Merge pull request #3256 from matrix-org/3218-official-prom

Switch to the Python Prometheus library
pull/3299/head
Amber Brown 2018-05-28 23:30:09 +10:00 committed by GitHub
commit 81717e8515
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 583 additions and 1319 deletions

3
.gitignore vendored
View File

@ -1,5 +1,6 @@
*.pyc *.pyc
.*.swp .*.swp
*~
.DS_Store .DS_Store
_trial_temp/ _trial_temp/
@ -13,6 +14,7 @@ docs/build/
cmdclient_config.json cmdclient_config.json
homeserver*.db homeserver*.db
homeserver*.log homeserver*.log
homeserver*.log.*
homeserver*.pid homeserver*.pid
homeserver*.yaml homeserver*.yaml
@ -40,6 +42,7 @@ media_store/
*.tac *.tac
build/ build/
venv/
localhost-800*/ localhost-800*/
static/client/register/register_config.js static/client/register/register_config.js

View File

@ -57,7 +57,7 @@ class Auth(object):
self.TOKEN_NOT_FOUND_HTTP_STATUS = 401 self.TOKEN_NOT_FOUND_HTTP_STATUS = 401
self.token_cache = LruCache(CACHE_SIZE_FACTOR * 10000) self.token_cache = LruCache(CACHE_SIZE_FACTOR * 10000)
register_cache("token_cache", self.token_cache) register_cache("cache", "token_cache", self.token_cache)
@defer.inlineCallbacks @defer.inlineCallbacks
def check_from_context(self, event, context, do_sig_check=True): def check_from_context(self, event, context, do_sig_check=True):

View File

@ -34,8 +34,8 @@ from synapse.module_api import ModuleApi
from synapse.http.additional_resource import AdditionalResource from synapse.http.additional_resource import AdditionalResource
from synapse.http.server import RootRedirect from synapse.http.server import RootRedirect
from synapse.http.site import SynapseSite from synapse.http.site import SynapseSite
from synapse.metrics import register_memory_metrics from synapse.metrics import RegistryProxy
from synapse.metrics.resource import METRICS_PREFIX, MetricsResource from synapse.metrics.resource import METRICS_PREFIX
from synapse.python_dependencies import CONDITIONAL_REQUIREMENTS, \ from synapse.python_dependencies import CONDITIONAL_REQUIREMENTS, \
check_requirements check_requirements
from synapse.replication.http import ReplicationRestResource, REPLICATION_PREFIX from synapse.replication.http import ReplicationRestResource, REPLICATION_PREFIX
@ -61,6 +61,8 @@ from twisted.web.resource import EncodingResourceWrapper, NoResource
from twisted.web.server import GzipEncoderFactory from twisted.web.server import GzipEncoderFactory
from twisted.web.static import File from twisted.web.static import File
from prometheus_client.twisted import MetricsResource
logger = logging.getLogger("synapse.app.homeserver") logger = logging.getLogger("synapse.app.homeserver")
@ -230,7 +232,7 @@ class SynapseHomeServer(HomeServer):
resources[WEB_CLIENT_PREFIX] = build_resource_for_web_client(self) resources[WEB_CLIENT_PREFIX] = build_resource_for_web_client(self)
if name == "metrics" and self.get_config().enable_metrics: if name == "metrics" and self.get_config().enable_metrics:
resources[METRICS_PREFIX] = MetricsResource(self) resources[METRICS_PREFIX] = MetricsResource(RegistryProxy())
if name == "replication": if name == "replication":
resources[REPLICATION_PREFIX] = ReplicationRestResource(self) resources[REPLICATION_PREFIX] = ReplicationRestResource(self)
@ -362,8 +364,6 @@ def setup(config_options):
hs.get_datastore().start_doing_background_updates() hs.get_datastore().start_doing_background_updates()
hs.get_federation_client().start_get_pdu_cache() hs.get_federation_client().start_get_pdu_cache()
register_memory_metrics(hs)
reactor.callWhenRunning(start) reactor.callWhenRunning(start)
return hs return hs

View File

@ -32,20 +32,17 @@ from synapse.federation.federation_base import (
FederationBase, FederationBase,
event_from_pdu_json, event_from_pdu_json,
) )
import synapse.metrics
from synapse.util import logcontext, unwrapFirstError from synapse.util import logcontext, unwrapFirstError
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background from synapse.util.logcontext import make_deferred_yieldable, run_in_background
from synapse.util.logutils import log_function from synapse.util.logutils import log_function
from synapse.util.retryutils import NotRetryingDestination from synapse.util.retryutils import NotRetryingDestination
from prometheus_client import Counter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
sent_queries_counter = Counter("synapse_federation_client_sent_queries", "", ["type"])
# synapse.federation.federation_client is a silly name
metrics = synapse.metrics.get_metrics_for("synapse.federation.client")
sent_queries_counter = metrics.register_counter("sent_queries", labels=["type"])
PDU_RETRY_TIME_MS = 1 * 60 * 1000 PDU_RETRY_TIME_MS = 1 * 60 * 1000
@ -108,7 +105,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the a Deferred which will eventually yield a JSON object from the
response response
""" """
sent_queries_counter.inc(query_type) sent_queries_counter.labels(query_type).inc()
return self.transport_layer.make_query( return self.transport_layer.make_query(
destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail, destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail,
@ -127,7 +124,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the a Deferred which will eventually yield a JSON object from the
response response
""" """
sent_queries_counter.inc("client_device_keys") sent_queries_counter.labels("client_device_keys").inc()
return self.transport_layer.query_client_keys( return self.transport_layer.query_client_keys(
destination, content, timeout destination, content, timeout
) )
@ -137,7 +134,7 @@ class FederationClient(FederationBase):
"""Query the device keys for a list of user ids hosted on a remote """Query the device keys for a list of user ids hosted on a remote
server. server.
""" """
sent_queries_counter.inc("user_devices") sent_queries_counter.labels("user_devices").inc()
return self.transport_layer.query_user_devices( return self.transport_layer.query_user_devices(
destination, user_id, timeout destination, user_id, timeout
) )
@ -154,7 +151,7 @@ class FederationClient(FederationBase):
a Deferred which will eventually yield a JSON object from the a Deferred which will eventually yield a JSON object from the
response response
""" """
sent_queries_counter.inc("client_one_time_keys") sent_queries_counter.labels("client_one_time_keys").inc()
return self.transport_layer.claim_client_keys( return self.transport_layer.claim_client_keys(
destination, content, timeout destination, content, timeout
) )

View File

@ -27,12 +27,13 @@ from synapse.federation.federation_base import (
from synapse.federation.persistence import TransactionActions from synapse.federation.persistence import TransactionActions
from synapse.federation.units import Edu, Transaction from synapse.federation.units import Edu, Transaction
import synapse.metrics
from synapse.types import get_domain_from_id from synapse.types import get_domain_from_id
from synapse.util import async from synapse.util import async
from synapse.util.caches.response_cache import ResponseCache from synapse.util.caches.response_cache import ResponseCache
from synapse.util.logutils import log_function from synapse.util.logutils import log_function
from prometheus_client import Counter
from six import iteritems from six import iteritems
# when processing incoming transactions, we try to handle multiple rooms in # when processing incoming transactions, we try to handle multiple rooms in
@ -41,17 +42,17 @@ TRANSACTION_CONCURRENCY_LIMIT = 10
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# synapse.federation.federation_server is a silly name received_pdus_counter = Counter("synapse_federation_server_received_pdus", "")
metrics = synapse.metrics.get_metrics_for("synapse.federation.server")
received_pdus_counter = metrics.register_counter("received_pdus") received_edus_counter = Counter("synapse_federation_server_received_edus", "")
received_edus_counter = metrics.register_counter("received_edus") received_queries_counter = Counter(
"synapse_federation_server_received_queries", "", ["type"]
received_queries_counter = metrics.register_counter("received_queries", labels=["type"]) )
class FederationServer(FederationBase): class FederationServer(FederationBase):
def __init__(self, hs): def __init__(self, hs):
super(FederationServer, self).__init__(hs) super(FederationServer, self).__init__(hs)
@ -131,7 +132,7 @@ class FederationServer(FederationBase):
logger.debug("[%s] Transaction is new", transaction.transaction_id) logger.debug("[%s] Transaction is new", transaction.transaction_id)
received_pdus_counter.inc_by(len(transaction.pdus)) received_pdus_counter.inc(len(transaction.pdus))
pdus_by_room = {} pdus_by_room = {}
@ -292,7 +293,7 @@ class FederationServer(FederationBase):
@defer.inlineCallbacks @defer.inlineCallbacks
def on_query_request(self, query_type, args): def on_query_request(self, query_type, args):
received_queries_counter.inc(query_type) received_queries_counter.labels(query_type).inc()
resp = yield self.registry.on_query(query_type, args) resp = yield self.registry.on_query(query_type, args)
defer.returnValue((200, resp)) defer.returnValue((200, resp))

View File

@ -33,7 +33,7 @@ from .units import Edu
from synapse.storage.presence import UserPresenceState from synapse.storage.presence import UserPresenceState
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
import synapse.metrics from synapse.metrics import LaterGauge
from blist import sorteddict from blist import sorteddict
from collections import namedtuple from collections import namedtuple
@ -45,9 +45,6 @@ from six import itervalues, iteritems
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__)
class FederationRemoteSendQueue(object): class FederationRemoteSendQueue(object):
"""A drop in replacement for TransactionQueue""" """A drop in replacement for TransactionQueue"""
@ -77,10 +74,8 @@ class FederationRemoteSendQueue(object):
# lambda binds to the queue rather than to the name of the queue which # lambda binds to the queue rather than to the name of the queue which
# changes. ARGH. # changes. ARGH.
def register(name, queue): def register(name, queue):
metrics.register_callback( LaterGauge("synapse_federation_send_queue_%s_size" % (queue_name,),
queue_name + "_size", "", lambda: len(queue))
lambda: len(queue),
)
for queue_name in [ for queue_name in [
"presence_map", "presence_changed", "keyed_edu", "keyed_edu_changed", "presence_map", "presence_changed", "keyed_edu", "keyed_edu_changed",

View File

@ -26,23 +26,23 @@ from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
from synapse.util.metrics import measure_func from synapse.util.metrics import measure_func
from synapse.handlers.presence import format_user_presence_state, get_interested_remotes from synapse.handlers.presence import format_user_presence_state, get_interested_remotes
import synapse.metrics import synapse.metrics
from synapse.metrics import LaterGauge
from synapse.metrics import (
sent_edus_counter,
sent_transactions_counter,
events_processed_counter,
)
from prometheus_client import Counter
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__) sent_pdus_destination_dist = Counter(
"synapse_federation_transaction_queue_sent_pdu_destinations", ""
client_metrics = synapse.metrics.get_metrics_for("synapse.federation.client")
sent_pdus_destination_dist = client_metrics.register_distribution(
"sent_pdu_destinations"
) )
sent_edus_counter = client_metrics.register_counter("sent_edus")
sent_transactions_counter = client_metrics.register_counter("sent_transactions")
events_processed_counter = client_metrics.register_counter("events_processed")
class TransactionQueue(object): class TransactionQueue(object):
@ -69,8 +69,10 @@ class TransactionQueue(object):
# done # done
self.pending_transactions = {} self.pending_transactions = {}
metrics.register_callback( LaterGauge(
"pending_destinations", "synapse_federation_transaction_queue_pending_destinations",
"",
[],
lambda: len(self.pending_transactions), lambda: len(self.pending_transactions),
) )
@ -94,12 +96,16 @@ class TransactionQueue(object):
# Map of destination -> (edu_type, key) -> Edu # Map of destination -> (edu_type, key) -> Edu
self.pending_edus_keyed_by_dest = edus_keyed = {} self.pending_edus_keyed_by_dest = edus_keyed = {}
metrics.register_callback( LaterGauge(
"pending_pdus", "synapse_federation_transaction_queue_pending_pdus",
"",
[],
lambda: sum(map(len, pdus.values())), lambda: sum(map(len, pdus.values())),
) )
metrics.register_callback( LaterGauge(
"pending_edus", "synapse_federation_transaction_queue_pending_edus",
"",
[],
lambda: ( lambda: (
sum(map(len, edus.values())) sum(map(len, edus.values()))
+ sum(map(len, presence.values())) + sum(map(len, presence.values()))
@ -241,18 +247,15 @@ class TransactionQueue(object):
now = self.clock.time_msec() now = self.clock.time_msec()
ts = yield self.store.get_received_ts(events[-1].event_id) ts = yield self.store.get_received_ts(events[-1].event_id)
synapse.metrics.event_processing_lag.set( synapse.metrics.event_processing_lag.labels(
now - ts, "federation_sender", "federation_sender").set(now - ts)
) synapse.metrics.event_processing_last_ts.labels(
synapse.metrics.event_processing_last_ts.set( "federation_sender").set(ts)
ts, "federation_sender",
)
events_processed_counter.inc_by(len(events)) events_processed_counter.inc(len(events))
synapse.metrics.event_processing_positions.set( synapse.metrics.event_processing_positions.labels(
next_token, "federation_sender", "federation_sender").set(next_token)
)
finally: finally:
self._is_processing = False self._is_processing = False
@ -275,7 +278,7 @@ class TransactionQueue(object):
if not destinations: if not destinations:
return return
sent_pdus_destination_dist.inc_by(len(destinations)) sent_pdus_destination_dist.inc(len(destinations))
for destination in destinations: for destination in destinations:
self.pending_pdus_by_dest.setdefault(destination, []).append( self.pending_pdus_by_dest.setdefault(destination, []).append(

View File

@ -21,14 +21,13 @@ from synapse.util.metrics import Measure
from synapse.util.logcontext import ( from synapse.util.logcontext import (
make_deferred_yieldable, run_in_background, make_deferred_yieldable, run_in_background,
) )
from prometheus_client import Counter
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__) events_processed_counter = Counter("synapse_handlers_appservice_events_processed", "")
events_processed_counter = metrics.register_counter("events_processed")
def log_failure(failure): def log_failure(failure):
@ -128,18 +127,15 @@ class ApplicationServicesHandler(object):
now = self.clock.time_msec() now = self.clock.time_msec()
ts = yield self.store.get_received_ts(events[-1].event_id) ts = yield self.store.get_received_ts(events[-1].event_id)
synapse.metrics.event_processing_positions.set( synapse.metrics.event_processing_positions.labels(
upper_bound, "appservice_sender", "appservice_sender").set(upper_bound)
)
events_processed_counter.inc_by(len(events)) events_processed_counter.inc(len(events))
synapse.metrics.event_processing_lag.set( synapse.metrics.event_processing_lag.labels(
now - ts, "appservice_sender", "appservice_sender").set(now - ts)
) synapse.metrics.event_processing_last_ts.labels(
synapse.metrics.event_processing_last_ts.set( "appservice_sender").set(ts)
ts, "appservice_sender",
)
finally: finally:
self.is_processing = False self.is_processing = False

View File

@ -38,26 +38,29 @@ from synapse.util.logutils import log_function
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
from synapse.util.wheel_timer import WheelTimer from synapse.util.wheel_timer import WheelTimer
from synapse.types import UserID, get_domain_from_id from synapse.types import UserID, get_domain_from_id
import synapse.metrics from synapse.metrics import LaterGauge
import logging import logging
from prometheus_client import Counter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__)
notified_presence_counter = metrics.register_counter("notified_presence") notified_presence_counter = Counter("synapse_handler_presence_notified_presence", "")
federation_presence_out_counter = metrics.register_counter("federation_presence_out") federation_presence_out_counter = Counter(
presence_updates_counter = metrics.register_counter("presence_updates") "synapse_handler_presence_federation_presence_out", "")
timers_fired_counter = metrics.register_counter("timers_fired") presence_updates_counter = Counter("synapse_handler_presence_presence_updates", "")
federation_presence_counter = metrics.register_counter("federation_presence") timers_fired_counter = Counter("synapse_handler_presence_timers_fired", "")
bump_active_time_counter = metrics.register_counter("bump_active_time") federation_presence_counter = Counter("synapse_handler_presence_federation_presence", "")
bump_active_time_counter = Counter("synapse_handler_presence_bump_active_time", "")
get_updates_counter = metrics.register_counter("get_updates", labels=["type"]) get_updates_counter = Counter("synapse_handler_presence_get_updates", "", ["type"])
notify_reason_counter = metrics.register_counter("notify_reason", labels=["reason"]) notify_reason_counter = Counter(
state_transition_counter = metrics.register_counter( "synapse_handler_presence_notify_reason", "", ["reason"])
"state_transition", labels=["from", "to"] state_transition_counter = Counter(
"synapse_handler_presence_state_transition", "", ["from", "to"]
) )
@ -142,8 +145,9 @@ class PresenceHandler(object):
for state in active_presence for state in active_presence
} }
metrics.register_callback( LaterGauge(
"user_to_current_state_size", lambda: len(self.user_to_current_state) "synapse_handlers_presence_user_to_current_state_size", "", [],
lambda: len(self.user_to_current_state)
) )
now = self.clock.time_msec() now = self.clock.time_msec()
@ -213,7 +217,8 @@ class PresenceHandler(object):
60 * 1000, 60 * 1000,
) )
metrics.register_callback("wheel_timer_size", lambda: len(self.wheel_timer)) LaterGauge("synapse_handlers_presence_wheel_timer_size", "", [],
lambda: len(self.wheel_timer))
@defer.inlineCallbacks @defer.inlineCallbacks
def _on_shutdown(self): def _on_shutdown(self):
@ -316,10 +321,10 @@ class PresenceHandler(object):
# TODO: We should probably ensure there are no races hereafter # TODO: We should probably ensure there are no races hereafter
presence_updates_counter.inc_by(len(new_states)) presence_updates_counter.inc(len(new_states))
if to_notify: if to_notify:
notified_presence_counter.inc_by(len(to_notify)) notified_presence_counter.inc(len(to_notify))
yield self._persist_and_notify(to_notify.values()) yield self._persist_and_notify(to_notify.values())
self.unpersisted_users_changes |= set(s.user_id for s in new_states) self.unpersisted_users_changes |= set(s.user_id for s in new_states)
@ -330,7 +335,7 @@ class PresenceHandler(object):
if user_id not in to_notify if user_id not in to_notify
} }
if to_federation_ping: if to_federation_ping:
federation_presence_out_counter.inc_by(len(to_federation_ping)) federation_presence_out_counter.inc(len(to_federation_ping))
self._push_to_remotes(to_federation_ping.values()) self._push_to_remotes(to_federation_ping.values())
@ -368,7 +373,7 @@ class PresenceHandler(object):
for user_id in users_to_check for user_id in users_to_check
] ]
timers_fired_counter.inc_by(len(states)) timers_fired_counter.inc(len(states))
changes = handle_timeouts( changes = handle_timeouts(
states, states,
@ -657,7 +662,7 @@ class PresenceHandler(object):
updates.append(prev_state.copy_and_replace(**new_fields)) updates.append(prev_state.copy_and_replace(**new_fields))
if updates: if updates:
federation_presence_counter.inc_by(len(updates)) federation_presence_counter.inc(len(updates))
yield self._update_states(updates) yield self._update_states(updates)
@defer.inlineCallbacks @defer.inlineCallbacks
@ -932,28 +937,28 @@ def should_notify(old_state, new_state):
return False return False
if old_state.status_msg != new_state.status_msg: if old_state.status_msg != new_state.status_msg:
notify_reason_counter.inc("status_msg_change") notify_reason_counter.labels("status_msg_change").inc()
return True return True
if old_state.state != new_state.state: if old_state.state != new_state.state:
notify_reason_counter.inc("state_change") notify_reason_counter.labels("state_change").inc()
state_transition_counter.inc(old_state.state, new_state.state) state_transition_counter.labels(old_state.state, new_state.state).inc()
return True return True
if old_state.state == PresenceState.ONLINE: if old_state.state == PresenceState.ONLINE:
if new_state.currently_active != old_state.currently_active: if new_state.currently_active != old_state.currently_active:
notify_reason_counter.inc("current_active_change") notify_reason_counter.labels("current_active_change").inc()
return True return True
if new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY: if new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY:
# Only notify about last active bumps if we're not currently acive # Only notify about last active bumps if we're not currently acive
if not new_state.currently_active: if not new_state.currently_active:
notify_reason_counter.inc("last_active_change_online") notify_reason_counter.labels("last_active_change_online").inc()
return True return True
elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY: elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY:
# Always notify for a transition where last active gets bumped. # Always notify for a transition where last active gets bumped.
notify_reason_counter.inc("last_active_change_not_online") notify_reason_counter.labels("last_active_change_not_online").inc()
return True return True
return False return False
@ -1027,14 +1032,14 @@ class PresenceEventSource(object):
if changed is not None and len(changed) < 500: if changed is not None and len(changed) < 500:
# For small deltas, its quicker to get all changes and then # For small deltas, its quicker to get all changes and then
# work out if we share a room or they're in our presence list # work out if we share a room or they're in our presence list
get_updates_counter.inc("stream") get_updates_counter.labels("stream").inc()
for other_user_id in changed: for other_user_id in changed:
if other_user_id in users_interested_in: if other_user_id in users_interested_in:
user_ids_changed.add(other_user_id) user_ids_changed.add(other_user_id)
else: else:
# Too many possible updates. Find all users we can see and check # Too many possible updates. Find all users we can see and check
# if any of them have changed. # if any of them have changed.
get_updates_counter.inc("full") get_updates_counter.labels("full").inc()
if from_key: if from_key:
user_ids_changed = stream_change_cache.get_entities_changed( user_ids_changed = stream_change_cache.get_entities_changed(

View File

@ -23,7 +23,6 @@ from synapse.http import cancelled_to_request_timed_out_error
from synapse.util.async import add_timeout_to_deferred from synapse.util.async import add_timeout_to_deferred
from synapse.util.caches import CACHE_SIZE_FACTOR from synapse.util.caches import CACHE_SIZE_FACTOR
from synapse.util.logcontext import make_deferred_yieldable from synapse.util.logcontext import make_deferred_yieldable
import synapse.metrics
from synapse.http.endpoint import SpiderEndpoint from synapse.http.endpoint import SpiderEndpoint
from canonicaljson import encode_canonical_json from canonicaljson import encode_canonical_json
@ -42,6 +41,7 @@ from twisted.web._newclient import ResponseDone
from six import StringIO from six import StringIO
from prometheus_client import Counter
import simplejson as json import simplejson as json
import logging import logging
import urllib import urllib
@ -49,16 +49,9 @@ import urllib
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__) outgoing_requests_counter = Counter("synapse_http_client_requests", "", ["method"])
incoming_responses_counter = Counter("synapse_http_client_responses", "",
outgoing_requests_counter = metrics.register_counter( ["method", "code"])
"requests",
labels=["method"],
)
incoming_responses_counter = metrics.register_counter(
"responses",
labels=["method", "code"],
)
class SimpleHttpClient(object): class SimpleHttpClient(object):
@ -95,7 +88,7 @@ class SimpleHttpClient(object):
def request(self, method, uri, *args, **kwargs): def request(self, method, uri, *args, **kwargs):
# A small wrapper around self.agent.request() so we can easily attach # A small wrapper around self.agent.request() so we can easily attach
# counters to it # counters to it
outgoing_requests_counter.inc(method) outgoing_requests_counter.labels(method).inc()
logger.info("Sending request %s %s", method, uri) logger.info("Sending request %s %s", method, uri)
@ -109,14 +102,14 @@ class SimpleHttpClient(object):
) )
response = yield make_deferred_yieldable(request_deferred) response = yield make_deferred_yieldable(request_deferred)
incoming_responses_counter.inc(method, response.code) incoming_responses_counter.labels(method, response.code).inc()
logger.info( logger.info(
"Received response to %s %s: %s", "Received response to %s %s: %s",
method, uri, response.code method, uri, response.code
) )
defer.returnValue(response) defer.returnValue(response)
except Exception as e: except Exception as e:
incoming_responses_counter.inc(method, "ERR") incoming_responses_counter.labels(method, "ERR").inc()
logger.info( logger.info(
"Error sending request to %s %s: %s %s", "Error sending request to %s %s: %s %s",
method, uri, type(e).__name__, e.message method, uri, type(e).__name__, e.message

View File

@ -45,19 +45,15 @@ from six.moves.urllib import parse as urlparse
from six import string_types from six import string_types
from prometheus_client import Counter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
outbound_logger = logging.getLogger("synapse.http.outbound") outbound_logger = logging.getLogger("synapse.http.outbound")
metrics = synapse.metrics.get_metrics_for(__name__) outgoing_requests_counter = Counter("synapse_http_matrixfederationclient_requests",
"", ["method"])
outgoing_requests_counter = metrics.register_counter( incoming_responses_counter = Counter("synapse_http_matrixfederationclient_responses",
"requests", "", ["method", "code"])
labels=["method"],
)
incoming_responses_counter = metrics.register_counter(
"responses",
labels=["method", "code"],
)
MAX_LONG_RETRIES = 10 MAX_LONG_RETRIES = 10

View File

@ -16,137 +16,109 @@
import logging import logging
import synapse.metrics from prometheus_client.core import Counter, Histogram
from synapse.metrics import LaterGauge
from synapse.util.logcontext import LoggingContext from synapse.util.logcontext import LoggingContext
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for("synapse.http.server")
# total number of responses served, split by method/servlet/tag # total number of responses served, split by method/servlet/tag
response_count = metrics.register_counter( response_count = Counter(
"response_count", "synapse_http_server_response_count", "", ["method", "servlet", "tag"]
labels=["method", "servlet", "tag"],
alternative_names=(
# the following are all deprecated aliases for the same metric
metrics.name_prefix + x for x in (
"_requests",
"_response_time:count",
"_response_ru_utime:count",
"_response_ru_stime:count",
"_response_db_txn_count:count",
"_response_db_txn_duration:count",
)
)
) )
requests_counter = metrics.register_counter( requests_counter = Counter(
"requests_received", "synapse_http_server_requests_received", "", ["method", "servlet"]
labels=["method", "servlet", ],
) )
outgoing_responses_counter = metrics.register_counter( outgoing_responses_counter = Counter(
"responses", "synapse_http_server_responses", "", ["method", "code"]
labels=["method", "code"],
) )
response_timer = metrics.register_counter( response_timer = Histogram(
"response_time_seconds", "synapse_http_server_response_time_seconds", "sec", ["method", "servlet", "tag"]
labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_time:total",
),
) )
response_ru_utime = metrics.register_counter( response_ru_utime = Counter(
"response_ru_utime_seconds", labels=["method", "servlet", "tag"], "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"]
alternative_names=(
metrics.name_prefix + "_response_ru_utime:total",
),
) )
response_ru_stime = metrics.register_counter( response_ru_stime = Counter(
"response_ru_stime_seconds", labels=["method", "servlet", "tag"], "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"]
alternative_names=(
metrics.name_prefix + "_response_ru_stime:total",
),
) )
response_db_txn_count = metrics.register_counter( response_db_txn_count = Counter(
"response_db_txn_count", labels=["method", "servlet", "tag"], "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"]
alternative_names=(
metrics.name_prefix + "_response_db_txn_count:total",
),
) )
# seconds spent waiting for db txns, excluding scheduling time, when processing # seconds spent waiting for db txns, excluding scheduling time, when processing
# this request # this request
response_db_txn_duration = metrics.register_counter( response_db_txn_duration = Counter(
"response_db_txn_duration_seconds", labels=["method", "servlet", "tag"], "synapse_http_server_response_db_txn_duration_seconds",
alternative_names=( "",
metrics.name_prefix + "_response_db_txn_duration:total", ["method", "servlet", "tag"],
),
) )
# seconds spent waiting for a db connection, when processing this request # seconds spent waiting for a db connection, when processing this request
response_db_sched_duration = metrics.register_counter( response_db_sched_duration = Counter(
"response_db_sched_duration_seconds", labels=["method", "servlet", "tag"] "synapse_http_server_response_db_sched_duration_seconds",
"",
["method", "servlet", "tag"],
) )
# size in bytes of the response written # size in bytes of the response written
response_size = metrics.register_counter( response_size = Counter(
"response_size", labels=["method", "servlet", "tag"] "synapse_http_server_response_size", "", ["method", "servlet", "tag"]
) )
# In flight metrics are incremented while the requests are in flight, rather # In flight metrics are incremented while the requests are in flight, rather
# than when the response was written. # than when the response was written.
in_flight_requests_ru_utime = metrics.register_counter( in_flight_requests_ru_utime = Counter(
"in_flight_requests_ru_utime_seconds", labels=["method", "servlet"], "synapse_http_server_in_flight_requests_ru_utime_seconds",
"",
["method", "servlet"],
) )
in_flight_requests_ru_stime = metrics.register_counter( in_flight_requests_ru_stime = Counter(
"in_flight_requests_ru_stime_seconds", labels=["method", "servlet"], "synapse_http_server_in_flight_requests_ru_stime_seconds",
"",
["method", "servlet"],
) )
in_flight_requests_db_txn_count = metrics.register_counter( in_flight_requests_db_txn_count = Counter(
"in_flight_requests_db_txn_count", labels=["method", "servlet"], "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"]
) )
# seconds spent waiting for db txns, excluding scheduling time, when processing # seconds spent waiting for db txns, excluding scheduling time, when processing
# this request # this request
in_flight_requests_db_txn_duration = metrics.register_counter( in_flight_requests_db_txn_duration = Counter(
"in_flight_requests_db_txn_duration_seconds", labels=["method", "servlet"], "synapse_http_server_in_flight_requests_db_txn_duration_seconds",
"",
["method", "servlet"],
) )
# seconds spent waiting for a db connection, when processing this request # seconds spent waiting for a db connection, when processing this request
in_flight_requests_db_sched_duration = metrics.register_counter( in_flight_requests_db_sched_duration = Counter(
"in_flight_requests_db_sched_duration_seconds", labels=["method", "servlet"] "synapse_http_server_in_flight_requests_db_sched_duration_seconds",
"",
["method", "servlet"],
) )
# The set of all in flight requests, set[RequestMetrics] # The set of all in flight requests, set[RequestMetrics]
_in_flight_requests = set() _in_flight_requests = set()
def _collect_in_flight():
"""Called just before metrics are collected, so we use it to update all
the in flight request metrics
"""
for rm in _in_flight_requests:
rm.update_metrics()
metrics.register_collector(_collect_in_flight)
def _get_in_flight_counts(): def _get_in_flight_counts():
"""Returns a count of all in flight requests by (method, server_name) """Returns a count of all in flight requests by (method, server_name)
Returns: Returns:
dict[tuple[str, str], int] dict[tuple[str, str], int]
""" """
for rm in _in_flight_requests:
rm.update_metrics()
# Map from (method, name) -> int, the number of in flight requests of that # Map from (method, name) -> int, the number of in flight requests of that
# type # type
@ -158,16 +130,17 @@ def _get_in_flight_counts():
return counts return counts
metrics.register_callback( LaterGauge(
"in_flight_requests_count", "synapse_http_request_metrics_in_flight_requests_count",
"",
["method", "servlet"],
_get_in_flight_counts, _get_in_flight_counts,
labels=["method", "servlet"]
) )
class RequestMetrics(object): class RequestMetrics(object):
def start(self, time_msec, name, method): def start(self, time_sec, name, method):
self.start = time_msec self.start = time_sec
self.start_context = LoggingContext.current_context() self.start_context = LoggingContext.current_context()
self.name = name self.name = name
self.method = method self.method = method
@ -176,7 +149,7 @@ class RequestMetrics(object):
_in_flight_requests.add(self) _in_flight_requests.add(self)
def stop(self, time_msec, request): def stop(self, time_sec, request):
_in_flight_requests.discard(self) _in_flight_requests.discard(self)
context = LoggingContext.current_context() context = LoggingContext.current_context()
@ -192,34 +165,29 @@ class RequestMetrics(object):
) )
return return
outgoing_responses_counter.inc(request.method, str(request.code)) outgoing_responses_counter.labels(request.method, str(request.code)).inc()
response_count.inc(request.method, self.name, tag) response_count.labels(request.method, self.name, tag).inc()
response_timer.inc_by( response_timer.labels(request.method, self.name, tag).observe(
time_msec - self.start, request.method, time_sec - self.start
self.name, tag
) )
ru_utime, ru_stime = context.get_resource_usage() ru_utime, ru_stime = context.get_resource_usage()
response_ru_utime.inc_by( response_ru_utime.labels(request.method, self.name, tag).inc(ru_utime)
ru_utime, request.method, self.name, tag response_ru_stime.labels(request.method, self.name, tag).inc(ru_stime)
response_db_txn_count.labels(request.method, self.name, tag).inc(
context.db_txn_count
) )
response_ru_stime.inc_by( response_db_txn_duration.labels(request.method, self.name, tag).inc(
ru_stime, request.method, self.name, tag context.db_txn_duration_sec
) )
response_db_txn_count.inc_by( response_db_sched_duration.labels(request.method, self.name, tag).inc(
context.db_txn_count, request.method, self.name, tag context.db_sched_duration_sec
)
response_db_txn_duration.inc_by(
context.db_txn_duration_ms / 1000., request.method, self.name, tag
)
response_db_sched_duration.inc_by(
context.db_sched_duration_ms / 1000., request.method, self.name, tag
) )
response_size.inc_by(request.sentLength, request.method, self.name, tag) response_size.labels(request.method, self.name, tag).inc(request.sentLength)
# We always call this at the end to ensure that we update the metrics # We always call this at the end to ensure that we update the metrics
# regardless of whether a call to /metrics while the request was in # regardless of whether a call to /metrics while the request was in
@ -229,27 +197,21 @@ class RequestMetrics(object):
def update_metrics(self): def update_metrics(self):
"""Updates the in flight metrics with values from this request. """Updates the in flight metrics with values from this request.
""" """
diff = self._request_stats.update(self.start_context) diff = self._request_stats.update(self.start_context)
in_flight_requests_ru_utime.inc_by( in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
diff.ru_utime, self.method, self.name, in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
diff.db_txn_count
) )
in_flight_requests_ru_stime.inc_by( in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(
diff.ru_stime, self.method, self.name, diff.db_txn_duration_sec
) )
in_flight_requests_db_txn_count.inc_by( in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
diff.db_txn_count, self.method, self.name, diff.db_sched_duration_sec
)
in_flight_requests_db_txn_duration.inc_by(
diff.db_txn_duration_ms / 1000., self.method, self.name,
)
in_flight_requests_db_sched_duration.inc_by(
diff.db_sched_duration_ms / 1000., self.method, self.name,
) )
@ -258,17 +220,21 @@ class _RequestStats(object):
""" """
__slots__ = [ __slots__ = [
"ru_utime", "ru_stime", "ru_utime",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms", "ru_stime",
"db_txn_count",
"db_txn_duration_sec",
"db_sched_duration_sec",
] ]
def __init__(self, ru_utime, ru_stime, db_txn_count, def __init__(
db_txn_duration_ms, db_sched_duration_ms): self, ru_utime, ru_stime, db_txn_count, db_txn_duration_sec, db_sched_duration_sec
):
self.ru_utime = ru_utime self.ru_utime = ru_utime
self.ru_stime = ru_stime self.ru_stime = ru_stime
self.db_txn_count = db_txn_count self.db_txn_count = db_txn_count
self.db_txn_duration_ms = db_txn_duration_ms self.db_txn_duration_sec = db_txn_duration_sec
self.db_sched_duration_ms = db_sched_duration_ms self.db_sched_duration_sec = db_sched_duration_sec
@staticmethod @staticmethod
def from_context(context): def from_context(context):
@ -277,8 +243,8 @@ class _RequestStats(object):
return _RequestStats( return _RequestStats(
ru_utime, ru_stime, ru_utime, ru_stime,
context.db_txn_count, context.db_txn_count,
context.db_txn_duration_ms, context.db_txn_duration_sec,
context.db_sched_duration_ms, context.db_sched_duration_sec,
) )
def update(self, context): def update(self, context):
@ -294,14 +260,14 @@ class _RequestStats(object):
new.ru_utime - self.ru_utime, new.ru_utime - self.ru_utime,
new.ru_stime - self.ru_stime, new.ru_stime - self.ru_stime,
new.db_txn_count - self.db_txn_count, new.db_txn_count - self.db_txn_count,
new.db_txn_duration_ms - self.db_txn_duration_ms, new.db_txn_duration_sec - self.db_txn_duration_sec,
new.db_sched_duration_ms - self.db_sched_duration_ms, new.db_sched_duration_sec - self.db_sched_duration_sec,
) )
self.ru_utime = new.ru_utime self.ru_utime = new.ru_utime
self.ru_stime = new.ru_stime self.ru_stime = new.ru_stime
self.db_txn_count = new.db_txn_count self.db_txn_count = new.db_txn_count
self.db_txn_duration_ms = new.db_txn_duration_ms self.db_txn_duration_sec = new.db_txn_duration_sec
self.db_sched_duration_ms = new.db_sched_duration_ms self.db_sched_duration_sec = new.db_sched_duration_sec
return diff return diff

View File

@ -210,8 +210,8 @@ def wrap_request_handler_with_logging(h):
# dispatching to the handler, so that the handler # dispatching to the handler, so that the handler
# can update the servlet name in the request # can update the servlet name in the request
# metrics # metrics
requests_counter.inc(request.method, requests_counter.labels(request.method,
request.request_metrics.name) request.request_metrics.name).inc()
yield d yield d
return wrapped_request_handler return wrapped_request_handler

View File

@ -83,7 +83,7 @@ class SynapseRequest(Request):
return Request.render(self, resrc) return Request.render(self, resrc)
def _started_processing(self, servlet_name): def _started_processing(self, servlet_name):
self.start_time = int(time.time() * 1000) self.start_time = time.time()
self.request_metrics = RequestMetrics() self.request_metrics = RequestMetrics()
self.request_metrics.start( self.request_metrics.start(
self.start_time, name=servlet_name, method=self.method, self.start_time, name=servlet_name, method=self.method,
@ -102,26 +102,26 @@ class SynapseRequest(Request):
context = LoggingContext.current_context() context = LoggingContext.current_context()
ru_utime, ru_stime = context.get_resource_usage() ru_utime, ru_stime = context.get_resource_usage()
db_txn_count = context.db_txn_count db_txn_count = context.db_txn_count
db_txn_duration_ms = context.db_txn_duration_ms db_txn_duration_sec = context.db_txn_duration_sec
db_sched_duration_ms = context.db_sched_duration_ms db_sched_duration_sec = context.db_sched_duration_sec
except Exception: except Exception:
ru_utime, ru_stime = (0, 0) ru_utime, ru_stime = (0, 0)
db_txn_count, db_txn_duration_ms = (0, 0) db_txn_count, db_txn_duration_sec = (0, 0)
end_time = int(time.time() * 1000) end_time = time.time()
self.site.access_logger.info( self.site.access_logger.info(
"%s - %s - {%s}" "%s - %s - {%s}"
" Processed request: %dms (%dms, %dms) (%dms/%dms/%d)" " Processed request: %.3fsec (%.3fsec, %.3fsec) (%.3fsec/%.3fsec/%d)"
" %sB %s \"%s %s %s\" \"%s\"", " %sB %s \"%s %s %s\" \"%s\"",
self.getClientIP(), self.getClientIP(),
self.site.site_tag, self.site.site_tag,
self.authenticated_entity, self.authenticated_entity,
end_time - self.start_time, end_time - self.start_time,
int(ru_utime * 1000), ru_utime,
int(ru_stime * 1000), ru_stime,
db_sched_duration_ms, db_sched_duration_sec,
db_txn_duration_ms, db_txn_duration_sec,
int(db_txn_count), int(db_txn_count),
self.sentLength, self.sentLength,
self.code, self.code,

View File

@ -17,165 +17,170 @@ import logging
import functools import functools
import time import time
import gc import gc
import os
import platform import platform
import attr
from prometheus_client import Gauge, Histogram, Counter
from prometheus_client.core import GaugeMetricFamily, REGISTRY
from twisted.internet import reactor from twisted.internet import reactor
from .metric import (
CounterMetric, CallbackMetric, DistributionMetric, CacheMetric,
MemoryUsageMetric, GaugeMetric,
)
from .process_collector import register_process_collector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
running_on_pypy = platform.python_implementation() == "PyPy"
running_on_pypy = platform.python_implementation() == 'PyPy'
all_metrics = [] all_metrics = []
all_collectors = [] all_collectors = []
all_gauges = {}
class Metrics(object): class RegistryProxy(object):
""" A single Metrics object gives a (mutable) slice view of the all_metrics
dict, allowing callers to easily register new metrics that are namespaced
nicely."""
def __init__(self, name): def collect(self):
self.name_prefix = name for metric in REGISTRY.collect():
if not metric.name.startswith("__"):
def make_subspace(self, name): yield metric
return Metrics("%s_%s" % (self.name_prefix, name))
def register_collector(self, func):
all_collectors.append(func)
def _register(self, metric_class, name, *args, **kwargs):
full_name = "%s_%s" % (self.name_prefix, name)
metric = metric_class(full_name, *args, **kwargs)
all_metrics.append(metric)
return metric
def register_counter(self, *args, **kwargs):
"""
Returns:
CounterMetric
"""
return self._register(CounterMetric, *args, **kwargs)
def register_gauge(self, *args, **kwargs):
"""
Returns:
GaugeMetric
"""
return self._register(GaugeMetric, *args, **kwargs)
def register_callback(self, *args, **kwargs):
"""
Returns:
CallbackMetric
"""
return self._register(CallbackMetric, *args, **kwargs)
def register_distribution(self, *args, **kwargs):
"""
Returns:
DistributionMetric
"""
return self._register(DistributionMetric, *args, **kwargs)
def register_cache(self, *args, **kwargs):
"""
Returns:
CacheMetric
"""
return self._register(CacheMetric, *args, **kwargs)
def register_memory_metrics(hs): @attr.s(hash=True)
class LaterGauge(object):
name = attr.ib()
desc = attr.ib()
labels = attr.ib(hash=False)
caller = attr.ib()
def collect(self):
g = GaugeMetricFamily(self.name, self.desc, labels=self.labels)
try: try:
import psutil calls = self.caller()
process = psutil.Process() except Exception as e:
process.memory_info().rss print(e)
except (ImportError, AttributeError): logger.err()
logger.warn( yield g
"psutil is not installed or incorrect version."
" Disabling memory metrics." if isinstance(calls, dict):
) for k, v in calls.items():
return g.add_metric(k, v)
metric = MemoryUsageMetric(hs, psutil) else:
all_metrics.append(metric) g.add_metric([], calls)
yield g
def __attrs_post_init__(self):
self._register()
def _register(self):
if self.name in all_gauges.keys():
logger.warning("%s already registered, reregistering" % (self.name,))
REGISTRY.unregister(all_gauges.pop(self.name))
REGISTRY.register(self)
all_gauges[self.name] = self
def get_metrics_for(pkg_name): #
""" Returns a Metrics instance for conveniently creating metrics # Detailed CPU metrics
namespaced with the given name prefix. """ #
# Convert a "package.name" to "package_name" because Prometheus doesn't class CPUMetrics(object):
# let us use . in metric names
return Metrics(pkg_name.replace(".", "_"))
def __init__(self):
def render_all(): ticks_per_sec = 100
strs = []
for collector in all_collectors:
collector()
for metric in all_metrics:
try: try:
strs += metric.render() # Try and get the system config
except Exception: ticks_per_sec = os.sysconf('SC_CLK_TCK')
strs += ["# FAILED to render"] except (ValueError, TypeError, AttributeError):
logger.exception("Failed to render metric") pass
strs.append("") # to generate a final CRLF self.ticks_per_sec = ticks_per_sec
return "\n".join(strs) def collect(self):
with open("/proc/self/stat") as s:
line = s.read()
raw_stats = line.split(") ", 1)[1].split(" ")
user = GaugeMetricFamily("process_cpu_user_seconds_total", "")
user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec)
yield user
sys = GaugeMetricFamily("process_cpu_system_seconds_total", "")
sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec)
yield sys
register_process_collector(get_metrics_for("process")) REGISTRY.register(CPUMetrics())
#
# Python GC metrics
#
python_metrics = get_metrics_for("python") gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
gc_time = Histogram(
gc_time = python_metrics.register_distribution("gc_time", labels=["gen"]) "python_gc_time",
gc_unreachable = python_metrics.register_counter("gc_unreachable_total", labels=["gen"]) "Time taken to GC (sec)",
python_metrics.register_callback( ["gen"],
"gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"] buckets=[0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50,
5.00, 7.50, 15.00, 30.00, 45.00, 60.00],
) )
reactor_metrics = get_metrics_for("python.twisted.reactor")
tick_time = reactor_metrics.register_distribution("tick_time")
pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
synapse_metrics = get_metrics_for("synapse") class GCCounts(object):
def collect(self):
cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"])
for n, m in enumerate(gc.get_count()):
cm.add_metric([str(n)], m)
yield cm
REGISTRY.register(GCCounts())
#
# Twisted reactor metrics
#
tick_time = Histogram(
"python_twisted_reactor_tick_time",
"Tick time of the Twisted reactor (sec)",
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
)
pending_calls_metric = Histogram(
"python_twisted_reactor_pending_calls",
"Pending calls",
buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
)
#
# Federation Metrics
#
sent_edus_counter = Counter("synapse_federation_client_sent_edus", "")
sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")
events_processed_counter = Counter("synapse_federation_client_events_processed", "")
# Used to track where various components have processed in the event stream, # Used to track where various components have processed in the event stream,
# e.g. federation sending, appservice sending, etc. # e.g. federation sending, appservice sending, etc.
event_processing_positions = synapse_metrics.register_gauge( event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])
"event_processing_positions", labels=["name"],
)
# Used to track the current max events stream position # Used to track the current max events stream position
event_persisted_position = synapse_metrics.register_gauge( event_persisted_position = Gauge("synapse_event_persisted_position", "")
"event_persisted_position",
)
# Used to track the received_ts of the last event processed by various # Used to track the received_ts of the last event processed by various
# components # components
event_processing_last_ts = synapse_metrics.register_gauge( event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])
"event_processing_last_ts", labels=["name"],
)
# Used to track the lag processing events. This is the time difference # Used to track the lag processing events. This is the time difference
# between the last processed event's received_ts and the time it was # between the last processed event's received_ts and the time it was
# finished being processed. # finished being processed.
event_processing_lag = synapse_metrics.register_gauge( event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])
"event_processing_lag", labels=["name"],
)
def runUntilCurrentTimer(func): def runUntilCurrentTimer(func):
@ -197,17 +202,17 @@ def runUntilCurrentTimer(func):
num_pending += 1 num_pending += 1
num_pending += len(reactor.threadCallQueue) num_pending += len(reactor.threadCallQueue)
start = time.time() * 1000 start = time.time()
ret = func(*args, **kwargs) ret = func(*args, **kwargs)
end = time.time() * 1000 end = time.time()
# record the amount of wallclock time spent running pending calls. # record the amount of wallclock time spent running pending calls.
# This is a proxy for the actual amount of time between reactor polls, # This is a proxy for the actual amount of time between reactor polls,
# since about 25% of time is actually spent running things triggered by # since about 25% of time is actually spent running things triggered by
# I/O events, but that is harder to capture without rewriting half the # I/O events, but that is harder to capture without rewriting half the
# reactor. # reactor.
tick_time.inc_by(end - start) tick_time.observe(end - start)
pending_calls_metric.inc_by(num_pending) pending_calls_metric.observe(num_pending)
if running_on_pypy: if running_on_pypy:
return ret return ret
@ -220,12 +225,12 @@ def runUntilCurrentTimer(func):
if threshold[i] < counts[i]: if threshold[i] < counts[i]:
logger.info("Collecting gc %d", i) logger.info("Collecting gc %d", i)
start = time.time() * 1000 start = time.time()
unreachable = gc.collect(i) unreachable = gc.collect(i)
end = time.time() * 1000 end = time.time()
gc_time.inc_by(end - start, i) gc_time.labels(i).observe(end - start)
gc_unreachable.inc_by(unreachable, i) gc_unreachable.labels(i).set(unreachable)
return ret return ret

View File

@ -1,328 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2015, 2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from itertools import chain
import logging
import re
logger = logging.getLogger(__name__)
def flatten(items):
"""Flatten a list of lists
Args:
items: iterable[iterable[X]]
Returns:
list[X]: flattened list
"""
return list(chain.from_iterable(items))
class BaseMetric(object):
"""Base class for metrics which report a single value per label set
"""
def __init__(self, name, labels=[], alternative_names=[]):
"""
Args:
name (str): principal name for this metric
labels (list(str)): names of the labels which will be reported
for this metric
alternative_names (iterable(str)): list of alternative names for
this metric. This can be useful to provide a migration path
when renaming metrics.
"""
self._names = [name] + list(alternative_names)
self.labels = labels # OK not to clone as we never write it
def dimension(self):
return len(self.labels)
def is_scalar(self):
return not len(self.labels)
def _render_labelvalue(self, value):
return '"%s"' % (_escape_label_value(value),)
def _render_key(self, values):
if self.is_scalar():
return ""
return "{%s}" % (
",".join(["%s=%s" % (k, self._render_labelvalue(v))
for k, v in zip(self.labels, values)])
)
def _render_for_labels(self, label_values, value):
"""Render this metric for a single set of labels
Args:
label_values (list[object]): values for each of the labels,
(which get stringified).
value: value of the metric at with these labels
Returns:
iterable[str]: rendered metric
"""
rendered_labels = self._render_key(label_values)
return (
"%s%s %.12g" % (name, rendered_labels, value)
for name in self._names
)
def render(self):
"""Render this metric
Each metric is rendered as:
name{label1="val1",label2="val2"} value
https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
Returns:
iterable[str]: rendered metrics
"""
raise NotImplementedError()
class CounterMetric(BaseMetric):
"""The simplest kind of metric; one that stores a monotonically-increasing
value that counts events or running totals.
Example use cases for Counters:
- Number of requests processed
- Number of items that were inserted into a queue
- Total amount of data that a system has processed
Counters can only go up (and be reset when the process restarts).
"""
def __init__(self, *args, **kwargs):
super(CounterMetric, self).__init__(*args, **kwargs)
# dict[list[str]]: value for each set of label values. the keys are the
# label values, in the same order as the labels in self.labels.
#
# (if the metric is a scalar, the (single) key is the empty tuple).
self.counts = {}
# Scalar metrics are never empty
if self.is_scalar():
self.counts[()] = 0.
def inc_by(self, incr, *values):
if len(values) != self.dimension():
raise ValueError(
"Expected as many values to inc() as labels (%d)" % (self.dimension())
)
# TODO: should assert that the tag values are all strings
if values not in self.counts:
self.counts[values] = incr
else:
self.counts[values] += incr
def inc(self, *values):
self.inc_by(1, *values)
def render(self):
return flatten(
self._render_for_labels(k, self.counts[k])
for k in sorted(self.counts.keys())
)
class GaugeMetric(BaseMetric):
"""A metric that can go up or down
"""
def __init__(self, *args, **kwargs):
super(GaugeMetric, self).__init__(*args, **kwargs)
# dict[list[str]]: value for each set of label values. the keys are the
# label values, in the same order as the labels in self.labels.
#
# (if the metric is a scalar, the (single) key is the empty tuple).
self.guages = {}
def set(self, v, *values):
if len(values) != self.dimension():
raise ValueError(
"Expected as many values to inc() as labels (%d)" % (self.dimension())
)
# TODO: should assert that the tag values are all strings
self.guages[values] = v
def render(self):
return flatten(
self._render_for_labels(k, self.guages[k])
for k in sorted(self.guages.keys())
)
class CallbackMetric(BaseMetric):
"""A metric that returns the numeric value returned by a callback whenever
it is rendered. Typically this is used to implement gauges that yield the
size or other state of some in-memory object by actively querying it."""
def __init__(self, name, callback, labels=[]):
super(CallbackMetric, self).__init__(name, labels=labels)
self.callback = callback
def render(self):
try:
value = self.callback()
except Exception:
logger.exception("Failed to render %s", self.name)
return ["# FAILED to render " + self.name]
if self.is_scalar():
return list(self._render_for_labels([], value))
return flatten(
self._render_for_labels(k, value[k])
for k in sorted(value.keys())
)
class DistributionMetric(object):
"""A combination of an event counter and an accumulator, which counts
both the number of events and accumulates the total value. Typically this
could be used to keep track of method-running times, or other distributions
of values that occur in discrete occurances.
TODO(paul): Try to export some heatmap-style stats?
"""
def __init__(self, name, *args, **kwargs):
self.counts = CounterMetric(name + ":count", **kwargs)
self.totals = CounterMetric(name + ":total", **kwargs)
def inc_by(self, inc, *values):
self.counts.inc(*values)
self.totals.inc_by(inc, *values)
def render(self):
return self.counts.render() + self.totals.render()
class CacheMetric(object):
__slots__ = (
"name", "cache_name", "hits", "misses", "evicted_size", "size_callback",
)
def __init__(self, name, size_callback, cache_name):
self.name = name
self.cache_name = cache_name
self.hits = 0
self.misses = 0
self.evicted_size = 0
self.size_callback = size_callback
def inc_hits(self):
self.hits += 1
def inc_misses(self):
self.misses += 1
def inc_evictions(self, size=1):
self.evicted_size += size
def render(self):
size = self.size_callback()
hits = self.hits
total = self.misses + self.hits
return [
"""%s:hits{name="%s"} %d""" % (self.name, self.cache_name, hits),
"""%s:total{name="%s"} %d""" % (self.name, self.cache_name, total),
"""%s:size{name="%s"} %d""" % (self.name, self.cache_name, size),
"""%s:evicted_size{name="%s"} %d""" % (
self.name, self.cache_name, self.evicted_size
),
]
class MemoryUsageMetric(object):
"""Keeps track of the current memory usage, using psutil.
The class will keep the current min/max/sum/counts of rss over the last
WINDOW_SIZE_SEC, by polling UPDATE_HZ times per second
"""
UPDATE_HZ = 2 # number of times to get memory per second
WINDOW_SIZE_SEC = 30 # the size of the window in seconds
def __init__(self, hs, psutil):
clock = hs.get_clock()
self.memory_snapshots = []
self.process = psutil.Process()
clock.looping_call(self._update_curr_values, 1000 / self.UPDATE_HZ)
def _update_curr_values(self):
max_size = self.UPDATE_HZ * self.WINDOW_SIZE_SEC
self.memory_snapshots.append(self.process.memory_info().rss)
self.memory_snapshots[:] = self.memory_snapshots[-max_size:]
def render(self):
if not self.memory_snapshots:
return []
max_rss = max(self.memory_snapshots)
min_rss = min(self.memory_snapshots)
sum_rss = sum(self.memory_snapshots)
len_rss = len(self.memory_snapshots)
return [
"process_psutil_rss:max %d" % max_rss,
"process_psutil_rss:min %d" % min_rss,
"process_psutil_rss:total %d" % sum_rss,
"process_psutil_rss:count %d" % len_rss,
]
def _escape_character(m):
"""Replaces a single character with its escape sequence.
Args:
m (re.MatchObject): A match object whose first group is the single
character to replace
Returns:
str
"""
c = m.group(1)
if c == "\\":
return "\\\\"
elif c == "\"":
return "\\\""
elif c == "\n":
return "\\n"
return c
def _escape_label_value(value):
"""Takes a label value and escapes quotes, newlines and backslashes
"""
return re.sub(r"([\n\"\\])", _escape_character, str(value))

View File

@ -1,123 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2015, 2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from six import iteritems
TICKS_PER_SEC = 100
BYTES_PER_PAGE = 4096
HAVE_PROC_STAT = os.path.exists("/proc/stat")
HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits")
HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd")
# Field indexes from /proc/self/stat, taken from the proc(5) manpage
STAT_FIELDS = {
"utime": 14,
"stime": 15,
"starttime": 22,
"vsize": 23,
"rss": 24,
}
stats = {}
# In order to report process_start_time_seconds we need to know the
# machine's boot time, because the value in /proc/self/stat is relative to
# this
boot_time = None
if HAVE_PROC_STAT:
with open("/proc/stat") as _procstat:
for line in _procstat:
if line.startswith("btime "):
boot_time = int(line.split()[1])
def update_resource_metrics():
if HAVE_PROC_SELF_STAT:
global stats
with open("/proc/self/stat") as s:
line = s.read()
# line is PID (command) more stats go here ...
raw_stats = line.split(") ", 1)[1].split(" ")
for (name, index) in iteritems(STAT_FIELDS):
# subtract 3 from the index, because proc(5) is 1-based, and
# we've lost the first two fields in PID and COMMAND above
stats[name] = int(raw_stats[index - 3])
def _count_fds():
# Not every OS will have a /proc/self/fd directory
if not HAVE_PROC_SELF_FD:
return 0
return len(os.listdir("/proc/self/fd"))
def register_process_collector(process_metrics):
process_metrics.register_collector(update_resource_metrics)
if HAVE_PROC_SELF_STAT:
process_metrics.register_callback(
"cpu_user_seconds_total",
lambda: float(stats["utime"]) / TICKS_PER_SEC
)
process_metrics.register_callback(
"cpu_system_seconds_total",
lambda: float(stats["stime"]) / TICKS_PER_SEC
)
process_metrics.register_callback(
"cpu_seconds_total",
lambda: (float(stats["utime"] + stats["stime"])) / TICKS_PER_SEC
)
process_metrics.register_callback(
"virtual_memory_bytes",
lambda: int(stats["vsize"])
)
process_metrics.register_callback(
"resident_memory_bytes",
lambda: int(stats["rss"]) * BYTES_PER_PAGE
)
process_metrics.register_callback(
"start_time_seconds",
lambda: boot_time + int(stats["starttime"]) / TICKS_PER_SEC
)
if HAVE_PROC_SELF_FD:
process_metrics.register_callback(
"open_fds",
lambda: _count_fds()
)
if HAVE_PROC_SELF_LIMITS:
def _get_max_fds():
with open("/proc/self/limits") as limits:
for line in limits:
if not line.startswith("Max open files "):
continue
# Line is Max open files $SOFT $HARD
return int(line.split()[3])
return None
process_metrics.register_callback(
"max_fds",
lambda: _get_max_fds()
)

View File

@ -13,27 +13,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from twisted.web.resource import Resource
import synapse.metrics
METRICS_PREFIX = "/_synapse/metrics" METRICS_PREFIX = "/_synapse/metrics"
class MetricsResource(Resource):
isLeaf = True
def __init__(self, hs):
Resource.__init__(self) # Resource is old-style, so no super()
self.hs = hs
def render_GET(self, request):
response = synapse.metrics.render_all()
request.setHeader("Content-Type", "text/plain")
request.setHeader("Content-Length", str(len(response)))
# Encode as UTF-8 (default)
return response.encode()

View File

@ -28,22 +28,20 @@ from synapse.util.logcontext import PreserveLoggingContext, run_in_background
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
from synapse.types import StreamToken from synapse.types import StreamToken
from synapse.visibility import filter_events_for_client from synapse.visibility import filter_events_for_client
import synapse.metrics from synapse.metrics import LaterGauge
from collections import namedtuple from collections import namedtuple
from prometheus_client import Counter
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__) notified_events_counter = Counter("synapse_notifier_notified_events", "")
notified_events_counter = metrics.register_counter("notified_events") users_woken_by_stream_counter = Counter(
"synapse_notifier_users_woken_by_stream", "", ["stream"])
users_woken_by_stream_counter = metrics.register_counter(
"users_woken_by_stream", labels=["stream"]
)
# TODO(paul): Should be shared somewhere # TODO(paul): Should be shared somewhere
@ -108,7 +106,7 @@ class _NotifierUserStream(object):
self.last_notified_ms = time_now_ms self.last_notified_ms = time_now_ms
noify_deferred = self.notify_deferred noify_deferred = self.notify_deferred
users_woken_by_stream_counter.inc(stream_key) users_woken_by_stream_counter.labels(stream_key).inc()
with PreserveLoggingContext(): with PreserveLoggingContext():
self.notify_deferred = ObservableDeferred(defer.Deferred()) self.notify_deferred = ObservableDeferred(defer.Deferred())
@ -197,14 +195,14 @@ class Notifier(object):
all_user_streams.add(x) all_user_streams.add(x)
return sum(stream.count_listeners() for stream in all_user_streams) return sum(stream.count_listeners() for stream in all_user_streams)
metrics.register_callback("listeners", count_listeners) LaterGauge("synapse_notifier_listeners", "", [], count_listeners)
metrics.register_callback( LaterGauge(
"rooms", "synapse_notifier_rooms", "", [],
lambda: count(bool, self.room_to_user_streams.values()), lambda: count(bool, self.room_to_user_streams.values()),
) )
metrics.register_callback( LaterGauge(
"users", "synapse_notifier_users", "", [],
lambda: len(self.user_to_user_stream), lambda: len(self.user_to_user_stream),
) )

View File

@ -22,14 +22,13 @@ from .push_rule_evaluator import PushRuleEvaluatorForEvent
from synapse.event_auth import get_user_power_level from synapse.event_auth import get_user_power_level
from synapse.api.constants import EventTypes, Membership from synapse.api.constants import EventTypes, Membership
from synapse.metrics import get_metrics_for from synapse.util.caches import register_cache
from synapse.util.caches import metrics as cache_metrics
from synapse.util.caches.descriptors import cached from synapse.util.caches.descriptors import cached
from synapse.util.async import Linearizer from synapse.util.async import Linearizer
from synapse.state import POWER_KEY from synapse.state import POWER_KEY
from collections import namedtuple from collections import namedtuple
from prometheus_client import Counter
from six import itervalues, iteritems from six import itervalues, iteritems
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -37,21 +36,18 @@ logger = logging.getLogger(__name__)
rules_by_room = {} rules_by_room = {}
push_metrics = get_metrics_for(__name__)
push_rules_invalidation_counter = push_metrics.register_counter( push_rules_invalidation_counter = Counter(
"push_rules_invalidation_counter" "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", "")
) push_rules_state_size_counter = Counter(
push_rules_state_size_counter = push_metrics.register_counter( "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", "")
"push_rules_state_size_counter"
)
# Measures whether we use the fast path of using state deltas, or if we have to # Measures whether we use the fast path of using state deltas, or if we have to
# recalculate from scratch # recalculate from scratch
push_rules_delta_state_cache_metric = cache_metrics.register_cache( push_rules_delta_state_cache_metric = register_cache(
"cache", "cache",
size_callback=lambda: 0, # Meaningless size, as this isn't a cache that stores values "push_rules_delta_state_cache_metric",
cache_name="push_rules_delta_state_cache_metric", cache=[], # Meaningless size, as this isn't a cache that stores values
) )
@ -65,10 +61,10 @@ class BulkPushRuleEvaluator(object):
self.store = hs.get_datastore() self.store = hs.get_datastore()
self.auth = hs.get_auth() self.auth = hs.get_auth()
self.room_push_rule_cache_metrics = cache_metrics.register_cache( self.room_push_rule_cache_metrics = register_cache(
"cache", "cache",
size_callback=lambda: 0, # There's not good value for this "room_push_rule_cache",
cache_name="room_push_rule_cache", cache=[], # Meaningless size, as this isn't a cache that stores values
) )
@defer.inlineCallbacks @defer.inlineCallbacks
@ -310,7 +306,7 @@ class RulesForRoom(object):
current_state_ids = context.current_state_ids current_state_ids = context.current_state_ids
push_rules_delta_state_cache_metric.inc_misses() push_rules_delta_state_cache_metric.inc_misses()
push_rules_state_size_counter.inc_by(len(current_state_ids)) push_rules_state_size_counter.inc(len(current_state_ids))
logger.debug( logger.debug(
"Looking for member changes in %r %r", state_group, current_state_ids "Looking for member changes in %r %r", state_group, current_state_ids

View File

@ -20,22 +20,17 @@ from twisted.internet.error import AlreadyCalled, AlreadyCancelled
from . import push_rule_evaluator from . import push_rule_evaluator
from . import push_tools from . import push_tools
import synapse
from synapse.push import PusherConfigException from synapse.push import PusherConfigException
from synapse.util.logcontext import LoggingContext from synapse.util.logcontext import LoggingContext
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
from prometheus_client import Counter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__) http_push_processed_counter = Counter("synapse_http_httppusher_http_pushes_processed", "")
http_push_processed_counter = metrics.register_counter( http_push_failed_counter = Counter("synapse_http_httppusher_http_pushes_failed", "")
"http_pushes_processed",
)
http_push_failed_counter = metrics.register_counter(
"http_pushes_failed",
)
class HttpPusher(object): class HttpPusher(object):

View File

@ -152,7 +152,7 @@ class PushRuleEvaluatorForEvent(object):
# Caches (glob, word_boundary) -> regex for push. See _glob_matches # Caches (glob, word_boundary) -> regex for push. See _glob_matches
regex_cache = LruCache(50000 * CACHE_SIZE_FACTOR) regex_cache = LruCache(50000 * CACHE_SIZE_FACTOR)
register_cache("regex_push_cache", regex_cache) register_cache("cache", "regex_push_cache", regex_cache)
def _glob_matches(glob, value, word_boundary=False): def _glob_matches(glob, value, word_boundary=False):

View File

@ -56,6 +56,7 @@ REQUIREMENTS = {
"msgpack-python>=0.3.0": ["msgpack"], "msgpack-python>=0.3.0": ["msgpack"],
"phonenumbers>=8.2.0": ["phonenumbers"], "phonenumbers>=8.2.0": ["phonenumbers"],
"six": ["six"], "six": ["six"],
"prometheus_client": ["prometheus_client"],
} }
CONDITIONAL_REQUIREMENTS = { CONDITIONAL_REQUIREMENTS = {
"web_client": { "web_client": {

View File

@ -60,22 +60,21 @@ from .commands import (
) )
from .streams import STREAMS_MAP from .streams import STREAMS_MAP
from synapse.metrics import LaterGauge
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
from synapse.metrics.metric import CounterMetric
import logging from prometheus_client import Counter
import synapse.metrics
import struct from collections import defaultdict
import fcntl
from six import iterkeys, iteritems from six import iterkeys, iteritems
metrics = synapse.metrics.get_metrics_for(__name__) import logging
import struct
connection_close_counter = metrics.register_counter( import fcntl
"close_reason", labels=["reason_type"],
)
connection_close_counter = Counter(
"synapse_replication_tcp_protocol_close_reason", "", ["reason_type"])
# A list of all connected protocols. This allows us to send metrics about the # A list of all connected protocols. This allows us to send metrics about the
# connections. # connections.
@ -137,12 +136,8 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
# The LoopingCall for sending pings. # The LoopingCall for sending pings.
self._send_ping_loop = None self._send_ping_loop = None
self.inbound_commands_counter = CounterMetric( self.inbound_commands_counter = defaultdict(int)
"inbound_commands", labels=["command"], self.outbound_commands_counter = defaultdict(int)
)
self.outbound_commands_counter = CounterMetric(
"outbound_commands", labels=["command"],
)
def connectionMade(self): def connectionMade(self):
logger.info("[%s] Connection established", self.id()) logger.info("[%s] Connection established", self.id())
@ -202,7 +197,8 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
self.last_received_command = self.clock.time_msec() self.last_received_command = self.clock.time_msec()
self.inbound_commands_counter.inc(cmd_name) self.inbound_commands_counter[cmd_name] = (
self.inbound_commands_counter[cmd_name] + 1)
cmd_cls = COMMAND_MAP[cmd_name] cmd_cls = COMMAND_MAP[cmd_name]
try: try:
@ -252,8 +248,8 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
self._queue_command(cmd) self._queue_command(cmd)
return return
self.outbound_commands_counter.inc(cmd.NAME) self.outbound_commands_counter[cmd.NAME] = (
self.outbound_commands_counter[cmd.NAME] + 1)
string = "%s %s" % (cmd.NAME, cmd.to_line(),) string = "%s %s" % (cmd.NAME, cmd.to_line(),)
if "\n" in string: if "\n" in string:
raise Exception("Unexpected newline in command: %r", string) raise Exception("Unexpected newline in command: %r", string)
@ -318,9 +314,9 @@ class BaseReplicationStreamProtocol(LineOnlyReceiver):
def connectionLost(self, reason): def connectionLost(self, reason):
logger.info("[%s] Replication connection closed: %r", self.id(), reason) logger.info("[%s] Replication connection closed: %r", self.id(), reason)
if isinstance(reason, Failure): if isinstance(reason, Failure):
connection_close_counter.inc(reason.type.__name__) connection_close_counter.labels(reason.type.__name__).inc()
else: else:
connection_close_counter.inc(reason.__class__.__name__) connection_close_counter.labels(reason.__class__.__name__).inc()
try: try:
# Remove us from list of connections to be monitored # Remove us from list of connections to be monitored
@ -519,7 +515,7 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
def on_RDATA(self, cmd): def on_RDATA(self, cmd):
stream_name = cmd.stream_name stream_name = cmd.stream_name
inbound_rdata_count.inc(stream_name) inbound_rdata_count.labels(stream_name).inc()
try: try:
row = STREAMS_MAP[stream_name].ROW_TYPE(*cmd.row) row = STREAMS_MAP[stream_name].ROW_TYPE(*cmd.row)
@ -567,14 +563,12 @@ class ClientReplicationStreamProtocol(BaseReplicationStreamProtocol):
# The following simply registers metrics for the replication connections # The following simply registers metrics for the replication connections
metrics.register_callback( pending_commands = LaterGauge(
"pending_commands", "pending_commands", "", ["name", "conn_id"],
lambda: { lambda: {
(p.name, p.conn_id): len(p.pending_commands) (p.name, p.conn_id): len(p.pending_commands)
for p in connected_connections for p in connected_connections
}, })
labels=["name", "conn_id"],
)
def transport_buffer_size(protocol): def transport_buffer_size(protocol):
@ -584,14 +578,12 @@ def transport_buffer_size(protocol):
return 0 return 0
metrics.register_callback( transport_send_buffer = LaterGauge(
"transport_send_buffer", "synapse_replication_tcp_transport_send_buffer", "", ["name", "conn_id"],
lambda: { lambda: {
(p.name, p.conn_id): transport_buffer_size(p) (p.name, p.conn_id): transport_buffer_size(p)
for p in connected_connections for p in connected_connections
}, })
labels=["name", "conn_id"],
)
def transport_kernel_read_buffer_size(protocol, read=True): def transport_kernel_read_buffer_size(protocol, read=True):
@ -609,48 +601,38 @@ def transport_kernel_read_buffer_size(protocol, read=True):
return 0 return 0
metrics.register_callback( tcp_transport_kernel_send_buffer = LaterGauge(
"transport_kernel_send_buffer", "synapse_replication_tcp_transport_kernel_send_buffer", "", ["name", "conn_id"],
lambda: { lambda: {
(p.name, p.conn_id): transport_kernel_read_buffer_size(p, False) (p.name, p.conn_id): transport_kernel_read_buffer_size(p, False)
for p in connected_connections for p in connected_connections
}, })
labels=["name", "conn_id"],
)
metrics.register_callback( tcp_transport_kernel_read_buffer = LaterGauge(
"transport_kernel_read_buffer", "synapse_replication_tcp_transport_kernel_read_buffer", "", ["name", "conn_id"],
lambda: { lambda: {
(p.name, p.conn_id): transport_kernel_read_buffer_size(p, True) (p.name, p.conn_id): transport_kernel_read_buffer_size(p, True)
for p in connected_connections for p in connected_connections
}, })
labels=["name", "conn_id"],
)
metrics.register_callback( tcp_inbound_commands = LaterGauge(
"inbound_commands", "synapse_replication_tcp_inbound_commands", "", ["command", "name", "conn_id"],
lambda: { lambda: {
(k[0], p.name, p.conn_id): count (k[0], p.name, p.conn_id): count
for p in connected_connections for p in connected_connections
for k, count in iteritems(p.inbound_commands_counter.counts) for k, count in iteritems(p.inbound_commands_counter.counts)
}, })
labels=["command", "name", "conn_id"],
)
metrics.register_callback( tcp_outbound_commands = LaterGauge(
"outbound_commands", "synapse_replication_tcp_outbound_commands", "", ["command", "name", "conn_id"],
lambda: { lambda: {
(k[0], p.name, p.conn_id): count (k[0], p.name, p.conn_id): count
for p in connected_connections for p in connected_connections
for k, count in iteritems(p.outbound_commands_counter.counts) for k, count in iteritems(p.outbound_commands_counter.counts)
}, })
labels=["command", "name", "conn_id"],
)
# number of updates received for each RDATA stream # number of updates received for each RDATA stream
inbound_rdata_count = metrics.register_counter( inbound_rdata_count = Counter("synapse_replication_tcp_inbound_rdata_count", "",
"inbound_rdata_count", ["stream_name"])
labels=["stream_name"],
)

View File

@ -22,21 +22,21 @@ from .streams import STREAMS_MAP, FederationStream
from .protocol import ServerReplicationStreamProtocol from .protocol import ServerReplicationStreamProtocol
from synapse.util.metrics import Measure, measure_func from synapse.util.metrics import Measure, measure_func
from synapse.metrics import LaterGauge
import logging import logging
import synapse.metrics
from prometheus_client import Counter
from six import itervalues from six import itervalues
metrics = synapse.metrics.get_metrics_for(__name__) stream_updates_counter = Counter("synapse_replication_tcp_resource_stream_updates",
stream_updates_counter = metrics.register_counter( "", ["stream_name"])
"stream_updates", labels=["stream_name"] user_sync_counter = Counter("synapse_replication_tcp_resource_user_sync", "")
) federation_ack_counter = Counter("synapse_replication_tcp_resource_federation_ack", "")
user_sync_counter = metrics.register_counter("user_sync") remove_pusher_counter = Counter("synapse_replication_tcp_resource_remove_pusher", "")
federation_ack_counter = metrics.register_counter("federation_ack") invalidate_cache_counter = Counter("synapse_replication_tcp_resource_invalidate_cache",
remove_pusher_counter = metrics.register_counter("remove_pusher") "")
invalidate_cache_counter = metrics.register_counter("invalidate_cache") user_ip_cache_counter = Counter("synapse_replication_tcp_resource_user_ip_cache", "")
user_ip_cache_counter = metrics.register_counter("user_ip_cache")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -75,7 +75,8 @@ class ReplicationStreamer(object):
# Current connections. # Current connections.
self.connections = [] self.connections = []
metrics.register_callback("total_connections", lambda: len(self.connections)) LaterGauge("synapse_replication_tcp_resource_total_connections", "", [],
lambda: len(self.connections))
# List of streams that clients can subscribe to. # List of streams that clients can subscribe to.
# We only support federation stream if federation sending hase been # We only support federation stream if federation sending hase been
@ -87,17 +88,16 @@ class ReplicationStreamer(object):
self.streams_by_name = {stream.NAME: stream for stream in self.streams} self.streams_by_name = {stream.NAME: stream for stream in self.streams}
metrics.register_callback( LaterGauge(
"connections_per_stream", "synapse_replication_tcp_resource_connections_per_stream", "",
["stream_name"],
lambda: { lambda: {
(stream_name,): len([ (stream_name,): len([
conn for conn in self.connections conn for conn in self.connections
if stream_name in conn.replication_streams if stream_name in conn.replication_streams
]) ])
for stream_name in self.streams_by_name for stream_name in self.streams_by_name
}, })
labels=["stream_name"],
)
self.federation_sender = None self.federation_sender = None
if not hs.config.send_federation: if not hs.config.send_federation:
@ -177,7 +177,7 @@ class ReplicationStreamer(object):
logger.info( logger.info(
"Streaming: %s -> %s", stream.NAME, updates[-1][0] "Streaming: %s -> %s", stream.NAME, updates[-1][0]
) )
stream_updates_counter.inc_by(len(updates), stream.NAME) stream_updates_counter.labels(stream.NAME).inc(len(updates))
# Some streams return multiple rows with the same stream IDs, # Some streams return multiple rows with the same stream IDs,
# we need to make sure they get sent out in batches. We do # we need to make sure they get sent out in batches. We do

View File

@ -18,8 +18,8 @@ from synapse.api.errors import StoreError
from synapse.util.logcontext import LoggingContext, PreserveLoggingContext from synapse.util.logcontext import LoggingContext, PreserveLoggingContext
from synapse.util.caches.descriptors import Cache from synapse.util.caches.descriptors import Cache
from synapse.storage.engines import PostgresEngine from synapse.storage.engines import PostgresEngine
import synapse.metrics
from prometheus_client import Histogram
from twisted.internet import defer from twisted.internet import defer
@ -42,13 +42,10 @@ sql_logger = logging.getLogger("synapse.storage.SQL")
transaction_logger = logging.getLogger("synapse.storage.txn") transaction_logger = logging.getLogger("synapse.storage.txn")
perf_logger = logging.getLogger("synapse.storage.TIME") perf_logger = logging.getLogger("synapse.storage.TIME")
sql_scheduling_timer = Histogram("synapse_storage_schedule_time", "sec")
metrics = synapse.metrics.get_metrics_for("synapse.storage") sql_query_timer = Histogram("synapse_storage_query_time", "sec", ["verb"])
sql_txn_timer = Histogram("synapse_storage_transaction_time", "sec", ["desc"])
sql_scheduling_timer = metrics.register_distribution("schedule_time")
sql_query_timer = metrics.register_distribution("query_time", labels=["verb"])
sql_txn_timer = metrics.register_distribution("transaction_time", labels=["desc"])
class LoggingTransaction(object): class LoggingTransaction(object):
@ -113,7 +110,7 @@ class LoggingTransaction(object):
# Don't let logging failures stop SQL from working # Don't let logging failures stop SQL from working
pass pass
start = time.time() * 1000 start = time.time()
try: try:
return func( return func(
@ -123,9 +120,9 @@ class LoggingTransaction(object):
logger.debug("[SQL FAIL] {%s} %s", self.name, e) logger.debug("[SQL FAIL] {%s} %s", self.name, e)
raise raise
finally: finally:
msecs = (time.time() * 1000) - start secs = time.time() - start
sql_logger.debug("[SQL time] {%s} %f", self.name, msecs) sql_logger.debug("[SQL time] {%s} %f sec", self.name, secs)
sql_query_timer.inc_by(msecs, sql.split()[0]) sql_query_timer.labels(sql.split()[0]).observe(secs)
class PerformanceCounters(object): class PerformanceCounters(object):
@ -135,7 +132,7 @@ class PerformanceCounters(object):
def update(self, key, start_time, end_time=None): def update(self, key, start_time, end_time=None):
if end_time is None: if end_time is None:
end_time = time.time() * 1000 end_time = time.time()
duration = end_time - start_time duration = end_time - start_time
count, cum_time = self.current_counters.get(key, (0, 0)) count, cum_time = self.current_counters.get(key, (0, 0))
count += 1 count += 1
@ -225,7 +222,7 @@ class SQLBaseStore(object):
def _new_transaction(self, conn, desc, after_callbacks, exception_callbacks, def _new_transaction(self, conn, desc, after_callbacks, exception_callbacks,
logging_context, func, *args, **kwargs): logging_context, func, *args, **kwargs):
start = time.time() * 1000 start = time.time()
txn_id = self._TXN_ID txn_id = self._TXN_ID
# We don't really need these to be unique, so lets stop it from # We don't really need these to be unique, so lets stop it from
@ -285,17 +282,17 @@ class SQLBaseStore(object):
logger.debug("[TXN FAIL] {%s} %s", name, e) logger.debug("[TXN FAIL] {%s} %s", name, e)
raise raise
finally: finally:
end = time.time() * 1000 end = time.time()
duration = end - start duration = end - start
if logging_context is not None: if logging_context is not None:
logging_context.add_database_transaction(duration) logging_context.add_database_transaction(duration)
transaction_logger.debug("[TXN END] {%s} %f", name, duration) transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
self._current_txn_total_time += duration self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end) self._txn_perf_counters.update(desc, start, end)
sql_txn_timer.inc_by(duration, desc) sql_txn_timer.labels(desc).observe(duration)
@defer.inlineCallbacks @defer.inlineCallbacks
def runInteraction(self, desc, func, *args, **kwargs): def runInteraction(self, desc, func, *args, **kwargs):
@ -352,13 +349,13 @@ class SQLBaseStore(object):
""" """
current_context = LoggingContext.current_context() current_context = LoggingContext.current_context()
start_time = time.time() * 1000 start_time = time.time()
def inner_func(conn, *args, **kwargs): def inner_func(conn, *args, **kwargs):
with LoggingContext("runWithConnection") as context: with LoggingContext("runWithConnection") as context:
sched_duration_ms = time.time() * 1000 - start_time sched_duration_sec = time.time() - start_time
sql_scheduling_timer.inc_by(sched_duration_ms) sql_scheduling_timer.observe(sched_duration_sec)
current_context.add_database_scheduled(sched_duration_ms) current_context.add_database_scheduled(sched_duration_sec)
if self.database_engine.is_connection_closed(conn): if self.database_engine.is_connection_closed(conn):
logger.debug("Reconnecting closed database connection") logger.debug("Reconnecting closed database connection")

View File

@ -40,30 +40,27 @@ import synapse.metrics
from synapse.events import EventBase # noqa: F401 from synapse.events import EventBase # noqa: F401
from synapse.events.snapshot import EventContext # noqa: F401 from synapse.events.snapshot import EventContext # noqa: F401
from prometheus_client import Counter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
persist_event_counter = Counter("synapse_storage_events_persisted_events", "")
metrics = synapse.metrics.get_metrics_for(__name__) event_counter = Counter("synapse_storage_events_persisted_events_sep", "",
persist_event_counter = metrics.register_counter("persisted_events") ["type", "origin_type", "origin_entity"])
event_counter = metrics.register_counter(
"persisted_events_sep", labels=["type", "origin_type", "origin_entity"]
)
# The number of times we are recalculating the current state # The number of times we are recalculating the current state
state_delta_counter = metrics.register_counter( state_delta_counter = Counter("synapse_storage_events_state_delta", "")
"state_delta",
)
# The number of times we are recalculating state when there is only a # The number of times we are recalculating state when there is only a
# single forward extremity # single forward extremity
state_delta_single_event_counter = metrics.register_counter( state_delta_single_event_counter = Counter(
"state_delta_single_event", "synapse_storage_events_state_delta_single_event", "")
)
# The number of times we are reculating state when we could have resonably # The number of times we are reculating state when we could have resonably
# calculated the delta when we calculated the state for an event we were # calculated the delta when we calculated the state for an event we were
# persisting. # persisting.
state_delta_reuse_delta_counter = metrics.register_counter( state_delta_reuse_delta_counter = Counter(
"state_delta_reuse_delta", "synapse_storage_events_state_delta_reuse_delta", "")
)
def encode_json(json_object): def encode_json(json_object):
@ -445,7 +442,7 @@ class EventsStore(EventsWorkerStore):
state_delta_for_room=state_delta_for_room, state_delta_for_room=state_delta_for_room,
new_forward_extremeties=new_forward_extremeties, new_forward_extremeties=new_forward_extremeties,
) )
persist_event_counter.inc_by(len(chunk)) persist_event_counter.inc(len(chunk))
synapse.metrics.event_persisted_position.set( synapse.metrics.event_persisted_position.set(
chunk[-1][0].internal_metadata.stream_ordering, chunk[-1][0].internal_metadata.stream_ordering,
) )
@ -460,7 +457,7 @@ class EventsStore(EventsWorkerStore):
origin_type = "remote" origin_type = "remote"
origin_entity = get_domain_from_id(event.sender) origin_entity = get_domain_from_id(event.sender)
event_counter.inc(event.type, origin_type, origin_entity) event_counter.labels(event.type, origin_type, origin_entity).inc()
for room_id, new_state in current_state_for_room.iteritems(): for room_id, new_state in current_state_for_room.iteritems():
self.get_current_state_ids.prefill( self.get_current_state_ids.prefill(

View File

@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import synapse.metrics from prometheus_client.core import Gauge, REGISTRY, GaugeMetricFamily
import os import os
from six.moves import intern from six.moves import intern
@ -21,23 +22,68 @@ import six
CACHE_SIZE_FACTOR = float(os.environ.get("SYNAPSE_CACHE_FACTOR", 0.5)) CACHE_SIZE_FACTOR = float(os.environ.get("SYNAPSE_CACHE_FACTOR", 0.5))
metrics = synapse.metrics.get_metrics_for("synapse.util.caches")
caches_by_name = {} caches_by_name = {}
# cache_counter = metrics.register_cache( collectors_by_name = {}
# "cache",
# lambda: {(name,): len(caches_by_name[name]) for name in caches_by_name.keys()}, cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
# labels=["name"], cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
# ) cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"])
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
response_cache_evicted = Gauge(
"synapse_util_caches_response_cache:evicted_size", "", ["name"]
)
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
def register_cache(name, cache): def register_cache(cache_type, cache_name, cache):
caches_by_name[name] = cache
return metrics.register_cache( # Check if the metric is already registered. Unregister it, if so.
"cache", # This usually happens during tests, as at runtime these caches are
lambda: len(cache), # effectively singletons.
name, metric_name = "cache_%s_%s" % (cache_type, cache_name)
) if metric_name in collectors_by_name.keys():
REGISTRY.unregister(collectors_by_name[metric_name])
class CacheMetric(object):
hits = 0
misses = 0
evicted_size = 0
def inc_hits(self):
self.hits += 1
def inc_misses(self):
self.misses += 1
def inc_evictions(self, size=1):
self.evicted_size += size
def describe(self):
return []
def collect(self):
if cache_type == "response_cache":
response_cache_size.labels(cache_name).set(len(cache))
response_cache_hits.labels(cache_name).set(self.hits)
response_cache_evicted.labels(cache_name).set(self.evicted_size)
response_cache_total.labels(cache_name).set(self.hits + self.misses)
else:
cache_size.labels(cache_name).set(len(cache))
cache_hits.labels(cache_name).set(self.hits)
cache_evicted.labels(cache_name).set(self.evicted_size)
cache_total.labels(cache_name).set(self.hits + self.misses)
yield GaugeMetricFamily("__unused", "")
metric = CacheMetric()
REGISTRY.register(metric)
caches_by_name[cache_name] = cache
collectors_by_name[metric_name] = metric
return metric
KNOWN_KEYS = { KNOWN_KEYS = {

View File

@ -80,7 +80,7 @@ class Cache(object):
self.name = name self.name = name
self.keylen = keylen self.keylen = keylen
self.thread = None self.thread = None
self.metrics = register_cache(name, self.cache) self.metrics = register_cache("cache", name, self.cache)
def _on_evicted(self, evicted_count): def _on_evicted(self, evicted_count):
self.metrics.inc_evictions(evicted_count) self.metrics.inc_evictions(evicted_count)

View File

@ -55,7 +55,7 @@ class DictionaryCache(object):
__slots__ = [] __slots__ = []
self.sentinel = Sentinel() self.sentinel = Sentinel()
self.metrics = register_cache(name, self.cache) self.metrics = register_cache("dictionary", name, self.cache)
def check_thread(self): def check_thread(self):
expected_thread = self.thread expected_thread = self.thread

View File

@ -52,12 +52,12 @@ class ExpiringCache(object):
self._cache = OrderedDict() self._cache = OrderedDict()
self.metrics = register_cache(cache_name, self)
self.iterable = iterable self.iterable = iterable
self._size_estimate = 0 self._size_estimate = 0
self.metrics = register_cache("expiring", cache_name, self)
def start(self): def start(self):
if not self._expiry_ms: if not self._expiry_ms:
# Don't bother starting the loop if things never expire # Don't bother starting the loop if things never expire

View File

@ -17,7 +17,7 @@ import logging
from twisted.internet import defer from twisted.internet import defer
from synapse.util.async import ObservableDeferred from synapse.util.async import ObservableDeferred
from synapse.util.caches import metrics as cache_metrics from synapse.util.caches import register_cache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background from synapse.util.logcontext import make_deferred_yieldable, run_in_background
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -38,15 +38,16 @@ class ResponseCache(object):
self.timeout_sec = timeout_ms / 1000. self.timeout_sec = timeout_ms / 1000.
self._name = name self._name = name
self._metrics = cache_metrics.register_cache( self._metrics = register_cache(
"response_cache", "response_cache", name, self
size_callback=lambda: self.size(),
cache_name=name,
) )
def size(self): def size(self):
return len(self.pending_result_cache) return len(self.pending_result_cache)
def __len__(self):
return self.size()
def get(self, key): def get(self, key):
"""Look up the given key. """Look up the given key.

View File

@ -38,7 +38,7 @@ class StreamChangeCache(object):
self._cache = sorteddict() self._cache = sorteddict()
self._earliest_known_stream_pos = current_stream_pos self._earliest_known_stream_pos = current_stream_pos
self.name = name self.name = name
self.metrics = register_cache(self.name, self._cache) self.metrics = register_cache("cache", self.name, self._cache)
for entity, stream_pos in prefilled_cache.items(): for entity, stream_pos in prefilled_cache.items():
self.entity_has_changed(entity, stream_pos) self.entity_has_changed(entity, stream_pos)

View File

@ -59,7 +59,7 @@ class LoggingContext(object):
__slots__ = [ __slots__ = [
"previous_context", "name", "ru_stime", "ru_utime", "previous_context", "name", "ru_stime", "ru_utime",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms", "db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
"usage_start", "usage_start",
"main_thread", "alive", "main_thread", "alive",
"request", "tag", "request", "tag",
@ -84,10 +84,10 @@ class LoggingContext(object):
def stop(self): def stop(self):
pass pass
def add_database_transaction(self, duration_ms): def add_database_transaction(self, duration_sec):
pass pass
def add_database_scheduled(self, sched_ms): def add_database_scheduled(self, sched_sec):
pass pass
def __nonzero__(self): def __nonzero__(self):
@ -103,11 +103,11 @@ class LoggingContext(object):
self.ru_utime = 0. self.ru_utime = 0.
self.db_txn_count = 0 self.db_txn_count = 0
# ms spent waiting for db txns, excluding scheduling time # sec spent waiting for db txns, excluding scheduling time
self.db_txn_duration_ms = 0 self.db_txn_duration_sec = 0
# ms spent waiting for db txns to be scheduled # sec spent waiting for db txns to be scheduled
self.db_sched_duration_ms = 0 self.db_sched_duration_sec = 0
# If alive has the thread resource usage when the logcontext last # If alive has the thread resource usage when the logcontext last
# became active. # became active.
@ -230,18 +230,18 @@ class LoggingContext(object):
return ru_utime, ru_stime return ru_utime, ru_stime
def add_database_transaction(self, duration_ms): def add_database_transaction(self, duration_sec):
self.db_txn_count += 1 self.db_txn_count += 1
self.db_txn_duration_ms += duration_ms self.db_txn_duration_sec += duration_sec
def add_database_scheduled(self, sched_ms): def add_database_scheduled(self, sched_sec):
"""Record a use of the database pool """Record a use of the database pool
Args: Args:
sched_ms (int): number of milliseconds it took us to get a sched_sec (float): number of seconds it took us to get a
connection connection
""" """
self.db_sched_duration_ms += sched_ms self.db_sched_duration_sec += sched_sec
class LoggingContextFilter(logging.Filter): class LoggingContextFilter(logging.Filter):

View File

@ -96,7 +96,7 @@ def time_function(f):
id = _TIME_FUNC_ID id = _TIME_FUNC_ID
_TIME_FUNC_ID += 1 _TIME_FUNC_ID += 1
start = time.clock() * 1000 start = time.clock()
try: try:
_log_debug_as_f( _log_debug_as_f(
@ -107,10 +107,10 @@ def time_function(f):
r = f(*args, **kwargs) r = f(*args, **kwargs)
finally: finally:
end = time.clock() * 1000 end = time.clock()
_log_debug_as_f( _log_debug_as_f(
f, f,
"[FUNC END] {%s-%d} %f", "[FUNC END] {%s-%d} %.3f sec",
(func_name, id, end - start,), (func_name, id, end - start,),
) )

View File

@ -15,8 +15,8 @@
from twisted.internet import defer from twisted.internet import defer
from prometheus_client import Counter
from synapse.util.logcontext import LoggingContext from synapse.util.logcontext import LoggingContext
import synapse.metrics
from functools import wraps from functools import wraps
import logging import logging
@ -24,66 +24,26 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
block_counter = Counter("synapse_util_metrics_block_count", "", ["block_name"])
metrics = synapse.metrics.get_metrics_for(__name__) block_timer = Counter("synapse_util_metrics_block_time_seconds", "", ["block_name"])
# total number of times we have hit this block block_ru_utime = Counter(
block_counter = metrics.register_counter( "synapse_util_metrics_block_ru_utime_seconds", "", ["block_name"])
"block_count",
labels=["block_name"],
alternative_names=(
# the following are all deprecated aliases for the same metric
metrics.name_prefix + x for x in (
"_block_timer:count",
"_block_ru_utime:count",
"_block_ru_stime:count",
"_block_db_txn_count:count",
"_block_db_txn_duration:count",
)
)
)
block_timer = metrics.register_counter( block_ru_stime = Counter(
"block_time_seconds", "synapse_util_metrics_block_ru_stime_seconds", "", ["block_name"])
labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_timer:total",
),
)
block_ru_utime = metrics.register_counter( block_db_txn_count = Counter(
"block_ru_utime_seconds", labels=["block_name"], "synapse_util_metrics_block_db_txn_count", "", ["block_name"])
alternative_names=(
metrics.name_prefix + "_block_ru_utime:total",
),
)
block_ru_stime = metrics.register_counter(
"block_ru_stime_seconds", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_ru_stime:total",
),
)
block_db_txn_count = metrics.register_counter(
"block_db_txn_count", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_db_txn_count:total",
),
)
# seconds spent waiting for db txns, excluding scheduling time, in this block # seconds spent waiting for db txns, excluding scheduling time, in this block
block_db_txn_duration = metrics.register_counter( block_db_txn_duration = Counter(
"block_db_txn_duration_seconds", labels=["block_name"], "synapse_util_metrics_block_db_txn_duration_seconds", "", ["block_name"])
alternative_names=(
metrics.name_prefix + "_block_db_txn_duration:total",
),
)
# seconds spent waiting for a db connection, in this block # seconds spent waiting for a db connection, in this block
block_db_sched_duration = metrics.register_counter( block_db_sched_duration = Counter(
"block_db_sched_duration_seconds", labels=["block_name"], "synapse_util_metrics_block_db_sched_duration_seconds", "", ["block_name"])
)
def measure_func(name): def measure_func(name):
@ -102,7 +62,7 @@ class Measure(object):
__slots__ = [ __slots__ = [
"clock", "name", "start_context", "start", "new_context", "ru_utime", "clock", "name", "start_context", "start", "new_context", "ru_utime",
"ru_stime", "ru_stime",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms", "db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
"created_context", "created_context",
] ]
@ -114,7 +74,7 @@ class Measure(object):
self.created_context = False self.created_context = False
def __enter__(self): def __enter__(self):
self.start = self.clock.time_msec() self.start = self.clock.time()
self.start_context = LoggingContext.current_context() self.start_context = LoggingContext.current_context()
if not self.start_context: if not self.start_context:
self.start_context = LoggingContext("Measure") self.start_context = LoggingContext("Measure")
@ -123,17 +83,17 @@ class Measure(object):
self.ru_utime, self.ru_stime = self.start_context.get_resource_usage() self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
self.db_txn_count = self.start_context.db_txn_count self.db_txn_count = self.start_context.db_txn_count
self.db_txn_duration_ms = self.start_context.db_txn_duration_ms self.db_txn_duration_sec = self.start_context.db_txn_duration_sec
self.db_sched_duration_ms = self.start_context.db_sched_duration_ms self.db_sched_duration_sec = self.start_context.db_sched_duration_sec
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if isinstance(exc_type, Exception) or not self.start_context: if isinstance(exc_type, Exception) or not self.start_context:
return return
duration = self.clock.time_msec() - self.start duration = self.clock.time() - self.start
block_counter.inc(self.name) block_counter.labels(self.name).inc()
block_timer.inc_by(duration, self.name) block_timer.labels(self.name).inc(duration)
context = LoggingContext.current_context() context = LoggingContext.current_context()
@ -150,19 +110,13 @@ class Measure(object):
ru_utime, ru_stime = context.get_resource_usage() ru_utime, ru_stime = context.get_resource_usage()
block_ru_utime.inc_by(ru_utime - self.ru_utime, self.name) block_ru_utime.labels(self.name).inc(ru_utime - self.ru_utime)
block_ru_stime.inc_by(ru_stime - self.ru_stime, self.name) block_ru_stime.labels(self.name).inc(ru_stime - self.ru_stime)
block_db_txn_count.inc_by( block_db_txn_count.labels(self.name).inc(context.db_txn_count - self.db_txn_count)
context.db_txn_count - self.db_txn_count, self.name block_db_txn_duration.labels(self.name).inc(
) context.db_txn_duration_sec - self.db_txn_duration_sec)
block_db_txn_duration.inc_by( block_db_sched_duration.labels(self.name).inc(
(context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000., context.db_sched_duration_sec - self.db_sched_duration_sec)
self.name
)
block_db_sched_duration.inc_by(
(context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.,
self.name
)
if self.created_context: if self.created_context:
self.start_context.__exit__(exc_type, exc_val, exc_tb) self.start_context.__exit__(exc_type, exc_val, exc_tb)

View File

@ -12,3 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from twisted.trial import util
util.DEFAULT_TIMEOUT_DURATION = 10

View File

@ -1,192 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2015, 2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tests import unittest
from synapse.metrics.metric import (
CounterMetric, CallbackMetric, DistributionMetric, CacheMetric,
_escape_label_value,
)
class CounterMetricTestCase(unittest.TestCase):
def test_scalar(self):
counter = CounterMetric("scalar")
self.assertEquals(counter.render(), [
'scalar 0',
])
counter.inc()
self.assertEquals(counter.render(), [
'scalar 1',
])
counter.inc_by(2)
self.assertEquals(counter.render(), [
'scalar 3'
])
def test_vector(self):
counter = CounterMetric("vector", labels=["method"])
# Empty counter doesn't yet know what values it has
self.assertEquals(counter.render(), [])
counter.inc("GET")
self.assertEquals(counter.render(), [
'vector{method="GET"} 1',
])
counter.inc("GET")
counter.inc("PUT")
self.assertEquals(counter.render(), [
'vector{method="GET"} 2',
'vector{method="PUT"} 1',
])
class CallbackMetricTestCase(unittest.TestCase):
def test_scalar(self):
d = dict()
metric = CallbackMetric("size", lambda: len(d))
self.assertEquals(metric.render(), [
'size 0',
])
d["key"] = "value"
self.assertEquals(metric.render(), [
'size 1',
])
def test_vector(self):
vals = dict()
metric = CallbackMetric("values", lambda: vals, labels=["type"])
self.assertEquals(metric.render(), [])
# Keys have to be tuples, even if they're 1-element
vals[("foo",)] = 1
vals[("bar",)] = 2
self.assertEquals(metric.render(), [
'values{type="bar"} 2',
'values{type="foo"} 1',
])
class DistributionMetricTestCase(unittest.TestCase):
def test_scalar(self):
metric = DistributionMetric("thing")
self.assertEquals(metric.render(), [
'thing:count 0',
'thing:total 0',
])
metric.inc_by(500)
self.assertEquals(metric.render(), [
'thing:count 1',
'thing:total 500',
])
def test_vector(self):
metric = DistributionMetric("queries", labels=["verb"])
self.assertEquals(metric.render(), [])
metric.inc_by(300, "SELECT")
metric.inc_by(200, "SELECT")
metric.inc_by(800, "INSERT")
self.assertEquals(metric.render(), [
'queries:count{verb="INSERT"} 1',
'queries:count{verb="SELECT"} 2',
'queries:total{verb="INSERT"} 800',
'queries:total{verb="SELECT"} 500',
])
class CacheMetricTestCase(unittest.TestCase):
def test_cache(self):
d = dict()
metric = CacheMetric("cache", lambda: len(d), "cache_name")
self.assertEquals(metric.render(), [
'cache:hits{name="cache_name"} 0',
'cache:total{name="cache_name"} 0',
'cache:size{name="cache_name"} 0',
'cache:evicted_size{name="cache_name"} 0',
])
metric.inc_misses()
d["key"] = "value"
self.assertEquals(metric.render(), [
'cache:hits{name="cache_name"} 0',
'cache:total{name="cache_name"} 1',
'cache:size{name="cache_name"} 1',
'cache:evicted_size{name="cache_name"} 0',
])
metric.inc_hits()
self.assertEquals(metric.render(), [
'cache:hits{name="cache_name"} 1',
'cache:total{name="cache_name"} 2',
'cache:size{name="cache_name"} 1',
'cache:evicted_size{name="cache_name"} 0',
])
metric.inc_evictions(2)
self.assertEquals(metric.render(), [
'cache:hits{name="cache_name"} 1',
'cache:total{name="cache_name"} 2',
'cache:size{name="cache_name"} 1',
'cache:evicted_size{name="cache_name"} 2',
])
class LabelValueEscapeTestCase(unittest.TestCase):
def test_simple(self):
string = "safjhsdlifhyskljfksdfh"
self.assertEqual(string, _escape_label_value(string))
def test_escape(self):
self.assertEqual(
"abc\\\"def\\nghi\\\\",
_escape_label_value("abc\"def\nghi\\"),
)
def test_sequence_of_escapes(self):
self.assertEqual(
"abc\\\"def\\nghi\\\\\\n",
_escape_label_value("abc\"def\nghi\\\n"),
)

View File

@ -51,7 +51,7 @@ usedevelop=true
commands = commands =
/usr/bin/find "{toxinidir}" -name '*.pyc' -delete /usr/bin/find "{toxinidir}" -name '*.pyc' -delete
coverage run {env:COVERAGE_OPTS:} --source="{toxinidir}/synapse" \ coverage run {env:COVERAGE_OPTS:} --source="{toxinidir}/synapse" \
"{envbindir}/trial" {env:TRIAL_FLAGS:} {posargs:tests/metrics tests/config} \ "{envbindir}/trial" {env:TRIAL_FLAGS:} {posargs:tests/config} \
{env:TOXSUFFIX:} {env:TOXSUFFIX:}
{env:DUMP_COVERAGE_COMMAND:coverage report -m} {env:DUMP_COVERAGE_COMMAND:coverage report -m}