Merge pull request #5499 from matrix-org/rav/cleanup_metrics
Cleanups and sanity-checking in cpu and db metricspull/5547/head
commit
e59a8cd2e5
|
@ -0,0 +1 @@
|
||||||
|
Some cleanups and sanity-checking in the CPU and database metrics.
|
|
@ -175,22 +175,22 @@ class PerformanceCounters(object):
|
||||||
self.current_counters = {}
|
self.current_counters = {}
|
||||||
self.previous_counters = {}
|
self.previous_counters = {}
|
||||||
|
|
||||||
def update(self, key, start_time, end_time=None):
|
def update(self, key, duration_secs):
|
||||||
if end_time is None:
|
|
||||||
end_time = time.time()
|
|
||||||
duration = end_time - start_time
|
|
||||||
count, cum_time = self.current_counters.get(key, (0, 0))
|
count, cum_time = self.current_counters.get(key, (0, 0))
|
||||||
count += 1
|
count += 1
|
||||||
cum_time += duration
|
cum_time += duration_secs
|
||||||
self.current_counters[key] = (count, cum_time)
|
self.current_counters[key] = (count, cum_time)
|
||||||
return end_time
|
|
||||||
|
|
||||||
def interval(self, interval_duration, limit=3):
|
def interval(self, interval_duration_secs, limit=3):
|
||||||
counters = []
|
counters = []
|
||||||
for name, (count, cum_time) in iteritems(self.current_counters):
|
for name, (count, cum_time) in iteritems(self.current_counters):
|
||||||
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
|
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
|
||||||
counters.append(
|
counters.append(
|
||||||
((cum_time - prev_time) / interval_duration, count - prev_count, name)
|
(
|
||||||
|
(cum_time - prev_time) / interval_duration_secs,
|
||||||
|
count - prev_count,
|
||||||
|
name,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.previous_counters = dict(self.current_counters)
|
self.previous_counters = dict(self.current_counters)
|
||||||
|
@ -221,7 +221,6 @@ class SQLBaseStore(object):
|
||||||
# is running in mainline, and we have some nice monitoring frontends
|
# is running in mainline, and we have some nice monitoring frontends
|
||||||
# to watch it
|
# to watch it
|
||||||
self._txn_perf_counters = PerformanceCounters()
|
self._txn_perf_counters = PerformanceCounters()
|
||||||
self._get_event_counters = PerformanceCounters()
|
|
||||||
|
|
||||||
self._get_event_cache = Cache(
|
self._get_event_cache = Cache(
|
||||||
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
|
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
|
||||||
|
@ -369,21 +368,13 @@ class SQLBaseStore(object):
|
||||||
time_then = self._previous_loop_ts
|
time_then = self._previous_loop_ts
|
||||||
self._previous_loop_ts = time_now
|
self._previous_loop_ts = time_now
|
||||||
|
|
||||||
ratio = (curr - prev) / (time_now - time_then)
|
duration = time_now - time_then
|
||||||
|
ratio = (curr - prev) / duration
|
||||||
|
|
||||||
top_three_counters = self._txn_perf_counters.interval(
|
top_three_counters = self._txn_perf_counters.interval(duration, limit=3)
|
||||||
time_now - time_then, limit=3
|
|
||||||
)
|
|
||||||
|
|
||||||
top_3_event_counters = self._get_event_counters.interval(
|
|
||||||
time_now - time_then, limit=3
|
|
||||||
)
|
|
||||||
|
|
||||||
perf_logger.info(
|
perf_logger.info(
|
||||||
"Total database time: %.3f%% {%s} {%s}",
|
"Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
|
||||||
ratio * 100,
|
|
||||||
top_three_counters,
|
|
||||||
top_3_event_counters,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self._clock.looping_call(loop, 10000)
|
self._clock.looping_call(loop, 10000)
|
||||||
|
@ -465,7 +456,7 @@ class SQLBaseStore(object):
|
||||||
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
|
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
|
||||||
|
|
||||||
self._current_txn_total_time += duration
|
self._current_txn_total_time += duration
|
||||||
self._txn_perf_counters.update(desc, start, end)
|
self._txn_perf_counters.update(desc, duration)
|
||||||
sql_txn_timer.labels(desc).observe(duration)
|
sql_txn_timer.labels(desc).observe(duration)
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
|
|
|
@ -336,10 +336,9 @@ class LoggingContext(object):
|
||||||
logger.warning("Called stop on logcontext %s without calling start", self)
|
logger.warning("Called stop on logcontext %s without calling start", self)
|
||||||
return
|
return
|
||||||
|
|
||||||
usage_end = get_thread_resource_usage()
|
utime_delta, stime_delta = self._get_cputime()
|
||||||
|
self._resource_usage.ru_utime += utime_delta
|
||||||
self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
|
self._resource_usage.ru_stime += stime_delta
|
||||||
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
|
|
||||||
|
|
||||||
self.usage_start = None
|
self.usage_start = None
|
||||||
|
|
||||||
|
@ -357,13 +356,44 @@ class LoggingContext(object):
|
||||||
# can include resource usage so far.
|
# can include resource usage so far.
|
||||||
is_main_thread = threading.current_thread() is self.main_thread
|
is_main_thread = threading.current_thread() is self.main_thread
|
||||||
if self.alive and self.usage_start and is_main_thread:
|
if self.alive and self.usage_start and is_main_thread:
|
||||||
current = get_thread_resource_usage()
|
utime_delta, stime_delta = self._get_cputime()
|
||||||
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
|
res.ru_utime += utime_delta
|
||||||
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
|
res.ru_stime += stime_delta
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def _get_cputime(self):
|
||||||
|
"""Get the cpu usage time so far
|
||||||
|
|
||||||
|
Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
|
||||||
|
"""
|
||||||
|
current = get_thread_resource_usage()
|
||||||
|
|
||||||
|
utime_delta = current.ru_utime - self.usage_start.ru_utime
|
||||||
|
stime_delta = current.ru_stime - self.usage_start.ru_stime
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
if utime_delta < 0:
|
||||||
|
logger.error(
|
||||||
|
"utime went backwards! %f < %f",
|
||||||
|
current.ru_utime,
|
||||||
|
self.usage_start.ru_utime,
|
||||||
|
)
|
||||||
|
utime_delta = 0
|
||||||
|
|
||||||
|
if stime_delta < 0:
|
||||||
|
logger.error(
|
||||||
|
"stime went backwards! %f < %f",
|
||||||
|
current.ru_stime,
|
||||||
|
self.usage_start.ru_stime,
|
||||||
|
)
|
||||||
|
stime_delta = 0
|
||||||
|
|
||||||
|
return utime_delta, stime_delta
|
||||||
|
|
||||||
def add_database_transaction(self, duration_sec):
|
def add_database_transaction(self, duration_sec):
|
||||||
|
if duration_sec < 0:
|
||||||
|
raise ValueError("DB txn time can only be non-negative")
|
||||||
self._resource_usage.db_txn_count += 1
|
self._resource_usage.db_txn_count += 1
|
||||||
self._resource_usage.db_txn_duration_sec += duration_sec
|
self._resource_usage.db_txn_duration_sec += duration_sec
|
||||||
|
|
||||||
|
@ -374,6 +404,8 @@ class LoggingContext(object):
|
||||||
sched_sec (float): number of seconds it took us to get a
|
sched_sec (float): number of seconds it took us to get a
|
||||||
connection
|
connection
|
||||||
"""
|
"""
|
||||||
|
if sched_sec < 0:
|
||||||
|
raise ValueError("DB scheduling time can only be non-negative")
|
||||||
self._resource_usage.db_sched_duration_sec += sched_sec
|
self._resource_usage.db_sched_duration_sec += sched_sec
|
||||||
|
|
||||||
def record_event_fetch(self, event_count):
|
def record_event_fetch(self, event_count):
|
||||||
|
|
Loading…
Reference in New Issue