Add OpenTracing for database activity. (#10113)

This adds quite a lot of OpenTracing decoration for database activity. Specifically it adds tracing at four different levels:

 * emit a span for each "interaction" - ie, the top level database function that we tend to call "transaction", but isn't really, because it can end up as multiple transactions.
 * emit a span while we hold a database connection open
 * emit a span for each database transaction - actual actual transaction.
 * emit a span for each database query.

I'm aware this might be quite a lot of overhead, but even just running it on a local Synapse it looks really interesting, and I hope the overhead can be offset just by turning down the sampling frequency and finding other ways of tracing requests of interest (eg, the `force_tracing_for_users` setting).
pull/10124/head
Richard van der Hoff 2021-06-03 16:31:56 +01:00 committed by GitHub
parent 1d143074c5
commit 9eea4646be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 30 deletions

View File

@ -0,0 +1 @@
Report OpenTracing spans for database activity.

View File

@ -271,6 +271,12 @@ class SynapseTags:
# HTTP request tag (used to distinguish full vs incremental syncs, etc) # HTTP request tag (used to distinguish full vs incremental syncs, etc)
REQUEST_TAG = "request_tag" REQUEST_TAG = "request_tag"
# Text description of a database transaction
DB_TXN_DESC = "db.txn_desc"
# Uniqueish ID of a database transaction
DB_TXN_ID = "db.txn_id"
# Block everything by default # Block everything by default
# A regex which matches the server_names to expose traces for. # A regex which matches the server_names to expose traces for.

View File

@ -40,6 +40,7 @@ from twisted.enterprise import adbapi
from synapse.api.errors import StoreError from synapse.api.errors import StoreError
from synapse.config.database import DatabaseConnectionConfig from synapse.config.database import DatabaseConnectionConfig
from synapse.logging import opentracing
from synapse.logging.context import ( from synapse.logging.context import (
LoggingContext, LoggingContext,
current_context, current_context,
@ -313,7 +314,14 @@ class LoggingTransaction:
start = time.time() start = time.time()
try: try:
return func(sql, *args) with opentracing.start_active_span(
"db.query",
tags={
opentracing.tags.DATABASE_TYPE: "sql",
opentracing.tags.DATABASE_STATEMENT: sql,
},
):
return func(sql, *args)
except Exception as e: except Exception as e:
sql_logger.debug("[SQL FAIL] {%s} %s", self.name, e) sql_logger.debug("[SQL FAIL] {%s} %s", self.name, e)
raise raise
@ -525,9 +533,16 @@ class DatabasePool:
exception_callbacks=exception_callbacks, exception_callbacks=exception_callbacks,
) )
try: try:
r = func(cursor, *args, **kwargs) with opentracing.start_active_span(
conn.commit() "db.txn",
return r tags={
opentracing.SynapseTags.DB_TXN_DESC: desc,
opentracing.SynapseTags.DB_TXN_ID: name,
},
):
r = func(cursor, *args, **kwargs)
conn.commit()
return r
except self.engine.module.OperationalError as e: except self.engine.module.OperationalError as e:
# This can happen if the database disappears mid # This can happen if the database disappears mid
# transaction. # transaction.
@ -653,16 +668,17 @@ class DatabasePool:
logger.warning("Starting db txn '%s' from sentinel context", desc) logger.warning("Starting db txn '%s' from sentinel context", desc)
try: try:
result = await self.runWithConnection( with opentracing.start_active_span(f"db.{desc}"):
self.new_transaction, result = await self.runWithConnection(
desc, self.new_transaction,
after_callbacks, desc,
exception_callbacks, after_callbacks,
func, exception_callbacks,
*args, func,
db_autocommit=db_autocommit, *args,
**kwargs, db_autocommit=db_autocommit,
) **kwargs,
)
for after_callback, after_args, after_kwargs in after_callbacks: for after_callback, after_args, after_kwargs in after_callbacks:
after_callback(*after_args, **after_kwargs) after_callback(*after_args, **after_kwargs)
@ -718,25 +734,29 @@ class DatabasePool:
with LoggingContext( with LoggingContext(
str(curr_context), parent_context=parent_context str(curr_context), parent_context=parent_context
) as context: ) as context:
sched_duration_sec = monotonic_time() - start_time with opentracing.start_active_span(
sql_scheduling_timer.observe(sched_duration_sec) operation_name="db.connection",
context.add_database_scheduled(sched_duration_sec) ):
sched_duration_sec = monotonic_time() - start_time
sql_scheduling_timer.observe(sched_duration_sec)
context.add_database_scheduled(sched_duration_sec)
if self.engine.is_connection_closed(conn): if self.engine.is_connection_closed(conn):
logger.debug("Reconnecting closed database connection") logger.debug("Reconnecting closed database connection")
conn.reconnect() conn.reconnect()
opentracing.log_kv({"message": "reconnected"})
try: try:
if db_autocommit: if db_autocommit:
self.engine.attempt_to_set_autocommit(conn, True) self.engine.attempt_to_set_autocommit(conn, True)
db_conn = LoggingDatabaseConnection( db_conn = LoggingDatabaseConnection(
conn, self.engine, "runWithConnection" conn, self.engine, "runWithConnection"
) )
return func(db_conn, *args, **kwargs) return func(db_conn, *args, **kwargs)
finally: finally:
if db_autocommit: if db_autocommit:
self.engine.attempt_to_set_autocommit(conn, False) self.engine.attempt_to_set_autocommit(conn, False)
return await make_deferred_yieldable( return await make_deferred_yieldable(
self._db_pool.runWithConnection(inner_func, *args, **kwargs) self._db_pool.runWithConnection(inner_func, *args, **kwargs)