Add experimental configuration option to allow disabling legacy Prometheus metric names. (#13540)
Co-authored-by: David Robertson <davidr@element.io>pull/13616/head
parent
2e2040c93e
commit
be4250c7a8
|
@ -0,0 +1 @@
|
||||||
|
Add experimental configuration option to allow disabling legacy Prometheus metric names.
|
|
@ -266,15 +266,48 @@ def register_start(
|
||||||
reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))
|
reactor.callWhenRunning(lambda: defer.ensureDeferred(wrapper()))
|
||||||
|
|
||||||
|
|
||||||
def listen_metrics(bind_addresses: Iterable[str], port: int) -> None:
|
def listen_metrics(
|
||||||
|
bind_addresses: Iterable[str], port: int, enable_legacy_metric_names: bool
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Start Prometheus metrics server.
|
Start Prometheus metrics server.
|
||||||
"""
|
"""
|
||||||
from synapse.metrics import RegistryProxy, start_http_server
|
from prometheus_client import start_http_server as start_http_server_prometheus
|
||||||
|
|
||||||
|
from synapse.metrics import (
|
||||||
|
RegistryProxy,
|
||||||
|
start_http_server as start_http_server_legacy,
|
||||||
|
)
|
||||||
|
|
||||||
for host in bind_addresses:
|
for host in bind_addresses:
|
||||||
logger.info("Starting metrics listener on %s:%d", host, port)
|
logger.info("Starting metrics listener on %s:%d", host, port)
|
||||||
start_http_server(port, addr=host, registry=RegistryProxy)
|
if enable_legacy_metric_names:
|
||||||
|
start_http_server_legacy(port, addr=host, registry=RegistryProxy)
|
||||||
|
else:
|
||||||
|
_set_prometheus_client_use_created_metrics(False)
|
||||||
|
start_http_server_prometheus(port, addr=host, registry=RegistryProxy)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_prometheus_client_use_created_metrics(new_value: bool) -> None:
|
||||||
|
"""
|
||||||
|
Sets whether prometheus_client should expose `_created`-suffixed metrics for
|
||||||
|
all gauges, histograms and summaries.
|
||||||
|
There is no programmatic way to disable this without poking at internals;
|
||||||
|
the proper way is to use an environment variable which prometheus_client
|
||||||
|
loads at import time.
|
||||||
|
|
||||||
|
The motivation for disabling these `_created` metrics is that they're
|
||||||
|
a waste of space as they're not useful but they take up space in Prometheus.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import prometheus_client.metrics
|
||||||
|
|
||||||
|
if hasattr(prometheus_client.metrics, "_use_created"):
|
||||||
|
prometheus_client.metrics._use_created = new_value
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
"Can't disable `_created` metrics in prometheus_client (brittle hack broken?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def listen_manhole(
|
def listen_manhole(
|
||||||
|
|
|
@ -412,7 +412,11 @@ class GenericWorkerServer(HomeServer):
|
||||||
"enable_metrics is not True!"
|
"enable_metrics is not True!"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_base.listen_metrics(listener.bind_addresses, listener.port)
|
_base.listen_metrics(
|
||||||
|
listener.bind_addresses,
|
||||||
|
listener.port,
|
||||||
|
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning("Unsupported listener type: %s", listener.type)
|
logger.warning("Unsupported listener type: %s", listener.type)
|
||||||
|
|
||||||
|
|
|
@ -307,7 +307,11 @@ class SynapseHomeServer(HomeServer):
|
||||||
"enable_metrics is not True!"
|
"enable_metrics is not True!"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_base.listen_metrics(listener.bind_addresses, listener.port)
|
_base.listen_metrics(
|
||||||
|
listener.bind_addresses,
|
||||||
|
listener.port,
|
||||||
|
enable_legacy_metric_names=self.config.metrics.enable_legacy_metrics,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# this shouldn't happen, as the listener type should have been checked
|
# this shouldn't happen, as the listener type should have been checked
|
||||||
# during parsing
|
# during parsing
|
||||||
|
|
|
@ -42,6 +42,35 @@ class MetricsConfig(Config):
|
||||||
|
|
||||||
def read_config(self, config: JsonDict, **kwargs: Any) -> None:
|
def read_config(self, config: JsonDict, **kwargs: Any) -> None:
|
||||||
self.enable_metrics = config.get("enable_metrics", False)
|
self.enable_metrics = config.get("enable_metrics", False)
|
||||||
|
|
||||||
|
"""
|
||||||
|
### `enable_legacy_metrics` (experimental)
|
||||||
|
|
||||||
|
**Experimental: this option may be removed or have its behaviour
|
||||||
|
changed at any time, with no notice.**
|
||||||
|
|
||||||
|
Set to `true` to publish both legacy and non-legacy Prometheus metric names,
|
||||||
|
or to `false` to only publish non-legacy Prometheus metric names.
|
||||||
|
Defaults to `true`. Has no effect if `enable_metrics` is `false`.
|
||||||
|
|
||||||
|
Legacy metric names include:
|
||||||
|
- metrics containing colons in the name, such as `synapse_util_caches_response_cache:hits`, because colons are supposed to be reserved for user-defined recording rules;
|
||||||
|
- counters that don't end with the `_total` suffix, such as `synapse_federation_client_sent_edus`, therefore not adhering to the OpenMetrics standard.
|
||||||
|
|
||||||
|
These legacy metric names are unconventional and not compliant with OpenMetrics standards.
|
||||||
|
They are included for backwards compatibility.
|
||||||
|
|
||||||
|
Example configuration:
|
||||||
|
```yaml
|
||||||
|
enable_legacy_metrics: false
|
||||||
|
```
|
||||||
|
|
||||||
|
See https://github.com/matrix-org/synapse/issues/11106 for context.
|
||||||
|
|
||||||
|
*Since v1.67.0.*
|
||||||
|
"""
|
||||||
|
self.enable_legacy_metrics = config.get("enable_legacy_metrics", True)
|
||||||
|
|
||||||
self.report_stats = config.get("report_stats", None)
|
self.report_stats = config.get("report_stats", None)
|
||||||
self.report_stats_endpoint = config.get(
|
self.report_stats_endpoint = config.get(
|
||||||
"report_stats_endpoint", "https://matrix.org/report-usage-stats/push"
|
"report_stats_endpoint", "https://matrix.org/report-usage-stats/push"
|
||||||
|
|
|
@ -46,12 +46,12 @@ from twisted.python.threadpool import ThreadPool
|
||||||
|
|
||||||
# This module is imported for its side effects; flake8 needn't warn that it's unused.
|
# This module is imported for its side effects; flake8 needn't warn that it's unused.
|
||||||
import synapse.metrics._reactor_metrics # noqa: F401
|
import synapse.metrics._reactor_metrics # noqa: F401
|
||||||
from synapse.metrics._exposition import (
|
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
|
||||||
|
from synapse.metrics._legacy_exposition import (
|
||||||
MetricsResource,
|
MetricsResource,
|
||||||
generate_latest,
|
generate_latest,
|
||||||
start_http_server,
|
start_http_server,
|
||||||
)
|
)
|
||||||
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
|
|
||||||
from synapse.metrics._types import Collector
|
from synapse.metrics._types import Collector
|
||||||
from synapse.util import SYNAPSE_VERSION
|
from synapse.util import SYNAPSE_VERSION
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,27 @@ def sample_line(line: Sample, name: str) -> str:
|
||||||
return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
|
return "{}{} {}{}\n".format(name, labelstr, floatToGoString(line.value), timestamp)
|
||||||
|
|
||||||
|
|
||||||
|
# Mapping from new metric names to legacy metric names.
|
||||||
|
# We translate these back to their old names when exposing them through our
|
||||||
|
# legacy vendored exporter.
|
||||||
|
# Only this legacy exposition module applies these name changes.
|
||||||
|
LEGACY_METRIC_NAMES = {
|
||||||
|
"synapse_util_caches_cache_hits": "synapse_util_caches_cache:hits",
|
||||||
|
"synapse_util_caches_cache_size": "synapse_util_caches_cache:size",
|
||||||
|
"synapse_util_caches_cache_evicted_size": "synapse_util_caches_cache:evicted_size",
|
||||||
|
"synapse_util_caches_cache_total": "synapse_util_caches_cache:total",
|
||||||
|
"synapse_util_caches_response_cache_size": "synapse_util_caches_response_cache:size",
|
||||||
|
"synapse_util_caches_response_cache_hits": "synapse_util_caches_response_cache:hits",
|
||||||
|
"synapse_util_caches_response_cache_evicted_size": "synapse_util_caches_response_cache:evicted_size",
|
||||||
|
"synapse_util_caches_response_cache_total": "synapse_util_caches_response_cache:total",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
|
def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> bytes:
|
||||||
|
"""
|
||||||
|
Generate metrics in legacy format. Modern metrics are generated directly
|
||||||
|
by prometheus-client.
|
||||||
|
"""
|
||||||
|
|
||||||
# Trigger the cache metrics to be rescraped, which updates the common
|
# Trigger the cache metrics to be rescraped, which updates the common
|
||||||
# metrics but do not produce metrics themselves
|
# metrics but do not produce metrics themselves
|
||||||
|
@ -94,7 +114,8 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
|
||||||
# No samples, don't bother.
|
# No samples, don't bother.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
mname = metric.name
|
# Translate to legacy metric name if it has one.
|
||||||
|
mname = LEGACY_METRIC_NAMES.get(metric.name, metric.name)
|
||||||
mnewname = metric.name
|
mnewname = metric.name
|
||||||
mtype = metric.type
|
mtype = metric.type
|
||||||
|
|
||||||
|
@ -124,7 +145,7 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
|
||||||
om_samples: Dict[str, List[str]] = {}
|
om_samples: Dict[str, List[str]] = {}
|
||||||
for s in metric.samples:
|
for s in metric.samples:
|
||||||
for suffix in ["_created", "_gsum", "_gcount"]:
|
for suffix in ["_created", "_gsum", "_gcount"]:
|
||||||
if s.name == metric.name + suffix:
|
if s.name == mname + suffix:
|
||||||
# OpenMetrics specific sample, put in a gauge at the end.
|
# OpenMetrics specific sample, put in a gauge at the end.
|
||||||
# (these come from gaugehistograms which don't get renamed,
|
# (these come from gaugehistograms which don't get renamed,
|
||||||
# so no need to faff with mnewname)
|
# so no need to faff with mnewname)
|
||||||
|
@ -140,12 +161,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
|
||||||
if emit_help:
|
if emit_help:
|
||||||
output.append(
|
output.append(
|
||||||
"# HELP {}{} {}\n".format(
|
"# HELP {}{} {}\n".format(
|
||||||
metric.name,
|
mname,
|
||||||
suffix,
|
suffix,
|
||||||
metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
|
metric.documentation.replace("\\", r"\\").replace("\n", r"\n"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
output.append(f"# TYPE {metric.name}{suffix} gauge\n")
|
output.append(f"# TYPE {mname}{suffix} gauge\n")
|
||||||
output.extend(lines)
|
output.extend(lines)
|
||||||
|
|
||||||
# Get rid of the weird colon things while we're at it
|
# Get rid of the weird colon things while we're at it
|
||||||
|
@ -170,11 +191,12 @@ def generate_latest(registry: CollectorRegistry, emit_help: bool = False) -> byt
|
||||||
# Get rid of the OpenMetrics specific samples (we should already have
|
# Get rid of the OpenMetrics specific samples (we should already have
|
||||||
# dealt with them above anyway.)
|
# dealt with them above anyway.)
|
||||||
for suffix in ["_created", "_gsum", "_gcount"]:
|
for suffix in ["_created", "_gsum", "_gcount"]:
|
||||||
if s.name == metric.name + suffix:
|
if s.name == mname + suffix:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
sample_name = LEGACY_METRIC_NAMES.get(s.name, s.name)
|
||||||
output.append(
|
output.append(
|
||||||
sample_line(s, s.name.replace(":total", "").replace(":", "_"))
|
sample_line(s, sample_name.replace(":total", "").replace(":", "_"))
|
||||||
)
|
)
|
||||||
|
|
||||||
return "".join(output).encode("utf-8")
|
return "".join(output).encode("utf-8")
|
|
@ -34,10 +34,10 @@ TRACK_MEMORY_USAGE = False
|
||||||
caches_by_name: Dict[str, Sized] = {}
|
caches_by_name: Dict[str, Sized] = {}
|
||||||
collectors_by_name: Dict[str, "CacheMetric"] = {}
|
collectors_by_name: Dict[str, "CacheMetric"] = {}
|
||||||
|
|
||||||
cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
|
cache_size = Gauge("synapse_util_caches_cache_size", "", ["name"])
|
||||||
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
|
cache_hits = Gauge("synapse_util_caches_cache_hits", "", ["name"])
|
||||||
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
|
cache_evicted = Gauge("synapse_util_caches_cache_evicted_size", "", ["name", "reason"])
|
||||||
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
|
cache_total = Gauge("synapse_util_caches_cache_total", "", ["name"])
|
||||||
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
|
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
|
||||||
cache_memory_usage = Gauge(
|
cache_memory_usage = Gauge(
|
||||||
"synapse_util_caches_cache_size_bytes",
|
"synapse_util_caches_cache_size_bytes",
|
||||||
|
@ -45,12 +45,12 @@ cache_memory_usage = Gauge(
|
||||||
["name"],
|
["name"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
|
response_cache_size = Gauge("synapse_util_caches_response_cache_size", "", ["name"])
|
||||||
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
|
response_cache_hits = Gauge("synapse_util_caches_response_cache_hits", "", ["name"])
|
||||||
response_cache_evicted = Gauge(
|
response_cache_evicted = Gauge(
|
||||||
"synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
|
"synapse_util_caches_response_cache_evicted_size", "", ["name", "reason"]
|
||||||
)
|
)
|
||||||
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
|
response_cache_total = Gauge("synapse_util_caches_response_cache_total", "", ["name"])
|
||||||
|
|
||||||
|
|
||||||
class EvictionReason(Enum):
|
class EvictionReason(Enum):
|
||||||
|
|
|
@ -12,7 +12,16 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
try:
|
||||||
|
from importlib import metadata
|
||||||
|
except ImportError:
|
||||||
|
import importlib_metadata as metadata # type: ignore[no-redef]
|
||||||
|
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from pkg_resources import parse_version
|
||||||
|
|
||||||
|
from synapse.app._base import _set_prometheus_client_use_created_metrics
|
||||||
from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
|
from synapse.metrics import REGISTRY, InFlightGauge, generate_latest
|
||||||
from synapse.util.caches.deferred_cache import DeferredCache
|
from synapse.util.caches.deferred_cache import DeferredCache
|
||||||
|
|
||||||
|
@ -162,3 +171,30 @@ class CacheMetricsTests(unittest.HomeserverTestCase):
|
||||||
|
|
||||||
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
|
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
|
||||||
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
|
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusMetricsHackTestCase(unittest.HomeserverTestCase):
|
||||||
|
if parse_version(metadata.version("prometheus_client")) < parse_version("0.14.0"):
|
||||||
|
skip = "prometheus-client too old"
|
||||||
|
|
||||||
|
def test_created_metrics_disabled(self) -> None:
|
||||||
|
"""
|
||||||
|
Tests that a brittle hack, to disable `_created` metrics, works.
|
||||||
|
This involves poking at the internals of prometheus-client.
|
||||||
|
It's not the end of the world if this doesn't work.
|
||||||
|
|
||||||
|
This test gives us a way to notice if prometheus-client changes
|
||||||
|
their internals.
|
||||||
|
"""
|
||||||
|
import prometheus_client.metrics
|
||||||
|
|
||||||
|
PRIVATE_FLAG_NAME = "_use_created"
|
||||||
|
|
||||||
|
# By default, the pesky `_created` metrics are enabled.
|
||||||
|
# Check this assumption is still valid.
|
||||||
|
self.assertTrue(getattr(prometheus_client.metrics, PRIVATE_FLAG_NAME))
|
||||||
|
|
||||||
|
with patch("prometheus_client.metrics") as mock:
|
||||||
|
setattr(mock, PRIVATE_FLAG_NAME, True)
|
||||||
|
_set_prometheus_client_use_created_metrics(False)
|
||||||
|
self.assertFalse(getattr(mock, PRIVATE_FLAG_NAME, False))
|
||||||
|
|
Loading…
Reference in New Issue