Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Share some metrics between the Prometheus exporter and the phone home stats #13671

Merged
merged 13 commits into from
Sep 5, 2022
1 change: 1 addition & 0 deletions changelog.d/13671.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Introduce a `SharedUsageMetrics` class to share some usage metrics between the Prometheus exporter and the phone home stats.
5 changes: 3 additions & 2 deletions synapse/app/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,9 +478,10 @@ def run_sighup(*args: Any, **kwargs: Any) -> None:
setup_sentry(hs)
setup_sdnotify(hs)

# If background tasks are running on the main process, start collecting the
# phone home stats.
# If background tasks are running on the main process or this is the worker in
# charge of them, start collecting the phone home stats and shared usage metrics.
if hs.config.worker.run_background_tasks:
await hs.get_shared_usage_metrics().setup()
start_phone_stats_home(hs)

# We now freeze all allocated objects in the hopes that (almost)
Expand Down
25 changes: 22 additions & 3 deletions synapse/app/phone_stats_home.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from prometheus_client import Gauge

from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.metrics.shared_usage_metrics import SharedUsageMetrics
from synapse.types import JsonDict

if TYPE_CHECKING:
Expand Down Expand Up @@ -49,8 +50,21 @@
async def phone_stats_home(
hs: "HomeServer",
stats: JsonDict,
shared_metrics: SharedUsageMetrics,
stats_process: List[Tuple[int, "resource.struct_rusage"]] = _stats_process,
) -> None:
"""Collect usage statistics and send them to the configured endpoint.

Args:
hs: the HomeServer object to use for gathering usage data.
stats: the dict in which to store the statistics sent to the configured
endpoint. Mostly used in tests to figure out the data that is supposed to
be sent.
shared_metrics: metrics shared between the Prometheus exporter and the phone
home stats, populated outside of this function.
stats_process: statistics about resource usage of the process.
"""

logger.info("Gathering stats for reporting")
now = int(hs.get_clock().time())
# Ensure the homeserver has started.
Expand Down Expand Up @@ -104,7 +118,7 @@ async def phone_stats_home(
room_count = await store.get_room_count()
stats["total_room_count"] = room_count

stats["daily_active_users"] = await store.count_daily_users()
stats["daily_active_users"] = shared_metrics.daily_active_users
stats["monthly_active_users"] = await store.count_monthly_users()
daily_active_e2ee_rooms = await store.count_daily_active_e2ee_rooms()
stats["daily_active_e2ee_rooms"] = daily_active_e2ee_rooms
Expand Down Expand Up @@ -205,14 +219,19 @@ async def generate_monthly_active_users() -> None:
clock.looping_call(generate_monthly_active_users, 5 * 60 * 1000)
# End of monthly active user settings

# Collect metrics shared between the Prometheus exporter and the phone home stats.
shared_metrics = hs.get_shared_usage_metrics()

if hs.config.metrics.report_stats:
logger.info("Scheduling stats reporting for 3 hour intervals")
clock.looping_call(phone_stats_home, 3 * 60 * 60 * 1000, hs, stats)
clock.looping_call(
phone_stats_home, 3 * 60 * 60 * 1000, hs, stats, shared_metrics
)

# We need to defer this init for the cases that we daemonize
# otherwise the process ID we get is that of the non-daemon process
clock.call_later(0, performance_stats_init)

# We wait 5 minutes to send the first set of stats as the server can
# be quite busy the first few minutes
clock.call_later(5 * 60, phone_stats_home, hs, stats)
clock.call_later(5 * 60, phone_stats_home, hs, stats, shared_metrics)
59 changes: 59 additions & 0 deletions synapse/metrics/shared_usage_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2022 The Matrix.org Foundation C.I.C
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING

from synapse.metrics.background_process_metrics import run_as_background_process

if TYPE_CHECKING:
from synapse.server import HomeServer

from prometheus_client import Gauge

# Gauge to expose daily active users metrics
current_dau_gauge = Gauge(
"synapse_admin_daily_active_users",
"Current daily active users count",
)


class SharedUsageMetrics:
"""Usage metrics shared between the phone home stats and the prometheus exporter."""

def __init__(self, hs: "HomeServer") -> None:
self._store = hs.get_datastores().main
self._clock = hs.get_clock()

self.daily_active_users = -1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a fan of having each field have a sentinel value; it gives the impression that we may end up reporting -1 (although I see that you update upon initialisation). IMO we shouldn't report anything rather than report negative values. I can see this being potentially troublesome for the phone-home stats when we sum DAUs.

I think I would be tempted to restructure this a bit:

  • rename to SharedUsageMetricsManager or equivalent (not sure what the conventional term is in Synapse ... is it Controller?)
  • make SharedUsageMetrics a plain dataclass/attrs object, with daily_active_users as an int field
  • put a latest(): Optional[SharedUsageMetrics] method (or field maybe?) on SharedUsageMetricsManager

(N.B. I was going to suggest that the metrics be lazily computed, either on-phone-home or on-prometheus-scrape, but I realise we don't want to block the prometheus scrape endpoint on Twisted's reactor in case everything is going awry)

This sort of design prevents having individual metrics having sentinel values like -1, which may escape.


async def setup(self) -> None:
"""Reads the current values for the shared usage metrics and starts a looping
call to keep them updated.
"""
await self.update()
self._clock.looping_call(
run_as_background_process,
5 * 60 * 1000,
desc="update_shared_usage_metrics",
func=self.update,
)

async def update(self) -> None:
"""Updates the shared usage metrics."""
await self.update_daily_active_users()
Copy link
Contributor Author

@babolivier babolivier Aug 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels very boilerplatey currently but the idea is that we can add to this method if we add new metrics to this class.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be tempted to separate the stages of:

  • collecting the metrics (c.f. other comment; return a SharedUsageMetrics dataclass)
  • updating the gauges, given a SharedUsageMetrics object

One function per metric does feel a bit too boilerplatey to me.

as an idea

async def _collect(self) -> SharedUsageMetrics:
    return SharedUsageMetrics(
		daily_active_users=await self._store.count_daily_users()
    )

async def _update_gauges(self, metrics: SharedUsageMetrics) -> None:
    current_dau_gauge.set(metrics.daily_active_users)

Just my opinion, but I like that a bit better

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we would generate a new object each time we collect metrics? That sounds fairly inefficient to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've implemented this idea in a slightly different way from your proposal, lmk what you think.

Copy link
Contributor

@reivilibre reivilibre Sep 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we would generate a new object each time we collect metrics? That sounds fairly inefficient to me.

I think it's not a big cost for the clarity it gives. Remember this is Python where basically everything, even integers, are objects, so one more object isn't going to be much harm (especially if it uses slots).
The object is being removed in such a way that should be friendly with Python's refcounting so I don't think there's much of a garbage collection cost here either.

I would prefer the way I proposed as each metric is more self-contained in the code; the current proposal has the metric being updated differently in two branches and stored in an intermediate variable; it feels clunkier.

Even supposing objects were more inefficient, this is infrequently run that I don't think it's worth premature optimisation, better to simplify the code to reduce footgun potential I think

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree. Even though, as you mention, the gain in optimising this is pretty negligible (for now at least, there's always the possibility that something else starts using it, such as modules), to me creating a new object everytime something requests it is a bad code smell. Plus I think the code would be more confusing that way, because then we'd be updating the prometheus gauge every 5 minutes by reading the count from the database and then when the phone home stats ask for them we'd do the whole dance again. It feels both more logical and more efficient to me to have the CommonUsageMetricsManager keep a copy of the metrics in memory, keep that updated, and serve it whenever necessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you substantiate why you think it's a code smell? Creating an object or struct to hold associated values is such an accepted pattern elsewhere that I'm surprised it's objectionable. (In fact, mutating the existing objects that you've already handed out is usually considered an awful crime, to the point that some disciplines insist on immutability in many places.)

I think the worse code smell here is not keeping the logic for one metric in one place, e.g. if you have metrics A B and C then the code locality looks like:

1st proposal:

self.stats = Stats(
    a=count_a(),
    b=count_b(),
    c=count_c(),
)

2nd proposal:

a = count_a()
b = count_b()
c = count_c()
if ..:
    self.stats = Stats(a=a, b=b, c=c)
else:
    self.stats.a = a
    self.stats.b = b
    self.stats.c = c

If we annotate the two proposals' lines with which metric they concern, you can see my point here that the locality of behaviour is improved in the first suggestion (which I believe is better for its readability):

1st proposal:

self.stats = Stats(
    a=count_a(), # A
    b=count_b(), # B
    c=count_c(), # C
)

2nd proposal:

a = count_a() # A
b = count_b() # B
c = count_c() # C
if ..:
    self.stats = Stats(a=a, b=b, c=c) # A, B, C
else:
    self.stats.a = a # A
    self.stats.b = b # B
    self.stats.c = c # C

I also think the branch here is just more clutter than it's worth. Pessimistically, this is now twice as much to go wrong and twice as much to test.

the gain in optimising this is pretty negligible (for now at least, there's always the possibility that something else starts using it, such as modules)

I will note that I don't think the gain is pretty negligible — I think it's highly dubious; we create objects all the time in Synapse and in Python generally, whether they're integers, tuples, or instances of classes.

Of course, if we really want to talk about such tiny optimisations, then we might instead consider whether introducing an if here is bad because branching is inefficient, or we might consider that increasing the size of the code in the function is bad because you're going to increase the size of this function's code, therefore making caches ineffective.
I don't think any of these are seriously relevant here, but I do think that code clarity and keeping it easier to maintain is a big bonus generally.

Plus I think the code would be more confusing that way, because then we'd be updating the prometheus gauge every 5 minutes by reading the count from the database and then when the phone home stats ask for them we'd do the whole dance again.

I think you misunderstand my statement here (hopefully having spelled out the proposals above clarifies it?). You don't have to recalculate each and every time we request it, just the style of handing back the record can be different; just always create a new record when updating the metrics rather than mutating the previous one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you misunderstand my statement here (hopefully having spelled out the proposals above clarifies it?). You don't have to recalculate each and every time we request it, just the style of handing back the record can be different; just always create a new record when updating the metrics rather than mutating the previous one.

Ah, I was misunderstanding it indeed. You initial proposal of _update_gauges taking metrics as an argument somehow made me assume something else than the CommonUsageMetricsManager would be handing over a metrics object to use to update the gauges, which made me super confused as to where it's supposed to be coming from. I think I see what you have in mind now, and it does make sense.


async def update_daily_active_users(self) -> None:
"""Updates the daily active users count."""
dau_count = await self._store.count_daily_users()
current_dau_gauge.set(float(dau_count))
self.daily_active_users = dau_count
6 changes: 6 additions & 0 deletions synapse/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
from synapse.handlers.user_directory import UserDirectoryHandler
from synapse.http.client import InsecureInterceptableContextFactory, SimpleHttpClient
from synapse.http.matrixfederationclient import MatrixFederationHttpClient
from synapse.metrics.shared_usage_metrics import SharedUsageMetrics
from synapse.module_api import ModuleApi
from synapse.notifier import Notifier
from synapse.push.bulk_push_rule_evaluator import BulkPushRuleEvaluator
Expand Down Expand Up @@ -827,3 +828,8 @@ def get_request_ratelimiter(self) -> RequestRatelimiter:
self.config.ratelimiting.rc_message,
self.config.ratelimiting.rc_admin_redaction,
)

@cache_in_self
def get_shared_usage_metrics(self) -> SharedUsageMetrics:
babolivier marked this conversation as resolved.
Show resolved Hide resolved
"""Usage metrics shared between phone home stats and the prometheus exporter."""
return SharedUsageMetrics(self)
46 changes: 44 additions & 2 deletions tests/test_phone_home.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@
import resource
from unittest import mock

from twisted.test.proto_helpers import MemoryReactor

from synapse.app.phone_stats_home import phone_stats_home
from synapse.metrics.shared_usage_metrics import SharedUsageMetrics
from synapse.rest import admin
from synapse.rest.client import login, sync
from synapse.server import HomeServer
from synapse.types import JsonDict
from synapse.util import Clock

from tests.unittest import HomeserverTestCase

Expand All @@ -30,7 +37,9 @@ def test_performance_frozen_clock(self) -> None:
(self.hs.get_clock().time(), resource.getrusage(resource.RUSAGE_SELF))
]
stats: JsonDict = {}
self.get_success(phone_stats_home(self.hs, stats, past_stats))
self.get_success(
phone_stats_home(self.hs, stats, SharedUsageMetrics(self.hs), past_stats)
)
self.assertEqual(stats["cpu_average"], 0)

def test_performance_100(self) -> None:
Expand All @@ -47,5 +56,38 @@ def test_performance_100(self) -> None:
stats: JsonDict = {}
self.reactor.advance(1)
# `old_resource` has type `Mock` instead of `struct_rusage`
self.get_success(phone_stats_home(self.hs, stats, past_stats)) # type: ignore[arg-type]
self.get_success(
phone_stats_home(
self.hs,
stats,
SharedUsageMetrics(self.hs),
past_stats, # type: ignore[arg-type]
)
)
self.assertApproximates(stats["cpu_average"], 100, tolerance=2.5)


class SharedMetricsTestCase(HomeserverTestCase):
servlets = [
admin.register_servlets,
login.register_servlets,
sync.register_servlets,
]

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.metrics = hs.get_shared_usage_metrics()
self.get_success(self.metrics.setup())

def test_dau(self) -> None:
"""Tests that the daily active users count is correctly updated."""
self.assertEqual(self.metrics.daily_active_users, 0)

self.register_user("user", "password")
tok = self.login("user", "password")
self.make_request("GET", "/sync", access_token=tok)

self.assertEqual(self.metrics.daily_active_users, 0)

self.reactor.advance(3000)

self.assertEqual(self.metrics.daily_active_users, 1)