From 6a2c931b46dc592dc0130482d287d416da1ab127 Mon Sep 17 00:00:00 2001 From: PietroPasotti Date: Fri, 7 Jun 2024 14:46:10 +0200 Subject: [PATCH] Fix race between pebble-ready and relation-created (#126) --- src/charm.py | 6 ++++-- src/tempo.py | 40 ++++++++++++++++++++++++++++-------- tests/scenario/test_charm.py | 26 +++++++++++++++++++++++ 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/charm.py b/src/charm.py index 08c491be..4fe24393 100755 --- a/src/charm.py +++ b/src/charm.py @@ -177,7 +177,9 @@ def _on_cert_handler_changed(self, _): self.tempo.update_config(self._requested_receivers()) # sync scheme change with traefik and related consumers self._configure_ingress(_) - self.tempo.restart() + + if self.tempo.is_tempo_service_defined: + self.tempo.restart() # sync the server cert with the charm container. # technically, because of charm tracing, this will be called first thing on each event @@ -247,7 +249,7 @@ def _restart_if_receivers_changed(self, requested_receivers): if not updated: logger.debug("Config not updated; skipping tempo restart") if updated: - restarted = self.tempo.restart() + restarted = self.tempo.is_tempo_service_defined and self.tempo.restart() if not restarted: # assume that this will be handled at the next pebble-ready logger.debug("Cannot reconfigure/restart tempo at this time.") diff --git a/src/tempo.py b/src/tempo.py index d7bb4749..cc0fd86e 100644 --- a/src/tempo.py +++ b/src/tempo.py @@ -17,6 +17,7 @@ receiver_protocol_to_transport_protocol, ) from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer +from ops import ModelError from ops.pebble import Layer logger = logging.getLogger(__name__) @@ -157,6 +158,15 @@ def update_config(self, requested_receivers: Sequence[ReceiverProtocol]) -> bool return True return False + @property + def is_tempo_service_defined(self) -> bool: + """Check that the tempo service is present in the plan.""" + try: + self.container.get_service("tempo") + return True + except ModelError: + return False + @tenacity.retry( # if restart FAILS (this function returns False) retry=tenacity.retry_if_result(lambda r: r is False), @@ -176,23 +186,37 @@ def restart(self) -> bool: if not self.container.can_connect(): return False + if not self.is_tempo_service_defined: + # quite expensive to hit this every time, because of the retry, so let's warn. + logger.warning( + "you shouldn't call .restart() before .plan() has taken place." + "use .is_tempo_service_defined to guard against this situation." + ) + return False - self.container.stop("tempo") - service_status = self.container.get_service("tempo").current + try: + is_started = self.container.get_service("tempo").is_running() + except ModelError: + is_started = False # verify if tempo is already inactive, then try to start a new instance - if service_status == ops.pebble.ServiceStatus.INACTIVE: + if is_started: + try: + self.container.restart("tempo") + except ops.pebble.ChangeError: + # if tempo fails to start, we'll try again after some backoff + return False + else: try: self.container.start("tempo") except ops.pebble.ChangeError: # if tempo fails to start, we'll try again after retry backoff return False - # set the notice to start checking for tempo server readiness so we don't have to - # wait for an update-status - self.container.start("tempo-ready") - return True - return False + # set the notice to start checking for tempo server readiness so we don't have to + # wait for an update-status + self.container.start("tempo-ready") + return True def get_current_config(self) -> Optional[dict]: """Fetch the current configuration from the container.""" diff --git a/tests/scenario/test_charm.py b/tests/scenario/test_charm.py index c69a383f..ed1968bc 100644 --- a/tests/scenario/test_charm.py +++ b/tests/scenario/test_charm.py @@ -93,3 +93,29 @@ def test_tempo_restart_on_ingress_v2_changed(context, tmp_path, requested_protoc context.output_state.get_container("tempo").service_status["tempo"] is pebble.ServiceStatus.ACTIVE ) + + +def test_tempo_tracing_created_before_pebble_ready(context, tmp_path): + # GIVEN there is no plan yet + tempo = Container( + "tempo", + can_connect=True, + ) + + # WHEN + # the charm receives a tracing-relation-created requesting an otlp_grpc receiver + tracing = Relation( + "tracing", + remote_app_data={"receivers": '["otlp_http"]'}, + local_app_data={ + "receivers": '[{"protocol": {"name": "otlp_grpc", "type": "grpc"} , "url": "foo.com:10"}, ' + '{"protocol": {"name": "otlp_http", "type": "http"}, "url": "http://foo.com:11"}, ' + }, + ) + state = State(leader=True, containers=[tempo], relations=[tracing]) + state_out = context.run(tracing.created_event, state) + + # THEN + # tempo still has no services + tempo_out = state_out.get_container("tempo") + assert not tempo_out.services