Skip to content
This repository has been archived by the owner on Oct 8, 2024. It is now read-only.

Commit

Permalink
Fix race between pebble-ready and relation-created (#126)
Browse files Browse the repository at this point in the history
  • Loading branch information
PietroPasotti authored Jun 7, 2024
1 parent 3e76f23 commit 6a2c931
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 10 deletions.
6 changes: 4 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ def _on_cert_handler_changed(self, _):
self.tempo.update_config(self._requested_receivers())
# sync scheme change with traefik and related consumers
self._configure_ingress(_)
self.tempo.restart()

if self.tempo.is_tempo_service_defined:
self.tempo.restart()

# sync the server cert with the charm container.
# technically, because of charm tracing, this will be called first thing on each event
Expand Down Expand Up @@ -247,7 +249,7 @@ def _restart_if_receivers_changed(self, requested_receivers):
if not updated:
logger.debug("Config not updated; skipping tempo restart")
if updated:
restarted = self.tempo.restart()
restarted = self.tempo.is_tempo_service_defined and self.tempo.restart()
if not restarted:
# assume that this will be handled at the next pebble-ready
logger.debug("Cannot reconfigure/restart tempo at this time.")
Expand Down
40 changes: 32 additions & 8 deletions src/tempo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
receiver_protocol_to_transport_protocol,
)
from charms.traefik_route_k8s.v0.traefik_route import TraefikRouteRequirer
from ops import ModelError
from ops.pebble import Layer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -157,6 +158,15 @@ def update_config(self, requested_receivers: Sequence[ReceiverProtocol]) -> bool
return True
return False

@property
def is_tempo_service_defined(self) -> bool:
"""Check that the tempo service is present in the plan."""
try:
self.container.get_service("tempo")
return True
except ModelError:
return False

@tenacity.retry(
# if restart FAILS (this function returns False)
retry=tenacity.retry_if_result(lambda r: r is False),
Expand All @@ -176,23 +186,37 @@ def restart(self) -> bool:

if not self.container.can_connect():
return False
if not self.is_tempo_service_defined:
# quite expensive to hit this every time, because of the retry, so let's warn.
logger.warning(
"you shouldn't call .restart() before .plan() has taken place."
"use .is_tempo_service_defined to guard against this situation."
)
return False

self.container.stop("tempo")
service_status = self.container.get_service("tempo").current
try:
is_started = self.container.get_service("tempo").is_running()
except ModelError:
is_started = False

# verify if tempo is already inactive, then try to start a new instance
if service_status == ops.pebble.ServiceStatus.INACTIVE:
if is_started:
try:
self.container.restart("tempo")
except ops.pebble.ChangeError:
# if tempo fails to start, we'll try again after some backoff
return False
else:
try:
self.container.start("tempo")
except ops.pebble.ChangeError:
# if tempo fails to start, we'll try again after retry backoff
return False

# set the notice to start checking for tempo server readiness so we don't have to
# wait for an update-status
self.container.start("tempo-ready")
return True
return False
# set the notice to start checking for tempo server readiness so we don't have to
# wait for an update-status
self.container.start("tempo-ready")
return True

def get_current_config(self) -> Optional[dict]:
"""Fetch the current configuration from the container."""
Expand Down
26 changes: 26 additions & 0 deletions tests/scenario/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,29 @@ def test_tempo_restart_on_ingress_v2_changed(context, tmp_path, requested_protoc
context.output_state.get_container("tempo").service_status["tempo"]
is pebble.ServiceStatus.ACTIVE
)


def test_tempo_tracing_created_before_pebble_ready(context, tmp_path):
# GIVEN there is no plan yet
tempo = Container(
"tempo",
can_connect=True,
)

# WHEN
# the charm receives a tracing-relation-created requesting an otlp_grpc receiver
tracing = Relation(
"tracing",
remote_app_data={"receivers": '["otlp_http"]'},
local_app_data={
"receivers": '[{"protocol": {"name": "otlp_grpc", "type": "grpc"} , "url": "foo.com:10"}, '
'{"protocol": {"name": "otlp_http", "type": "http"}, "url": "http://foo.com:11"}, '
},
)
state = State(leader=True, containers=[tempo], relations=[tracing])
state_out = context.run(tracing.created_event, state)

# THEN
# tempo still has no services
tempo_out = state_out.get_container("tempo")
assert not tempo_out.services

0 comments on commit 6a2c931

Please sign in to comment.