Skip to content

Commit

Permalink
added guard for permission errors on relation-get (#93)
Browse files Browse the repository at this point in the history
* added guard for permission errors on relation-get

* vbump

* more lenient readiness check guard

* better error handling

* PR comments

* updated comment

* lint and fix tests
  • Loading branch information
PietroPasotti authored Nov 14, 2024
1 parent 14c27b4 commit 303d1b7
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 36 deletions.
84 changes: 60 additions & 24 deletions src/cosl/coordinated_workers/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import tenacity
import yaml
from ops import MaintenanceStatus, StatusBase
from ops.model import ActiveStatus, BlockedStatus, WaitingStatus
from ops.model import ActiveStatus, BlockedStatus, ModelError, WaitingStatus
from ops.pebble import Check, Layer, PathError, Plan, ProtocolError

from cosl import JujuTopology
Expand Down Expand Up @@ -77,6 +77,10 @@ class WorkerError(Exception):
"""Base class for exceptions raised by this module."""


class NoReadinessCheckEndpointConfiguredError(Exception):
"""Internal error when readiness check endpoint is missing."""


class ServiceEndpointStatus(Enum):
"""Status of the worker service managed by pebble."""

Expand Down Expand Up @@ -255,19 +259,27 @@ def status(self) -> ServiceEndpointStatus:
# we really don't want this code to raise errors, so we blanket catch all.
try:
services = self._container.get_services(*layer.services.keys())
running_status = {name: svc.is_running() for name, svc in services.items()}
if not all(running_status.values()):
if any(running_status.values()):
starting_services = tuple(
name for name, running in running_status.items() if not running
)
logger.info(
f"Some services which should be running are not: {starting_services}."
if not services:
logger.debug("No services found in pebble plan.")
else:
services_not_running = [
name for name, svc in services.items() if not svc.is_running()
]
if services_not_running:
logger.debug(
f"Some services which should be running are not: {services_not_running}."
)
return ServiceEndpointStatus.starting
return ServiceEndpointStatus.down
else:
logger.debug("All pebble services up.")

logger.info("All services are down.")
return ServiceEndpointStatus.down
# so far as pebble knows all services are up, now let's see if
# the readiness endpoint confirm that
return self.check_readiness()

except NoReadinessCheckEndpointConfiguredError:
# assume up
return ServiceEndpointStatus.up

except Exception:
logger.exception(
Expand All @@ -276,16 +288,11 @@ def status(self) -> ServiceEndpointStatus:
)
return ServiceEndpointStatus.down

return self.check_readiness()

def check_readiness(self) -> ServiceEndpointStatus:
"""If the user has configured a readiness check endpoint, GET it and check the workload status."""
check_endpoint = self._readiness_check_endpoint
if not check_endpoint:
raise WorkerError(
"cannot check readiness without a readiness_check_endpoint configured. "
"Pass one to Worker on __init__."
)
raise NoReadinessCheckEndpointConfiguredError()

try:
with urllib.request.urlopen(check_endpoint(self)) as response:
Expand Down Expand Up @@ -506,7 +513,20 @@ def _update_cluster_relation(self) -> None:
self.cluster.publish_unit_address(socket.getfqdn())
if self._charm.unit.is_leader() and self.roles:
logger.info(f"publishing roles: {self.roles}")
self.cluster.publish_app_roles(self.roles)
try:
self.cluster.publish_app_roles(self.roles)
except ModelError as e:
# if we are handling an event prior to 'install', we could be denied write access
# Swallowing the exception here relies on the reconciler pattern - this will be
# retried at the next occasion and eventually that'll be after 'install'.
if "ERROR permission denied (unauthorized access)" in e.args:
logger.debug(
"relation-set failed with a permission denied error. "
"This could be a transient issue."
)
else:
# let it burn, we clearly don't know what's going on
raise

def _running_worker_config(self) -> Optional[Dict[str, Any]]:
"""Return the worker config as dict, or None if retrieval failed."""
Expand Down Expand Up @@ -666,13 +686,26 @@ def restart(self):
# restart all services that our layer is responsible for
self._container.restart(*service_names)

except ops.pebble.ConnectionError:
logger.debug(
"failed to (re)start worker jobs because the container unexpectedly died; "
"this might mean the unit is still settling after deploy or an upgrade. "
"This should resolve itself."
# or it's a juju bug^TM
)
return False

except ops.pebble.ChangeError:
logger.error(
"failed to (re)start worker jobs. This usually means that an external resource (such as s3) "
"that the software needs to start is not available."
)
raise

except Exception:
logger.exception("failed to (re)start worker jobs due to an unexpected error.")
raise

try:
for attempt in tenacity.Retrying(
# status may report .down
Expand All @@ -691,15 +724,18 @@ def restart(self):
# set result to status; will retry unless it's up
attempt.retry_state.set_result(self.status is ServiceEndpointStatus.up)

except WorkerError:
# unable to check worker readiness: no readiness_check_endpoint configured.
# this status is already set on the unit so no need to log it
pass
except NoReadinessCheckEndpointConfiguredError:
# collect_unit_status will surface this to the user
logger.warning(
"could not check worker service readiness: no check endpoint configured. "
"Pass one to the Worker."
)
return True

except Exception:
logger.exception("unexpected error while attempting to determine worker status")

return False
return self.status is ServiceEndpointStatus.up

def running_version(self) -> Optional[str]:
"""Get the running version from the worker process."""
Expand Down
19 changes: 10 additions & 9 deletions tests/test_coordinated_workers/test_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,17 +244,18 @@ def test_s3_integration(
assert coordinator._s3_config["insecure"] is (not tls_ca_chain)


def test_tracing_receivers_urls(coordinator_state: State, coordinator_charm: ops.CharmBase):

charm_tracing_relation = Relation(
def test_tracing_receivers_urls(
coordinator_state: testing.State, coordinator_charm: ops.CharmBase
):
charm_tracing_relation = testing.Relation(
endpoint="my-charm-tracing",
remote_app_data={
"receivers": json.dumps(
[{"protocol": {"name": "otlp_http", "type": "http"}, "url": "1.2.3.4:4318"}]
)
},
)
workload_tracing_relation = Relation(
workload_tracing_relation = testing.Relation(
endpoint="my-workload-tracing",
remote_app_data={
"receivers": json.dumps(
Expand All @@ -265,11 +266,11 @@ def test_tracing_receivers_urls(coordinator_state: State, coordinator_charm: ops
)
},
)
ctx = Context(coordinator_charm, meta=coordinator_charm.META)
with ctx.manager(
"update-status",
state=coordinator_state.replace(
relations=[charm_tracing_relation, workload_tracing_relation]
ctx = testing.Context(coordinator_charm, meta=coordinator_charm.META)
with ctx(
ctx.on.update_status(),
state=dataclasses.replace(
coordinator_state, relations=[charm_tracing_relation, workload_tracing_relation]
),
) as mgr:
coordinator: Coordinator = mgr.charm.coordinator
Expand Down
7 changes: 5 additions & 2 deletions tests/test_coordinated_workers/test_worker_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from ops import testing

from cosl.coordinated_workers.interface import ClusterProviderAppData
from cosl.coordinated_workers.worker import Worker, WorkerError
from cosl.coordinated_workers.worker import (
NoReadinessCheckEndpointConfiguredError,
Worker,
)


@pytest.fixture(params=[True, False])
Expand Down Expand Up @@ -247,7 +250,7 @@ def test_access_readiness_no_endpoint_raises():
)

# THEN calling .check_readiness raises
with pytest.raises(WorkerError):
with pytest.raises(NoReadinessCheckEndpointConfiguredError):
worker.check_readiness() # noqa


Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# See LICENSE file for licensing details.

[tox]
envlist = lint, static, unit
envlist = fetch-libs, lint, static, unit
isolated_build=true

[vars]
Expand Down

0 comments on commit 303d1b7

Please sign in to comment.