From 7f12e09498542ada412232a272c44217a2367971 Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Thu, 18 Jul 2024 16:38:50 +0930 Subject: [PATCH] Workaround ceilometer-agent-compute not running This is a bug in nova-compute and/or ceilometer-agent ( https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ), where the nova-compute `resume` action can sometimes fail. The workaround added here is to restart ceilometer-agent-compute if it fails for this reason. Fixes: #427 --- cou/apps/core.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/cou/apps/core.py b/cou/apps/core.py index 1581a90c..5bda216d 100644 --- a/cou/apps/core.py +++ b/cou/apps/core.py @@ -18,9 +18,10 @@ from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication from cou.apps.factory import AppFactory -from cou.exceptions import ApplicationNotSupported +from cou.exceptions import ActionFailed, ApplicationNotSupported from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep -from cou.utils.juju_utils import Unit +from cou.utils import prompt_input +from cou.utils.juju_utils import Model, Unit from cou.utils.nova_compute import verify_empty_hypervisor from cou.utils.openstack import OpenStackRelease @@ -221,6 +222,63 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp for unit in units_to_disable ] + def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep: + """Override the resume unit step, because extra error handling is required. + + :param unit: Unit to be resumed. + :type unit: Unit + :param dependent: Whether the step is dependent of another step, defaults to False + :type dependent: bool, optional + :return: Step to resume a unit. + :rtype: UnitUpgradeStep + """ + # workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 + return UnitUpgradeStep( + description=( + f"Resume the unit: '{unit.name}', restarting ceilometer-agent-compute if needed" + ), + coro=resume_nova_compute_unit(self.model, unit), + dependent=dependent, + ) + + +async def resume_nova_compute_unit(model: Model, unit: Unit) -> None: + """Run the resume action on nova-compute, with workarounds. + + Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 + + :param model: juju model to work with + :type model: Model + :param unit: nova-compute unit to resume + :type unit: Unit + :raises ActionFailed: when the resume action fails with an unknown failure + """ + action = await model.run_action(unit.name, "resume", raise_on_failure=False) + + # If the action was successful, there is nothing left to do + if action.status == "completed": + return + + # If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 , + # apply the workaround. + if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get( + "message", "" + ): + logger.debug("Resume failed because ceilometer-agent-compute not running.") + logger.debug("Restarting ceilometer-agent-compute on %s", unit.name) + await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute") + + # Update status manually, otherwise nova-compute and ceilometer-agent + # will be blocked until next update-status hook. + await model.update_status(unit.name) + for subordinate in unit.subordinates: + if subordinate.charm == "ceilometer-agent": + await model.update_status(subordinate.name) + + # Otherwise, it's an unknown error, so raise the exception + else: + raise ActionFailed(action) + @AppFactory.register_application(["swift-proxy", "swift-storage"]) class Swift(OpenStackApplication):