Skip to content

Commit

Permalink
Workaround ceilometer-agent-compute not running
Browse files Browse the repository at this point in the history
This is a bug in nova-compute and/or ceilometer-agent
( https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ),
where the nova-compute `resume` action can sometimes fail.
The workaround added here is to restart ceilometer-agent-compute
if it fails for this reason.

Fixes: #427
  • Loading branch information
samuelallan72 committed Jul 22, 2024
1 parent 8ab380d commit 329dc8e
Showing 1 changed file with 59 additions and 2 deletions.
61 changes: 59 additions & 2 deletions cou/apps/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication
from cou.apps.factory import AppFactory
from cou.exceptions import ApplicationNotSupported
from cou.exceptions import ActionFailed, ApplicationNotSupported
from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep
from cou.utils.juju_utils import Unit
from cou.utils.juju_utils import Model, Unit
from cou.utils.nova_compute import verify_empty_hypervisor
from cou.utils.openstack import OpenStackRelease

Expand Down Expand Up @@ -221,6 +221,63 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp
for unit in units_to_disable
]

def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep:
"""Override the resume unit step, because extra error handling is required.
:param unit: Unit to be resumed.
:type unit: Unit
:param dependent: Whether the step is dependent of another step, defaults to False
:type dependent: bool, optional
:return: Step to resume a unit.
:rtype: UnitUpgradeStep
"""
# workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
return UnitUpgradeStep(
description=(
f"Resume the unit: '{unit.name}', restarting ceilometer-agent-compute if needed"
),
coro=resume_nova_compute_unit(self.model, unit),
dependent=dependent,
)


async def resume_nova_compute_unit(model: Model, unit: Unit) -> None:
"""Run the resume action on nova-compute, with workarounds.
Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
:param model: juju model to work with
:type model: Model
:param unit: nova-compute unit to resume
:type unit: Unit
:raises ActionFailed: when the resume action fails with an unknown failure
"""
action = await model.run_action(unit.name, "resume", raise_on_failure=False)

# If the action was successful, there is nothing left to do
if action.status == "completed":
return

# If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ,
# apply the workaround.
if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get(
"message", ""
):
logger.debug("Resume failed because ceilometer-agent-compute not running.")
logger.debug("Restarting ceilometer-agent-compute on %s", unit.name)
await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute")

# Update status manually, otherwise nova-compute and ceilometer-agent
# will be blocked until next update-status hook.
await model.update_status(unit.name)
for subordinate in unit.subordinates:
if subordinate.charm == "ceilometer-agent":
await model.update_status(subordinate.name)

# Otherwise, it's an unknown error, so raise the exception
else:
raise ActionFailed(action)


@AppFactory.register_application(["swift-proxy", "swift-storage"])
class Swift(OpenStackApplication):
Expand Down

0 comments on commit 329dc8e

Please sign in to comment.