From 79a532165136ab6596032af5c726f2aba6711284 Mon Sep 17 00:00:00 2001 From: Samuel Allan Date: Thu, 18 Jul 2024 16:38:50 +0930 Subject: [PATCH] Workaround ceilometer-agent-compute not running This is a bug in nova-compute and/or ceilometer-agent ( https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ), where the nova-compute `resume` action can sometimes fail. The workaround added here is to restart ceilometer-agent-compute if it fails for this reason. Fixes: #427 --- cou/apps/core.py | 59 +++++++++++++++++++++++++++- tests/unit/apps/test_core.py | 74 +++++++++++++++++++++++++++++++++++- tests/unit/conftest.py | 1 + 3 files changed, 130 insertions(+), 4 deletions(-) diff --git a/cou/apps/core.py b/cou/apps/core.py index 1581a90c..cdf98f08 100644 --- a/cou/apps/core.py +++ b/cou/apps/core.py @@ -18,9 +18,9 @@ from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication from cou.apps.factory import AppFactory -from cou.exceptions import ApplicationNotSupported +from cou.exceptions import ActionFailed, ApplicationNotSupported from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep -from cou.utils.juju_utils import Unit +from cou.utils.juju_utils import Model, Unit from cou.utils.nova_compute import verify_empty_hypervisor from cou.utils.openstack import OpenStackRelease @@ -221,6 +221,61 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp for unit in units_to_disable ] + def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep: + """Override the resume unit step, because extra error handling is required. + + :param unit: Unit to be resumed. + :type unit: Unit + :param dependent: Whether the step is dependent of another step, defaults to False + :type dependent: bool, optional + :return: Step to resume a unit. + :rtype: UnitUpgradeStep + """ + # workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 + return UnitUpgradeStep( + description=(f"Resume the unit: '{unit.name}'"), + coro=resume_nova_compute_unit(self.model, unit), + dependent=dependent, + ) + + +async def resume_nova_compute_unit(model: Model, unit: Unit) -> None: + """Run the resume action on nova-compute, with workarounds. + + Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 + + :param model: juju model to work with + :type model: Model + :param unit: nova-compute unit to resume + :type unit: Unit + :raises ActionFailed: when the resume action fails with an unknown failure + """ + action = await model.run_action(unit.name, "resume", raise_on_failure=False) + + # If the action was successful, there is nothing left to do + if action.status == "completed": + return + + # If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 , + # apply the workaround. + if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get( + "message", "" + ): + logger.debug("Resume failed because ceilometer-agent-compute not running.") + logger.debug("Restarting ceilometer-agent-compute on %s", unit.name) + await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute") + + # Update status manually, otherwise nova-compute and ceilometer-agent + # will be blocked until next update-status hook. + await model.update_status(unit.name) + for subordinate in unit.subordinates: + if subordinate.charm == "ceilometer-agent": + await model.update_status(subordinate.name) + + # Otherwise, it's an unknown error, so raise the exception + else: + raise ActionFailed(action) + @AppFactory.register_application(["swift-proxy", "swift-storage"]) class Swift(OpenStackApplication): diff --git a/tests/unit/apps/test_core.py b/tests/unit/apps/test_core.py index 900781d6..e4453347 100644 --- a/tests/unit/apps/test_core.py +++ b/tests/unit/apps/test_core.py @@ -11,14 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, Mock, call, patch import pytest from juju.client._definitions import ApplicationStatus, UnitStatus from cou.apps.base import OpenStackApplication -from cou.apps.core import Keystone, NovaCompute, Swift +from cou.apps.core import Keystone, NovaCompute, Swift, resume_nova_compute_unit from cou.exceptions import ( + ActionFailed, ApplicationError, ApplicationNotSupported, HaltUpgradePlanGeneration, @@ -1144,3 +1145,72 @@ def test_core_wrong_channel(model): with pytest.raises(ApplicationError, match=exp_msg): app.generate_upgrade_plan(target, force=False) + + +@pytest.mark.asyncio +async def test_resume_nova_compute_success(model): + """Verify that the success case resumes without calling workarounds.""" + model.run_action.return_value = Mock(status="completed") + unit = Unit( + name="nova-compute/0", + machine=generate_cou_machine("0", "az-0"), + workload_version="21.2.4", + subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")], + ) + + await resume_nova_compute_unit(model, unit) + + model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False) + model.run_on_unit.assert_not_awaited() + model.update_status.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_resume_nova_compute_unknown_failure(model): + """Verify that the unknown failure case bails out.""" + model.run_action.return_value = Mock( + status="failed", safe_data={"message": "it crashed and we don't know why"} + ) + unit = Unit( + name="nova-compute/0", + machine=generate_cou_machine("0", "az-0"), + workload_version="21.2.4", + subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")], + ) + + with pytest.raises(ActionFailed, match="it crashed and we don't know why"): + await resume_nova_compute_unit(model, unit) + + model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False) + model.run_on_unit.assert_not_awaited() + model.update_status.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_resume_nova_compute_ceilometer_failure(model): + """Verify that the ceilometer failure case performs the workaround.""" + model.run_action.return_value = Mock( + status="failed", + safe_data={ + "message": ( + "Action resume failed: Couldn't resume: " + "ceilometer-agent-compute didn't resume cleanly.; " + "Services not running that should be: ceilometer-agent-compute" + ), + }, + ) + unit = Unit( + name="nova-compute/0", + machine=generate_cou_machine("0", "az-0"), + workload_version="21.2.4", + subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")], + ) + + await resume_nova_compute_unit(model, unit) + + model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False) + model.run_on_unit.assert_awaited_once_with( + "nova-compute/0", "sudo systemctl restart ceilometer-agent-compute" + ) + model.update_status.has_awaits(call("nova-compute/0"), call("ceilometer-agent/0")) + assert model.update_status.await_count == 2 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 45251830..b00c5d70 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -36,6 +36,7 @@ def model() -> AsyncMock: model.scp_from_unit = AsyncMock() model.set_application_config = AsyncMock() model.get_application_config = AsyncMock() + model.update_status = AsyncMock() return model