Workaround ceilometer-agent-compute not running

This is a bug in nova-compute and/or ceilometer-agent ( https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ), where the nova-compute `resume` action can sometimes fail. The workaround added here is to restart ceilometer-agent-compute if it fails for this reason. Fixes: #427
canonical · Jul 22, 2024 · 79a5321 · 79a5321
1 parent 8ab380d
commit 79a5321
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 4 deletions.
diff --git a/cou/apps/core.py b/cou/apps/core.py
@@ -18,9 +18,9 @@
 
 from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication
 from cou.apps.factory import AppFactory
-from cou.exceptions import ApplicationNotSupported
+from cou.exceptions import ActionFailed, ApplicationNotSupported
 from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep
-from cou.utils.juju_utils import Unit
+from cou.utils.juju_utils import Model, Unit
 from cou.utils.nova_compute import verify_empty_hypervisor
 from cou.utils.openstack import OpenStackRelease
 
@@ -221,6 +221,61 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp
             for unit in units_to_disable
         ]
 
+    def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep:
+        """Override the resume unit step, because extra error handling is required.
+
+        :param unit: Unit to be resumed.
+        :type unit: Unit
+        :param dependent: Whether the step is dependent of another step, defaults to False
+        :type dependent: bool, optional
+        :return: Step to resume a unit.
+        :rtype: UnitUpgradeStep
+        """
+        # workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
+        return UnitUpgradeStep(
+            description=(f"Resume the unit: '{unit.name}'"),
+            coro=resume_nova_compute_unit(self.model, unit),
+            dependent=dependent,
+        )
+
+
+async def resume_nova_compute_unit(model: Model, unit: Unit) -> None:
+    """Run the resume action on nova-compute, with workarounds.
+
+    Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
+
+    :param model: juju model to work with
+    :type model: Model
+    :param unit: nova-compute unit to resume
+    :type unit: Unit
+    :raises ActionFailed: when the resume action fails with an unknown failure
+    """
+    action = await model.run_action(unit.name, "resume", raise_on_failure=False)
+
+    # If the action was successful, there is nothing left to do
+    if action.status == "completed":
+        return
+
+    # If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ,
+    # apply the workaround.
+    if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get(
+        "message", ""
+    ):
+        logger.debug("Resume failed because ceilometer-agent-compute not running.")
+        logger.debug("Restarting ceilometer-agent-compute on %s", unit.name)
+        await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute")
+
+        # Update status manually, otherwise nova-compute and ceilometer-agent
+        # will be blocked until next update-status hook.
+        await model.update_status(unit.name)
+        for subordinate in unit.subordinates:
+            if subordinate.charm == "ceilometer-agent":
+                await model.update_status(subordinate.name)
+
+    # Otherwise, it's an unknown error, so raise the exception
+    else:
+        raise ActionFailed(action)
+
 
 @AppFactory.register_application(["swift-proxy", "swift-storage"])
 class Swift(OpenStackApplication):

diff --git a/tests/unit/apps/test_core.py b/tests/unit/apps/test_core.py
@@ -11,14 +11,15 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, Mock, call, patch
 
 import pytest
 from juju.client._definitions import ApplicationStatus, UnitStatus
 
 from cou.apps.base import OpenStackApplication
-from cou.apps.core import Keystone, NovaCompute, Swift
+from cou.apps.core import Keystone, NovaCompute, Swift, resume_nova_compute_unit
 from cou.exceptions import (
+    ActionFailed,
     ApplicationError,
     ApplicationNotSupported,
     HaltUpgradePlanGeneration,
@@ -1144,3 +1145,72 @@ def test_core_wrong_channel(model):
 
     with pytest.raises(ApplicationError, match=exp_msg):
         app.generate_upgrade_plan(target, force=False)
+
+
+@pytest.mark.asyncio
+async def test_resume_nova_compute_success(model):
+    """Verify that the success case resumes without calling workarounds."""
+    model.run_action.return_value = Mock(status="completed")
+    unit = Unit(
+        name="nova-compute/0",
+        machine=generate_cou_machine("0", "az-0"),
+        workload_version="21.2.4",
+        subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
+    )
+
+    await resume_nova_compute_unit(model, unit)
+
+    model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
+    model.run_on_unit.assert_not_awaited()
+    model.update_status.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_resume_nova_compute_unknown_failure(model):
+    """Verify that the unknown failure case bails out."""
+    model.run_action.return_value = Mock(
+        status="failed", safe_data={"message": "it crashed and we don't know why"}
+    )
+    unit = Unit(
+        name="nova-compute/0",
+        machine=generate_cou_machine("0", "az-0"),
+        workload_version="21.2.4",
+        subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
+    )
+
+    with pytest.raises(ActionFailed, match="it crashed and we don't know why"):
+        await resume_nova_compute_unit(model, unit)
+
+    model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
+    model.run_on_unit.assert_not_awaited()
+    model.update_status.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_resume_nova_compute_ceilometer_failure(model):
+    """Verify that the ceilometer failure case performs the workaround."""
+    model.run_action.return_value = Mock(
+        status="failed",
+        safe_data={
+            "message": (
+                "Action resume failed: Couldn't resume: "
+                "ceilometer-agent-compute didn't resume cleanly.; "
+                "Services not running that should be: ceilometer-agent-compute"
+            ),
+        },
+    )
+    unit = Unit(
+        name="nova-compute/0",
+        machine=generate_cou_machine("0", "az-0"),
+        workload_version="21.2.4",
+        subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
+    )
+
+    await resume_nova_compute_unit(model, unit)
+
+    model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
+    model.run_on_unit.assert_awaited_once_with(
+        "nova-compute/0", "sudo systemctl restart ceilometer-agent-compute"
+    )
+    model.update_status.has_awaits(call("nova-compute/0"), call("ceilometer-agent/0"))
+    assert model.update_status.await_count == 2
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -36,6 +36,7 @@ def model() -> AsyncMock:
     model.scp_from_unit = AsyncMock()
     model.set_application_config = AsyncMock()
     model.get_application_config = AsyncMock()
+    model.update_status = AsyncMock()
 
     return model