Skip to content

Commit

Permalink
Workaround ceilometer-agent-compute not running
Browse files Browse the repository at this point in the history
This is a bug in nova-compute and/or ceilometer-agent
( https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ),
where the nova-compute `resume` action can sometimes fail.
The workaround added here is to restart ceilometer-agent-compute
if it fails for this reason.

Fixes: #427
  • Loading branch information
samuelallan72 committed Jul 22, 2024
1 parent 8ab380d commit 79a5321
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 4 deletions.
59 changes: 57 additions & 2 deletions cou/apps/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication
from cou.apps.factory import AppFactory
from cou.exceptions import ApplicationNotSupported
from cou.exceptions import ActionFailed, ApplicationNotSupported
from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep
from cou.utils.juju_utils import Unit
from cou.utils.juju_utils import Model, Unit
from cou.utils.nova_compute import verify_empty_hypervisor
from cou.utils.openstack import OpenStackRelease

Expand Down Expand Up @@ -221,6 +221,61 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp
for unit in units_to_disable
]

def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep:
"""Override the resume unit step, because extra error handling is required.
:param unit: Unit to be resumed.
:type unit: Unit
:param dependent: Whether the step is dependent of another step, defaults to False
:type dependent: bool, optional
:return: Step to resume a unit.
:rtype: UnitUpgradeStep
"""
# workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
return UnitUpgradeStep(
description=(f"Resume the unit: '{unit.name}'"),
coro=resume_nova_compute_unit(self.model, unit),
dependent=dependent,
)


async def resume_nova_compute_unit(model: Model, unit: Unit) -> None:
"""Run the resume action on nova-compute, with workarounds.
Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
:param model: juju model to work with
:type model: Model
:param unit: nova-compute unit to resume
:type unit: Unit
:raises ActionFailed: when the resume action fails with an unknown failure
"""
action = await model.run_action(unit.name, "resume", raise_on_failure=False)

# If the action was successful, there is nothing left to do
if action.status == "completed":
return

# If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ,
# apply the workaround.
if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get(
"message", ""
):
logger.debug("Resume failed because ceilometer-agent-compute not running.")
logger.debug("Restarting ceilometer-agent-compute on %s", unit.name)
await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute")

# Update status manually, otherwise nova-compute and ceilometer-agent
# will be blocked until next update-status hook.
await model.update_status(unit.name)
for subordinate in unit.subordinates:
if subordinate.charm == "ceilometer-agent":
await model.update_status(subordinate.name)

# Otherwise, it's an unknown error, so raise the exception
else:
raise ActionFailed(action)


@AppFactory.register_application(["swift-proxy", "swift-storage"])
class Swift(OpenStackApplication):
Expand Down
74 changes: 72 additions & 2 deletions tests/unit/apps/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, MagicMock, Mock, call, patch

import pytest
from juju.client._definitions import ApplicationStatus, UnitStatus

from cou.apps.base import OpenStackApplication
from cou.apps.core import Keystone, NovaCompute, Swift
from cou.apps.core import Keystone, NovaCompute, Swift, resume_nova_compute_unit
from cou.exceptions import (
ActionFailed,
ApplicationError,
ApplicationNotSupported,
HaltUpgradePlanGeneration,
Expand Down Expand Up @@ -1144,3 +1145,72 @@ def test_core_wrong_channel(model):

with pytest.raises(ApplicationError, match=exp_msg):
app.generate_upgrade_plan(target, force=False)


@pytest.mark.asyncio
async def test_resume_nova_compute_success(model):
"""Verify that the success case resumes without calling workarounds."""
model.run_action.return_value = Mock(status="completed")
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_not_awaited()
model.update_status.assert_not_awaited()


@pytest.mark.asyncio
async def test_resume_nova_compute_unknown_failure(model):
"""Verify that the unknown failure case bails out."""
model.run_action.return_value = Mock(
status="failed", safe_data={"message": "it crashed and we don't know why"}
)
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

with pytest.raises(ActionFailed, match="it crashed and we don't know why"):
await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_not_awaited()
model.update_status.assert_not_awaited()


@pytest.mark.asyncio
async def test_resume_nova_compute_ceilometer_failure(model):
"""Verify that the ceilometer failure case performs the workaround."""
model.run_action.return_value = Mock(
status="failed",
safe_data={
"message": (
"Action resume failed: Couldn't resume: "
"ceilometer-agent-compute didn't resume cleanly.; "
"Services not running that should be: ceilometer-agent-compute"
),
},
)
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_awaited_once_with(
"nova-compute/0", "sudo systemctl restart ceilometer-agent-compute"
)
model.update_status.has_awaits(call("nova-compute/0"), call("ceilometer-agent/0"))
assert model.update_status.await_count == 2
1 change: 1 addition & 0 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def model() -> AsyncMock:
model.scp_from_unit = AsyncMock()
model.set_application_config = AsyncMock()
model.get_application_config = AsyncMock()
model.update_status = AsyncMock()

return model

Expand Down

0 comments on commit 79a5321

Please sign in to comment.