Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround ceilometer-agent-compute not running #499

Merged
merged 1 commit into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions cou/apps/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

from cou.apps.base import LONG_IDLE_TIMEOUT, OpenStackApplication
from cou.apps.factory import AppFactory
from cou.exceptions import ApplicationNotSupported
from cou.exceptions import ActionFailed, ApplicationNotSupported
from cou.steps import PostUpgradeStep, PreUpgradeStep, UnitUpgradeStep, UpgradeStep
from cou.utils.juju_utils import Unit
from cou.utils.juju_utils import Model, Unit
from cou.utils.nova_compute import verify_empty_hypervisor
from cou.utils.openstack import OpenStackRelease

Expand Down Expand Up @@ -221,6 +221,61 @@ def _get_disable_scheduler_step(self, units: Optional[list[Unit]]) -> list[PreUp
for unit in units_to_disable
]

def _get_resume_unit_step(self, unit: Unit, dependent: bool = False) -> UnitUpgradeStep:
"""Override the resume unit step, because extra error handling is required.

:param unit: Unit to be resumed.
:type unit: Unit
:param dependent: Whether the step is dependent of another step, defaults to False
:type dependent: bool, optional
:return: Step to resume a unit.
:rtype: UnitUpgradeStep
"""
# workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585
return UnitUpgradeStep(
description=(f"Resume the unit: '{unit.name}'"),
coro=resume_nova_compute_unit(self.model, unit),
dependent=dependent,
)


async def resume_nova_compute_unit(model: Model, unit: Unit) -> None:
"""Run the resume action on nova-compute, with workarounds.

Includes a workaround for https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585

:param model: juju model to work with
:type model: Model
:param unit: nova-compute unit to resume
:type unit: Unit
:raises ActionFailed: when the resume action fails with an unknown failure
"""
action = await model.run_action(unit.name, "resume", raise_on_failure=False)

# If the action was successful, there is nothing left to do
if action.status == "completed":
return

# If it failed because of https://bugs.launchpad.net/charm-ceilometer-agent/+bug/1947585 ,
# apply the workaround.
if "Services not running that should be: ceilometer-agent-compute" in action.safe_data.get(
"message", ""
):
logger.debug("Resume failed because ceilometer-agent-compute not running.")
logger.debug("Restarting ceilometer-agent-compute on %s", unit.name)
await model.run_on_unit(unit.name, "sudo systemctl restart ceilometer-agent-compute")

# Update status manually, otherwise nova-compute and ceilometer-agent
# will be blocked until next update-status hook.
await model.update_status(unit.name)
for subordinate in unit.subordinates:
if subordinate.charm == "ceilometer-agent":
await model.update_status(subordinate.name)
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved

# Otherwise, it's an unknown error, so raise the exception
else:
raise ActionFailed(action)


@AppFactory.register_application(["swift-proxy", "swift-storage"])
class Swift(OpenStackApplication):
Expand Down
74 changes: 72 additions & 2 deletions tests/unit/apps/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, MagicMock, Mock, call, patch

import pytest
from juju.client._definitions import ApplicationStatus, UnitStatus

from cou.apps.base import OpenStackApplication
from cou.apps.core import Keystone, NovaCompute, Swift
from cou.apps.core import Keystone, NovaCompute, Swift, resume_nova_compute_unit
from cou.exceptions import (
ActionFailed,
ApplicationError,
ApplicationNotSupported,
HaltUpgradePlanGeneration,
Expand Down Expand Up @@ -1144,3 +1145,72 @@ def test_core_wrong_channel(model):

with pytest.raises(ApplicationError, match=exp_msg):
app.generate_upgrade_plan(target, force=False)


@pytest.mark.asyncio
async def test_resume_nova_compute_success(model):
"""Verify that the success case resumes without calling workarounds."""
model.run_action.return_value = Mock(status="completed")
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_not_awaited()
model.update_status.assert_not_awaited()


@pytest.mark.asyncio
async def test_resume_nova_compute_unknown_failure(model):
"""Verify that the unknown failure case bails out."""
model.run_action.return_value = Mock(
status="failed", safe_data={"message": "it crashed and we don't know why"}
)
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

with pytest.raises(ActionFailed, match="it crashed and we don't know why"):
await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_not_awaited()
model.update_status.assert_not_awaited()


@pytest.mark.asyncio
async def test_resume_nova_compute_ceilometer_failure(model):
"""Verify that the ceilometer failure case performs the workaround."""
model.run_action.return_value = Mock(
status="failed",
safe_data={
"message": (
"Action resume failed: Couldn't resume: "
"ceilometer-agent-compute didn't resume cleanly.; "
"Services not running that should be: ceilometer-agent-compute"
),
},
)
unit = Unit(
name="nova-compute/0",
machine=generate_cou_machine("0", "az-0"),
workload_version="21.2.4",
subordinates=[SubordinateUnit(name="ceilometer-agent/0", charm="ceilometer-agent")],
)

await resume_nova_compute_unit(model, unit)

model.run_action.assert_awaited_once_with("nova-compute/0", "resume", raise_on_failure=False)
model.run_on_unit.assert_awaited_once_with(
"nova-compute/0", "sudo systemctl restart ceilometer-agent-compute"
)
model.update_status.has_awaits(call("nova-compute/0"), call("ceilometer-agent/0"))
assert model.update_status.await_count == 2
1 change: 1 addition & 0 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def model() -> AsyncMock:
model.scp_from_unit = AsyncMock()
model.set_application_config = AsyncMock()
model.get_application_config = AsyncMock()
model.update_status = AsyncMock()

return model

Expand Down
Loading