Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ISSUE-1215]: Disk Eject E2E: RKE2 - Auto drive replacement - 1 drive failure out of 2 drives #1217

Merged
merged 4 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions tests/e2e-test-framework/framework/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def wait_volume(
expected_health (Optional[str], optional): The expected health of the volume. Defaults to None.
expected_usage (Optional[str], optional): The expected usage of the volume. Defaults to None.
expected_operational_status (Optional[str], optional): The expected operational status of the volume. Defaults to None.
timeout (int): The maximum time to wait for the volume in seconds. Defaults to 60.
timeout (int): The maximum time to wait for the volume in seconds. Defaults to 90.

Returns:
bool: True if the volume meets the expected status, health, and usage within the given timeout, False otherwise.
Expand Down Expand Up @@ -443,7 +443,7 @@ def wait_drive(
expected_health (Optional[str], optional): The expected health of the drive. Defaults to None.
expected_usage (Optional[str], optional): The expected usage of the drive. Defaults to None.
expected_led_state (Optional[str], optional): The expected LED state of the drive. Defaults to None.
timeout (int): The maximum time to wait for the drive in seconds. Defaults to 60.
timeout (int): The maximum time to wait for the drive in seconds. Defaults to 90.

Returns:
bool: True if the drive meets the expected status, health, and usage within the given timeout, False otherwise.
Expand Down Expand Up @@ -775,23 +775,24 @@ def clear_pvc_and_pod(
for pvc in pvcs:
assert self.wait_volume(
name=pvc.spec.volume_name,
expected_usage=const.USAGE_RELEASED,
), f"Volume: {pvc.spec.volume_name} failed to reach expected usage: {const.USAGE_RELEASED}"
logging.info(f"volume: {pvc.spec.volume_name} reach expected usage: {const.USAGE_RELEASED}")
expected_usage=','.join([const.USAGE_RELEASED, const.USAGE_IN_USE]),
), f"Volume: {pvc.spec.volume_name} failed to reach expected usage: {','.join([const.USAGE_RELEASED, const.USAGE_IN_USE])}"
logging.info(f"volume: {pvc.spec.volume_name} reach expected usage")
mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved

time.sleep(30)
mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved
self.recreate_pod(name=pod_name, namespace=namespace)

def check_drive_cr_not_exist(self, drive_name: str, timeout: int = 120) -> bool:
def check_drive_cr_exist_or_not(self, drive_name: str, cr_existence: bool, timeout: int = 120) -> bool:
"""
Checks if a custom resource (CR) representing a drive with the given name does not exist.
Checks if a custom resource (CR) representing a drive with the given name exists or not.

Args:
drive_name (str): The name of the drive CR.
cr_existence (bool): The state if drive CR should exist (True) or not (False).
timeout (int, optional): The timeout for checking the CR, defaults to 120.

Returns:
bool: True if the drive CR was removed within the given timeout, False otherwise.
bool: True if the drive CR existance is as expected within the given timeout, False otherwise.
"""
end_time = time.time() + timeout
while time.time() < end_time:
Expand All @@ -803,16 +804,16 @@ def check_drive_cr_not_exist(self, drive_name: str, timeout: int = 120) -> bool:
name=drive_name,
)
logging.warning(f"Drive CR '{drive_name}' still exists.")
if cr_existence:
return True
except ApiException as e:
if e.status == 404:
logging.info(f"Drive CR {drive_name} does not exist.")
return True
if not cr_existence:
return True
else:
raise
time.sleep(2)
logging.warning(
f"Drive CR '{drive_name}' still exists after {timeout} seconds timeout."
)
return False


mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ def test_5921_auto_drive_replacement_with_multiple_volumes_per_pod(self):
# 15. check driveCR succesfully removed
for drive in drives:
drive_name = drive["metadata"]["name"]
assert self.utils.check_drive_cr_not_exist(
drive_name=drive_name
assert self.utils.check_drive_cr_exist_or_not(
drive_name=drive_name,
cr_existence=False
), f"Drive CR {drive_name} still exists"
# 16. check for events DriveSuccessfullyRemoved in kubernetes events
for drive in drives:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import pytest
import logging
from typing import Dict

import framework.const as const

from framework.sts import STS
from framework.utils import Utils
from framework.drive import DriveUtils




class TestAutoDriveReplacementWithMultipleVolumesPerPodSingleFailure:
@classmethod
@pytest.fixture(autouse=True)
def setup_class(
cls,
namespace: str,
drive_utils_executors: Dict[str, DriveUtils],
utils: Utils,
):
cls.namespace = namespace
cls.name = "test-auto-dr-multiple-volumes-single-failure"
cls.timeout = 120
cls.replicas = 1

cls.utils = utils

cls.drive_utils = drive_utils_executors
cls.sts = STS(cls.namespace, cls.name, cls.replicas)
cls.sts.delete()
cls.sts.create(storage_classes=[const.SSD_SC, const.HDD_SC])

yield

cls.sts.delete()

@pytest.mark.hal
def test_5955_auto_drive_replacement_with_multiple_volumes_per_pod_single_failure(self):
# 1. get volume and volume groups for deployed pod
assert (
self.sts.verify(self.timeout) is True
), f"STS: {self.name} failed to reach desired number of replicas: {self.replicas}"
pod = self.utils.list_pods(name_prefix=self.name)[0]
node_ip = self.utils.get_pod_node_ip(
pod_name=pod.metadata.name, namespace=self.namespace
)
volumes = self.utils.list_volumes(pod_name=pod.metadata.name)
# get all drives
drives = []
for volume in volumes:
drive = self.utils.get_drive_cr(
volume_name=volume["metadata"]["name"],
namespace=volume["metadata"]["namespace"])
drives.append(drive)
failed_drive = drives[0]
health_drive = drives[1]
mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved
failed_volume = volumes[0]
# 2. simulate drive failure. Annotate drive used by pod with health=BAD
failed_drive_name = failed_drive["metadata"]["name"]
self.utils.annotate_custom_resource(
resource_name=failed_drive_name,
resource_type="drives",
annotation_key="health",
annotation_value="BAD"
mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved
)
logging.info(f"drive: {failed_drive_name} was annotated with health=BAD")
# 3. wait until drive health is BAD, status=ONLINE, usage=RELEASING.
logging.info(f"Waiting for drive: {failed_drive_name}")
assert self.utils.wait_drive(
name=failed_drive_name,
expected_status=const.STATUS_ONLINE,
expected_health=const.HEALTH_BAD,
expected_usage=const.USAGE_RELEASING
), f"Drive {failed_drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}"
logging.info(f"drive {failed_drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}")
# 4. wait until volume health is BAD, status=OPERATIVE, usage=RELEASING.
failed_volume_name = failed_volume["metadata"]["name"]
logging.info(f"Waiting for volume: {failed_volume_name}")
assert self.utils.wait_volume(
name=failed_volume_name,
expected_health=const.HEALTH_BAD,
expected_usage=const.USAGE_RELEASING,
expected_operational_status=const.STATUS_OPERATIVE
), f"Volume {failed_volume_name} failed to reach OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}"
logging.info(f"volume {failed_volume_name} went in OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}")
# 5. check events and locate event related to DriveHealthFailure
assert self.utils.event_in(
resource_name=failed_drive_name,
reason=const.DRIVE_HEALTH_FAILURE,
), f"event {const.DRIVE_HEALTH_FAILURE} for drive {failed_drive_name} not found"
# 6. annotate volume with release=done
self.utils.annotate_custom_resource(
resource_name=failed_volume_name,
resource_type="volumes",
annotation_key="release",
annotation_value="done",
mdutka-dell marked this conversation as resolved.
Show resolved Hide resolved
namespace=volume['metadata']['namespace']
)
logging.info(f"volume: {failed_volume_name} was annotated with release=done")
# 7. check drive usages are RELEASED
assert self.utils.wait_drive(
name=failed_drive_name,
expected_usage=const.USAGE_RELEASED
), f"Drive {failed_drive_name} failed to reach expected Usage: {const.USAGE_RELEASED}"
logging.info(f"drive {failed_drive_name} went in Usage: {const.USAGE_RELEASED}")
# 8. check volumes are RELEASED
assert self.utils.wait_volume(
name=failed_volume_name,
expected_usage=const.USAGE_RELEASED
), f"Volume {failed_volume_name} failed to reach expected Usage {const.USAGE_RELEASED}"
logging.info(f"volume {failed_volume_name} went in Usage: {const.USAGE_RELEASED}")
# 9. check event DriveReadyForRemoval is generated
assert self.utils.event_in(
resource_name=failed_drive_name,
reason=const.DRIVE_READY_FOR_REMOVAL,
), f"event {const.DRIVE_READY_FOR_REMOVAL} for drive {failed_drive_name} not found"
# 10. check events and locate event related to VolumeBadHealth
assert self.utils.event_in(
resource_name=failed_volume_name,
reason=const.VOLUME_BAD_HEALTH,
), f"event {const.VOLUME_BAD_HEALTH} for volume {failed_volume_name} not found"
# 11. delete pod and pvc
self.utils.clear_pvc_and_pod(pod_name=pod.metadata.name, namespace=self.namespace)
# 12. check Drive status to be REMOVING or REMOVED and LED state to be 1 (if drive supports LED ) or 2 (if drive does not support LED) Status to be ONLINE #TODO: status LED 2 => another test case
assert self.utils.wait_drive(
name=failed_drive_name,
expected_status=const.STATUS_ONLINE,
expected_usage=const.USAGE_REMOVED,
expected_health=const.HEALTH_BAD,
expected_led_state=const.LED_STATE
), f"Drive {failed_drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}"
logging.info(f"drive {failed_drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}")
# 13. check for events: DriveReadyForPhysicalRemoval
assert self.utils.event_in(
resource_name=failed_drive_name,
reason=const.DRIVE_READY_FOR_PHYSICAL_REMOVAL,
), f"event {const.DRIVE_READY_FOR_PHYSICAL_REMOVAL} for drive {failed_drive_name} not found"
# 14. get Node ID on which drive resides, obtain path for affected drive, identify node name for corresponding node id and remove drive
failed_drive_path = failed_drive["spec"]["Path"]
assert failed_drive_path, f"Drive path for drive {failed_drive_name} not found"
logging.info(f"drive_path: {failed_drive_path}")

scsi_id = self.drive_utils[node_ip].get_scsi_id(failed_drive_path)
assert scsi_id, f"scsi_id for drive {failed_drive_name} not found"
logging.info(f"scsi_id: {scsi_id}")

self.drive_utils[node_ip].remove(scsi_id)
logging.info(f"drive {failed_drive_path}, {scsi_id} removed")
# 15. check driveCR succesfully removed -> only removed one, the second still should be on the cluster
assert self.utils.check_drive_cr_exist_or_not(
drive_name=failed_drive_name,
cr_existence=False
), f"Drive CR {failed_drive_name} still exists"

health_drive_name = health_drive['metadata']['name']
assert self.utils.check_drive_cr_exist_or_not(
drive_name=health_drive_name,
cr_existence=True,
), f"Drive CR {health_drive_name} does not exist"
# 16. check for events DriveSuccessfullyRemoved in kubernetes events
assert self.utils.event_in(
resource_name=failed_drive_name,
reason=const.DRIVE_SUCCESSFULLY_REMOVED,
), f"event {const.DRIVE_SUCCESSFULLY_REMOVED} for drive {failed_drive_name} not found"
Loading