dell · mdutka-dell · Jul 24, 2024 · Jul 18, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/tests/e2e-test-framework/framework/utils.py b/tests/e2e-test-framework/framework/utils.py
@@ -403,7 +403,7 @@ def wait_volume(
             expected_health (Optional[str], optional): The expected health of the volume. Defaults to None.
             expected_usage (Optional[str], optional): The expected usage of the volume. Defaults to None.
             expected_operational_status (Optional[str], optional): The expected operational status of the volume. Defaults to None.
-            timeout (int): The maximum time to wait for the volume in seconds. Defaults to 60.
+            timeout (int): The maximum time to wait for the volume in seconds. Defaults to 90.
 
         Returns:
             bool: True if the volume meets the expected status, health, and usage within the given timeout, False otherwise.
@@ -443,7 +443,7 @@ def wait_drive(
             expected_health (Optional[str], optional): The expected health of the drive. Defaults to None.
             expected_usage (Optional[str], optional): The expected usage of the drive. Defaults to None.
             expected_led_state (Optional[str], optional): The expected LED state of the drive. Defaults to None.
-            timeout (int): The maximum time to wait for the drive in seconds. Defaults to 60.
+            timeout (int): The maximum time to wait for the drive in seconds. Defaults to 90.
 
         Returns:
             bool: True if the drive meets the expected status, health, and usage within the given timeout, False otherwise.
@@ -775,23 +775,24 @@ def clear_pvc_and_pod(
             for pvc in pvcs:
                 assert self.wait_volume(
                     name=pvc.spec.volume_name,
-                    expected_usage=const.USAGE_RELEASED,
-                ), f"Volume: {pvc.spec.volume_name} failed to reach expected usage: {const.USAGE_RELEASED}"
-                logging.info(f"volume: {pvc.spec.volume_name} reach expected usage: {const.USAGE_RELEASED}")
+                    expected_usage=','.join([const.USAGE_RELEASED, const.USAGE_IN_USE]),
+                ), f"Volume: {pvc.spec.volume_name} failed to reach expected usage: {','.join([const.USAGE_RELEASED, const.USAGE_IN_USE])}"
+                logging.info(f"volume: {pvc.spec.volume_name} reach expected usage")
 
         time.sleep(30)
         self.recreate_pod(name=pod_name, namespace=namespace)
 
-    def check_drive_cr_not_exist(self, drive_name: str, timeout: int = 120) -> bool:
+    def check_drive_cr_exist_or_not(self, drive_name: str, cr_existence: bool, timeout: int = 120) -> bool:
         """
-        Checks if a custom resource (CR) representing a drive with the given name does not exist.
+        Checks if a custom resource (CR) representing a drive with the given name exists or not.
 
         Args:
             drive_name (str): The name of the drive CR.
+            cr_existence (bool): The state if drive CR should exist (True) or not (False).
             timeout (int, optional): The timeout for checking the CR, defaults to 120.
 
         Returns:
-            bool: True if the drive CR was removed within the given timeout, False otherwise.
+            bool: True if the drive CR existance is as expected within the given timeout, False otherwise.
         """
         end_time = time.time() + timeout
         while time.time() < end_time:
@@ -803,16 +804,16 @@ def check_drive_cr_not_exist(self, drive_name: str, timeout: int = 120) -> bool:
                     name=drive_name,
                 )
                 logging.warning(f"Drive CR '{drive_name}' still exists.")
+                if cr_existence:
+                    return True
             except ApiException as e:
                 if e.status == 404:
                     logging.info(f"Drive CR {drive_name} does not exist.")
-                    return True
+                    if not cr_existence:
+                        return True
                 else:
                     raise
             time.sleep(2)
-        logging.warning(
-            f"Drive CR '{drive_name}' still exists after {timeout} seconds timeout."
-        )
         return False
 
 
diff --git a/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes.py b/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes.py
@@ -168,8 +168,9 @@ def test_5921_auto_drive_replacement_with_multiple_volumes_per_pod(self):
     # 15. check driveCR succesfully removed
         for drive in drives:
             drive_name = drive["metadata"]["name"]          
-            assert self.utils.check_drive_cr_not_exist(
-                drive_name=drive_name
+            assert self.utils.check_drive_cr_exist_or_not(
+                drive_name=drive_name,
+                cr_existence=False
             ), f"Drive CR {drive_name} still exists"
     # 16. check for events DriveSuccessfullyRemoved in kubernetes events
         for drive in drives:

diff --git a/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes_single_fail.py b/tests/e2e-test-framework/tests/test_drive_replacement_multi_volumes_single_fail.py
@@ -0,0 +1,166 @@
+import pytest
+import logging
+from typing import Dict
+
+import framework.const as const
+
+from framework.sts import STS
+from framework.utils import Utils
+from framework.drive import DriveUtils
+
+
+
+
+class TestAutoDriveReplacementWithMultipleVolumesPerPodSingleFailure:
+    @classmethod
+    @pytest.fixture(autouse=True)
+    def setup_class(
+        cls,
+        namespace: str,
+        drive_utils_executors: Dict[str, DriveUtils],
+        utils: Utils,
+    ):
+        cls.namespace = namespace
+        cls.name = "test-auto-dr-multiple-volumes-single-failure"
+        cls.timeout = 120
+        cls.replicas = 1
+
+        cls.utils = utils
+
+        cls.drive_utils = drive_utils_executors
+        cls.sts = STS(cls.namespace, cls.name, cls.replicas)
+        cls.sts.delete()
+        cls.sts.create(storage_classes=[const.SSD_SC, const.HDD_SC])
+
+        yield
+
+        cls.sts.delete()
+
+    @pytest.mark.hal
+    def test_5955_auto_drive_replacement_with_multiple_volumes_per_pod_single_failure(self):
+    # 1. get volume and volume groups for deployed pod
+        assert (
+            self.sts.verify(self.timeout) is True
+        ), f"STS: {self.name} failed to reach desired number of replicas: {self.replicas}"
+        pod = self.utils.list_pods(name_prefix=self.name)[0]
+        node_ip = self.utils.get_pod_node_ip(
+            pod_name=pod.metadata.name, namespace=self.namespace
+        )
+        volumes = self.utils.list_volumes(pod_name=pod.metadata.name)
+        # get all drives
+        drives = []
+        for volume in volumes:
+            drive = self.utils.get_drive_cr(
+                volume_name=volume["metadata"]["name"], 
+                namespace=volume["metadata"]["namespace"])
+            drives.append(drive)
+        failed_drive = drives[0]
+        health_drive = drives[1]
+        failed_volume = volumes[0]
+    # 2. simulate drive failure. Annotate drive used by pod with health=BAD
+        failed_drive_name = failed_drive["metadata"]["name"]
+        self.utils.annotate_custom_resource(
+            resource_name=failed_drive_name, 
+            resource_type="drives", 
+            annotation_key="health", 
+            annotation_value="BAD"
+        )
+        logging.info(f"drive: {failed_drive_name} was annotated with health=BAD")
+    # 3. wait until drive health is BAD, status=ONLINE, usage=RELEASING.
+        logging.info(f"Waiting for drive: {failed_drive_name}")
+        assert self.utils.wait_drive(
+            name=failed_drive_name, 
+            expected_status=const.STATUS_ONLINE, 
+            expected_health=const.HEALTH_BAD, 
+            expected_usage=const.USAGE_RELEASING
+        ), f"Drive {failed_drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}"
+        logging.info(f"drive {failed_drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}")
+    # 4. wait until volume health is BAD, status=OPERATIVE, usage=RELEASING.
+        failed_volume_name = failed_volume["metadata"]["name"]
+        logging.info(f"Waiting for volume: {failed_volume_name}")
+        assert self.utils.wait_volume(
+            name=failed_volume_name, 
+            expected_health=const.HEALTH_BAD, 
+            expected_usage=const.USAGE_RELEASING, 
+            expected_operational_status=const.STATUS_OPERATIVE
+        ), f"Volume {failed_volume_name} failed to reach OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}"
+        logging.info(f"volume {failed_volume_name} went in OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}") 
+    # 5. check events and locate event related to DriveHealthFailure
+        assert self.utils.event_in(
+            resource_name=failed_drive_name,
+            reason=const.DRIVE_HEALTH_FAILURE,
+        ), f"event {const.DRIVE_HEALTH_FAILURE} for drive {failed_drive_name} not found"
+    # 6. annotate volume with release=done
+        self.utils.annotate_custom_resource(
+            resource_name=failed_volume_name, 
+            resource_type="volumes", 
+            annotation_key="release", 
+            annotation_value="done", 
+            namespace=volume['metadata']['namespace']
+        )
+        logging.info(f"volume: {failed_volume_name} was annotated with release=done")
+    # 7. check drive usages are RELEASED 
+        assert self.utils.wait_drive(
+            name=failed_drive_name, 
+            expected_usage=const.USAGE_RELEASED
+        ), f"Drive {failed_drive_name} failed to reach expected Usage: {const.USAGE_RELEASED}"
+        logging.info(f"drive {failed_drive_name} went in Usage: {const.USAGE_RELEASED}") 
+    # 8. check volumes are RELEASED 
+        assert self.utils.wait_volume(
+            name=failed_volume_name,
+            expected_usage=const.USAGE_RELEASED
+        ), f"Volume {failed_volume_name} failed to reach expected Usage {const.USAGE_RELEASED}"
+        logging.info(f"volume {failed_volume_name} went in Usage: {const.USAGE_RELEASED}")
+    # 9. check event DriveReadyForRemoval is generated
+        assert self.utils.event_in(
+            resource_name=failed_drive_name,
+            reason=const.DRIVE_READY_FOR_REMOVAL,
+        ), f"event {const.DRIVE_READY_FOR_REMOVAL} for drive {failed_drive_name} not found" 
+    # 10. check events and locate event related to VolumeBadHealth
+        assert self.utils.event_in(
+            resource_name=failed_volume_name,
+            reason=const.VOLUME_BAD_HEALTH,
+        ), f"event {const.VOLUME_BAD_HEALTH} for volume {failed_volume_name} not found"
+    # 11. delete pod and pvc
+        self.utils.clear_pvc_and_pod(pod_name=pod.metadata.name, namespace=self.namespace)
+    # 12. check Drive status to be REMOVING or REMOVED and LED state to be 1 (if drive supports LED ) or 2 (if drive does not support LED) Status to be ONLINE #TODO: status LED 2 => another test case
+        assert self.utils.wait_drive(
+            name=failed_drive_name, 
+            expected_status=const.STATUS_ONLINE, 
+            expected_usage=const.USAGE_REMOVED, 
+            expected_health=const.HEALTH_BAD, 
+            expected_led_state=const.LED_STATE
+        ), f"Drive {failed_drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}"
+        logging.info(f"drive {failed_drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}") 
+    # 13. check for events: DriveReadyForPhysicalRemoval   
+        assert self.utils.event_in(
+            resource_name=failed_drive_name,
+            reason=const.DRIVE_READY_FOR_PHYSICAL_REMOVAL,
+        ), f"event {const.DRIVE_READY_FOR_PHYSICAL_REMOVAL} for drive {failed_drive_name} not found"
+    # 14. get Node ID on which drive resides, obtain path for affected drive, identify node name for corresponding node id and remove drive
+        failed_drive_path = failed_drive["spec"]["Path"]
+        assert failed_drive_path, f"Drive path for drive {failed_drive_name} not found"
+        logging.info(f"drive_path: {failed_drive_path}")
+
+        scsi_id = self.drive_utils[node_ip].get_scsi_id(failed_drive_path)
+        assert scsi_id, f"scsi_id for drive {failed_drive_name} not found"
+        logging.info(f"scsi_id: {scsi_id}")
+
+        self.drive_utils[node_ip].remove(scsi_id)
+        logging.info(f"drive {failed_drive_path}, {scsi_id} removed")
+    # 15. check driveCR succesfully removed -> only removed one, the second still should be on the cluster 
+        assert self.utils.check_drive_cr_exist_or_not(
+            drive_name=failed_drive_name,
+            cr_existence=False
+        ), f"Drive CR {failed_drive_name} still exists"
+
+        health_drive_name = health_drive['metadata']['name']
+        assert self.utils.check_drive_cr_exist_or_not(
+            drive_name=health_drive_name,
+            cr_existence=True,
+        ), f"Drive CR {health_drive_name} does not exist"
+    # 16. check for events DriveSuccessfullyRemoved in kubernetes events
+        assert self.utils.event_in(
+            resource_name=failed_drive_name,
+            reason=const.DRIVE_SUCCESSFULLY_REMOVED,
+        ), f"event {const.DRIVE_SUCCESSFULLY_REMOVED} for drive {failed_drive_name} not found"