From da1c97d0ee531aeee5340126dbc27ed1b4baab54 Mon Sep 17 00:00:00 2001 From: Bo Wu Date: Tue, 3 Dec 2024 09:51:07 -0800 Subject: [PATCH] reduce cost --- testsuite/replay-verify/archive_disk_utils.py | 27 ++++++++++++---- testsuite/replay-verify/main.py | 31 +++++++++++++------ 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/testsuite/replay-verify/archive_disk_utils.py b/testsuite/replay-verify/archive_disk_utils.py index bb1e67a1d6060f..2ccf4424d0c8ca 100644 --- a/testsuite/replay-verify/archive_disk_utils.py +++ b/testsuite/replay-verify/archive_disk_utils.py @@ -22,6 +22,10 @@ MAINNET_SNAPSHOT_NAME = "mainnet-archive" +def get_region_from_zone(zone): + return zone.rsplit("-", 1)[0] + + def get_kubectl_credentials(project_id, region, cluster_name): try: # Command to get kubectl credentials for the cluster @@ -141,6 +145,8 @@ def create_snapshot_with_gcloud( source_disk_link, "--project", target_project, + "--storage-location", + get_region_from_zone(source_zone), ] try: @@ -199,7 +205,7 @@ def create_disk_pv_pvc_from_snapshot( wait_for_operation(project, zone, operation.name, compute_v1.ZoneOperationsClient()) logger.info(f"Disk {disk_name} created from snapshot {og_snapshot_name}.") - region_name = zone.rsplit("-", 1)[0] + region_name = get_region_from_zone(zone) get_kubectl_credentials(project, region_name, cluster_name) # create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True) # this is only for xfs replaying logs to repair the disk @@ -228,6 +234,17 @@ def create_disk_pv_pvc_from_snapshot( time.sleep(10) logger.info(f"creating final snapshot") create_snapshot_with_gcloud(snapshot_name, project, disk_name, zone, project) + logger.info("deleting repair pvc and correpsonding pv and disks") + # delete the PVC used for repair + try: + client.CoreV1Api().delete_namespaced_persistent_volume_claim( + name=repair_pvc, namespace=namespace + ) + except ApiException as e: + if e.status == 404: + print(f"PersistentVolumeClaim '{repair_pvc}' not found.") + else: + print(f"Exception when deleting PersistentVolumeClaim: {e}") def is_job_pod_cleanedup(namespace, job_name): @@ -298,7 +315,7 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only) fs_type="xfs", read_only=read_only, ), - persistent_volume_reclaim_policy="Retain", + persistent_volume_reclaim_policy="Delete", storage_class_name="standard", ), ) @@ -427,7 +444,7 @@ def create_pvcs_from_snapshot(run_id, snapshot_name, namespace, pvc_num, label): return res -def create_disk_pv_pvc( +def create_repair_disk_and_its_snapshot( project, zone, cluster_name, og_snapshot_name, snapshot_name, prefix, namespace ): tasks = [] @@ -462,8 +479,6 @@ def create_disk_pv_pvc( except Exception as e: logger.error(f"Task generated an exception: {e}") - # start a self deleteing job to mount the xfs disks for repairing - def parse_args(): parser = argparse.ArgumentParser( @@ -506,7 +521,7 @@ def parse_args(): source_namespace, project_id, ) - create_disk_pv_pvc( + create_repair_disk_and_its_snapshot( project_id, zone, cluster_name, diff --git a/testsuite/replay-verify/main.py b/testsuite/replay-verify/main.py index e8b29e363b4ee5..e3cb4acc0b26d1 100644 --- a/testsuite/replay-verify/main.py +++ b/testsuite/replay-verify/main.py @@ -28,6 +28,7 @@ REPLAY_CONCURRENCY_LEVEL = 1 + class Network(Enum): TESTNET = 1 MAINNET = 2 @@ -241,6 +242,7 @@ def get_pod_status(self): def get_humio_log_link(self): return construct_humio_url(self.label, self.name, self.start_time, time.time()) + class ReplayConfig: def __init__(self, network): if network == Network.TESTNET: @@ -253,9 +255,10 @@ def __init__(self, network): self.concurrent_replayer = 18 self.pvc_number = 8 self.min_range_size = 10_000 - self.range_size = 2_000_000 + self.range_size = 2_000_000 self.timeout_secs = 400 + class TaskStats: def __init__(self, name): self.name = name @@ -308,7 +311,7 @@ def __init__( self.image = image self.pvcs = [] self.config = replay_config - + def __str__(self): return f"""ReplayScheduler: id: {self.id} @@ -360,7 +363,11 @@ def create_pvc_from_snapshot(self): else MAINNET_SNAPSHOT_NAME ) pvcs = create_pvcs_from_snapshot( - self.id, snapshot_name, self.namespace, self.config.pvc_number, self.get_label() + self.id, + snapshot_name, + self.namespace, + self.config.pvc_number, + self.get_label(), ) assert len(pvcs) == self.config.pvc_number, "failed to create all pvcs" self.pvcs = pvcs @@ -504,12 +511,16 @@ def get_image(image_tag=None): shell = forge.LocalShell() git = forge.Git(shell) image_name = "tools" - default_latest_image = forge.find_recent_images( - shell, - git, - 1, - image_name=image_name, - )[0] if image_tag is None else image_tag + default_latest_image = ( + forge.find_recent_images( + shell, + git, + 1, + image_name=image_name, + )[0] + if image_tag is None + else image_tag + ) full_image = f"{forge.GAR_REPO_NAME}/{image_name}:{default_latest_image}" return full_image @@ -546,7 +557,7 @@ def print_logs(failed_workpod_logs, txn_mismatch_logs): range_size=range_size, image=image, replay_config=config, - network= network, + network=network, namespace=args.namespace, ) logger.info(f"scheduler: {scheduler}")