From 0d05b2a07201fac8f2db878def10603eea4e34b8 Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Wed, 1 Dec 2021 10:07:01 -0800 Subject: [PATCH] Load performance test pipelines from files * Add perf_test_config.yaml with paths of test pipelines * Add a 'Status' column to output CSV file * Update compile_pyfile to remove loaded pipeline module after compilation to allow subsequent imports of another module with the same name Resolves #778 Signed-off-by: Christian Kadner --- sdk/python/kfp_tekton/compiler/main.py | 1 + sdk/python/tests/perf_test_config.yaml | 39 ++++ .../tests/{compiler => }/performance_tests.py | 205 +++++++++++------- 3 files changed, 165 insertions(+), 80 deletions(-) create mode 100644 sdk/python/tests/perf_test_config.yaml rename sdk/python/tests/{compiler => }/performance_tests.py (65%) diff --git a/sdk/python/kfp_tekton/compiler/main.py b/sdk/python/kfp_tekton/compiler/main.py index b24fb45804..e0baa4e16f 100644 --- a/sdk/python/kfp_tekton/compiler/main.py +++ b/sdk/python/kfp_tekton/compiler/main.py @@ -82,6 +82,7 @@ def compile_pyfile(pyfile, function_name, output_path, type_check, tekton_pipeli _compile_pipeline_function(pipeline_funcs, function_name, output_path, type_check, tekton_pipeline_conf) finally: del sys.path[0] + sys.modules.pop(os.path.splitext(filename)[0]) def main(): diff --git a/sdk/python/tests/perf_test_config.yaml b/sdk/python/tests/perf_test_config.yaml new file mode 100644 index 0000000000..0776c33447 --- /dev/null +++ b/sdk/python/tests/perf_test_config.yaml @@ -0,0 +1,39 @@ +# Copyright 2021 kubeflow.org +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pipelines to be loaded in performance_test.py +# relative paths are assumed to be relative to the project root +pipeline_scripts: +# - name: "Sequential" +# path: "sdk/python/tests/compiler/testdata/sequential.py" +# - name: "Flip-Coin-testdata" +# path: "sdk/python/tests/compiler/testdata/condition.py" +# - name: "Retry" +# path: "sdk/python/tests/compiler/testdata/retry.py" +# - name: "Loop-Static" +# path: "sdk/python/tests/compiler/testdata/loop_static.py" +# - name: "Conditions-and-Loops" +# path: "sdk/python/tests/compiler/testdata/conditions_and_loops.py" +# - name: "With-item-Nested" +# path: "sdk/python/tests/compiler/testdata/withitem_nested.py" +# - name: "Condition-and-recur" +# path: "sdk/python/tests/compiler/testdata/cond_recur.py" + - name: "Flip-Coin-samples" + path: "samples/flip-coin/condition.py" # condition with simple python + - name: "Flip-Coin-Custom-Task" + path: "samples/flip-coin-custom-task/condition.py" # custom task condition with simple python + - path: "samples/lightweight-component/calc_pipeline.py" # python in script with package install + - path: "samples/trusted-ai/trusted-ai.py" # long-running python component with S3 file passing + - path: "samples/nested-loops/withitem_nested.py" # nested loop pipelines, many CRDs + - path: "sdk/python/tests/compiler/testdata/cond_recur.yaml" # calling itself 42 times recursively, controller cannot estimate duration diff --git a/sdk/python/tests/compiler/performance_tests.py b/sdk/python/tests/performance_tests.py similarity index 65% rename from sdk/python/tests/compiler/performance_tests.py rename to sdk/python/tests/performance_tests.py index 8e6f10f22f..49c6e072df 100755 --- a/sdk/python/tests/compiler/performance_tests.py +++ b/sdk/python/tests/performance_tests.py @@ -18,20 +18,22 @@ import datetime import functools import os -import sys # noqa import tempfile import time import threading import json +import yaml from collections import defaultdict from datetime import datetime as dt from datetime import timedelta from os import environ as env -from typing import Callable, Dict, Mapping +from os.path import pathsep +from pathlib import Path +from typing import Dict, Mapping from kfp_server_api import ApiException, ApiRun, ApiRunDetail -from kfp_tekton.compiler import TektonCompiler +from kfp_tekton.compiler.main import compile_pyfile from kfp_tekton._client import TektonClient from kfp_tekton.compiler.pipeline_utils import TektonPipelineConf @@ -40,12 +42,13 @@ # load test settings from environment variables # ============================================================================= -# TODO: turn env vars into script parameters +# TODO: turn env vars into script parameters, use argparse PUBLIC_IP = env.get("PUBLIC_IP") NAMESPACE = env.get("NAMESPACE", None) USER_INFO = env.get("USER_INFO") CONNECT_SID = env.get("CONNECT_SID") NUM_WORKERS = int(env.get("NUM_WORKERS", 1)) +TEST_CONFIG = env.get("TEST_CONFIG") or Path(__file__).parents[0].joinpath("perf_test_config.yaml") EXPERIMENT = env.get("EXPERIMENT_NAME", "PERF_TEST") OUTPUT_FILE = env.get("OUTPUT_FILE", f"perf_test_{dt.now().strftime('%Y%m%d_%H%M%S')}_N{NUM_WORKERS}_{PUBLIC_IP}.csv") OUTPUT_SEP = env.get("OUTPUT_SEP", ",") @@ -57,6 +60,7 @@ f" USER_INFO: {USER_INFO}\n" f" CONNECT_SID: {CONNECT_SID}\n" f" NUM_WORKERS: {NUM_WORKERS}\n" + f" TEST_CONFIG: {TEST_CONFIG}\n" f" EXPERIMENT: {EXPERIMENT}\n" f" OUTPUT_FILE: {OUTPUT_FILE}\n" f" OUTPUT_SEP: {OUTPUT_SEP}\n") @@ -66,8 +70,6 @@ # local variables # ============================================================================= -# kfp_tekton_root_dir = os.path.abspath(__file__).replace("sdk/python/tests/compiler/performance_tests.py", "") - execution_times: Dict[str, Dict[str, timedelta]] = defaultdict(dict) @@ -131,15 +133,24 @@ def _synchronized_function(*args, **kwargs): @time_it # time_it inside the synchronized block so idle wait is not recorded def compile_pipeline(*, # force kwargs for time_it decorator to get pipeline_name pipeline_name: str, - pipeline_func: Callable) -> str: + pipeline_script: Path) -> str: file_name = pipeline_name + '.yaml' tmpdir = tempfile.gettempdir() # TODO: keep compiled pipelines? pipeline_package_path = os.path.join(tmpdir, file_name) pipeline_conf = TektonPipelineConf() - TektonCompiler().compile(pipeline_func=pipeline_func, - package_path=pipeline_package_path, - pipeline_conf=pipeline_conf) + + try: + compile_pyfile(pyfile=pipeline_script, + function_name=None, + output_path=pipeline_package_path, + type_check=True, + tekton_pipeline_conf=pipeline_conf) + + except ValueError as e: + print(f"{e.__class__.__name__} trying to compile {pipeline_script}: {str(e)}") + + # TODO: delete those files after running test or keep for inspection? return pipeline_package_path @@ -151,27 +162,35 @@ def submit_pipeline_run(*, # force kwargs for time_it decorator to get pipeline client = get_client() experiment = client.create_experiment(EXPERIMENT) # get or create - run_result = None - while run_result is None: # TODO: add timeout or max retries on ApiException - try: - run_result: ApiRun = client.run_pipeline( - experiment_id=experiment.id, - job_name=pipeline_name, - pipeline_package_path=pipeline_file, - params=arguments) - except ApiException as e: - print(f"KFP Server Exception: '{e.reason}' {e.status} '{e.body}'" - f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}") - time.sleep(1) - client = get_client() - return run_result.id + try: + run_result: ApiRun = client.run_pipeline( + experiment_id=experiment.id, + job_name=pipeline_name, + pipeline_package_path=pipeline_file, + params=arguments) + return run_result.id + + except ApiException as e: + print(f"KFP Server Exception trying to submit pipeline {pipeline_file}:" + f" '{e.reason}' {e.status} '{e.body}'" + f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}") + + except Exception as e: + print(f"Exception trying to submit pipeline {pipeline_file}:" + f" '{str(e)}'" + f" {datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')}") + + return None @time_it def wait_for_run_to_complete(*, # force kwargs so the time_it decorator can get pipeline_name pipeline_name: str, run_id: str) -> ApiRunDetail: + if not run_id: + return None + client = get_client() status = None @@ -182,8 +201,8 @@ def wait_for_run_to_complete(*, # force kwargs so the time_it decorator can get run: ApiRun = run_detail.run status = run.status except ApiException as e: # TODO: add timeout or max retries on ApiError - print(f"KFP Server Exception: {e.reason}") - time.sleep(1) + print(f"KFP Server Exception waiting for {pipeline_name} run {run_id}: {e.reason}") + time.sleep(10) time.sleep(0.1) @@ -205,85 +224,104 @@ def get_client() -> TektonClient: return client -def load_pipeline_functions() -> [(Callable, str)]: - pipeline_functions = [] +def get_project_root_dir() -> Path: + + script_path_presumed = "sdk/python/tests/performance_tests.py" + script_path_actually = Path(__file__) + project_root_folder = script_path_actually.parents[3] + + assert script_path_actually == project_root_folder.joinpath(script_path_presumed), \ + "Can not determine project root folder. Was this script file moved or renamed?" + + return project_root_folder + - from testdata.sequential import sequential_pipeline - pipeline_functions.append((sequential_pipeline, "sequential_pipeline")) +def load_test_config() -> dict: - from testdata.condition import flipcoin - pipeline_functions.append((flipcoin, "flipcoin")) + # script_path = Path(__file__) + # script_dir = script_path.parents[0] + # config_file = script_dir.joinpath("perf_test_config.yaml") - from testdata.compose import save_most_frequent_word - pipeline_functions.append((save_most_frequent_word, "compose")) + with open(TEST_CONFIG, "r") as f: + test_config = yaml.safe_load(f) - from testdata.retry import retry_sample_pipeline - pipeline_functions.append((retry_sample_pipeline, "retry")) + return test_config - from testdata.loop_static import pipeline as loop_static - pipeline_functions.append((loop_static, "loop_static")) - from testdata.conditions_and_loops import conditions_and_loops - pipeline_functions.append((conditions_and_loops, "conditions_and_loops")) +def load_pipeline_scripts() -> [(Path, str)]: - # from testdata.loop_in_recursion import flipcoin as loop_in_loop - # pipeline_functions.append((loop_in_loop, "loop_in_recursion")) - # - # from testdata.condition_custom_task import flipcoin_pipeline - # pipeline_functions.append((flipcoin_pipeline, "condition_custom_task")) + pipeline_files_with_name = [] + test_config = load_test_config() + project_dir = get_project_root_dir() - # TODO: add more pipelines + for path_name_dict in test_config["pipeline_scripts"]: - # NOTE: loading samples from outside package scope is hacky - # sys.path.insert(1, '/Users/dummy/projects/kfp-tekton/samples/lightweight-component') - # from calc_pipeline import calc_pipeline - # pipeline_functions.append(calc_pipeline) + path = path_name_dict["path"] + name = path_name_dict.get("name") or Path(path).stem - return pipeline_functions + if not path.startswith(pathsep): + # path assumed to be relative to project root + fp: Path = project_dir.joinpath(path) + else: + # path is absolute + fp = Path(path) + assert fp.exists(), f"Cannot find file: {fp.resolve()}" -def run_concurrently(pipelinefunc_name_tuples: [(Callable, str)]) -> [(str, str)]: - pipeline_status = [] + pipeline_files_with_name.append((fp, name)) + + print(f"Loaded {len(pipeline_files_with_name)} pipelines from {TEST_CONFIG}\n") + + return pipeline_files_with_name + + +def run_concurrently(pipelinescript_name_tuples: [(Path, str)]): with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: performance_tests = ( - executor.submit(run_single_pipeline_performance_test, func, name) - for (func, name) in pipelinefunc_name_tuples + executor.submit(run_single_pipeline_performance_test, pipeline_script, name) + for (pipeline_script, name) in pipelinescript_name_tuples ) for performance_test in concurrent.futures.as_completed(performance_tests): try: - run_details = performance_test.result().run - pipeline_status.append((run_details.name, run_details.status)) + run_details = performance_test.result() # noqa F841 except Exception as e: error = f"{e.__class__.__name__}: {str(e)}" print(error) - pipeline_status.append(("unknown pipeline", error)) - return pipeline_status +def run_single_pipeline_performance_test(pipeline_script: Path, + pipeline_name: str): + try: + pipeline_file = compile_pipeline(pipeline_name=pipeline_name, pipeline_script=pipeline_script) + run_id = submit_pipeline_run(pipeline_name=pipeline_name, pipeline_file=pipeline_file) + run_details = wait_for_run_to_complete(pipeline_name=pipeline_name, run_id=run_id) -def run_single_pipeline_performance_test(pipeline_func: Callable, - pipeline_name: str) -> ApiRunDetail: + status = run_details.run.status if run_details else "Error" + task_details = parse_run_details(run_details) - pipeline_file = compile_pipeline(pipeline_name=pipeline_name, pipeline_func=pipeline_func) - run_id = submit_pipeline_run(pipeline_name=pipeline_name, pipeline_file=pipeline_file) - run_details = wait_for_run_to_complete(pipeline_name=pipeline_name, run_id=run_id) - task_details = parse_run_details(run_details) + append_exec_times_to_output_file(pipeline_name, status, task_details) - append_exec_times_to_output_file(pipeline_name, task_details) - - return run_details + except Exception as e: + error = f"{e.__class__.__name__} while testing '{pipeline_name}': {str(e)}" + print(error) def parse_run_details(run_details: ApiRunDetail) -> dict: - rev = {} + task_details = {} + + if not run_details: + return {} + pipelinerun = json.loads(run_details.to_dict()["pipeline_runtime"]["workflow_manifest"]) status = pipelinerun["status"] def get_details(data): + info = {} total = timedelta(0) count = 0 + for key in data.keys(): run = data[key] status = run["status"] @@ -302,15 +340,17 @@ def get_details(data): return info if "taskRuns" in status: - rev["taskRuns"] = get_details(status["taskRuns"]) + task_details["taskRuns"] = get_details(status["taskRuns"]) if "runs" in status: - rev["run"] = get_details(status["runs"]) + task_details["runs"] = get_details(status["runs"]) - return rev + return task_details -def append_exec_times_to_output_file(pipeline_name: str, tasks: dict): +def append_exec_times_to_output_file(pipeline_name: str, + status: str = "", + tasks: dict = {}): compile_time = str(execution_times[pipeline_name][compile_pipeline.__name__]) submit_time = str(execution_times[pipeline_name][submit_pipeline_run.__name__]) @@ -319,16 +359,18 @@ def append_exec_times_to_output_file(pipeline_name: str, tasks: dict): taskrun_elapsed = timedelta(0) runs = 0 run_elapsed = timedelta(0) + if "taskRuns" in tasks: taskruns = tasks["taskRuns"]["count"] taskrun_elapsed = tasks["taskRuns"]["total_elapsed"] + if "runs" in tasks: runs = tasks["runs"]["count"] run_elapsed = tasks["runs"]["total_elapsed"] with open(OUTPUT_FILE, "a") as f: f.write(OUTPUT_SEP.join([ - pipeline_name, compile_time, submit_time, run_time, + pipeline_name, str(status), compile_time, submit_time, run_time, str(taskruns), str(runs), str(taskrun_elapsed), str(run_elapsed) ])) f.write("\n") @@ -337,20 +379,23 @@ def append_exec_times_to_output_file(pipeline_name: str, tasks: dict): def create_output_file(): with open(OUTPUT_FILE, "w") as f: - f.write(OUTPUT_SEP.join(["Pipeline", "Compile", "Submit", "Run", - "Num_TaskRuns", "Num_Runs", "Total_TaskRun_Time", "Total_Run_Time"]) + "\n") + + f.write(OUTPUT_SEP.join([ + "Pipeline", "Status", "Compile_Time", "Submit_Time", "Run_Time", + "Num_TaskRuns", "Num_Runs", + "Total_TaskRun_Time", "Total_Run_Time"]) + "\n") def run_performance_tests(): create_output_file() - pipeline_functions = load_pipeline_functions() + pipeline_scripts = load_pipeline_scripts() if NUM_WORKERS == 1: # TODO: use `run_concurrently()` even with 1 worker - for func, name in pipeline_functions: - run_single_pipeline_performance_test(func, name) + for script, name in pipeline_scripts: + run_single_pipeline_performance_test(script, name) else: - run_concurrently(pipeline_functions) + run_concurrently(pipeline_scripts) if __name__ == '__main__':