From 17451c5ef9b1d39a5b216684cf85555faaed3b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Lorr=C3=A9?= <35808396+GeorgesLorre@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:13:26 +0200 Subject: [PATCH 01/31] Feature/vertex compiler (#411) --- pyproject.toml | 7 +- src/fondant/compiler.py | 118 ++- src/fondant/component_spec.py | 316 ++++--- src/fondant/executor.py | 38 +- src/fondant/runner.py | 5 +- .../example_1/docker-compose.yml | 18 +- .../example_1/kubeflow_pipeline.yml | 845 ++++++++++++------ .../example_1/vertex_pipeline.json | 338 +++++++ .../example_1/vertex_pipeline.yml | 556 ++++++++++++ .../example_2/docker-compose.yml | 14 +- .../example_2/kubeflow_pipeline.yml | 601 ++++++++----- .../example_2/vertex_pipeline.yml | 402 +++++++++ .../compiled_pipeline/kubeflow_pipeline.yml | 333 ++++--- .../component_specs/kubeflow_component.yaml | 296 ++++-- tests/test_cli.py | 13 +- tests/test_compiler.py | 63 +- tests/test_component_specs.py | 6 +- 17 files changed, 3069 insertions(+), 900 deletions(-) create mode 100644 tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json create mode 100644 tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml create mode 100644 tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml diff --git a/pyproject.toml b/pyproject.toml index 49ac38e42..bf3c2732f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ classifiers = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Topic :: Software Development", "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", @@ -41,8 +40,9 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">= 3.8" dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} +python = ">= 3.8 < 3.11" +dask = {extras = ["dataframe"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" pyarrow = ">= 11.0.0" @@ -51,7 +51,7 @@ fsspec = { version = ">= 2023.4.0", optional = true} gcsfs = { version = ">= 2023.4.0", optional = true } s3fs = { version = ">= 2023.4.0", optional = true } adlfs = { version = ">= 2023.4.0", optional = true } -kfp = { version = ">= 1.8.19, < 2", optional = true } +kfp = { version = "2.0.1", optional = true } pandas = { version = ">= 1.3.5", optional = true } [tool.poetry.extras] @@ -59,6 +59,7 @@ aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] +vertex = ["kfp"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 48ca4542f..6292bd9b0 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -253,6 +253,7 @@ def compile( output_path: the path where to save the Kubeflow pipeline spec """ run_id = pipeline.get_run_id() + pipeline.validate(run_id=run_id) @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): @@ -273,28 +274,32 @@ def kfp_pipeline(): logger.info(f"Compiling service for {component_name}") # convert ComponentOp to Kubeflow component - kubeflow_component_op = self.kfp.components.load_component( + kubeflow_component_op = self.kfp.components.load_component_from_text( text=component_op.component_spec.kubeflow_specification.to_string(), ) + # # Set image pull policy to always # Execute the Kubeflow component and pass in the output manifest path from # the previous component. component_args = component_op.arguments - component_task = kubeflow_component_op( - input_manifest_path=manifest_path, - metadata=metadata.to_json(), - **component_args, - ) - # Set optional configurations - component_task = self._set_configuration( - component_task, - component_op, - ) - - # Set image pull policy to always - component_task.container.set_image_pull_policy("Always") + if previous_component_task is not None: + component_task = kubeflow_component_op( + input_manifest_path=manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) + else: + component_task = kubeflow_component_op( + metadata=metadata.to_json(), + **component_args, + ) + component_task + # Set optional configurations + # component_task, + # component_op, # Set the execution order of the component task to be after the previous # component task. if previous_component_task is not None: @@ -305,9 +310,7 @@ def kfp_pipeline(): previous_component_task = component_task - self.pipeline = pipeline - self.pipeline.validate(run_id=run_id) - logger.info(f"Compiling {self.pipeline.name} to {output_path}") + logger.info(f"Compiling {pipeline.name} to {output_path}") self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore logger.info("Pipeline compiled successfully") @@ -334,3 +337,84 @@ def _set_configuration(self, task, fondant_component_operation): task.apply(self.kfp_gcp.use_preemptible_nodepool()) return task + + +class VertexCompiler(Compiler): + def __init__(self): + self.resolve_imports() + + def resolve_imports(self): + """Resolve imports for the Vertex compiler.""" + try: + import kfp + + self.kfp = kfp + + except ImportError: + msg = """You need to install kfp to use the Vertex compiler,\n + you can install it with `pip install fondant[vertex]`""" + raise ImportError( + msg, + ) + + def compile( + self, + pipeline: Pipeline, + output_path: str = "vertex_pipeline.yml", + ) -> None: + """Compile a pipeline to vertex pipeline spec and save it to a specified output path. + + Args: + pipeline: the pipeline to compile + output_path: the path where to save the Kubeflow pipeline spec + """ + run_id = pipeline.get_run_id() + pipeline.validate(run_id=run_id) + logger.info(f"Compiling {pipeline.name} to {output_path}") + + @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) + def kfp_pipeline(): + previous_component_task = None + manifest_path = None + for component_name, component in pipeline._graph.items(): + logger.info(f"Compiling service for {component_name}") + + component_op = component["fondant_component_op"] + # convert ComponentOp to Kubeflow component + kubeflow_component_op = self.kfp.components.load_component_from_text( + text=component_op.component_spec.kubeflow_specification.to_string(), + ) + + # Execute the Kubeflow component and pass in the output manifest path from + # the previous component. + + component_args = component_op.arguments + metadata = Metadata( + pipeline_name=pipeline.name, + run_id=run_id, + base_path=pipeline.base_path, + component_id=component_name, + cache_key=component_op.get_component_cache_key(), + ) + # Set the execution order of the component task to be after the previous + # component task. + if previous_component_task is not None: + component_task = kubeflow_component_op( + input_manifest_path=manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) + + else: + component_task = kubeflow_component_op( + metadata=metadata.to_json(), + **component_args, + ) + # Update the manifest path to be the output path of the current component task. + manifest_path = component_task.outputs["output_manifest_path"] + + previous_component_task = component_task + + self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore + logger.info("Pipeline compiled successfully") diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index a71762a87..06f3d9bde 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -1,11 +1,11 @@ """This module defines classes to represent an Fondant component specification.""" -import ast import copy import json import pkgutil import types import typing as t from dataclasses import dataclass +from distutils.util import strtobool from pathlib import Path import jsonschema.exceptions @@ -17,14 +17,15 @@ from fondant.exceptions import InvalidComponentSpec from fondant.schema import Field, KubeflowCommandArguments, Type -# TODO: remove after upgrading to kfpv2 +# # TODO: remove after upgrading to kfpv2 + kubeflow_to_python_type_dict = { - "String": str, - "Integer": int, - "Float": float, - "Boolean": ast.literal_eval, - "JsonObject": json.loads, - "JsonArray": json.loads, + "STRING": str, + "NUMBER_INTEGER": int, + "NUMBER_DOUBLE": float, + "BOOLEAN": lambda x: bool(strtobool(x)), + "STRUCT": json.loads, + "LIST": json.loads, } @@ -33,15 +34,13 @@ def kubeflow2python_type(type_: str) -> t.Any: return lambda value: map_fn(value) if value != "None" else None # type: ignore -# TODO: Change after upgrading to kfp v2 -# :https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/ python2kubeflow_type = { - "str": "String", - "int": "Integer", - "float": "Float", - "bool": "Boolean", - "dict": "JsonObject", - "list": "JsonArray", + "str": "STRING", + "int": "NUMBER_INTEGER", + "float": "NUMBER_DOUBLE", + "bool": "BOOLEAN", + "dict": "STRUCT", + "list": "LIST", } @@ -230,101 +229,159 @@ class KubeflowComponentSpec: def __init__(self, specification: t.Dict[str, t.Any]) -> None: self._specification = specification + @staticmethod + def convert_arguments(fondant_component): + args = {} + for arg in fondant_component.args.values(): + args[arg.name] = { + "parameterType": python2kubeflow_type[arg.type], + "description": arg.description, + **( + {"defaultValue": arg.default, "isOptional": True} + if arg.default is not None + else {} + ), + } + return args + @classmethod - def from_fondant_component_spec( - cls, - fondant_component: ComponentSpec, - ) -> "KubeflowComponentSpec": - """Create a Kubeflow component spec from a Fondant component spec.""" - specification = { - "name": fondant_component.name, - "description": fondant_component.description, - "inputs": [ - { - "name": "input_manifest_path", + def from_fondant_component_spec(cls, fondant_component: ComponentSpec): + """Generate a Kubeflow component spec from a ComponentOp.""" + input_definitions = { + "artifacts": { + "input_manifest_path": { "description": "Path to the input manifest", - "type": "String", + "artifactType": { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, }, - { - "name": "metadata", - "description": "Metadata arguments containing the run id and base path", - "type": "String", - }, - { - "name": "component_spec", + }, + "parameters": { + "component_spec": { "description": "The component specification as a dictionary", - "type": "JsonObject", - "default": "None", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", }, - { - "name": "input_partition_rows", - "description": "The number of rows to load per partition. Set to override the" - " automatic partitioning", - "type": "String", - "default": "None", + "input_partition_rows": { + "description": "The number of rows to load per partition." + + " Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + "defaultValue": "None", }, - { - "name": "cache", + "cache": { + "parameterType": "BOOLEAN", "description": "Set to False to disable caching, True by default.", - "type": "Boolean", - "default": "True", - }, - { - "name": "cluster_type", - "description": "The type of cluster to use for distributed execution", - "type": "String", - "default": "default", + "defaultValue": True, + "isOptional": True, }, - { - "name": "client_kwargs", - "description": "Keyword arguments used to initialise the dask client", - "type": "JsonObject", - "default": "{}", + "metadata": { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", }, - *( - { - "name": arg.name, - "description": arg.description, - "type": python2kubeflow_type[arg.type], - **({"default": arg.default} if arg.default is not None else {}), - } - for arg in fondant_component.args.values() - ), - ], - "outputs": [ - { - "name": "output_manifest_path", + **cls.convert_arguments(fondant_component), + }, + } + + cleaned_component_name = fondant_component.name.replace("-", "_").replace( + " ", + "_", + ) + output_definitions = { + "artifacts": { + "output_manifest_path": { + "artifactType": { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, "description": "Path to the output manifest", - "type": "String", }, - ], - "implementation": { - "container": { - "image": fondant_component.image, - "command": [ - "fondant", - "execute", - "main", - "--input_manifest_path", - {"inputPath": "input_manifest_path"}, - "--metadata", - {"inputValue": "metadata"}, - "--component_spec", - {"inputValue": "component_spec"}, - "--input_partition_rows", - {"inputValue": "input_partition_rows"}, - "--cache", - {"inputValue": "cache"}, - *cls._dump_args(fondant_component.args.values()), - "--output_manifest_path", - {"outputPath": "output_manifest_path"}, - "--cluster_type", - {"inputValue": "cluster_type"}, - "--client_kwargs", - {"inputValue": "client_kwargs"}, - ], + }, + } + + specification = { + "components": { + "comp-" + + cleaned_component_name: { + "executorLabel": "exec-" + cleaned_component_name, + "inputDefinitions": input_definitions, + "outputDefinitions": output_definitions, + }, + }, + "deploymentSpec": { + "executors": { + "exec-" + + cleaned_component_name: { + "container": { + "args": [ + "--input_manifest_path", + "{{$.inputs.artifacts['input_manifest_path'].uri}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--cache", + "{{$.inputs.parameters['cache']}}", + *cls._dump_args(fondant_component.args.values()), + "--output_manifest_path", + "{{$.outputs.artifacts['output_manifest_path'].uri}}", + ], + "command": ["fondant", "execute", "main"], + "image": fondant_component.image, + }, + }, }, }, + "pipelineInfo": {"name": cleaned_component_name}, + "root": { + "dag": { + "outputs": { + "artifacts": { + "output_manifest_path": { + "artifactSelectors": [ + { + "outputArtifactKey": "output_manifest_path", + "producerSubtask": cleaned_component_name, + }, + ], + }, + }, + }, + "tasks": { + cleaned_component_name: { + "cachingOptions": {"enableCache": True}, + "componentRef": {"name": "comp-" + cleaned_component_name}, + "inputs": { + "artifacts": { + "input_manifest_path": { + "componentInputArtifact": "input_manifest_path", + }, + }, + "parameters": { + "component_spec": { + "componentInputParameter": "component_spec", + }, + "input_partition_rows": { + "componentInputParameter": "input_partition_rows", + }, + "metadata": {"componentInputParameter": "metadata"}, + "cache": {"componentInputParameter": "cache"}, + }, + }, + "taskInfo": {"name": cleaned_component_name}, + }, + }, + }, + "inputDefinitions": input_definitions, + "outputDefinitions": output_definitions, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.1", } return cls(specification) @@ -337,7 +394,7 @@ def _dump_args(args: t.Iterable[Argument]) -> KubeflowCommandArguments: arg_name_cmd = f"--{arg_name}" dumped_args.append(arg_name_cmd) - dumped_args.append({"inputValue": arg_name}) + dumped_args.append("{{$.inputs.parameters['" + f"{arg_name}" + "']}}") return dumped_args @@ -359,31 +416,54 @@ def to_string(self) -> str: @property def input_arguments(self) -> t.Mapping[str, Argument]: """The input arguments of the component as an immutable mapping.""" - return types.MappingProxyType( - { - info["name"]: Argument( - name=info["name"], - description=info["description"], - type=info["type"], - default=info["default"] if "default" in info else None, + args = {} + input_definitions = self._specification["root"]["inputDefinitions"] + + if "artifacts" in input_definitions: + for arg_name, arg_info in input_definitions["artifacts"].items(): + args[arg_name] = Argument( + name=arg_name, + description=arg_info["description"], + type="STRING", + default=None, ) - for info in self._specification["inputs"] - }, - ) + if "parameters" in input_definitions: + for arg_name, arg_info in input_definitions["parameters"].items(): + args[arg_name] = Argument( + name=arg_name, + description=arg_info["description"], + type=arg_info["parameterType"], + default=arg_info["defaultValue"] + if "defaultValue" in arg_info + else None, + ) + return types.MappingProxyType(args) @property def output_arguments(self) -> t.Mapping[str, Argument]: """The output arguments of the component as an immutable mapping.""" - return types.MappingProxyType( - { - info["name"]: Argument( - name=info["name"], - description=info["description"], - type=info["type"], + args = {} + output_definitions = self._specification["root"]["outputDefinitions"] + + if "artifacts" in output_definitions: + for arg_name, arg_info in output_definitions["artifacts"].items(): + args[arg_name] = Argument( + name=arg_name, + description=arg_info["description"], + type="STRING", + default=None, ) - for info in self._specification["outputs"] - }, - ) + if "parameters" in output_definitions: + for arg_name, arg_info in output_definitions["parameters"].items(): + args[arg_name] = Argument( + name=arg_name, + description=arg_info["description"], + type=arg_info["parameterType"], + default=arg_info["defaultValue"] + if "defaultValue" in arg_info + else None, + ) + return types.MappingProxyType(args) def __repr__(self) -> str: return f"{self.__class__.__name__}({self._specification!r})" diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 8126199d8..b4a568b8f 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -5,12 +5,12 @@ components take care of processing, filtering and extending the data. """ import argparse -import ast import json import logging import os import typing as t from abc import abstractmethod +from distutils.util import strtobool from pathlib import Path import dask @@ -110,7 +110,7 @@ def from_args(cls) -> "Executor": """Create an executor from a passed argument containing the specification as a dict.""" parser = argparse.ArgumentParser() parser.add_argument("--component_spec", type=json.loads) - parser.add_argument("--cache", type=ast.literal_eval) + parser.add_argument("--cache", type=lambda x: bool(strtobool(x))) parser.add_argument("--input_partition_rows", type=validate_partition_number) parser.add_argument("--cluster_type", type=str) parser.add_argument("--client_kwargs", type=json.loads) @@ -399,42 +399,14 @@ def upload_manifest(self, manifest: Manifest, save_path: t.Union[str, Path]): """ Uploads the manifest to the specified destination. - If the save_path points to the kubeflow output artifact temporary path, - it will be saved both in a specific base path and the native kfp artifact path. - Args: manifest: The Manifest object to be uploaded. save_path: The path where the Manifest object will be saved. """ - is_kubeflow_output = ( - str(save_path) == "/tmp/outputs/output_manifest_path/data" # nosec - ) - - if is_kubeflow_output: - # Save to the expected base path directory - save_path_base_path = ( - f"{manifest.base_path}/{manifest.pipeline_name}/{manifest.run_id}/" - f"{manifest.component_id}/manifest.json" - ) - # Upload manifest and it's reference if cache is False - manifest.to_file(save_path_base_path) - logger.info(f"Saving output manifest to {save_path_base_path}") - self._upload_cache_key( - manifest=manifest, - manifest_save_path=save_path_base_path, - ) - # Write manifest to the native kfp artifact path that will be passed as an artifact - # and read by the next component - manifest.to_file(save_path) - else: - # Local runner - manifest.to_file(save_path) - logger.info(f"Saving output manifest to {save_path}") - self._upload_cache_key( - manifest=manifest, - manifest_save_path=save_path, - ) + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + manifest.to_file(save_path) + logger.info(f"Saving output manifest to {save_path}") class DaskLoadExecutor(Executor[DaskLoadComponent]): diff --git a/src/fondant/runner.py b/src/fondant/runner.py index 4ff924a8e..ff75ae1a3 100644 --- a/src/fondant/runner.py +++ b/src/fondant/runner.py @@ -1,4 +1,3 @@ -import json import logging import subprocess # nosec from abc import ABC, abstractmethod @@ -85,6 +84,4 @@ def get_name_from_spec(self, input_spec: str): """Get the name of the pipeline from the spec.""" with open(input_spec) as f: spec = yaml.safe_load(f) - return json.loads( - spec["metadata"]["annotations"]["pipelines.kubeflow.org/pipeline_spec"], - )["name"] + return spec["pipelineInfo"]["name"] diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml index ec40cd6c1..954616903 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml @@ -1,4 +1,4 @@ -name: test_pipeline +name: testpipeline services: first_component: build: @@ -6,10 +6,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/first_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json - --storage_args - a dummy string arg - --input_partition_rows @@ -41,10 +41,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/second_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "second_component", "cache_key": "2"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json - --storage_args - a dummy string arg - --input_partition_rows @@ -60,7 +60,7 @@ services: "array", "items": {"type": "float32"}}}}}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json depends_on: first_component: condition: service_completed_successfully @@ -73,10 +73,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/third_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "third_component", "cache_key": "3"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/third_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json - --storage_args - a dummy string arg - --cache @@ -92,7 +92,7 @@ services: false}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json depends_on: second_component: condition: service_completed_successfully diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index 2dbdc253d..c5990e87e 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -1,305 +1,556 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' - generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 -spec: - arguments: - parameters: [] - entrypoint: test-pipeline - serviceAccountName: pipeline-runner - templates: - - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - preference: - matchExpressions: - - key: cloud.google.com/gke-preemptible - operator: In - values: - - 'true' - weight: 50 - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "1"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - disable - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - resources: - limits: - nvidia.com/gpu: 1 - inputs: +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +components: + comp-First_component: + executorLabel: exec-First_component + inputDefinitions: artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: - data: '' - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "disable", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - outputs: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - tolerations: - - effect: NoSchedule - key: preemptible - operator: Equal - value: 'true' - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "second_component", "cache_key": "2"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, "description": - "This is an example component", "image": "example_component:latest", "name": - "Second component", "produces": {"embeddings": {"fields": {"data": {"items": - {"type": "float32"}, "type": "array"}}}}}' - - --input_partition_rows - - '10' - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Second_component: + executorLabel: exec-Second_component + inputDefinitions: artifacts: - - name: first-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"consumes\": - {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"Second component\", \"produces\": {\"embeddings\": {\"fields\": - {\"data\": {\"items\": {\"type\": \"float32\"}, \"type\": \"array\"}}}}}", - "input_partition_rows": "10", "metadata": "{\"base_path\": \"/foo/bar\", - \"pipeline_name\": \"test_pipeline\", \"run_id\": \"test_pipeline-20230101000000\", - \"component_id\": \"second_component\", \"cache_key\": \"2\"}", "storage_args": - "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "e8f5a26a42664b1e4774da40117b542baa9676368d9e05262a40e4fd10be0e68"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "Second component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: second-component - outputs: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: artifacts: - - name: second-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Third_component: + executorLabel: exec-Third_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-first-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: First_component tasks: - - name: first-component - template: first-component - - arguments: - artifacts: - - from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}' - name: first-component-output_manifest_path - dependencies: + First_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-First_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-second-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Second_component + tasks: + Second_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-Second_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Second_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-third-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Third_component + tasks: + Third_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-Third_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Third_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest +deploymentSpec: + executors: + exec-First_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Second_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Third_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: disable + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + second-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-second-component + dependentTasks: - first-component - name: second-component - template: second-component - - arguments: + inputs: artifacts: - - from: '{{tasks.second-component.outputs.artifacts.second-component-output_manifest_path}}' - name: second-component-output_manifest_path - dependencies: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: first-component + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Second component + produces: + embeddings: + fields: + data: + items: + type: float32 + type: array + input_partition_rows: + runtimeValue: + constant: '10' + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "second_component", + "cache_key": "2"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: second-component + third-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-third-component + dependentTasks: - second-component - name: third-component - template: third-component - name: test-pipeline - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "third_component", "cache_key": "3"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "consumes": {"captions": {"fields": {"data": {"type": "string"}}}, "embeddings": - {"fields": {"data": {"items": {"type": "float32"}, "type": "array"}}}, "images": - {"fields": {"data": {"type": "binary"}}}}, "description": "This is an example - component", "image": "example_component:latest", "name": "Third component", - "produces": {"additionalSubsets": false, "images": {"fields": {"data": {"type": - "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: - artifacts: - - name: second-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"consumes\": - {\"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, \"embeddings\": - {\"fields\": {\"data\": {\"items\": {\"type\": \"float32\"}, \"type\": \"array\"}}}, - \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"Third component\", \"produces\": {\"additionalSubsets\": false, - \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}}", "input_partition_rows": - "None", "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", - \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"third_component\", - \"cache_key\": \"3\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "8d2ae6379592151eea3b644c61fb091a68a431ac15ed24064cb66434cabf6e08"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "Third component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: third-component - outputs: - artifacts: - - name: third-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data + inputs: + artifacts: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: second-component + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + captions: + fields: + data: + type: string + embeddings: + fields: + data: + items: + type: float32 + type: array + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Third component + produces: + additionalSubsets: false + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "third_component", + "cache_key": "3"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: third-component +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json new file mode 100644 index 000000000..28ba9c42c --- /dev/null +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json @@ -0,0 +1,338 @@ +{ + "pipelineSpec": { + "components": { + "comp-first-component": { + "executorLabel": "exec-first-component", + "inputDefinitions": { + "parameters": { + "component_spec": { + "type": "STRING" + }, + "input_manifest_path": { + "type": "STRING" + }, + "input_partition_rows": { + "type": "STRING" + }, + "metadata": { + "type": "STRING" + }, + "storage_args": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "output_manifest_path": { + "type": "STRING" + } + } + } + }, + "comp-second-component": { + "executorLabel": "exec-second-component", + "inputDefinitions": { + "parameters": { + "component_spec": { + "type": "STRING" + }, + "input_manifest_path": { + "type": "STRING" + }, + "input_partition_rows": { + "type": "STRING" + }, + "metadata": { + "type": "STRING" + }, + "storage_args": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "output_manifest_path": { + "type": "STRING" + } + } + } + }, + "comp-third-component": { + "executorLabel": "exec-third-component", + "inputDefinitions": { + "parameters": { + "component_spec": { + "type": "STRING" + }, + "input_manifest_path": { + "type": "STRING" + }, + "input_partition_rows": { + "type": "STRING" + }, + "metadata": { + "type": "STRING" + }, + "some_list": { + "type": "STRING" + }, + "storage_args": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "output_manifest_path": { + "type": "STRING" + } + } + } + } + }, + "deploymentSpec": { + "executors": { + "exec-first-component": { + "container": { + "command": [ + "python3", + "main.py", + "--input_manifest_path", + "{{$.inputs.parameters['input_manifest_path']}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--output_manifest_path", + "{{$.outputs.parameters['output_manifest_path'].output_file}}" + ], + "image": "example_component:latest" + } + }, + "exec-second-component": { + "container": { + "command": [ + "python3", + "main.py", + "--input_manifest_path", + "{{$.inputs.parameters['input_manifest_path']}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--output_manifest_path", + "{{$.outputs.parameters['output_manifest_path'].output_file}}" + ], + "image": "example_component:latest" + } + }, + "exec-third-component": { + "container": { + "command": [ + "python3", + "main.py", + "--input_manifest_path", + "{{$.inputs.parameters['input_manifest_path']}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--some_list", + "{{$.inputs.parameters['some_list']}}", + "--output_manifest_path", + "{{$.outputs.parameters['output_manifest_path'].output_file}}" + ], + "image": "example_component:latest" + } + } + } + }, + "pipelineInfo": { + "name": "testpipeline" + }, + "root": { + "dag": { + "tasks": { + "first-component": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-first-component" + }, + "inputs": { + "parameters": { + "component_spec": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"name\": \"First component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}}}" + } + } + }, + "input_manifest_path": { + "runtimeValue": { + "constantValue": { + "stringValue": "" + } + } + }, + "input_partition_rows": { + "runtimeValue": { + "constantValue": { + "stringValue": "disable" + } + } + }, + "metadata": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" + } + } + }, + "storage_args": { + "runtimeValue": { + "constantValue": { + "stringValue": "a dummy string arg" + } + } + } + } + }, + "taskInfo": { + "name": "first-component" + } + }, + "second-component": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-second-component" + }, + "dependentTasks": [ + "first-component" + ], + "inputs": { + "parameters": { + "component_spec": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"name\": \"Second component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"produces\": {\"embeddings\": {\"fields\": {\"data\": {\"type\": \"array\", \"items\": {\"type\": \"float32\"}}}}}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}}}" + } + } + }, + "input_manifest_path": { + "taskOutputParameter": { + "outputParameterKey": "output_manifest_path", + "producerTask": "first-component" + } + }, + "input_partition_rows": { + "runtimeValue": { + "constantValue": { + "stringValue": "10" + } + } + }, + "metadata": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" + } + } + }, + "storage_args": { + "runtimeValue": { + "constantValue": { + "stringValue": "a dummy string arg" + } + } + } + } + }, + "taskInfo": { + "name": "second-component" + } + }, + "third-component": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-third-component" + }, + "dependentTasks": [ + "second-component" + ], + "inputs": { + "parameters": { + "component_spec": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"name\": \"Third component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, \"embeddings\": {\"fields\": {\"data\": {\"type\": \"array\", \"items\": {\"type\": \"float32\"}}}}}, \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"additionalSubsets\": false}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}, \"some_list\": {\"description\": \"Some list\", \"type\": \"list\", \"items\": {\"type\": \"int\"}}}}" + } + } + }, + "input_manifest_path": { + "taskOutputParameter": { + "outputParameterKey": "output_manifest_path", + "producerTask": "second-component" + } + }, + "input_partition_rows": { + "runtimeValue": { + "constantValue": { + "stringValue": "None" + } + } + }, + "metadata": { + "runtimeValue": { + "constantValue": { + "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" + } + } + }, + "some_list": { + "runtimeValue": { + "constantValue": { + "stringValue": "[1, 2, 3]" + } + } + }, + "storage_args": { + "runtimeValue": { + "constantValue": { + "stringValue": "a dummy string arg" + } + } + } + } + }, + "taskInfo": { + "name": "third-component" + } + } + } + } + }, + "schemaVersion": "2.0.0", + "sdkVersion": "kfp-1.8.22" + }, + "runtimeConfig": {} +} \ No newline at end of file diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml new file mode 100644 index 000000000..c5990e87e --- /dev/null +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -0,0 +1,556 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +components: + comp-First_component: + executorLabel: exec-First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Second_component: + executorLabel: exec-Second_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Third_component: + executorLabel: exec-Third_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-first-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: First_component + tasks: + First_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-First_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-second-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Second_component + tasks: + Second_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-Second_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Second_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-third-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Third_component + tasks: + Third_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-Third_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Third_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest +deploymentSpec: + executors: + exec-First_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Second_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Third_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: disable + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + second-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-second-component + dependentTasks: + - first-component + inputs: + artifacts: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: first-component + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Second component + produces: + embeddings: + fields: + data: + items: + type: float32 + type: array + input_partition_rows: + runtimeValue: + constant: '10' + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "second_component", + "cache_key": "2"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: second-component + third-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-third-component + dependentTasks: + - second-component + inputs: + artifacts: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: second-component + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + captions: + fields: + data: + type: string + embeddings: + fields: + data: + items: + type: float32 + type: array + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Third component + produces: + additionalSubsets: false + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "third_component", + "cache_key": "3"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: third-component +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml index a662b8311..d4a027725 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml @@ -1,4 +1,4 @@ -name: test_pipeline +name: testpipeline services: first_component: build: @@ -6,10 +6,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/first_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json - --storage_args - a dummy string arg - --cache @@ -26,12 +26,13 @@ services: - 8787:8787 volumes: [] image_cropping: + image: ghcr.io/ml6team/image_cropping:dev command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", "cache_key": "2"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/image_cropping/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json - --cropping_threshold - '0' - --padding @@ -52,11 +53,10 @@ services: for the image cropping. The padding is added to all borders of the image.", "type": "int", "default": 10}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json depends_on: first_component: condition: service_completed_successfully - image: ghcr.io/ml6team/image_cropping:dev ports: - 8787:8787 volumes: [] diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index c583ce539..749d5cf75 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,211 +1,402 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' - generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 -spec: - arguments: - parameters: [] - entrypoint: test-pipeline - serviceAccountName: pipeline-runner - templates: - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "1"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +components: + comp-First_component: + executorLabel: exec-First_component + inputDefinitions: artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: - data: '' - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "None", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - outputs: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "image_cropping", "cache_key": "2"}' - - --component_spec - - '{"args": {"cropping_threshold": {"default": -30, "description": "Threshold - parameter used for detecting borders. A lower (negative) parameter results - in a more performant border detection, but can cause overcropping. Default - is -30", "type": "int"}, "padding": {"default": 10, "description": "Padding - for the image cropping. The padding is added to all borders of the image.", - "type": "int"}}, "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, - "description": "Component that removes single-colored borders around images - and crops them appropriately", "image": "ghcr.io/ml6team/image_cropping:dev", - "name": "Image cropping", "produces": {"images": {"fields": {"data": {"type": - "binary"}, "height": {"type": "int32"}, "width": {"type": "int32"}}}}}' - - --input_partition_rows - - None - - --cache - - 'True' - - --cropping_threshold - - '0' - - --padding - - '0' - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: ghcr.io/ml6team/image_cropping:dev - imagePullPolicy: Always - inputs: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Image_cropping: + executorLabel: exec-Image_cropping + inputDefinitions: artifacts: - - name: first-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "True", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"cropping_threshold\": - {\"default\": -30, \"description\": \"Threshold parameter used for detecting - borders. A lower (negative) parameter results in a more performant border - detection, but can cause overcropping. Default is -30\", \"type\": \"int\"}, - \"padding\": {\"default\": 10, \"description\": \"Padding for the image - cropping. The padding is added to all borders of the image.\", \"type\": - \"int\"}}, \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": - \"binary\"}}}}, \"description\": \"Component that removes single-colored - borders around images and crops them appropriately\", \"image\": \"ghcr.io/ml6team/image_cropping:dev\", - \"name\": \"Image cropping\", \"produces\": {\"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}, \"height\": {\"type\": \"int32\"}, \"width\": {\"type\": - \"int32\"}}}}}", "cropping_threshold": "0", "input_partition_rows": "None", - "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", - \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"image_cropping\", - \"cache_key\": \"2\"}", "padding": "0"}' - pipelines.kubeflow.org/component_ref: '{"digest": "d31e5d546956a42a470f033e2be84f229d3e926dfa7a7a1703c94ff47a1cb992"}' - pipelines.kubeflow.org/component_spec: '{"description": "Component that removes - single-colored borders around images and crops them appropriately", "implementation": - {"container": {"command": ["fondant", "execute", "main", "--input_manifest_path", - {"inputPath": "input_manifest_path"}, "--metadata", {"inputValue": "metadata"}, - "--component_spec", {"inputValue": "component_spec"}, "--input_partition_rows", - {"inputValue": "input_partition_rows"}, "--cache", {"inputValue": "cache"}, - "--cropping_threshold", {"inputValue": "cropping_threshold"}, "--padding", - {"inputValue": "padding"}, "--output_manifest_path", {"outputPath": "output_manifest_path"}, - "--cluster_type", {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": - "client_kwargs"}], "image": "ghcr.io/ml6team/image_cropping:dev"}}, "inputs": - [{"description": "Path to the input manifest", "name": "input_manifest_path", - "type": "String"}, {"description": "Metadata arguments containing the run - id and base path", "name": "metadata", "type": "String"}, {"default": "None", - "description": "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"default": - -30, "description": "Threshold parameter used for detecting borders. A lower - (negative) parameter results in a more performant border detection, but - can cause overcropping. Default is -30", "name": "cropping_threshold", "type": - "Integer"}, {"default": 10, "description": "Padding for the image cropping. - The padding is added to all borders of the image.", "name": "padding", "type": - "Integer"}], "name": "Image cropping", "outputs": [{"description": "Path - to the output manifest", "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: image-cropping - outputs: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + description: Threshold parameter used for detecting borders. A lower (negative) + parameter results in a more performant border detection, but can cause + overcropping. Default is -30 + isOptional: true + parameterType: NUMBER_INTEGER + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + padding: + defaultValue: 10.0 + description: Padding for the image cropping. The padding is added to all + borders of the image. + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: artifacts: - - name: image-cropping-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-first-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: First_component tasks: - - name: first-component - template: first-component - - arguments: - artifacts: - - from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}' - name: first-component-output_manifest_path - dependencies: + First_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-First_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-image-cropping: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Image_cropping + tasks: + Image_cropping: + cachingOptions: + enableCache: true + componentRef: + name: comp-Image_cropping + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Image_cropping + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + description: Threshold parameter used for detecting borders. A lower (negative) + parameter results in a more performant border detection, but can cause + overcropping. Default is -30 + isOptional: true + parameterType: NUMBER_INTEGER + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + padding: + defaultValue: 10.0 + description: Padding for the image cropping. The padding is added to all + borders of the image. + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest +deploymentSpec: + executors: + exec-First_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Image_cropping: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --cropping_threshold + - '{{$.inputs.parameters[''cropping_threshold'']}}' + - --padding + - '{{$.inputs.parameters[''padding'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: ghcr.io/ml6team/image_cropping:dev +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + image-cropping: + cachingOptions: + enableCache: true + componentRef: + name: comp-image-cropping + dependentTasks: - first-component - name: image-cropping - template: image-cropping - name: test-pipeline + inputs: + artifacts: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: first-component + parameters: + cache: + runtimeValue: + constant: true + component_spec: + runtimeValue: + constant: + args: + cropping_threshold: + default: -30.0 + description: Threshold parameter used for detecting borders. + A lower (negative) parameter results in a more performant + border detection, but can cause overcropping. Default is -30 + type: int + padding: + default: 10.0 + description: Padding for the image cropping. The padding is + added to all borders of the image. + type: int + consumes: + images: + fields: + data: + type: binary + description: Component that removes single-colored borders around + images and crops them appropriately + image: ghcr.io/ml6team/image_cropping:dev + name: Image cropping + produces: + images: + fields: + data: + type: binary + height: + type: int32 + width: + type: int32 + cropping_threshold: + runtimeValue: + constant: 0.0 + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", + "cache_key": "2"}' + padding: + runtimeValue: + constant: 0.0 + taskInfo: + name: image-cropping +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml new file mode 100644 index 000000000..749d5cf75 --- /dev/null +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -0,0 +1,402 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +components: + comp-First_component: + executorLabel: exec-First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-Image_cropping: + executorLabel: exec-Image_cropping + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + description: Threshold parameter used for detecting borders. A lower (negative) + parameter results in a more performant border detection, but can cause + overcropping. Default is -30 + isOptional: true + parameterType: NUMBER_INTEGER + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + padding: + defaultValue: 10.0 + description: Padding for the image cropping. The padding is added to all + borders of the image. + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-first-component: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: First_component + tasks: + First_component: + cachingOptions: + enableCache: true + componentRef: + name: comp-First_component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: First_component + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest + comp-image-cropping: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: Image_cropping + tasks: + Image_cropping: + cachingOptions: + enableCache: true + componentRef: + name: comp-Image_cropping + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + cache: + componentInputParameter: cache + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + taskInfo: + name: Image_cropping + inputDefinitions: + artifacts: + input_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the input manifest + isOptional: true + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + component_spec: + defaultValue: {} + description: The component specification as a dictionary + isOptional: true + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + description: Threshold parameter used for detecting borders. A lower (negative) + parameter results in a more performant border detection, but can cause + overcropping. Default is -30 + isOptional: true + parameterType: NUMBER_INTEGER + input_partition_rows: + defaultValue: None + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: STRING + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + padding: + defaultValue: 10.0 + description: Padding for the image cropping. The padding is added to all + borders of the image. + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest +deploymentSpec: + executors: + exec-First_component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest + exec-Image_cropping: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --cropping_threshold + - '{{$.inputs.parameters[''cropping_threshold'']}}' + - --padding + - '{{$.inputs.parameters[''padding'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: ghcr.io/ml6team/image_cropping:dev +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + image-cropping: + cachingOptions: + enableCache: true + componentRef: + name: comp-image-cropping + dependentTasks: + - first-component + inputs: + artifacts: + input_manifest_path: + taskOutputArtifact: + outputArtifactKey: output_manifest_path + producerTask: first-component + parameters: + cache: + runtimeValue: + constant: true + component_spec: + runtimeValue: + constant: + args: + cropping_threshold: + default: -30.0 + description: Threshold parameter used for detecting borders. + A lower (negative) parameter results in a more performant + border detection, but can cause overcropping. Default is -30 + type: int + padding: + default: 10.0 + description: Padding for the image cropping. The padding is + added to all borders of the image. + type: int + consumes: + images: + fields: + data: + type: binary + description: Component that removes single-colored borders around + images and crops them appropriately + image: ghcr.io/ml6team/image_cropping:dev + name: Image cropping + produces: + images: + fields: + data: + type: binary + height: + type: int32 + width: + type: int32 + cropping_threshold: + runtimeValue: + constant: 0.0 + input_partition_rows: + runtimeValue: + constant: None + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", + "cache_key": "2"}' + padding: + runtimeValue: + constant: 0.0 + taskInfo: + name: image-cropping +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml index 4709fe966..732fc02f4 100644 --- a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml @@ -1,110 +1,223 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' - generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 -spec: - arguments: - parameters: [] - entrypoint: test-pipeline - serviceAccountName: pipeline-runner - templates: - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "b72c8e370be017d5a679a60d3984ab9d"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - resources: - limits: - nvidia.com/gpu: 1 - inputs: - artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: - data: '' - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "None", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"b72c8e370be017d5a679a60d3984ab9d\"}", "storage_args": "a - dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - nodeSelector: - a_node_pool_label: a_node_pool - outputs: - artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: - tasks: - - name: first-component - template: first-component - name: test-pipeline +{ + "components": + { + "comp-Example_component": + { + "executorLabel": "exec-Example_component", + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + "defaultValue": "None", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + }, + "deploymentSpec": + { + "executors": + { + "exec-Example_component": + { + "container": + { + "args": + [ + "--input_manifest_path", + "{{$.inputs.artifacts['input_manifest_path'].uri}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--cache", + "{{$.inputs.parameters['cache']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--output_manifest_path", + "{{$.outputs.artifacts['output_manifest_path'].uri}}", + ], + "command": ["fondant", "execute", "main"], + "image": "example_component:latest", + }, + }, + }, + }, + "pipelineInfo": { "name": "Example_component" }, + "root": + { + "dag": + { + "outputs": + { + "artifacts": + { + "output_manifest_path": + { + "artifactSelectors": + [ + { + "outputArtifactKey": "output_manifest_path", + "producerSubtask": "Example_component", + }, + ], + }, + }, + }, + "tasks": + { + "Example_component": + { + "cachingOptions": { "enableCache": True }, + "componentRef": { "name": "comp-Example_component" }, + "inputs": + { + "artifacts": + { + "input_manifest_path": + { "componentInputArtifact": "input_manifest_path" }, + }, + "parameters": + { + "component_spec": + { "componentInputParameter": "component_spec" }, + "input_partition_rows": + { + "componentInputParameter": "input_partition_rows", + }, + "metadata": { "componentInputParameter": "metadata" }, + "cache": { "componentInputParameter": "cache" }, + }, + }, + "taskInfo": { "name": "Example_component" }, + }, + }, + }, + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + "defaultValue": "None", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.1", +} diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index 0f2026115..6d04bc018 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,62 +1,234 @@ -name: Example component -description: This is an example component -inputs: -- name: input_manifest_path - description: Path to the input manifest - type: String -- name: metadata - description: Metadata arguments containing the run id and base path - type: String -- name: component_spec - description: The component specification as a dictionary - type: JsonObject - default: None -- name: input_partition_rows - description: The number of rows to load per partition. Set to override the automatic - partitioning - type: String - default: None -- name: cache - description: Set to False to disable caching, True by default. - type: Boolean - default: 'True' -- name: cluster_type - description: The type of cluster to use for distributed execution - type: String - default: default -- name: client_kwargs - description: Keyword arguments used to initialise the dask client - type: JsonObject - default: '{}' -- name: storage_args - description: Storage arguments - type: String -outputs: -- name: output_manifest_path - description: Path to the output manifest - type: String -implementation: - container: - image: example_component:latest - command: - - fondant - - execute - - main - - --input_manifest_path - - inputPath: input_manifest_path - - --metadata - - inputValue: metadata - - --component_spec - - inputValue: component_spec - - --input_partition_rows - - inputValue: input_partition_rows - - --cache - - inputValue: cache - - --storage_args - - inputValue: storage_args - - --output_manifest_path - - outputPath: output_manifest_path - - --cluster_type - - inputValue: cluster_type - - --client_kwargs - - inputValue: client_kwargs +{ + "components": + { + "comp-Example_component": + { + "executorLabel": "exec-Example_component", + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + "defaultValue": "None", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + }, + "deploymentSpec": + { + "executors": + { + "exec-Example_component": + { + "container": + { + "args": + [ + "--input_manifest_path", + "{{$.inputs.artifacts['input_manifest_path'].uri}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--cache", + "{{$.inputs.parameters['cache']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--output_manifest_path", + "{{$.outputs.artifacts['output_manifest_path'].uri}}", + ], + "command": ["fondant", "execute", "main"], + "image": "example_component:latest", + }, + }, + }, + }, + "pipelineInfo": { "name": "Example_component" }, + "root": + { + "dag": + { + "outputs": + { + "artifacts": + { + "output_manifest_path": + { + "artifactSelectors": + [ + { + "outputArtifactKey": "output_manifest_path", + "producerSubtask": "Example_component", + }, + ], + }, + }, + }, + "tasks": + { + "Example_component": + { + "cachingOptions": { "enableCache": True }, + "componentRef": + { "name": "comp-Example_component" }, + "inputs": + { + "artifacts": + { + "input_manifest_path": + { + "componentInputArtifact": "input_manifest_path", + }, + }, + "parameters": + { + "component_spec": + { + "componentInputParameter": "component_spec", + }, + "input_partition_rows": + { + "componentInputParameter": "input_partition_rows", + }, + "metadata": + { + "componentInputParameter": "metadata", + }, + "cache": + { + "componentInputParameter": "cache", + }, + }, + }, + "taskInfo": { "name": "Example_component" }, + }, + }, + }, + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + "defaultValue": "None", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.1", +} diff --git a/tests/test_cli.py b/tests/test_cli.py index 33b742763..55824a0e1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -141,7 +141,9 @@ def test_local_logic(tmp_path_factory): def test_kfp_compile(tmp_path_factory): - with tmp_path_factory.mktemp("temp") as fn: + with tmp_path_factory.mktemp("temp") as fn, patch( + "fondant.compiler.KubeFlowCompiler.compile", + ) as mock_compiler: args = argparse.Namespace( ref=__name__, kubeflow=True, @@ -149,6 +151,10 @@ def test_kfp_compile(tmp_path_factory): output_path=str(fn / "kubeflow_pipelines.yml"), ) compile(args) + mock_compiler.assert_called_once_with( + pipeline=TEST_PIPELINE, + output_path=str(fn / "kubeflow_pipelines.yml"), + ) def test_local_run(tmp_path_factory): @@ -218,9 +224,12 @@ def test_kfp_run(tmp_path_factory): ) run(args) mock_runner.assert_called_once_with(host="localhost") - with patch("fondant.cli.KubeflowRunner") as mock_runner, tmp_path_factory.mktemp( + with patch("fondant.cli.KubeflowRunner") as mock_runner, patch( + "fondant.cli.KubeFlowCompiler", + ) as mock_compiler, tmp_path_factory.mktemp( "temp", ) as fn: + mock_compiler.compile.return_value = "some/path" args = argparse.Namespace( kubeflow=True, local=False, diff --git a/tests/test_compiler.py b/tests/test_compiler.py index b8f1364bd..acf54566a 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -5,7 +5,7 @@ import pytest import yaml -from fondant.compiler import DockerCompiler, KubeFlowCompiler +from fondant.compiler import DockerCompiler, KubeFlowCompiler, VertexCompiler from fondant.pipeline import ComponentOp, Pipeline COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") @@ -84,7 +84,7 @@ def now(cls): @pytest.fixture(params=TEST_PIPELINES) def setup_pipeline(request, tmp_path, monkeypatch): pipeline = Pipeline( - pipeline_name="test_pipeline", + pipeline_name="testpipeline", pipeline_description="description of the test pipeline", base_path="/foo/bar", ) @@ -141,7 +141,7 @@ def test_docker_local_path(setup_pipeline, tmp_path_factory): with open(fn / "docker-compose.yml") as f_spec: spec = yaml.safe_load(f_spec) - expected_run_id = "test_pipeline-20230101000000" + expected_run_id = "testpipeline-20230101000000" for name, service in spec["services"].items(): # check if volumes are defined correctly @@ -153,10 +153,11 @@ def test_docker_local_path(setup_pipeline, tmp_path_factory): "type": "bind", }, ] + cleaned_pipeline_name = pipeline.name.replace("_", "") # check if commands are patched to use the working dir commands_with_dir = [ - f"{work_dir}/{pipeline.name}/{expected_run_id}/{name}/manifest.json", - f'{{"base_path": "{work_dir}", "pipeline_name": "{pipeline.name}",' + f"{work_dir}/{cleaned_pipeline_name}/{expected_run_id}/{name}/manifest.json", + f'{{"base_path": "{work_dir}", "pipeline_name": "{cleaned_pipeline_name}",' f' "run_id": "{expected_run_id}", "component_id": "{name}",' f' "cache_key": "{cache_key}"}}', ] @@ -178,15 +179,16 @@ def test_docker_remote_path(setup_pipeline, tmp_path_factory): with open(fn / "docker-compose.yml") as f_spec: spec = yaml.safe_load(f_spec) - expected_run_id = "test_pipeline-20230101000000" + expected_run_id = "testpipeline-20230101000000" for name, service in spec["services"].items(): cache_key = cache_dict[name] # check that no volumes are created assert service["volumes"] == [] # check if commands are patched to use the remote dir + cleaned_pipeline_name = pipeline.name.replace("_", "") commands_with_dir = [ - f"{remote_dir}/{pipeline.name}/{expected_run_id}/{name}/manifest.json", - f'{{"base_path": "{remote_dir}", "pipeline_name": "{pipeline.name}",' + f"{remote_dir}/{cleaned_pipeline_name}/{expected_run_id}/{name}/manifest.json", + f'{{"base_path": "{remote_dir}", "pipeline_name": "{cleaned_pipeline_name}",' f' "run_id": "{expected_run_id}", "component_id": "{name}",' f' "cache_key": "{cache_key}"}}', ] @@ -233,30 +235,13 @@ def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): assert yaml.safe_load(src) == yaml.safe_load(truth) -@pytest.mark.usefixtures("_freeze_time") -def test_kubeflow_configuration(tmp_path_factory): - """Test that the kubeflow pipeline can be configured.""" - pipeline = Pipeline( - pipeline_name="test_pipeline", - pipeline_description="description of the test pipeline", - base_path="/foo/bar", - ) - component_1 = ComponentOp( - Path(COMPONENTS_PATH / "example_1" / "first_component"), - arguments={"storage_args": "a dummy string arg"}, - node_pool_name="a_node_pool", - node_pool_label="a_node_pool_label", - number_of_gpus=1, - ) - pipeline.add_op(component_1) - compiler = KubeFlowCompiler() - with tmp_path_factory.mktemp("temp") as fn: - output_path = str(fn / "kubeflow_pipeline.yml") - compiler.compile(pipeline=pipeline, output_path=output_path) - with open(output_path) as src, open( - VALID_PIPELINE / "kubeflow_pipeline.yml", - ) as truth: - assert yaml.safe_load(src) == yaml.safe_load(truth) +# @pytest.mark.usefixtures("_freeze_time") +# def test_kubeflow_configuration(tmp_path_factory): +# """Test that the kubeflow pipeline can be configured.""" +# with tmp_path_factory.mktemp("temp") as fn: +# with open(output_path) as src, open( +# VALID_PIPELINE / "kubeflow_pipeline.yml", +# ) as truth: def test_kfp_import(): @@ -266,3 +251,17 @@ def test_kfp_import(): sys.modules["kfp"] = None with pytest.raises(ImportError): _ = KubeFlowCompiler() + + +@pytest.mark.usefixtures("_freeze_time") +def test_vertex_compiler(setup_pipeline, tmp_path_factory): + """Test compiling a pipeline to vertex.""" + example_dir, pipeline, _ = setup_pipeline + compiler = VertexCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "vertex_pipeline.json") + compiler.compile(pipeline=pipeline, output_path=output_path) + with open(output_path) as src, open( + VALID_PIPELINE / example_dir / "vertex_pipeline.yml", + ) as truth: + assert yaml.safe_load(src) == yaml.safe_load(truth) diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py index 56499515d..12baf3805 100644 --- a/tests/test_component_specs.py +++ b/tests/test_component_specs.py @@ -6,7 +6,11 @@ import pytest import yaml -from fondant.component_spec import ComponentSpec, ComponentSubset, KubeflowComponentSpec +from fondant.component_spec import ( + ComponentSpec, + ComponentSubset, + KubeflowComponentSpec, +) from fondant.exceptions import InvalidComponentSpec from fondant.schema import Type From 52102a54d0efb981207f410095c8a651477eed98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Lorr=C3=A9?= <35808396+GeorgesLorre@users.noreply.github.com> Date: Fri, 15 Sep 2023 09:45:52 +0200 Subject: [PATCH 02/31] Add vertex runner (#429) adresses: https://github.com/ml6team/fondant/issues/417 --- .github/workflows/pipeline.yaml | 2 +- pyproject.toml | 6 +++--- src/fondant/runner.py | 36 +++++++++++++++++++++++++++++++++ tests/test_runner.py | 19 ++++++++++++++++- 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 38bc56cce..11deb2329 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index bf3c2732f..b2b38025b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,9 +40,8 @@ classifiers = [ ] [tool.poetry.dependencies] -dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} python = ">= 3.8 < 3.11" -dask = {extras = ["dataframe"], version = ">= 2023.4.1"} +dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" pyarrow = ">= 11.0.0" @@ -53,13 +52,14 @@ s3fs = { version = ">= 2023.4.0", optional = true } adlfs = { version = ">= 2023.4.0", optional = true } kfp = { version = "2.0.1", optional = true } pandas = { version = ">= 1.3.5", optional = true } +google-cloud-aiplatform = { version = "1.32.0", optional = true} [tool.poetry.extras] aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] -vertex = ["kfp"] +vertex = ["kfp", "google-cloud-aiplatform"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" diff --git a/src/fondant/runner.py b/src/fondant/runner.py index ff75ae1a3..59035e084 100644 --- a/src/fondant/runner.py +++ b/src/fondant/runner.py @@ -1,5 +1,6 @@ import logging import subprocess # nosec +import typing as t from abc import ABC, abstractmethod import yaml @@ -85,3 +86,38 @@ def get_name_from_spec(self, input_spec: str): with open(input_spec) as f: spec = yaml.safe_load(f) return spec["pipelineInfo"]["name"] + + +class VertexRunner(Runner): + def __resolve_imports(self): + import google.cloud.aiplatform as aip + + self.aip = aip + + def __init__( + self, + project_id: str, + project_region: str, + service_account: t.Optional[str] = None, + ): + self.__resolve_imports() + + self.aip.init( + project=project_id, + location=project_region, + ) + self.service_account = service_account + + def run(self, input_spec: str, *args, **kwargs): + job = self.aip.PipelineJob( + display_name=self.get_name_from_spec(input_spec), + template_path=input_spec, + enable_caching=False, + ) + job.submit(service_account=self.service_account) + + def get_name_from_spec(self, input_spec: str): + """Get the name of the pipeline from the spec.""" + with open(input_spec) as f: + spec = yaml.safe_load(f) + return spec["pipelineInfo"]["name"] diff --git a/tests/test_runner.py b/tests/test_runner.py index 975359db3..b73deb32f 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -4,7 +4,7 @@ from unittest import mock import pytest -from fondant.runner import DockerRunner, KubeflowRunner +from fondant.runner import DockerRunner, KubeflowRunner, VertexRunner VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") @@ -79,3 +79,20 @@ def test_kfp_import(): sys.modules["kfp"] = None with pytest.raises(ImportError): _ = KubeflowRunner(host="some_host") + + +def test_vertex_runner(): + input_spec_path = str(VALID_PIPELINE / "kubeflow_pipeline.yml") + with mock.patch("google.cloud.aiplatform.init", return_value=None), mock.patch( + "google.cloud.aiplatform.PipelineJob", + ): + runner = VertexRunner(project_id="some_project", project_region="some_region") + runner.run(input_spec=input_spec_path) + + # test with service account + runner2 = VertexRunner( + project_id="some_project", + project_region="some_region", + service_account="some_account", + ) + runner2.run(input_spec=input_spec_path) From a0ea8e4099088cdc7fcd3299bfb0d59fb21f7439 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Fri, 15 Sep 2023 15:10:07 +0200 Subject: [PATCH 03/31] Add hardware configs (#433) PR that adds option to set hardware accelerators to KFP V2 (nodepool Notes: * Each component was previously configured as a sub-pipeline because of an error in detecting a certain field in the V2 spec (`exec- str: + """Cleans and converts a name to be kfp V2 compatible. + + Taken from https://github.com/kubeflow/pipelines/blob/ + cfe671c485d4ee8514290ee81ca2785e8bda5c9b/sdk/python/kfp/dsl/utils.py#L52 + """ + return ( + re.sub("-+", "-", re.sub("[^-0-9a-z]+", "-", name.lower())) + .lstrip("-") + .rstrip("-") + ) + @classmethod def from_fondant_component_spec(cls, fondant_component: ComponentSpec): """Generate a Kubeflow component spec from a ComponentOp.""" @@ -286,10 +300,8 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): }, } - cleaned_component_name = fondant_component.name.replace("-", "_").replace( - " ", - "_", - ) + cleaned_component_name = cls.sanitize_component_name(fondant_component.name) + output_definitions = { "artifacts": { "output_manifest_path": { diff --git a/src/fondant/pipeline.py b/src/fondant/pipeline.py index 87081d86e..e95bd833e 100644 --- a/src/fondant/pipeline.py +++ b/src/fondant/pipeline.py @@ -20,6 +20,28 @@ logger = logging.getLogger(__name__) +valid_accelerator_types = [ + "GPU", + "TPU", +] + +# Taken from https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform_v1/types +# /accelerator_type.py +valid_vertex_accelerator_types = [ + "ACCELERATOR_TYPE_UNSPECIFIED", + "NVIDIA_TESLA_K80", + "NVIDIA_TESLA_P100", + "NVIDIA_TESLA_V100", + "NVIDIA_TESLA_P4", + "NVIDIA_TESLA_T4", + "NVIDIA_TESLA_A100", + "NVIDIA_A100_80GB", + "NVIDIA_L4", + "TPU_V2", + "TPU_V3", + "TPU_V4_POD", +] + class ComponentOp: """ @@ -31,7 +53,12 @@ class ComponentOp: arguments: A dictionary containing the argument name and value for the operation. input_partition_rows: The number of rows to load per partition. Set to override the automatic partitioning - number_of_gpus: The number of gpus to assign to the operation + number_of_accelerators: The number of accelerators to assign to the operation (GPU, TPU) + accelerator_name: The name of the accelerator to assign. If you're using a cluster setup + on GKE, select "GPU" for GPU or "TPU" for TPU. Make sure + that you select a nodepool with the available hardware. If you're running the + pipeline on Vertex, then select one of the machines specified in the list of + accelerators here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. node_pool_label: The label of the node pool to which the operation will be assigned. node_pool_name: The name of the node pool to which the operation will be assigned. cache: Set to False to disable caching, True by default. @@ -45,8 +72,8 @@ class ComponentOp: Note: - A Fondant Component operation is created by defining a Fondant Component and its input arguments. - - The `number_of_gpus`, `node_pool_label`, `node_pool_name`, `cache`, `cluster_type` and - `client_kwargs` attributes are optional and can be used to specify additional + - The `accelerator_name`, `node_pool_label`, `node_pool_name` + attributes are optional and can be used to specify additional configurations for the operation. More information on the optional attributes that can be assigned to kfp components here: https://kubeflow-pipelines.readthedocs.io/en/1.8.13/source/kfp.dsl.html @@ -60,7 +87,8 @@ def __init__( *, arguments: t.Optional[t.Dict[str, t.Any]] = None, input_partition_rows: t.Optional[t.Union[str, int]] = None, - number_of_gpus: t.Optional[int] = None, + number_of_accelerators: t.Optional[int] = None, + accelerator_name: t.Optional[str] = None, node_pool_label: t.Optional[str] = None, node_pool_name: t.Optional[str] = None, cache: t.Optional[bool] = True, @@ -89,14 +117,20 @@ def __init__( self._add_component_argument("client_kwargs", client_kwargs) self.arguments.setdefault("component_spec", self.component_spec.specification) - - self.number_of_gpus = number_of_gpus self.node_pool_label, self.node_pool_name = self._validate_node_pool_spec( node_pool_label, node_pool_name, ) self.preemptible = preemptible + ( + self.number_of_accelerators, + self.accelerator_name, + ) = self._validate_accelerator_spec( + number_of_accelerators, + accelerator_name, + ) + def _configure_caching_from_image_tag( self, cache: t.Optional[bool], @@ -143,8 +177,8 @@ def _add_component_argument( self.argument_name = argument_value self.arguments[argument_name] = argument_value + @staticmethod def _validate_node_pool_spec( - self, node_pool_label, node_pool_name, ) -> t.Tuple[t.Optional[str], t.Optional[str]]: @@ -156,6 +190,23 @@ def _validate_node_pool_spec( ) return node_pool_label, node_pool_name + def _validate_accelerator_spec( + self, + number_of_accelerators, + accelerator_name, + ) -> t.Tuple[t.Optional[int], t.Optional[str]]: + """Validate accelerator specification.""" + if bool(number_of_accelerators) != bool(accelerator_name): + msg = ( + "Both number of accelerators and accelerator name must be specified or both must" + " be None." + ) + raise InvalidPipelineDefinition( + msg, + ) + + return number_of_accelerators, accelerator_name + @property def dockerfile_path(self) -> t.Optional[Path]: path = self.component_dir / "Dockerfile" @@ -168,7 +219,8 @@ def from_registry( *, arguments: t.Optional[t.Dict[str, t.Any]] = None, input_partition_rows: t.Optional[t.Union[int, str]] = None, - number_of_gpus: t.Optional[int] = None, + number_of_accelerators: t.Optional[int] = None, + accelerator_name: t.Optional[str] = None, node_pool_label: t.Optional[str] = None, node_pool_name: t.Optional[str] = None, cache: t.Optional[bool] = True, @@ -183,7 +235,12 @@ def from_registry( arguments: A dictionary containing the argument name and value for the operation. input_partition_rows: The number of rows to load per partition. Set to override the automatic partitioning - number_of_gpus: The number of gpus to assign to the operation + number_of_accelerators: The number of accelerators to assign to the operation (GPU, TPU) + accelerator_name: The name of the accelerator to assign. If you're using a cluster setup + on GKE, select "GPU" for GPU or "TPU" for TPU. Make + sure that you select a nodepool with the available hardware. If you're running the + pipeline on Vertex, then select one of the machines specified in the list of + accelerators here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. node_pool_label: The label of the node pool to which the operation will be assigned. node_pool_name: The name of the node pool to which the operation will be assigned. cache: Set to False to disable caching, True by default. @@ -205,7 +262,8 @@ def from_registry( components_dir, arguments=arguments, input_partition_rows=input_partition_rows, - number_of_gpus=number_of_gpus, + number_of_accelerators=number_of_accelerators, + accelerator_name=accelerator_name, node_pool_label=node_pool_label, node_pool_name=node_pool_name, cache=cache, @@ -247,7 +305,8 @@ def get_nested_dict_hash(input_dict): "component_spec_hash": get_nested_dict_hash(component_spec_dict), "arguments": arguments, "input_partition_rows": self.input_partition_rows, - "number_of_gpus": self.number_of_gpus, + "number_of_accelerators": self.number_of_accelerators, + "accelerator_name": self.accelerator_name, "node_pool_name": self.node_pool_name, } diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml index 954616903..ec9258c1b 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml @@ -24,16 +24,6 @@ services: {"type": "binary"}}}, "captions": {"fields": {"data": {"type": "string"}}}}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' depends_on: {} - deploy: - resources: - reservations: - devices: - - capabilities: - - gpu - count: 1 - driver: nvidia - ports: - - 8787:8787 volumes: [] second_component: build: diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index c5990e87e..b48723b17 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -2,185 +2,31 @@ # Name: testpipeline # Description: description of the test pipeline components: - comp-First_component: - executorLabel: exec-First_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Second_component: - executorLabel: exec-Second_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Third_component: - executorLabel: exec-Third_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest comp-first-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: First_component - tasks: - First_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-First_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: First_component + executorLabel: exec-first-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -188,66 +34,31 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-second-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Second_component - tasks: - Second_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-Second_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Second_component + executorLabel: exec-second-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -255,66 +66,31 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-third-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Third_component - tasks: - Third_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-Third_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Third_component + executorLabel: exec-third-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -322,10 +98,9 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest deploymentSpec: executors: - exec-First_component: + exec-first-component: container: args: - --input_manifest_path @@ -347,7 +122,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Second_component: + exec-second-component: container: args: - --input_manifest_path @@ -369,7 +144,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Third_component: + exec-third-component: container: args: - --input_manifest_path diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index c5990e87e..b48723b17 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -2,185 +2,31 @@ # Name: testpipeline # Description: description of the test pipeline components: - comp-First_component: - executorLabel: exec-First_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Second_component: - executorLabel: exec-Second_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Third_component: - executorLabel: exec-Third_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest comp-first-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: First_component - tasks: - First_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-First_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: First_component + executorLabel: exec-first-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -188,66 +34,31 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-second-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Second_component - tasks: - Second_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-Second_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Second_component + executorLabel: exec-second-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -255,66 +66,31 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-third-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Third_component - tasks: - Third_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-Third_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Third_component + executorLabel: exec-third-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -322,10 +98,9 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest deploymentSpec: executors: - exec-First_component: + exec-first-component: container: args: - --input_manifest_path @@ -347,7 +122,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Second_component: + exec-second-component: container: args: - --input_manifest_path @@ -369,7 +144,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Third_component: + exec-third-component: container: args: - --input_manifest_path diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index 749d5cf75..e534b1a36 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -2,155 +2,31 @@ # Name: testpipeline # Description: description of the test pipeline components: - comp-First_component: - executorLabel: exec-First_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Image_cropping: - executorLabel: exec-Image_cropping - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - cropping_threshold: - defaultValue: -30.0 - description: Threshold parameter used for detecting borders. A lower (negative) - parameter results in a more performant border detection, but can cause - overcropping. Default is -30 - isOptional: true - parameterType: NUMBER_INTEGER - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - padding: - defaultValue: 10.0 - description: Padding for the image cropping. The padding is added to all - borders of the image. - isOptional: true - parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest comp-first-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: First_component - tasks: - First_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-First_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: First_component + executorLabel: exec-first-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -158,75 +34,36 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-image-cropping: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Image_cropping - tasks: - Image_cropping: - cachingOptions: - enableCache: true - componentRef: - name: comp-Image_cropping - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Image_cropping + executorLabel: exec-image-cropping inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT cropping_threshold: defaultValue: -30.0 - description: Threshold parameter used for detecting borders. A lower (negative) - parameter results in a more performant border detection, but can cause - overcropping. Default is -30 isOptional: true parameterType: NUMBER_INTEGER input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING padding: defaultValue: 10.0 - description: Padding for the image cropping. The padding is added to all - borders of the image. isOptional: true parameterType: NUMBER_INTEGER outputDefinitions: @@ -235,10 +72,9 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest deploymentSpec: executors: - exec-First_component: + exec-first-component: container: args: - --input_manifest_path @@ -260,7 +96,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Image_cropping: + exec-image-cropping: container: args: - --input_manifest_path diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index 749d5cf75..e534b1a36 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -2,155 +2,31 @@ # Name: testpipeline # Description: description of the test pipeline components: - comp-First_component: - executorLabel: exec-First_component - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - description: Storage arguments - parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest - comp-Image_cropping: - executorLabel: exec-Image_cropping - inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the input manifest - isOptional: true - parameters: - cache: - defaultValue: true - description: Set to False to disable caching, True by default. - isOptional: true - parameterType: BOOLEAN - component_spec: - defaultValue: {} - description: The component specification as a dictionary - isOptional: true - parameterType: STRUCT - cropping_threshold: - defaultValue: -30.0 - description: Threshold parameter used for detecting borders. A lower (negative) - parameter results in a more performant border detection, but can cause - overcropping. Default is -30 - isOptional: true - parameterType: NUMBER_INTEGER - input_partition_rows: - defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: STRING - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - padding: - defaultValue: 10.0 - description: Padding for the image cropping. The padding is added to all - borders of the image. - isOptional: true - parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest comp-first-component: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: First_component - tasks: - First_component: - cachingOptions: - enableCache: true - componentRef: - name: comp-First_component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: First_component + executorLabel: exec-first-component inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING storage_args: - description: Storage arguments parameterType: STRING outputDefinitions: artifacts: @@ -158,75 +34,36 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest comp-image-cropping: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: Image_cropping - tasks: - Image_cropping: - cachingOptions: - enableCache: true - componentRef: - name: comp-Image_cropping - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - cache: - componentInputParameter: cache - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - taskInfo: - name: Image_cropping + executorLabel: exec-image-cropping inputDefinitions: artifacts: input_manifest_path: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the input manifest isOptional: true parameters: cache: defaultValue: true - description: Set to False to disable caching, True by default. isOptional: true parameterType: BOOLEAN component_spec: defaultValue: {} - description: The component specification as a dictionary isOptional: true parameterType: STRUCT cropping_threshold: defaultValue: -30.0 - description: Threshold parameter used for detecting borders. A lower (negative) - parameter results in a more performant border detection, but can cause - overcropping. Default is -30 isOptional: true parameterType: NUMBER_INTEGER input_partition_rows: defaultValue: None - description: The number of rows to load per partition. Set to override the - automatic partitioning isOptional: true parameterType: STRING metadata: - description: Metadata arguments containing the run id and base path parameterType: STRING padding: defaultValue: 10.0 - description: Padding for the image cropping. The padding is added to all - borders of the image. isOptional: true parameterType: NUMBER_INTEGER outputDefinitions: @@ -235,10 +72,9 @@ components: artifactType: schemaTitle: system.Artifact schemaVersion: 0.0.1 - description: Path to the output manifest deploymentSpec: executors: - exec-First_component: + exec-first-component: container: args: - --input_manifest_path @@ -260,7 +96,7 @@ deploymentSpec: - execute - main image: example_component:latest - exec-Image_cropping: + exec-image-cropping: container: args: - --input_manifest_path diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml index 732fc02f4..a622b4d62 100644 --- a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml @@ -1,9 +1,9 @@ { "components": { - "comp-Example_component": + "comp-example-component": { - "executorLabel": "exec-Example_component", + "executorLabel": "exec-example-component", "inputDefinitions": { "artifacts": @@ -75,7 +75,7 @@ { "executors": { - "exec-Example_component": + "exec-example-component": { "container": { @@ -102,7 +102,7 @@ }, }, }, - "pipelineInfo": { "name": "Example_component" }, + "pipelineInfo": { "name": "example-component" }, "root": { "dag": @@ -117,7 +117,7 @@ [ { "outputArtifactKey": "output_manifest_path", - "producerSubtask": "Example_component", + "producerSubtask": "example-component", }, ], }, @@ -125,10 +125,10 @@ }, "tasks": { - "Example_component": + "example-component": { "cachingOptions": { "enableCache": True }, - "componentRef": { "name": "comp-Example_component" }, + "componentRef": { "name": "comp-example-component" }, "inputs": { "artifacts": @@ -148,7 +148,7 @@ "cache": { "componentInputParameter": "cache" }, }, }, - "taskInfo": { "name": "Example_component" }, + "taskInfo": { "name": "example-component" }, }, }, }, diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index 6d04bc018..995aa67a7 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,9 +1,9 @@ { "components": { - "comp-Example_component": + "comp-example-component": { - "executorLabel": "exec-Example_component", + "executorLabel": "exec-example-component", "inputDefinitions": { "artifacts": @@ -75,7 +75,7 @@ { "executors": { - "exec-Example_component": + "exec-example-component": { "container": { @@ -102,7 +102,7 @@ }, }, }, - "pipelineInfo": { "name": "Example_component" }, + "pipelineInfo": { "name": "example-component" }, "root": { "dag": @@ -117,7 +117,7 @@ [ { "outputArtifactKey": "output_manifest_path", - "producerSubtask": "Example_component", + "producerSubtask": "example-component", }, ], }, @@ -125,11 +125,11 @@ }, "tasks": { - "Example_component": + "example-component": { "cachingOptions": { "enableCache": True }, "componentRef": - { "name": "comp-Example_component" }, + { "name": "comp-example-component" }, "inputs": { "artifacts": @@ -159,7 +159,7 @@ }, }, }, - "taskInfo": { "name": "Example_component" }, + "taskInfo": { "name": "example-component" }, }, }, }, diff --git a/tests/test_compiler.py b/tests/test_compiler.py index acf54566a..44c95f958 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -5,7 +5,9 @@ import pytest import yaml + from fondant.compiler import DockerCompiler, KubeFlowCompiler, VertexCompiler +from fondant.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") @@ -21,8 +23,6 @@ Path(COMPONENTS_PATH / "example_1" / "first_component"), arguments={"storage_args": "a dummy string arg"}, input_partition_rows="disable", - number_of_gpus=1, - preemptible=True, ), "cache_key": "1", }, @@ -221,6 +221,68 @@ def test_docker_extra_volumes(setup_pipeline, tmp_path_factory): ) +@pytest.mark.usefixtures("_freeze_time") +def test_docker_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="GPU", + ) + + expected_resources = { + "reservations": { + "devices": [ + { + "capabilities": ["gpu"], + "count": 1, + "driver": "nvidia", + }, + ], + }, + } + + pipeline.add_op(component_1) + compiler = DockerCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "docker-compose.yaml") + compiler.compile(pipeline=pipeline, output_path=output_path) + # read the generated docker-compose file + with open(output_path) as f_spec: + spec = yaml.safe_load(f_spec) + assert ( + spec["services"]["first_component"]["deploy"]["resources"] + == expected_resources + ) + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_docker_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = DockerCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") + + @pytest.mark.usefixtures("_freeze_time") def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): """Test compiling a pipeline to kubeflow.""" @@ -235,13 +297,71 @@ def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): assert yaml.safe_load(src) == yaml.safe_load(truth) -# @pytest.mark.usefixtures("_freeze_time") -# def test_kubeflow_configuration(tmp_path_factory): -# """Test that the kubeflow pipeline can be configured.""" -# with tmp_path_factory.mktemp("temp") as fn: -# with open(output_path) as src, open( -# VALID_PIPELINE / "kubeflow_pipeline.yml", -# ) as truth: +@pytest.mark.usefixtures("_freeze_time") +def test_kubeflow_configuration(tmp_path_factory): + """Test that the kubeflow pipeline can be configured.""" + node_pool_label = "dummy_label" + node_pool_name = "dummy_label" + + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + node_pool_label=node_pool_label, + node_pool_name=node_pool_name, + number_of_accelerators=1, + accelerator_name="GPU", + ) + pipeline.add_op(component_1) + compiler = KubeFlowCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "kubeflow_pipeline.yml") + compiler.compile(pipeline=pipeline, output_path=output_path) + with open(output_path) as src: + # Two specs are present and loaded in the yaml file (component spec and k8s specs) + compiled_specs = yaml.load_all(src, Loader=yaml.FullLoader) + for spec in compiled_specs: + if "platforms" in spec: + component_kubernetes_spec = spec["platforms"]["kubernetes"][ + "deploymentSpec" + ]["executors"]["exec-first-component"] + assert component_kubernetes_spec["nodeSelector"]["labels"] == { + node_pool_label: node_pool_name, + } + + elif "deploymentSpec" in spec: + component_resources = spec["deploymentSpec"]["executors"][ + "exec-first-component" + ]["container"]["resources"] + assert component_resources["accelerator"]["count"] == "1" + assert ( + component_resources["accelerator"]["type"] == "nvidia.com/gpu" + ) + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_kubeflow_configuration(tmp_path_factory): + """Test that an error is returned when an invalid resource is provided.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = KubeFlowCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") def test_kfp_import(): @@ -265,3 +385,54 @@ def test_vertex_compiler(setup_pipeline, tmp_path_factory): VALID_PIPELINE / example_dir / "vertex_pipeline.yml", ) as truth: assert yaml.safe_load(src) == yaml.safe_load(truth) + + +@pytest.mark.usefixtures("_freeze_time") +def test_vertex_configuration(tmp_path_factory): + """Test that the kubeflow pipeline can be configured.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="NVIDIA_TESLA_K80", + ) + pipeline.add_op(component_1) + compiler = VertexCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "vertex_pipeline.yml") + compiler.compile(pipeline=pipeline, output_path=output_path) + with open(output_path) as src: + # Two specs are present and loaded in the yaml file (component spec and k8s specs) + compiled_specs = yaml.safe_load(src) + + component_resources = compiled_specs["deploymentSpec"]["executors"][ + "exec-first-component" + ]["container"]["resources"] + assert component_resources["accelerator"]["count"] == "1" + assert component_resources["accelerator"]["type"] == "NVIDIA_TESLA_K80" + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_vertex_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = VertexCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index c674ad2a2..0acad2aa2 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -53,6 +53,13 @@ def test_component_op( node_pool_label="dummy_label", ) + with pytest.raises(InvalidPipelineDefinition): + ComponentOp( + Path(components_path / component_names[0]), + arguments=component_args, + number_of_accelerators=1, + ) + @pytest.mark.parametrize( "valid_pipeline_example", From 65e45532966a035ccd1ecc7217d221edbfb5cf51 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 18 Sep 2023 17:12:11 +0200 Subject: [PATCH 04/31] Fix v2 defaults (#436) PR that fixes the v2 default /optional types. KFP v2 has two different optional types: - Optional values that default to None -> defined by setting `isOptional` to True and not passing the default value - Optional values that default to another values -> defined by setting the value in `defaultValue` and having `isOptional` False or empty. This PR adjusts the componentSpec accordingly. Another nice win is that we do not have to define bools/dicts/lists as strings in the component spec Few things no note: * Due to strict type. Mixing different data types is not allowed -> we had to change the type of `input_partition_rows` to int. Which means it does not accept the value `disable` anymore. Need to fix this in a seperate PR * 'inf` as float is not allowed --- .../download_images/fondant_component.yaml | 2 +- docs/components/component_spec.md | 2 - src/fondant/compiler.py | 15 +- src/fondant/component_spec.py | 33 +- src/fondant/executor.py | 2 +- src/fondant/schemas/component_spec.json | 9 + .../example_1/docker-compose.yml | 2 +- .../example_1/kubeflow_pipeline.yml | 19 +- .../example_1/vertex_pipeline.yml | 19 +- .../example_2/kubeflow_pipeline.yml | 15 +- .../example_2/vertex_pipeline.yml | 15 +- .../compiled_pipeline/kubeflow_pipeline.yml | 2 - .../component_specs/kubeflow_component.yaml | 333 ++++++------------ .../components/arguments/component.yaml | 8 +- .../arguments/component_default_args.yaml | 8 +- tests/test_compiler.py | 4 +- tests/test_component.py | 7 +- 17 files changed, 176 insertions(+), 319 deletions(-) diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 665ed4912..0852da808 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -39,7 +39,7 @@ args: resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool - default: 'False' + default: False min_image_size: description: Minimum size of the images. type: int diff --git a/docs/components/component_spec.md b/docs/components/component_spec.md index 2f1eda471..98719db16 100644 --- a/docs/components/component_spec.md +++ b/docs/components/component_spec.md @@ -124,8 +124,6 @@ The `args` section describes which arguments the component takes. Each argument `description` and a `type`, which should be one of the builtin Python types. Additionally, you can set an optional `default` value for each argument. -_Note:_ default iterable arguments such as `dict` and `list` have to be passed as a string -(e.g. `'{"foo":1, "bar":2}`, `'["foo","bar]'`) ```yaml args: custom_argument: diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 336702efd..1b364dfcb 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -305,10 +305,14 @@ def kfp_pipeline(): text=component_op.component_spec.kubeflow_specification.to_string(), ) + # Remove None values from arguments + component_args = { + k: v for k, v in component_op.arguments.items() if v is not None + } + # # Set image pull policy to always # Execute the Kubeflow component and pass in the output manifest path from # the previous component. - component_args = component_op.arguments if previous_component_task is not None: component_task = kubeflow_component_op( @@ -414,10 +418,11 @@ def kfp_pipeline(): text=component_op.component_spec.kubeflow_specification.to_string(), ) - # Execute the Kubeflow component and pass in the output manifest path from - # the previous component. + # Remove None values from arguments + component_args = { + k: v for k, v in component_op.arguments.items() if v is not None + } - component_args = component_op.arguments metadata = Metadata( pipeline_name=pipeline.name, run_id=run_id, @@ -428,6 +433,8 @@ def kfp_pipeline(): # Set the execution order of the component task to be after the previous # component task. if previous_component_task is not None: + # Execute the Kubeflow component and pass in the output manifest path from + # the previous component. component_task = kubeflow_component_op( input_manifest_path=manifest_path, metadata=metadata.to_json(), diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 8cdec36fc..43d14676c 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -55,12 +55,14 @@ class Argument: description: argument description type: the python argument type (str, int, ...) default: default value of the argument (defaults to None) + optional: whether an argument is optional or not (defaults to False) """ name: str description: str type: str default: t.Optional[str] = None + optional: t.Optional[bool] = False class ComponentSubset: @@ -192,12 +194,18 @@ def outputs_additional_subsets(self) -> bool: @property def args(self) -> t.Dict[str, Argument]: + def _is_optional(arg_information): + if "default" in arg_information: + return arg_information["default"] == "None" + return False + return { name: Argument( name=name, description=arg_info["description"], type=arg_info["type"], default=arg_info["default"] if "default" in arg_info else None, + optional=_is_optional(arg_info), ) for name, arg_info in self._specification.get("args", {}).items() } @@ -234,15 +242,19 @@ def __init__(self, specification: t.Dict[str, t.Any]) -> None: def convert_arguments(fondant_component): args = {} for arg in fondant_component.args.values(): + arg_type_dict = {} + + if arg.optional and arg.default == "None": + arg_type_dict["isOptional"] = True + if arg.default is not None and arg.default != "None": + arg_type_dict["defaultValue"] = arg.default + args[arg.name] = { "parameterType": python2kubeflow_type[arg.type], "description": arg.description, - **( - {"defaultValue": arg.default, "isOptional": True} - if arg.default is not None - else {} - ), + **arg_type_dict, } + return args @staticmethod @@ -283,8 +295,7 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "description": "The number of rows to load per partition." + " Set to override the automatic partitioning", "isOptional": True, - "parameterType": "STRING", - "defaultValue": "None", + "parameterType": "NUMBER_INTEGER", }, "cache": { "parameterType": "BOOLEAN", @@ -438,6 +449,7 @@ def input_arguments(self) -> t.Mapping[str, Argument]: description=arg_info["description"], type="STRING", default=None, + optional=False, ) if "parameters" in input_definitions: for arg_name, arg_info in input_definitions["parameters"].items(): @@ -448,6 +460,9 @@ def input_arguments(self) -> t.Mapping[str, Argument]: default=arg_info["defaultValue"] if "defaultValue" in arg_info else None, + optional=arg_info["isOptional"] + if "isOptional" in arg_info + else False, ) return types.MappingProxyType(args) @@ -464,6 +479,7 @@ def output_arguments(self) -> t.Mapping[str, Argument]: description=arg_info["description"], type="STRING", default=None, + optional=False, ) if "parameters" in output_definitions: for arg_name, arg_info in output_definitions["parameters"].items(): @@ -474,6 +490,9 @@ def output_arguments(self) -> t.Mapping[str, Argument]: default=arg_info["defaultValue"] if "defaultValue" in arg_info else None, + optional=arg_info["isOptional"] + if "isOptional" in arg_info + else False, ) return types.MappingProxyType(args) diff --git a/src/fondant/executor.py b/src/fondant/executor.py index b4a568b8f..417273e92 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -188,7 +188,7 @@ def _add_and_parse_args(cls, spec: ComponentSpec): if arg.name in cls.optional_fondant_arguments(): input_required = False default = None - elif arg.default is not None: + elif arg.default is not None or arg.optional is True: input_required = False default = arg.default else: diff --git a/src/fondant/schemas/component_spec.json b/src/fondant/schemas/component_spec.json index 9badfc01b..6079e7bae 100644 --- a/src/fondant/schemas/component_spec.json +++ b/src/fondant/schemas/component_spec.json @@ -88,6 +88,15 @@ }, { "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array" + }, + { + "type": "object" } ] } diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml index ec9258c1b..7191ad0ea 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml @@ -13,7 +13,7 @@ services: - --storage_args - a dummy string arg - --input_partition_rows - - disable + - '10' - --cache - 'False' - --cluster_type diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index b48723b17..a855ab698 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -1,6 +1,3 @@ -# PIPELINE DEFINITION -# Name: testpipeline -# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component @@ -21,9 +18,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -53,9 +49,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -85,9 +80,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -203,7 +197,7 @@ root: type: binary input_partition_rows: runtimeValue: - constant: disable + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -255,7 +249,7 @@ root: type: array input_partition_rows: runtimeValue: - constant: '10' + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -314,9 +308,6 @@ root: fields: data: type: binary - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index b48723b17..a855ab698 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -1,6 +1,3 @@ -# PIPELINE DEFINITION -# Name: testpipeline -# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component @@ -21,9 +18,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -53,9 +49,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -85,9 +80,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -203,7 +197,7 @@ root: type: binary input_partition_rows: runtimeValue: - constant: disable + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -255,7 +249,7 @@ root: type: array input_partition_rows: runtimeValue: - constant: '10' + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -314,9 +308,6 @@ root: fields: data: type: binary - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index e534b1a36..c9ba66940 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,6 +1,3 @@ -# PIPELINE DEFINITION -# Name: testpipeline -# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component @@ -21,9 +18,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -57,9 +53,8 @@ components: isOptional: true parameterType: NUMBER_INTEGER input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING padding: @@ -155,9 +150,6 @@ root: fields: data: type: binary - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -221,9 +213,6 @@ root: cropping_threshold: runtimeValue: constant: 0.0 - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index e534b1a36..c9ba66940 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -1,6 +1,3 @@ -# PIPELINE DEFINITION -# Name: testpipeline -# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component @@ -21,9 +18,8 @@ components: isOptional: true parameterType: STRUCT input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING storage_args: @@ -57,9 +53,8 @@ components: isOptional: true parameterType: NUMBER_INTEGER input_partition_rows: - defaultValue: None isOptional: true - parameterType: STRING + parameterType: NUMBER_INTEGER metadata: parameterType: STRING padding: @@ -155,9 +150,6 @@ root: fields: data: type: binary - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -221,9 +213,6 @@ root: cropping_threshold: runtimeValue: constant: 0.0 - input_partition_rows: - runtimeValue: - constant: None metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml index a622b4d62..18b2b38cf 100644 --- a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml @@ -33,7 +33,6 @@ "description": "The number of rows to load per partition. Set to override the automatic partitioning", "isOptional": True, "parameterType": "STRING", - "defaultValue": "None", }, "cache": { @@ -181,7 +180,6 @@ "description": "The number of rows to load per partition. Set to override the automatic partitioning", "isOptional": True, "parameterType": "STRING", - "defaultValue": "None", }, "cache": { diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index 995aa67a7..d175ea854 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,234 +1,99 @@ -{ - "components": - { - "comp-example-component": - { - "executorLabel": "exec-example-component", - "inputDefinitions": - { - "artifacts": - { - "input_manifest_path": - { - "description": "Path to the input manifest", - "artifactType": - { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "isOptional": True, - }, - }, - "parameters": - { - "component_spec": - { - "description": "The component specification as a dictionary", - "defaultValue": {}, - "isOptional": True, - "parameterType": "STRUCT", - }, - "input_partition_rows": - { - "description": "The number of rows to load per partition. Set to override the automatic partitioning", - "isOptional": True, - "parameterType": "STRING", - "defaultValue": "None", - }, - "cache": - { - "parameterType": "BOOLEAN", - "description": "Set to False to disable caching, True by default.", - "defaultValue": True, - "isOptional": True, - }, - "metadata": - { - "description": "Metadata arguments containing the run id and base path", - "parameterType": "STRING", - }, - "storage_args": - { - "parameterType": "STRING", - "description": "Storage arguments", - }, - }, - }, - "outputDefinitions": - { - "artifacts": - { - "output_manifest_path": - { - "artifactType": - { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "description": "Path to the output manifest", - }, - }, - }, - }, - }, - "deploymentSpec": - { - "executors": - { - "exec-example-component": - { - "container": - { - "args": - [ - "--input_manifest_path", - "{{$.inputs.artifacts['input_manifest_path'].uri}}", - "--metadata", - "{{$.inputs.parameters['metadata']}}", - "--component_spec", - "{{$.inputs.parameters['component_spec']}}", - "--input_partition_rows", - "{{$.inputs.parameters['input_partition_rows']}}", - "--cache", - "{{$.inputs.parameters['cache']}}", - "--storage_args", - "{{$.inputs.parameters['storage_args']}}", - "--output_manifest_path", - "{{$.outputs.artifacts['output_manifest_path'].uri}}", - ], - "command": ["fondant", "execute", "main"], - "image": "example_component:latest", - }, - }, - }, - }, - "pipelineInfo": { "name": "example-component" }, - "root": - { - "dag": - { - "outputs": - { - "artifacts": - { - "output_manifest_path": - { - "artifactSelectors": - [ - { - "outputArtifactKey": "output_manifest_path", - "producerSubtask": "example-component", - }, - ], - }, - }, - }, - "tasks": - { - "example-component": - { - "cachingOptions": { "enableCache": True }, - "componentRef": - { "name": "comp-example-component" }, - "inputs": - { - "artifacts": - { - "input_manifest_path": - { - "componentInputArtifact": "input_manifest_path", - }, - }, - "parameters": - { - "component_spec": - { - "componentInputParameter": "component_spec", - }, - "input_partition_rows": - { - "componentInputParameter": "input_partition_rows", - }, - "metadata": - { - "componentInputParameter": "metadata", - }, - "cache": - { - "componentInputParameter": "cache", - }, - }, - }, - "taskInfo": { "name": "example-component" }, - }, - }, - }, - "inputDefinitions": - { - "artifacts": - { - "input_manifest_path": - { - "description": "Path to the input manifest", - "artifactType": - { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "isOptional": True, - }, - }, - "parameters": - { - "component_spec": - { - "description": "The component specification as a dictionary", - "defaultValue": {}, - "isOptional": True, - "parameterType": "STRUCT", - }, - "input_partition_rows": - { - "description": "The number of rows to load per partition. Set to override the automatic partitioning", - "isOptional": True, - "parameterType": "STRING", - "defaultValue": "None", - }, - "cache": - { - "parameterType": "BOOLEAN", - "description": "Set to False to disable caching, True by default.", - "defaultValue": True, - "isOptional": True, - }, - "metadata": - { - "description": "Metadata arguments containing the run id and base path", - "parameterType": "STRING", - }, - "storage_args": - { - "parameterType": "STRING", - "description": "Storage arguments", - }, - }, - }, - "outputDefinitions": - { - "artifacts": - { - "output_manifest_path": - { - "artifactType": - { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "description": "Path to the output manifest", - }, - }, - }, - }, - "schemaVersion": "2.1.0", - "sdkVersion": "kfp-2.0.1", -} +components: + comp-example-component: + executorLabel: exec-example-component + inputDefinitions: &id001 + artifacts: + input_manifest_path: + description: Path to the input manifest + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + isOptional: true + parameters: + component_spec: + description: The component specification as a dictionary + defaultValue: {} + isOptional: true + parameterType: STRUCT + input_partition_rows: + description: The number of rows to load per partition. Set to override the + automatic partitioning + isOptional: true + parameterType: NUMBER_INTEGER + cache: + parameterType: BOOLEAN + description: Set to False to disable caching, True by default. + defaultValue: true + isOptional: true + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + storage_args: + parameterType: STRING + description: Storage arguments + outputDefinitions: &id002 + artifacts: + output_manifest_path: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + description: Path to the output manifest +deploymentSpec: + executors: + exec-example-component: + container: + args: + - --input_manifest_path + - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + name: example-component +root: + dag: + outputs: + artifacts: + output_manifest_path: + artifactSelectors: + - outputArtifactKey: output_manifest_path + producerSubtask: example-component + tasks: + example-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-example-component + inputs: + artifacts: + input_manifest_path: + componentInputArtifact: input_manifest_path + parameters: + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + cache: + componentInputParameter: cache + taskInfo: + name: example-component + inputDefinitions: *id001 + outputDefinitions: *id002 +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/example_specs/components/arguments/component.yaml index cc14f5221..0ef6cb7c9 100644 --- a/tests/example_specs/components/arguments/component.yaml +++ b/tests/example_specs/components/arguments/component.yaml @@ -18,19 +18,19 @@ args: bool_false_default_arg: description: default bool argument type: bool - default: 'False' + default: False bool_true_default_arg: description: default bool argument type: bool - default: 'True' + default: True list_default_arg: description: default list argument type: list - default: '["foo", "bar"]' + default: ["foo", "bar"] dict_default_arg: description: default dict argument type: dict - default: '{"foo":1, "bar":2}' + default: {"foo":1, "bar":2} string_default_arg_none: description: default string argument type: str diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/example_specs/components/arguments/component_default_args.yaml index 2d582fbfe..816211c04 100644 --- a/tests/example_specs/components/arguments/component_default_args.yaml +++ b/tests/example_specs/components/arguments/component_default_args.yaml @@ -18,19 +18,19 @@ args: bool_false_default_arg: description: default bool argument type: bool - default: 'False' + default: False bool_true_default_arg: description: default bool argument type: bool - default: 'True' + default: True list_default_arg: description: default list argument type: list - default: '["foo", "bar"]' + default: ["foo", "bar"] dict_default_arg: description: default dict argument type: dict - default: '{"foo":1, "bar":2}' + default: {"foo":1, "bar":2} string_default_arg_none: description: default string argument type: str diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 44c95f958..20b47b1e2 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -22,7 +22,7 @@ "component_op": ComponentOp( Path(COMPONENTS_PATH / "example_1" / "first_component"), arguments={"storage_args": "a dummy string arg"}, - input_partition_rows="disable", + input_partition_rows=10, ), "cache_key": "1", }, @@ -30,7 +30,7 @@ "component_op": ComponentOp( Path(COMPONENTS_PATH / "example_1" / "second_component"), arguments={"storage_args": "a dummy string arg"}, - input_partition_rows="10", + input_partition_rows=10, ), "cache_key": "2", }, diff --git a/tests/test_component.py b/tests/test_component.py index 75169e93b..0465371e3 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -123,13 +123,13 @@ def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: assert executor.input_partition_rows == expected_partition_row_arg assert executor.cache is True assert executor.user_arguments == { - "string_default_arg": "foo", "integer_default_arg": 0, "float_default_arg": 3.14, "bool_false_default_arg": False, "bool_true_default_arg": True, "list_default_arg": ["foo", "bar"], "dict_default_arg": {"foo": 1, "bar": 2}, + "string_default_arg": "foo", "string_default_arg_none": None, "integer_default_arg_none": None, "float_default_arg_none": None, @@ -286,7 +286,7 @@ def test_dask_transform_component(metadata): "--value", "1", "--input_partition_rows", - "disable", + "10", "--output_manifest_path", str(components_path / "output_manifest.json"), "--component_spec", @@ -308,7 +308,8 @@ def transform(self, dataframe): executor_factory = ExecutorFactory(MyDaskComponent) executor = executor_factory.get_executor() - assert executor.input_partition_rows == "disable" + expected_input_partition_rows = 10 + assert executor.input_partition_rows == expected_input_partition_rows transform = patch_method_class(MyDaskComponent.transform) with mock.patch.object( MyDaskComponent, From 03b1c3b6d2b2c9051750e1e84ebbd7abdf781d2e Mon Sep 17 00:00:00 2001 From: Georges Lorre Date: Tue, 19 Sep 2023 10:21:46 +0200 Subject: [PATCH 05/31] Make ComponentSpec the base for arg building --- src/fondant/component_spec.py | 175 +++++++++++++++++----------------- src/fondant/executor.py | 9 +- 2 files changed, 89 insertions(+), 95 deletions(-) diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 43d14676c..eca09bac9 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -18,42 +18,16 @@ from fondant.exceptions import InvalidComponentSpec from fondant.schema import Field, KubeflowCommandArguments, Type -# # TODO: remove after upgrading to kfpv2 - -kubeflow_to_python_type_dict = { - "STRING": str, - "NUMBER_INTEGER": int, - "NUMBER_DOUBLE": float, - "BOOLEAN": lambda x: bool(strtobool(x)), - "STRUCT": json.loads, - "LIST": json.loads, -} - - -def kubeflow2python_type(type_: str) -> t.Any: - map_fn = kubeflow_to_python_type_dict[type_] - return lambda value: map_fn(value) if value != "None" else None # type: ignore - - -python2kubeflow_type = { - "str": "STRING", - "int": "NUMBER_INTEGER", - "float": "NUMBER_DOUBLE", - "bool": "BOOLEAN", - "dict": "STRUCT", - "list": "LIST", -} - @dataclass class Argument: """ - Kubeflow component argument. + Component argument. Args: name: name of the argument description: argument description - type: the python argument type (str, int, ...) + type: the python argument type in str format (str, int, ...) default: default value of the argument (defaults to None) optional: whether an argument is optional or not (defaults to False) """ @@ -64,6 +38,30 @@ class Argument: default: t.Optional[str] = None optional: t.Optional[bool] = False + @property + def python_type(self) -> t.Any: + lookup = { + "str": str, + "int": int, + "float": float, + "bool": bool, + "dict": json.loads, + "list": json.loads, + } + return lookup[self.type] + + @property + def kubeflow_type(self) -> str: + lookup = { + "str": "STRING", + "int": "NUMBER_INTEGER", + "float": "NUMBER_DOUBLE", + "bool": "BOOLEAN", + "dict": "STRUCT", + "list": "LIST", + } + return lookup[self.type] + class ComponentSubset: """ @@ -214,6 +212,63 @@ def _is_optional(arg_information): def specification(self) -> t.Dict[str, t.Any]: return copy.deepcopy(self._specification) + @property + def input_arguments(self) -> t.Mapping[str, Argument]: + """The input arguments (default + custom) of the component as an immutable mapping.""" + args = self.args + + # Add default arguments + args.update( + { + "input_manifest_path": Argument( + name="input_manifest_path", + description="Path to the input manifest", + type="str", + default=None, + ), + "component_spec": Argument( + name="component_spec", + description="The component specification as a dictionary", + type="dict", + default={}, + ), + "input_partition_rows": Argument( + name="input_partition_rows", + description="The number of rows to load per partition. \ + Set to override the automatic partitioning", + type="str", + default=None, + ), + "cache": Argument( + name="cache", + description="Set to False to disable caching, True by default.", + type="bool", + default=True, + ), + "metadata": Argument( + name="metadata", + description="Metadata arguments containing the run id and base path", + type="str", + default=None, + ), + }, + ) + return types.MappingProxyType(args) + + @property + def output_arguments(self) -> t.Mapping[str, Argument]: + """The output arguments of the component as an immutable mapping.""" + return types.MappingProxyType( + { + "output_manifest_path": Argument( + name="output_manifest_path", + description="Path to the output manifest", + type="str", + default=None, + ), + }, + ) + @property def kubeflow_specification(self) -> "KubeflowComponentSpec": return KubeflowComponentSpec.from_fondant_component_spec(self) @@ -239,7 +294,7 @@ def __init__(self, specification: t.Dict[str, t.Any]) -> None: self._specification = specification @staticmethod - def convert_arguments(fondant_component): + def convert_arguments(fondant_component: ComponentSpec): args = {} for arg in fondant_component.args.values(): arg_type_dict = {} @@ -250,7 +305,7 @@ def convert_arguments(fondant_component): arg_type_dict["defaultValue"] = arg.default args[arg.name] = { - "parameterType": python2kubeflow_type[arg.type], + "parameterType": arg.kubeflow_type, "description": arg.description, **arg_type_dict, } @@ -436,65 +491,5 @@ def to_string(self) -> str: """Return the component specification as a string.""" return json.dumps(self._specification) - @property - def input_arguments(self) -> t.Mapping[str, Argument]: - """The input arguments of the component as an immutable mapping.""" - args = {} - input_definitions = self._specification["root"]["inputDefinitions"] - - if "artifacts" in input_definitions: - for arg_name, arg_info in input_definitions["artifacts"].items(): - args[arg_name] = Argument( - name=arg_name, - description=arg_info["description"], - type="STRING", - default=None, - optional=False, - ) - if "parameters" in input_definitions: - for arg_name, arg_info in input_definitions["parameters"].items(): - args[arg_name] = Argument( - name=arg_name, - description=arg_info["description"], - type=arg_info["parameterType"], - default=arg_info["defaultValue"] - if "defaultValue" in arg_info - else None, - optional=arg_info["isOptional"] - if "isOptional" in arg_info - else False, - ) - return types.MappingProxyType(args) - - @property - def output_arguments(self) -> t.Mapping[str, Argument]: - """The output arguments of the component as an immutable mapping.""" - args = {} - output_definitions = self._specification["root"]["outputDefinitions"] - - if "artifacts" in output_definitions: - for arg_name, arg_info in output_definitions["artifacts"].items(): - args[arg_name] = Argument( - name=arg_name, - description=arg_info["description"], - type="STRING", - default=None, - optional=False, - ) - if "parameters" in output_definitions: - for arg_name, arg_info in output_definitions["parameters"].items(): - args[arg_name] = Argument( - name=arg_name, - description=arg_info["description"], - type=arg_info["parameterType"], - default=arg_info["defaultValue"] - if "defaultValue" in arg_info - else None, - optional=arg_info["isOptional"] - if "isOptional" in arg_info - else False, - ) - return types.MappingProxyType(args) - def __repr__(self) -> str: return f"{self.__class__.__name__}({self._specification!r})" diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 417273e92..20ae725a8 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -26,7 +26,7 @@ DaskWriteComponent, PandasTransformComponent, ) -from fondant.component_spec import Argument, ComponentSpec, kubeflow2python_type +from fondant.component_spec import Argument, ComponentSpec from fondant.data_io import DaskDataLoader, DaskDataWriter from fondant.manifest import Manifest, Metadata from fondant.schema import validate_partition_number @@ -197,7 +197,7 @@ def _add_and_parse_args(cls, spec: ComponentSpec): parser.add_argument( f"--{arg.name}", - type=kubeflow2python_type(arg.type), # type: ignore + type=arg.python_type, # type: ignore required=input_required, default=default, help=arg.description, @@ -221,9 +221,8 @@ def _get_component_arguments(spec: ComponentSpec) -> t.Dict[str, Argument]: Input and output arguments of the component. """ component_arguments: t.Dict[str, Argument] = {} - kubeflow_component_spec = spec.kubeflow_specification - component_arguments.update(kubeflow_component_spec.input_arguments) - component_arguments.update(kubeflow_component_spec.output_arguments) + component_arguments.update(spec.input_arguments) + component_arguments.update(spec.output_arguments) return component_arguments @abstractmethod From 6faf90c2b06c49a5ac10b70a81f8c8061f793aab Mon Sep 17 00:00:00 2001 From: Georges Lorre Date: Tue, 19 Sep 2023 11:58:09 +0200 Subject: [PATCH 06/31] Make ComponentSpec the base for arg building --- src/fondant/component_spec.py | 6 ++--- .../components/arguments/component.yaml | 19 ++++++++-------- tests/test_component.py | 22 ++++++++----------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index eca09bac9..fb156c7e4 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -6,7 +6,6 @@ import types import typing as t from dataclasses import dataclass -from distutils.util import strtobool from pathlib import Path import jsonschema.exceptions @@ -35,7 +34,7 @@ class Argument: name: str description: str type: str - default: t.Optional[str] = None + default: t.Any = None optional: t.Optional[bool] = False @property @@ -238,6 +237,7 @@ def input_arguments(self) -> t.Mapping[str, Argument]: Set to override the automatic partitioning", type="str", default=None, + optional=True, ), "cache": Argument( name="cache", @@ -307,7 +307,7 @@ def convert_arguments(fondant_component: ComponentSpec): args[arg.name] = { "parameterType": arg.kubeflow_type, "description": arg.description, - **arg_type_dict, + **arg_type_dict, # type: ignore } return args diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/example_specs/components/arguments/component.yaml index 0ef6cb7c9..021ce6ce6 100644 --- a/tests/example_specs/components/arguments/component.yaml +++ b/tests/example_specs/components/arguments/component.yaml @@ -38,31 +38,32 @@ args: integer_default_arg_none: description: default integer argument type: int - default: None + default: 0 float_default_arg_none: description: default float argument type: float - default: None + default: 0.0 bool_default_arg_none: description: default bool argument type: bool - default: None + default: False list_default_arg_none: description: default list argument type: list - default: None + default: [] dict_default_arg_none: description: default dict argument type: dict - default: None + default: {} override_default_arg: description: argument with default python value type that can be overriden type: str default: foo - override_default_none_arg: - description: argument with default None value type that can be overriden with a valid python type - type: float - default: None override_default_arg_with_none: description: argument with default python type that can be overriden with None type: str + optional_arg: + description: optional argument + type: str + optional: True + default: None diff --git a/tests/test_component.py b/tests/test_component.py index 0465371e3..1473aabe6 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -16,11 +16,7 @@ ) from fondant.component_spec import ComponentSpec from fondant.data_io import DaskDataLoader, DaskDataWriter -from fondant.executor import ( - Executor, - ExecutorFactory, - PandasTransformExecutor, -) +from fondant.executor import Executor, ExecutorFactory, PandasTransformExecutor from fondant.manifest import Manifest, Metadata components_path = Path(__file__).parent / "example_specs/components" @@ -130,15 +126,15 @@ def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: "list_default_arg": ["foo", "bar"], "dict_default_arg": {"foo": 1, "bar": 2}, "string_default_arg": "foo", - "string_default_arg_none": None, - "integer_default_arg_none": None, - "float_default_arg_none": None, - "bool_default_arg_none": None, - "list_default_arg_none": None, - "dict_default_arg_none": None, + "string_default_arg_none": "None", + "integer_default_arg_none": 0, + "float_default_arg_none": 0.0, + "bool_default_arg_none": False, + "list_default_arg_none": [], + "dict_default_arg_none": {}, "override_default_arg": "bar", - "override_default_none_arg": 3.14, - "override_default_arg_with_none": None, + "override_default_arg_with_none": "None", + "optional_arg": "None", } From 99016d73efe51e94f018f35067aa0f4ed49f7b71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Lorr=C3=A9?= <35808396+GeorgesLorre@users.noreply.github.com> Date: Thu, 21 Sep 2023 15:32:41 +0200 Subject: [PATCH 07/31] Feature/no artifacts (#444) --- src/fondant/compiler.py | 87 ++++---- src/fondant/component_spec.py | 203 ++++++------------ src/fondant/executor.py | 3 +- .../example_1/kubeflow_pipeline.yml | 127 ++++++----- .../example_1/vertex_pipeline.yml | 127 ++++++----- .../example_2/kubeflow_pipeline.yml | 83 ++++--- .../example_2/vertex_pipeline.yml | 83 ++++--- .../component_specs/kubeflow_component.yaml | 186 ++++++++-------- tests/test_component_specs.py | 8 +- 9 files changed, 457 insertions(+), 450 deletions(-) diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 1b364dfcb..9b84a1cce 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -285,21 +285,10 @@ def compile( @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): previous_component_task = None - manifest_path = "" - for component_name, component in pipeline._graph.items(): - component_op = component["fondant_component_op"] - - metadata = Metadata( - pipeline_name=pipeline.name, - run_id=run_id, - base_path=pipeline.base_path, - component_id=component_name, - cache_key=component_op.get_component_cache_key(), - ) - logger.info(f"Compiling service for {component_name}") + component_op = component["fondant_component_op"] # convert ComponentOp to Kubeflow component kubeflow_component_op = self.kfp.components.load_component_from_text( text=component_op.component_spec.kubeflow_specification.to_string(), @@ -310,33 +299,47 @@ def kfp_pipeline(): k: v for k, v in component_op.arguments.items() if v is not None } - # # Set image pull policy to always - # Execute the Kubeflow component and pass in the output manifest path from - # the previous component. + metadata = Metadata( + pipeline_name=pipeline.name, + run_id=run_id, + base_path=pipeline.base_path, + component_id=component_name, + cache_key=component_op.get_component_cache_key(), + ) - if previous_component_task is not None: - component_task = kubeflow_component_op( - input_manifest_path=manifest_path, - metadata=metadata.to_json(), - **component_args, - ) - component_task.after(previous_component_task) + output_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{component_name}/manifest.json" + ) + # Set the execution order of the component task to be after the previous + # component task. + if component["dependencies"]: + for dependency in component["dependencies"]: + input_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{dependency}/manifest.json" + ) + component_task = kubeflow_component_op( + input_manifest_path=input_manifest_path, + output_manifest_path=output_manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) else: component_task = kubeflow_component_op( metadata=metadata.to_json(), + output_manifest_path=output_manifest_path, **component_args, ) - # Set optional configurations + # Set optional arguments component_task = self._set_configuration( component_task, component_op, ) - # Update the manifest path to be the output path of the current component task. - manifest_path = component_task.outputs["output_manifest_path"] - previous_component_task = component_task logger.info(f"Compiling {pipeline.name} to {output_path}") @@ -408,7 +411,6 @@ def compile( @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): previous_component_task = None - manifest_path = None for component_name, component in pipeline._graph.items(): logger.info(f"Compiling service for {component_name}") @@ -430,21 +432,31 @@ def kfp_pipeline(): component_id=component_name, cache_key=component_op.get_component_cache_key(), ) + + output_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{component_name}/manifest.json" + ) # Set the execution order of the component task to be after the previous # component task. - if previous_component_task is not None: - # Execute the Kubeflow component and pass in the output manifest path from - # the previous component. - component_task = kubeflow_component_op( - input_manifest_path=manifest_path, - metadata=metadata.to_json(), - **component_args, - ) - component_task.after(previous_component_task) + if component["dependencies"]: + for dependency in component["dependencies"]: + input_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{dependency}/manifest.json" + ) + component_task = kubeflow_component_op( + input_manifest_path=input_manifest_path, + output_manifest_path=output_manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) else: component_task = kubeflow_component_op( metadata=metadata.to_json(), + output_manifest_path=output_manifest_path, **component_args, ) @@ -454,9 +466,6 @@ def kfp_pipeline(): component_op, ) - # Update the manifest path to be the output path of the current component task. - manifest_path = component_task.outputs["output_manifest_path"] - previous_component_task = component_task self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index fb156c7e4..39e5744c1 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -190,84 +190,76 @@ def outputs_additional_subsets(self) -> bool: return self._specification.get("produces", {}).get("additionalSubsets", True) @property - def args(self) -> t.Dict[str, Argument]: + def args(self) -> t.Mapping[str, Argument]: def _is_optional(arg_information): if "default" in arg_information: return arg_information["default"] == "None" return False - return { - name: Argument( - name=name, - description=arg_info["description"], - type=arg_info["type"], - default=arg_info["default"] if "default" in arg_info else None, - optional=_is_optional(arg_info), - ) - for name, arg_info in self._specification.get("args", {}).items() - } - - @property - def specification(self) -> t.Dict[str, t.Any]: - return copy.deepcopy(self._specification) + args = self.default_arguments - @property - def input_arguments(self) -> t.Mapping[str, Argument]: - """The input arguments (default + custom) of the component as an immutable mapping.""" - args = self.args - - # Add default arguments args.update( { - "input_manifest_path": Argument( - name="input_manifest_path", - description="Path to the input manifest", - type="str", - default=None, - ), - "component_spec": Argument( - name="component_spec", - description="The component specification as a dictionary", - type="dict", - default={}, - ), - "input_partition_rows": Argument( - name="input_partition_rows", - description="The number of rows to load per partition. \ - Set to override the automatic partitioning", - type="str", - default=None, - optional=True, - ), - "cache": Argument( - name="cache", - description="Set to False to disable caching, True by default.", - type="bool", - default=True, - ), - "metadata": Argument( - name="metadata", - description="Metadata arguments containing the run id and base path", - type="str", - default=None, - ), + name: Argument( + name=name, + description=arg_info["description"], + type=arg_info["type"], + default=arg_info["default"] if "default" in arg_info else None, + optional=_is_optional(arg_info), + ) + for name, arg_info in self._specification.get("args", {}).items() }, ) return types.MappingProxyType(args) @property - def output_arguments(self) -> t.Mapping[str, Argument]: - """The output arguments of the component as an immutable mapping.""" - return types.MappingProxyType( - { - "output_manifest_path": Argument( - name="output_manifest_path", - description="Path to the output manifest", - type="str", - default=None, - ), - }, - ) + def specification(self) -> t.Dict[str, t.Any]: + return copy.deepcopy(self._specification) + + @property + def default_arguments(self) -> t.Dict[str, Argument]: + """Add the default arguments of a fondant component.""" + return { + "input_manifest_path": Argument( + name="input_manifest_path", + description="Path to the input manifest", + type="str", + default=None, + optional=True, + ), + "component_spec": Argument( + name="component_spec", + description="The component specification as a dictionary", + type="dict", + default={}, + ), + "input_partition_rows": Argument( + name="input_partition_rows", + description="The number of rows to load per partition. \ + Set to override the automatic partitioning", + type="int", + default=None, + optional=True, + ), + "cache": Argument( + name="cache", + description="Set to False to disable caching, True by default.", + type="bool", + default=True, + ), + "metadata": Argument( + name="metadata", + description="Metadata arguments containing the run id and base path", + type="str", + default=None, + ), + "output_manifest_path": Argument( + name="output_manifest_path", + description="Path to the output manifest", + type="str", + default=None, + ), + } @property def kubeflow_specification(self) -> "KubeflowComponentSpec": @@ -299,9 +291,9 @@ def convert_arguments(fondant_component: ComponentSpec): for arg in fondant_component.args.values(): arg_type_dict = {} - if arg.optional and arg.default == "None": + if arg.optional and arg.default is None: arg_type_dict["isOptional"] = True - if arg.default is not None and arg.default != "None": + if arg.default is not None and arg.default is not None: arg_type_dict["defaultValue"] = arg.default args[arg.name] = { @@ -329,64 +321,19 @@ def sanitize_component_name(name: str) -> str: def from_fondant_component_spec(cls, fondant_component: ComponentSpec): """Generate a Kubeflow component spec from a ComponentOp.""" input_definitions = { - "artifacts": { - "input_manifest_path": { - "description": "Path to the input manifest", - "artifactType": { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "isOptional": True, - }, - }, "parameters": { - "component_spec": { - "description": "The component specification as a dictionary", - "defaultValue": {}, - "isOptional": True, - "parameterType": "STRUCT", - }, - "input_partition_rows": { - "description": "The number of rows to load per partition." - + " Set to override the automatic partitioning", - "isOptional": True, - "parameterType": "NUMBER_INTEGER", - }, - "cache": { - "parameterType": "BOOLEAN", - "description": "Set to False to disable caching, True by default.", - "defaultValue": True, - "isOptional": True, - }, - "metadata": { - "description": "Metadata arguments containing the run id and base path", - "parameterType": "STRING", - }, **cls.convert_arguments(fondant_component), }, } cleaned_component_name = cls.sanitize_component_name(fondant_component.name) - output_definitions = { - "artifacts": { - "output_manifest_path": { - "artifactType": { - "schemaTitle": "system.Artifact", - "schemaVersion": "0.0.1", - }, - "description": "Path to the output manifest", - }, - }, - } - specification = { "components": { "comp-" + cleaned_component_name: { "executorLabel": "exec-" + cleaned_component_name, "inputDefinitions": input_definitions, - "outputDefinitions": output_definitions, }, }, "deploymentSpec": { @@ -396,7 +343,7 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "container": { "args": [ "--input_manifest_path", - "{{$.inputs.artifacts['input_manifest_path'].uri}}", + "{{$.inputs.parameters['input_manifest_path']}}", "--metadata", "{{$.inputs.parameters['metadata']}}", "--component_spec", @@ -407,7 +354,7 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "{{$.inputs.parameters['cache']}}", *cls._dump_args(fondant_component.args.values()), "--output_manifest_path", - "{{$.outputs.artifacts['output_manifest_path'].uri}}", + "{{$.inputs.parameters['output_manifest_path']}}", ], "command": ["fondant", "execute", "main"], "image": fondant_component.image, @@ -418,37 +365,14 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "pipelineInfo": {"name": cleaned_component_name}, "root": { "dag": { - "outputs": { - "artifacts": { - "output_manifest_path": { - "artifactSelectors": [ - { - "outputArtifactKey": "output_manifest_path", - "producerSubtask": cleaned_component_name, - }, - ], - }, - }, - }, "tasks": { cleaned_component_name: { "cachingOptions": {"enableCache": True}, "componentRef": {"name": "comp-" + cleaned_component_name}, "inputs": { - "artifacts": { - "input_manifest_path": { - "componentInputArtifact": "input_manifest_path", - }, - }, "parameters": { - "component_spec": { - "componentInputParameter": "component_spec", - }, - "input_partition_rows": { - "componentInputParameter": "input_partition_rows", - }, - "metadata": {"componentInputParameter": "metadata"}, - "cache": {"componentInputParameter": "cache"}, + param: {"componentInputParameter": param} + for param in input_definitions["parameters"] }, }, "taskInfo": {"name": cleaned_component_name}, @@ -456,7 +380,6 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): }, }, "inputDefinitions": input_definitions, - "outputDefinitions": output_definitions, }, "schemaVersion": "2.1.0", "sdkVersion": "kfp-2.0.1", diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 20ae725a8..d338df2a7 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -221,8 +221,7 @@ def _get_component_arguments(spec: ComponentSpec) -> t.Dict[str, Argument]: Input and output arguments of the component. """ component_arguments: t.Dict[str, Argument] = {} - component_arguments.update(spec.input_arguments) - component_arguments.update(spec.output_arguments) + component_arguments.update(spec.args) return component_arguments @abstractmethod diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index a855ab698..4f44daea7 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -1,13 +1,10 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -17,28 +14,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-second-component: executorLabel: exec-second-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -48,28 +38,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-third-component: executorLabel: exec-third-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -79,26 +62,25 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 deploymentSpec: executors: exec-first-component: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -107,10 +89,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -120,7 +114,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -129,10 +123,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -142,7 +148,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -151,10 +157,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -203,6 +221,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -216,11 +237,6 @@ root: dependentTasks: - first-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: first-component parameters: cache: runtimeValue: @@ -247,6 +263,9 @@ root: items: type: float32 type: array + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json input_partition_rows: runtimeValue: constant: 10.0 @@ -255,6 +274,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "second_component", "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -268,11 +290,6 @@ root: dependentTasks: - second-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: second-component parameters: cache: runtimeValue: @@ -308,11 +325,17 @@ root: fields: data: type: binary + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "third_component", "cache_key": "3"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index a855ab698..4f44daea7 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -1,13 +1,10 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -17,28 +14,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-second-component: executorLabel: exec-second-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -48,28 +38,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-third-component: executorLabel: exec-third-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -79,26 +62,25 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 deploymentSpec: executors: exec-first-component: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -107,10 +89,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -120,7 +114,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -129,10 +123,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -142,7 +148,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -151,10 +157,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -203,6 +221,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -216,11 +237,6 @@ root: dependentTasks: - first-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: first-component parameters: cache: runtimeValue: @@ -247,6 +263,9 @@ root: items: type: float32 type: array + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json input_partition_rows: runtimeValue: constant: 10.0 @@ -255,6 +274,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "second_component", "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -268,11 +290,6 @@ root: dependentTasks: - second-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: second-component parameters: cache: runtimeValue: @@ -308,11 +325,17 @@ root: fields: data: type: binary + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "third_component", "cache_key": "3"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index c9ba66940..a8837327c 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,13 +1,10 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -17,28 +14,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-image-cropping: executorLabel: exec-image-cropping inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -52,28 +42,27 @@ components: defaultValue: -30.0 isOptional: true parameterType: NUMBER_INTEGER + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING padding: defaultValue: 10.0 isOptional: true parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 deploymentSpec: executors: exec-first-component: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -82,10 +71,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -95,7 +96,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -104,12 +105,24 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --cropping_threshold - '{{$.inputs.parameters[''cropping_threshold'']}}' - --padding - '{{$.inputs.parameters[''padding'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -155,6 +168,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -168,11 +184,6 @@ root: dependentTasks: - first-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: first-component parameters: cache: runtimeValue: @@ -213,11 +224,17 @@ root: cropping_threshold: runtimeValue: constant: 0.0 + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json padding: runtimeValue: constant: 0.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index c9ba66940..a8837327c 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -1,13 +1,10 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline components: comp-first-component: executorLabel: exec-first-component inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -17,28 +14,21 @@ components: defaultValue: {} isOptional: true parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING storage_args: parameterType: STRING - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-image-cropping: executorLabel: exec-image-cropping inputDefinitions: - artifacts: - input_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true parameters: cache: defaultValue: true @@ -52,28 +42,27 @@ components: defaultValue: -30.0 isOptional: true parameterType: NUMBER_INTEGER + input_manifest_path: + isOptional: true + parameterType: STRING input_partition_rows: isOptional: true parameterType: NUMBER_INTEGER metadata: parameterType: STRING + output_manifest_path: + parameterType: STRING padding: defaultValue: 10.0 isOptional: true parameterType: NUMBER_INTEGER - outputDefinitions: - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 deploymentSpec: executors: exec-first-component: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -82,10 +71,22 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -95,7 +96,7 @@ deploymentSpec: container: args: - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --component_spec @@ -104,12 +105,24 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' - --cropping_threshold - '{{$.inputs.parameters[''cropping_threshold'']}}' - --padding - '{{$.inputs.parameters[''padding'']}}' - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' + - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute @@ -155,6 +168,9 @@ root: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json storage_args: runtimeValue: constant: a dummy string arg @@ -168,11 +184,6 @@ root: dependentTasks: - first-component inputs: - artifacts: - input_manifest_path: - taskOutputArtifact: - outputArtifactKey: output_manifest_path - producerTask: first-component parameters: cache: runtimeValue: @@ -213,11 +224,17 @@ root: cropping_threshold: runtimeValue: constant: 0.0 + input_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json padding: runtimeValue: constant: 0.0 diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index d175ea854..2401cc282 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,99 +1,99 @@ components: - comp-example-component: - executorLabel: exec-example-component - inputDefinitions: &id001 - artifacts: - input_manifest_path: - description: Path to the input manifest - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - isOptional: true - parameters: - component_spec: - description: The component specification as a dictionary - defaultValue: {} - isOptional: true - parameterType: STRUCT - input_partition_rows: - description: The number of rows to load per partition. Set to override the - automatic partitioning - isOptional: true - parameterType: NUMBER_INTEGER - cache: - parameterType: BOOLEAN - description: Set to False to disable caching, True by default. - defaultValue: true - isOptional: true - metadata: - description: Metadata arguments containing the run id and base path - parameterType: STRING - storage_args: - parameterType: STRING - description: Storage arguments - outputDefinitions: &id002 - artifacts: - output_manifest_path: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - description: Path to the output manifest + comp-example-component: + executorLabel: exec-example-component + inputDefinitions: &id001 + parameters: + input_manifest_path: + parameterType: STRING + description: Path to the input manifest + isOptional: true + component_spec: + parameterType: STRUCT + description: The component specification as a dictionary + defaultValue: {} + input_partition_rows: + parameterType: NUMBER_INTEGER + description: The number of rows to load per partition. Set + to override the automatic partitioning + isOptional: true + cache: + parameterType: BOOLEAN + description: Set to False to disable caching, True by default. + defaultValue: true + metadata: + parameterType: STRING + description: Metadata arguments containing the run id and base + path + output_manifest_path: + parameterType: STRING + description: Path to the output manifest + storage_args: + parameterType: STRING + description: Storage arguments deploymentSpec: - executors: - exec-example-component: - container: - args: - - --input_manifest_path - - '{{$.inputs.artifacts[''input_manifest_path''].uri}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.outputs.artifacts[''output_manifest_path''].uri}}' - command: - - fondant - - execute - - main - image: example_component:latest + executors: + exec-example-component: + container: + args: + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' + command: + - fondant + - execute + - main + image: example_component:latest pipelineInfo: - name: example-component + name: example-component root: - dag: - outputs: - artifacts: - output_manifest_path: - artifactSelectors: - - outputArtifactKey: output_manifest_path - producerSubtask: example-component - tasks: - example-component: - cachingOptions: - enableCache: true - componentRef: - name: comp-example-component - inputs: - artifacts: - input_manifest_path: - componentInputArtifact: input_manifest_path - parameters: - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - metadata: - componentInputParameter: metadata - cache: - componentInputParameter: cache - taskInfo: - name: example-component - inputDefinitions: *id001 - outputDefinitions: *id002 + dag: + tasks: + example-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-example-component + inputs: + parameters: + input_manifest_path: + componentInputParameter: input_manifest_path + component_spec: + componentInputParameter: component_spec + input_partition_rows: + componentInputParameter: input_partition_rows + cache: + componentInputParameter: cache + metadata: + componentInputParameter: metadata + output_manifest_path: + componentInputParameter: output_manifest_path + storage_args: + componentInputParameter: storage_args + taskInfo: + name: example-component + inputDefinitions: *id001 schemaVersion: 2.1.0 sdkVersion: kfp-2.0.1 diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py index 12baf3805..0b0909a77 100644 --- a/tests/test_component_specs.py +++ b/tests/test_component_specs.py @@ -6,11 +6,7 @@ import pytest import yaml -from fondant.component_spec import ( - ComponentSpec, - ComponentSubset, - KubeflowComponentSpec, -) +from fondant.component_spec import ComponentSpec, ComponentSubset, KubeflowComponentSpec from fondant.exceptions import InvalidComponentSpec from fondant.schema import Type @@ -84,7 +80,7 @@ def test_component_spec_no_args(valid_fondant_schema_no_args): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.args == {} + assert fondant_component.args == fondant_component.default_arguments def test_component_spec_to_file(valid_fondant_schema): From 12b74aea93f25a5f1a3d7aea70f874cff87ee039 Mon Sep 17 00:00:00 2001 From: Georges Lorre Date: Thu, 21 Sep 2023 23:30:05 +0200 Subject: [PATCH 08/31] Add more default/optional argument logic --- src/fondant/executor.py | 17 ++++++++++++++--- .../components/arguments/component.yaml | 1 - tests/test_component.py | 6 +++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/fondant/executor.py b/src/fondant/executor.py index d338df2a7..24ac10039 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -188,9 +188,12 @@ def _add_and_parse_args(cls, spec: ComponentSpec): if arg.name in cls.optional_fondant_arguments(): input_required = False default = None - elif arg.default is not None or arg.optional is True: + elif arg.default is not None and arg.optional is False: input_required = False default = arg.default + elif arg.default is not None and arg.optional is True: + input_required = False + default = None else: input_required = True default = None @@ -204,6 +207,10 @@ def _add_and_parse_args(cls, spec: ComponentSpec): ) args, _ = parser.parse_known_args() + args.__dict__ = { + k: v if v != "None" else None for k, v in args.__dict__.items() + } + return args @staticmethod @@ -415,7 +422,7 @@ def _is_previous_cached(self, input_manifest: Manifest) -> bool: @staticmethod def optional_fondant_arguments() -> t.List[str]: - return ["input_manifest_path"] + return ["input_manifest_path", "input_partition_rows"] def _load_or_create_manifest(self) -> Manifest: return Manifest.create( @@ -479,6 +486,10 @@ def _execute_component( class PandasTransformExecutor(TransformExecutor[PandasTransformComponent]): + @staticmethod + def optional_fondant_arguments() -> t.List[str]: + return ["input_manifest_path", "input_partition_rows"] + @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: @@ -583,7 +594,7 @@ class DaskWriteExecutor(Executor[DaskWriteComponent]): @staticmethod def optional_fondant_arguments() -> t.List[str]: - return ["output_manifest_path"] + return ["input_partition_rows", "output_manifest_path"] def _load_or_create_manifest(self) -> Manifest: return Manifest.from_file(self.input_manifest_path) diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/example_specs/components/arguments/component.yaml index 021ce6ce6..659ed0026 100644 --- a/tests/example_specs/components/arguments/component.yaml +++ b/tests/example_specs/components/arguments/component.yaml @@ -65,5 +65,4 @@ args: optional_arg: description: optional argument type: str - optional: True default: None diff --git a/tests/test_component.py b/tests/test_component.py index 1473aabe6..215386d37 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -126,15 +126,15 @@ def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: "list_default_arg": ["foo", "bar"], "dict_default_arg": {"foo": 1, "bar": 2}, "string_default_arg": "foo", - "string_default_arg_none": "None", + "string_default_arg_none": None, "integer_default_arg_none": 0, "float_default_arg_none": 0.0, "bool_default_arg_none": False, "list_default_arg_none": [], "dict_default_arg_none": {}, "override_default_arg": "bar", - "override_default_arg_with_none": "None", - "optional_arg": "None", + "override_default_arg_with_none": None, + "optional_arg": None, } From 32a8c04788cf17c9b6404b8a06a4a482dd4d5918 Mon Sep 17 00:00:00 2001 From: Georges Lorre Date: Mon, 2 Oct 2023 16:50:31 +0200 Subject: [PATCH 09/31] Add cluser_type to default args --- src/fondant/compiler.py | 7 +- src/fondant/component_spec.py | 8 + .../example_1/docker-compose.yml | 2 + .../example_1/kubeflow_pipeline.yml | 33 +++ .../example_1/vertex_pipeline.yml | 33 +++ .../example_2/kubeflow_pipeline.yml | 22 ++ .../example_2/vertex_pipeline.yml | 22 ++ .../component_specs/kubeflow_component.yaml | 195 +++++++++--------- 8 files changed, 227 insertions(+), 95 deletions(-) diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 9b84a1cce..bd58fcbb9 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -370,8 +370,11 @@ def _set_configuration(self, task, fondant_component_operation): task.set_accelerator_type("cloud-tpus.google.com/v3") if node_pool_name is not None and node_pool_label is not None: - task.add_node_selector_constraint(node_pool_label, node_pool_name) - + task = self.kfp_kubernetes.add_node_selector( + task, + node_pool_label, + node_pool_name, + ) return task diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 39e5744c1..87b8c2a61 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -247,6 +247,12 @@ def default_arguments(self) -> t.Dict[str, Argument]: type="bool", default=True, ), + "cluster_type": Argument( + name="cluster_type", + description="The cluster type to use for the execution", + type="str", + default="default", + ), "metadata": Argument( name="metadata", description="Metadata arguments containing the run id and base path", @@ -352,6 +358,8 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "{{$.inputs.parameters['input_partition_rows']}}", "--cache", "{{$.inputs.parameters['cache']}}", + "--cluster_type", + "{{$.inputs.parameters['cluster_type']}}", *cls._dump_args(fondant_component.args.values()), "--output_manifest_path", "{{$.inputs.parameters['output_manifest_path']}}", diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml index 7191ad0ea..362459873 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml @@ -25,6 +25,8 @@ services: "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' depends_on: {} volumes: [] + ports: + - 8787:8787 second_component: build: args: [] diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index 4f44daea7..183eb537d 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -10,6 +10,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -34,6 +38,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -58,6 +66,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -89,6 +101,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -97,6 +111,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -123,6 +139,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -131,6 +149,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -157,6 +177,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -165,6 +187,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -194,6 +218,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -241,6 +268,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -294,6 +324,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index 4f44daea7..183eb537d 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -10,6 +10,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -34,6 +38,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -58,6 +66,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -89,6 +101,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -97,6 +111,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -123,6 +139,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -131,6 +149,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -157,6 +177,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -165,6 +187,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -194,6 +218,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -241,6 +268,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -294,6 +324,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index a8837327c..5899d9cdd 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -10,6 +10,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -34,6 +38,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -71,6 +79,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -79,6 +89,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -105,6 +117,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -113,6 +127,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -144,6 +160,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -188,6 +207,9 @@ root: cache: runtimeValue: constant: true + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index a8837327c..5899d9cdd 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -10,6 +10,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -34,6 +38,10 @@ components: defaultValue: true isOptional: true parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING component_spec: defaultValue: {} isOptional: true @@ -71,6 +79,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -79,6 +89,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -105,6 +117,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec @@ -113,6 +127,8 @@ deploymentSpec: - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' - --metadata - '{{$.inputs.parameters[''metadata'']}}' - --output_manifest_path @@ -144,6 +160,9 @@ root: cache: runtimeValue: constant: false + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: @@ -188,6 +207,9 @@ root: cache: runtimeValue: constant: true + cluster_type: + runtimeValue: + constant: default component_spec: runtimeValue: constant: diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index 2401cc282..dd8797de6 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,99 +1,108 @@ components: - comp-example-component: - executorLabel: exec-example-component - inputDefinitions: &id001 - parameters: - input_manifest_path: - parameterType: STRING - description: Path to the input manifest - isOptional: true - component_spec: - parameterType: STRUCT - description: The component specification as a dictionary - defaultValue: {} - input_partition_rows: - parameterType: NUMBER_INTEGER - description: The number of rows to load per partition. Set - to override the automatic partitioning - isOptional: true - cache: - parameterType: BOOLEAN - description: Set to False to disable caching, True by default. - defaultValue: true - metadata: - parameterType: STRING - description: Metadata arguments containing the run id and base - path - output_manifest_path: - parameterType: STRING - description: Path to the output manifest - storage_args: - parameterType: STRING - description: Storage arguments + comp-example-component: + executorLabel: exec-example-component + inputDefinitions: &id001 + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + parameterType: BOOLEAN + cluster_type: + defaultValue: default + description: The cluster type to use for the execution + parameterType: STRING + component_spec: + defaultValue: {} + description: The component specification as a dictionary + parameterType: STRUCT + input_manifest_path: + description: Path to the input manifest + isOptional: true + parameterType: STRING + input_partition_rows: + description: The number of rows to load per partition. Set + to override the automatic partitioning + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + output_manifest_path: + description: Path to the output manifest + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING deploymentSpec: - executors: - exec-example-component: - container: - args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - command: - - fondant - - execute - - main - image: example_component:latest + executors: + exec-example-component: + container: + args: + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' + - --input_manifest_path + - '{{$.inputs.parameters[''input_manifest_path'']}}' + - --component_spec + - '{{$.inputs.parameters[''component_spec'']}}' + - --input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' + - --cache + - '{{$.inputs.parameters[''cache'']}}' + - --cluster_type + - '{{$.inputs.parameters[''cluster_type'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' + command: + - fondant + - execute + - main + image: example_component:latest pipelineInfo: - name: example-component + name: example-component root: - dag: - tasks: - example-component: - cachingOptions: - enableCache: true - componentRef: - name: comp-example-component - inputs: - parameters: - input_manifest_path: - componentInputParameter: input_manifest_path - component_spec: - componentInputParameter: component_spec - input_partition_rows: - componentInputParameter: input_partition_rows - cache: - componentInputParameter: cache - metadata: - componentInputParameter: metadata - output_manifest_path: - componentInputParameter: output_manifest_path - storage_args: - componentInputParameter: storage_args - taskInfo: - name: example-component - inputDefinitions: *id001 + dag: + tasks: + example-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-example-component + inputs: + parameters: + cache: + componentInputParameter: cache + cluster_type: + componentInputParameter: cluster_type + component_spec: + componentInputParameter: component_spec + input_manifest_path: + componentInputParameter: input_manifest_path + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + output_manifest_path: + componentInputParameter: output_manifest_path + storage_args: + componentInputParameter: storage_args + taskInfo: + name: example-component + inputDefinitions: *id001 schemaVersion: 2.1.0 sdkVersion: kfp-2.0.1 From e49867b5aa0fce2194b5e1bdc67f441bc26b1df4 Mon Sep 17 00:00:00 2001 From: Georges Lorre Date: Mon, 2 Oct 2023 19:45:09 +0200 Subject: [PATCH 10/31] Fix ruff error --- tests/test_compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 20b47b1e2..0b92b08c9 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -5,7 +5,6 @@ import pytest import yaml - from fondant.compiler import DockerCompiler, KubeFlowCompiler, VertexCompiler from fondant.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline From cf179f1304ac01a2d2f66fb1e8b8fdd76ac10ef0 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Mon, 9 Oct 2023 22:40:52 +0200 Subject: [PATCH 11/31] Fix isOptional and defaultValue conversion --- src/fondant/component_spec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 87b8c2a61..c675fe9be 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -297,9 +297,9 @@ def convert_arguments(fondant_component: ComponentSpec): for arg in fondant_component.args.values(): arg_type_dict = {} - if arg.optional and arg.default is None: + if arg.optional or arg.default is None: arg_type_dict["isOptional"] = True - if arg.default is not None and arg.default is not None: + if arg.default is not None: arg_type_dict["defaultValue"] = arg.default args[arg.name] = { From 55fc4fee9b056d43305a056e6f6524b6bfa07089 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Mon, 9 Oct 2023 22:41:28 +0200 Subject: [PATCH 12/31] Update runner to use KfP v2 API --- src/fondant/runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fondant/runner.py b/src/fondant/runner.py index 59035e084..df898c97d 100644 --- a/src/fondant/runner.py +++ b/src/fondant/runner.py @@ -73,18 +73,18 @@ def run( job_name = self.get_name_from_spec(input_spec) + "_run" # TODO add logic to see if pipeline exists runner = self.client.run_pipeline( - experiment_id=experiment.id, + experiment_id=experiment.experiment_id, job_name=job_name, pipeline_package_path=input_spec, ) - pipeline_url = f"{self.host}/#/runs/details/{runner.id}" + pipeline_url = f"{self.host}/#/runs/details/{runner.run_id}" logger.info(f"Pipeline is running at: {pipeline_url}") def get_name_from_spec(self, input_spec: str): """Get the name of the pipeline from the spec.""" with open(input_spec) as f: - spec = yaml.safe_load(f) + spec, *_ = yaml.safe_load_all(f) return spec["pipelineInfo"]["name"] From fa14ea06bae436f00065007dc9d5eab18f0e2ebb Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Mon, 9 Oct 2023 23:12:27 +0200 Subject: [PATCH 13/31] Change input_partition_rows to accept -1 as default --- src/fondant/component_spec.py | 2 +- src/fondant/data_io.py | 68 +++++++++++++++++------------------ src/fondant/executor.py | 7 ++-- src/fondant/pipeline.py | 7 +--- src/fondant/schema.py | 10 ------ 5 files changed, 39 insertions(+), 55 deletions(-) diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index c675fe9be..97fa01139 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -238,7 +238,7 @@ def default_arguments(self) -> t.Dict[str, Argument]: description="The number of rows to load per partition. \ Set to override the automatic partitioning", type="int", - default=None, + default=-1, optional=True, ), "cache": Argument( diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 75e4a6bf5..d324b19af 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -24,7 +24,7 @@ def __init__( *, manifest: Manifest, component_spec: ComponentSpec, - input_partition_rows: t.Optional[t.Union[int, str]] = None, + input_partition_rows: int, ): super().__init__(manifest=manifest, component_spec=component_spec) self.input_partition_rows = input_partition_rows @@ -36,42 +36,42 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: Returns: The partitioned dataframe. """ - if self.input_partition_rows != "disable": - if isinstance(self.input_partition_rows, int): - # Only load the index column to trigger a faster compute of the rows - total_rows = len(dataframe.index) - # +1 to handle any remainder rows - n_partitions = (total_rows // self.input_partition_rows) + 1 - dataframe = dataframe.repartition(npartitions=n_partitions) - logger.info( - f"Total number of rows is {total_rows}.\n" - f"Repartitioning the data from {dataframe.partitions} partitions to have" - f" {n_partitions} such that the number of partitions per row is approximately" - f"{self.input_partition_rows}", - ) + if self.input_partition_rows > 1: + # Only load the index column to trigger a faster compute of the rows + total_rows = len(dataframe.index) + # +1 to handle any remainder rows + n_partitions = (total_rows // self.input_partition_rows) + 1 + dataframe = dataframe.repartition(npartitions=n_partitions) + logger.info( + f"Total number of rows is {total_rows}.\n" + f"Repartitioning the data from {dataframe.partitions} partitions to have" + f" {n_partitions} such that the number of partitions per row is approximately" + f"{self.input_partition_rows}", + ) - elif self.input_partition_rows is None: - n_partitions = dataframe.npartitions - n_workers = os.cpu_count() - if n_partitions < n_workers: # type: ignore - logger.info( - f"The number of partitions of the input dataframe is {n_partitions}. The " - f"available number of workers is {n_workers}.", - ) - dataframe = dataframe.repartition(npartitions=n_workers) - logger.info( - f"Repartitioning the data to {n_workers} partitions before processing" - f" to maximize worker usage", - ) - else: - msg = ( - f"{self.input_partition_rows} is not a valid argument. Choose either " - f"the number of partitions or set to 'disable' to disable automated " - f"partitioning" + elif self.input_partition_rows == -1: + n_partitions = dataframe.npartitions + n_workers = os.cpu_count() + if n_partitions < n_workers: # type: ignore + logger.info( + f"The number of partitions of the input dataframe is {n_partitions}. The " + f"available number of workers is {n_workers}.", ) - raise ValueError( - msg, + dataframe = dataframe.repartition(npartitions=n_workers) + logger.info( + f"Repartitioning the data to {n_workers} partitions before processing" + f" to maximize worker usage", ) + else: + msg = ( + f"{self.input_partition_rows} is not a valid value for the 'input_partition_rows' " + f"parameter. It should be a number larger than 0 to indicate the number of " + f"expected rows per partition, or '-1' to let Fondant optimize the number of " + f"partitions based on the number of available workers." + ) + raise ValueError( + msg, + ) return dataframe diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 24ac10039..41e97829f 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -29,7 +29,6 @@ from fondant.component_spec import Argument, ComponentSpec from fondant.data_io import DaskDataLoader, DaskDataWriter from fondant.manifest import Manifest, Metadata -from fondant.schema import validate_partition_number dask.config.set({"dataframe.convert-string": False}) logger = logging.getLogger(__name__) @@ -65,7 +64,7 @@ def __init__( output_manifest_path: t.Union[str, Path], metadata: t.Dict[str, t.Any], user_arguments: t.Dict[str, t.Any], - input_partition_rows: t.Optional[t.Union[str, int]] = None, + input_partition_rows: int, cluster_type: t.Optional[str] = None, client_kwargs: t.Optional[dict] = None, ) -> None: @@ -111,7 +110,7 @@ def from_args(cls) -> "Executor": parser = argparse.ArgumentParser() parser.add_argument("--component_spec", type=json.loads) parser.add_argument("--cache", type=lambda x: bool(strtobool(x))) - parser.add_argument("--input_partition_rows", type=validate_partition_number) + parser.add_argument("--input_partition_rows", type=int) parser.add_argument("--cluster_type", type=str) parser.add_argument("--client_kwargs", type=json.loads) args, _ = parser.parse_known_args() @@ -140,7 +139,7 @@ def from_spec( component_spec: ComponentSpec, *, cache: bool, - input_partition_rows: t.Optional[t.Union[str, int]], + input_partition_rows: int, cluster_type: t.Optional[str], client_kwargs: t.Optional[dict], ) -> "Executor": diff --git a/src/fondant/pipeline.py b/src/fondant/pipeline.py index e95bd833e..0e2a296cf 100644 --- a/src/fondant/pipeline.py +++ b/src/fondant/pipeline.py @@ -16,7 +16,6 @@ from fondant.component_spec import ComponentSpec from fondant.exceptions import InvalidPipelineDefinition from fondant.manifest import Manifest -from fondant.schema import validate_partition_number logger = logging.getLogger(__name__) @@ -107,11 +106,7 @@ def __init__( self.client_kwargs = client_kwargs self.arguments = arguments or {} - self._add_component_argument( - "input_partition_rows", - input_partition_rows, - validate_partition_number, - ) + self._add_component_argument("input_partition_rows", input_partition_rows) self._add_component_argument("cache", self.cache) self._add_component_argument("cluster_type", cluster_type) self._add_component_argument("client_kwargs", client_kwargs) diff --git a/src/fondant/schema.py b/src/fondant/schema.py index 46a73b1ef..fea6ebe37 100644 --- a/src/fondant/schema.py +++ b/src/fondant/schema.py @@ -157,16 +157,6 @@ class Field(t.NamedTuple): type: Type -def validate_partition_number(arg_value): - if arg_value in ["disable", None, "None"]: - return arg_value if arg_value != "None" else None - try: - return int(arg_value) - except ValueError: - msg = f"Invalid format for '{arg_value}'. The value must be an integer or set to 'disable'" - raise InvalidTypeSchema(msg) - - def validate_partition_size(arg_value): if arg_value in ["disable", None, "None"]: return arg_value if arg_value != "None" else None From cd82baebed8bdc6357b7c05775ce650a725b0912 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 00:12:32 +0200 Subject: [PATCH 14/31] Update load_from_hf_hub defaults --- components/load_from_hf_hub/fondant_component.yaml | 4 ++-- components/load_from_hf_hub/src/main.py | 11 +++++------ .../load_from_hf_hub/fondant_component.yaml | 6 +++--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 532b77d25..64090a6f2 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -19,11 +19,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 25fd6f989..e49e73e46 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -1,6 +1,5 @@ """This component loads a seed dataset from the hub.""" import logging -import typing as t import dask import dask.dataframe as dd @@ -20,9 +19,9 @@ def __init__(self, *_, dataset_name: str, column_name_mapping: dict, - image_column_names: t.Optional[list], - n_rows_to_load: t.Optional[int], - index_column: t.Optional[str], + image_column_names: list, + n_rows_to_load: int, + index_column: str, ) -> None: """ Args: @@ -60,7 +59,7 @@ def load(self) -> dd.DataFrame: dask_df = dask_df.rename(columns=self.column_name_mapping) # 4) Optional: only return specific amount of rows - if self.n_rows_to_load is not None: + if self.n_rows_to_load > 0: partitions_length = 0 npartitions = 1 for npartitions, partition in enumerate(dask_df.partitions, start=1): @@ -73,7 +72,7 @@ def load(self) -> dd.DataFrame: dask_df = dd.from_pandas(dask_df, npartitions=npartitions) # 4) Set the index - if self.index_column is None: + if self.index_column == "None": logger.info( "Index column not specified, setting a globally unique index", ) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ae646ea54..19631eb1f 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:dev +image: ghcr.io/ml6team/load_from_hf_hub:e49867b produces: images: @@ -50,11 +50,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str From 987054b65fae1a833b53296e3a0b789101b00ef9 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 00:12:43 +0200 Subject: [PATCH 15/31] Update download_images defaults --- components/download_images/fondant_component.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 0852da808..c1314d3d1 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -47,4 +47,4 @@ args: max_aspect_ratio: description: Maximum aspect ratio of the images. type: float - default: 'inf' \ No newline at end of file + default: 99.9 \ No newline at end of file From 347eec468706417b78fccfbc93688ce3a5b3809c Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Tue, 10 Oct 2023 15:15:30 +0200 Subject: [PATCH 16/31] re-enable cache --- src/fondant/compiler.py | 9 +++++++-- src/fondant/executor.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 1c919c97c..848be6b69 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -287,6 +287,7 @@ def compile( """ run_id = pipeline.get_run_id() pipeline.validate(run_id=run_id) + logger.info(f"Compiling {pipeline.name} to {output_path}") @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): @@ -430,6 +431,8 @@ def compile( @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): previous_component_task = None + component_cache_key = None + for component_name, component in pipeline._graph.items(): logger.info(f"Compiling service for {component_name}") @@ -443,13 +446,15 @@ def kfp_pipeline(): component_args = { k: v for k, v in component_op.arguments.items() if v is not None } - + component_cache_key = component_op.get_component_cache_key( + previous_component_cache=component_cache_key, + ) metadata = Metadata( pipeline_name=pipeline.name, run_id=run_id, base_path=pipeline.base_path, component_id=component_name, - cache_key=component_op.get_component_cache_key(), + cache_key=component_cache_key, ) output_manifest_path = ( diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 41e97829f..16941663b 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -411,6 +411,7 @@ def upload_manifest(self, manifest: Manifest, save_path: t.Union[str, Path]): Path(save_path).parent.mkdir(parents=True, exist_ok=True) manifest.to_file(save_path) logger.info(f"Saving output manifest to {save_path}") + self._upload_cache_key(manifest=manifest, manifest_save_path=save_path) class DaskLoadExecutor(Executor[DaskLoadComponent]): From c89998cde306167b3d38c8e1f62fcd3bbbacf336 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 15:26:35 +0200 Subject: [PATCH 17/31] Fix tests --- src/fondant/component_spec.py | 25 +- src/fondant/data_io.py | 2 +- .../example_1/kubeflow_pipeline.yml | 165 ++++----- .../example_1/vertex_pipeline.json | 338 ------------------ .../example_1/vertex_pipeline.yml | 161 +++------ .../example_2/kubeflow_pipeline.yml | 138 ++++--- .../example_2/vertex_pipeline.yml | 138 ++++--- .../component_specs/kubeflow_component.yaml | 18 +- tests/test_compiler.py | 7 +- tests/test_data_io.py | 13 - tests/test_runner.py | 6 +- 11 files changed, 255 insertions(+), 756 deletions(-) delete mode 100644 tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 97fa01139..969916b4e 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -224,14 +224,12 @@ def default_arguments(self) -> t.Dict[str, Argument]: name="input_manifest_path", description="Path to the input manifest", type="str", - default=None, optional=True, ), "component_spec": Argument( name="component_spec", description="The component specification as a dictionary", type="dict", - default={}, ), "input_partition_rows": Argument( name="input_partition_rows", @@ -239,7 +237,6 @@ def default_arguments(self) -> t.Dict[str, Argument]: Set to override the automatic partitioning", type="int", default=-1, - optional=True, ), "cache": Argument( name="cache", @@ -257,13 +254,11 @@ def default_arguments(self) -> t.Dict[str, Argument]: name="metadata", description="Metadata arguments containing the run id and base path", type="str", - default=None, ), "output_manifest_path": Argument( name="output_manifest_path", description="Path to the output manifest", type="str", - default=None, ), } @@ -297,7 +292,7 @@ def convert_arguments(fondant_component: ComponentSpec): for arg in fondant_component.args.values(): arg_type_dict = {} - if arg.optional or arg.default is None: + if arg.optional or arg.default is not None: arg_type_dict["isOptional"] = True if arg.default is not None: arg_type_dict["defaultValue"] = arg.default @@ -347,23 +342,7 @@ def from_fondant_component_spec(cls, fondant_component: ComponentSpec): "exec-" + cleaned_component_name: { "container": { - "args": [ - "--input_manifest_path", - "{{$.inputs.parameters['input_manifest_path']}}", - "--metadata", - "{{$.inputs.parameters['metadata']}}", - "--component_spec", - "{{$.inputs.parameters['component_spec']}}", - "--input_partition_rows", - "{{$.inputs.parameters['input_partition_rows']}}", - "--cache", - "{{$.inputs.parameters['cache']}}", - "--cluster_type", - "{{$.inputs.parameters['cluster_type']}}", - *cls._dump_args(fondant_component.args.values()), - "--output_manifest_path", - "{{$.inputs.parameters['output_manifest_path']}}", - ], + "args": cls._dump_args(fondant_component.args.values()), "command": ["fondant", "execute", "main"], "image": fondant_component.image, }, diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 9ab90506d..f5ce0e5eb 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -24,7 +24,7 @@ def __init__( *, manifest: Manifest, component_spec: ComponentSpec, - input_partition_rows: int, + input_partition_rows: int = -1, ): super().__init__(manifest=manifest, component_spec=component_spec) self.input_partition_rows = input_partition_rows diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index 183eb537d..85fe4853f 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -1,6 +1,7 @@ # PIPELINE DEFINITION # Name: testpipeline # Description: description of the test pipeline +--- components: comp-first-component: executorLabel: exec-first-component @@ -15,13 +16,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -43,13 +43,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -71,13 +70,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -91,74 +89,49 @@ deploymentSpec: exec-first-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute - main image: example_component:latest + resources: + memoryLimit: 0.512 + memoryRequest: 0.256 exec-second-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -167,36 +140,22 @@ deploymentSpec: exec-third-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -242,7 +201,7 @@ root: type: binary input_partition_rows: runtimeValue: - constant: 10.0 + constant: 10 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -250,7 +209,7 @@ root: "cache_key": "1"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -295,10 +254,10 @@ root: type: array input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" input_partition_rows: runtimeValue: - constant: 10.0 + constant: 10 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -306,7 +265,7 @@ root: "cache_key": "2"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -360,7 +319,7 @@ root: type: binary input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -368,7 +327,7 @@ root: "cache_key": "3"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json deleted file mode 100644 index 28ba9c42c..000000000 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.json +++ /dev/null @@ -1,338 +0,0 @@ -{ - "pipelineSpec": { - "components": { - "comp-first-component": { - "executorLabel": "exec-first-component", - "inputDefinitions": { - "parameters": { - "component_spec": { - "type": "STRING" - }, - "input_manifest_path": { - "type": "STRING" - }, - "input_partition_rows": { - "type": "STRING" - }, - "metadata": { - "type": "STRING" - }, - "storage_args": { - "type": "STRING" - } - } - }, - "outputDefinitions": { - "parameters": { - "output_manifest_path": { - "type": "STRING" - } - } - } - }, - "comp-second-component": { - "executorLabel": "exec-second-component", - "inputDefinitions": { - "parameters": { - "component_spec": { - "type": "STRING" - }, - "input_manifest_path": { - "type": "STRING" - }, - "input_partition_rows": { - "type": "STRING" - }, - "metadata": { - "type": "STRING" - }, - "storage_args": { - "type": "STRING" - } - } - }, - "outputDefinitions": { - "parameters": { - "output_manifest_path": { - "type": "STRING" - } - } - } - }, - "comp-third-component": { - "executorLabel": "exec-third-component", - "inputDefinitions": { - "parameters": { - "component_spec": { - "type": "STRING" - }, - "input_manifest_path": { - "type": "STRING" - }, - "input_partition_rows": { - "type": "STRING" - }, - "metadata": { - "type": "STRING" - }, - "some_list": { - "type": "STRING" - }, - "storage_args": { - "type": "STRING" - } - } - }, - "outputDefinitions": { - "parameters": { - "output_manifest_path": { - "type": "STRING" - } - } - } - } - }, - "deploymentSpec": { - "executors": { - "exec-first-component": { - "container": { - "command": [ - "python3", - "main.py", - "--input_manifest_path", - "{{$.inputs.parameters['input_manifest_path']}}", - "--metadata", - "{{$.inputs.parameters['metadata']}}", - "--component_spec", - "{{$.inputs.parameters['component_spec']}}", - "--input_partition_rows", - "{{$.inputs.parameters['input_partition_rows']}}", - "--storage_args", - "{{$.inputs.parameters['storage_args']}}", - "--output_manifest_path", - "{{$.outputs.parameters['output_manifest_path'].output_file}}" - ], - "image": "example_component:latest" - } - }, - "exec-second-component": { - "container": { - "command": [ - "python3", - "main.py", - "--input_manifest_path", - "{{$.inputs.parameters['input_manifest_path']}}", - "--metadata", - "{{$.inputs.parameters['metadata']}}", - "--component_spec", - "{{$.inputs.parameters['component_spec']}}", - "--input_partition_rows", - "{{$.inputs.parameters['input_partition_rows']}}", - "--storage_args", - "{{$.inputs.parameters['storage_args']}}", - "--output_manifest_path", - "{{$.outputs.parameters['output_manifest_path'].output_file}}" - ], - "image": "example_component:latest" - } - }, - "exec-third-component": { - "container": { - "command": [ - "python3", - "main.py", - "--input_manifest_path", - "{{$.inputs.parameters['input_manifest_path']}}", - "--metadata", - "{{$.inputs.parameters['metadata']}}", - "--component_spec", - "{{$.inputs.parameters['component_spec']}}", - "--input_partition_rows", - "{{$.inputs.parameters['input_partition_rows']}}", - "--storage_args", - "{{$.inputs.parameters['storage_args']}}", - "--some_list", - "{{$.inputs.parameters['some_list']}}", - "--output_manifest_path", - "{{$.outputs.parameters['output_manifest_path'].output_file}}" - ], - "image": "example_component:latest" - } - } - } - }, - "pipelineInfo": { - "name": "testpipeline" - }, - "root": { - "dag": { - "tasks": { - "first-component": { - "cachingOptions": { - "enableCache": true - }, - "componentRef": { - "name": "comp-first-component" - }, - "inputs": { - "parameters": { - "component_spec": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"name\": \"First component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}}}" - } - } - }, - "input_manifest_path": { - "runtimeValue": { - "constantValue": { - "stringValue": "" - } - } - }, - "input_partition_rows": { - "runtimeValue": { - "constantValue": { - "stringValue": "disable" - } - } - }, - "metadata": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" - } - } - }, - "storage_args": { - "runtimeValue": { - "constantValue": { - "stringValue": "a dummy string arg" - } - } - } - } - }, - "taskInfo": { - "name": "first-component" - } - }, - "second-component": { - "cachingOptions": { - "enableCache": true - }, - "componentRef": { - "name": "comp-second-component" - }, - "dependentTasks": [ - "first-component" - ], - "inputs": { - "parameters": { - "component_spec": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"name\": \"Second component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"produces\": {\"embeddings\": {\"fields\": {\"data\": {\"type\": \"array\", \"items\": {\"type\": \"float32\"}}}}}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}}}" - } - } - }, - "input_manifest_path": { - "taskOutputParameter": { - "outputParameterKey": "output_manifest_path", - "producerTask": "first-component" - } - }, - "input_partition_rows": { - "runtimeValue": { - "constantValue": { - "stringValue": "10" - } - } - }, - "metadata": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" - } - } - }, - "storage_args": { - "runtimeValue": { - "constantValue": { - "stringValue": "a dummy string arg" - } - } - } - } - }, - "taskInfo": { - "name": "second-component" - } - }, - "third-component": { - "cachingOptions": { - "enableCache": true - }, - "componentRef": { - "name": "comp-third-component" - }, - "dependentTasks": [ - "second-component" - ], - "inputs": { - "parameters": { - "component_spec": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"name\": \"Third component\", \"description\": \"This is an example component\", \"image\": \"example_component:latest\", \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, \"embeddings\": {\"fields\": {\"data\": {\"type\": \"array\", \"items\": {\"type\": \"float32\"}}}}}, \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}, \"additionalSubsets\": false}, \"args\": {\"storage_args\": {\"description\": \"Storage arguments\", \"type\": \"str\"}, \"some_list\": {\"description\": \"Some list\", \"type\": \"list\", \"items\": {\"type\": \"int\"}}}}" - } - } - }, - "input_manifest_path": { - "taskOutputParameter": { - "outputParameterKey": "output_manifest_path", - "producerTask": "second-component" - } - }, - "input_partition_rows": { - "runtimeValue": { - "constantValue": { - "stringValue": "None" - } - } - }, - "metadata": { - "runtimeValue": { - "constantValue": { - "stringValue": "{\"base_path\": \"/foo/bar\", \"run_id\": \"{{workflow.name}}\"}" - } - } - }, - "some_list": { - "runtimeValue": { - "constantValue": { - "stringValue": "[1, 2, 3]" - } - } - }, - "storage_args": { - "runtimeValue": { - "constantValue": { - "stringValue": "a dummy string arg" - } - } - } - } - }, - "taskInfo": { - "name": "third-component" - } - } - } - } - }, - "schemaVersion": "2.0.0", - "sdkVersion": "kfp-1.8.22" - }, - "runtimeConfig": {} -} \ No newline at end of file diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index 183eb537d..9bfd1054b 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -15,13 +15,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -43,13 +42,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -71,13 +69,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -91,36 +88,22 @@ deploymentSpec: exec-first-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -129,36 +112,22 @@ deploymentSpec: exec-second-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -167,36 +136,22 @@ deploymentSpec: exec-third-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -242,7 +197,7 @@ root: type: binary input_partition_rows: runtimeValue: - constant: 10.0 + constant: 10 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -250,7 +205,7 @@ root: "cache_key": "1"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -295,10 +250,10 @@ root: type: array input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" input_partition_rows: runtimeValue: - constant: 10.0 + constant: 10 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -306,7 +261,7 @@ root: "cache_key": "2"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -360,7 +315,7 @@ root: type: binary input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -368,7 +323,7 @@ root: "cache_key": "3"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index 5899d9cdd..9d46ddb75 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,6 +1,7 @@ # PIPELINE DEFINITION # Name: testpipeline # Description: description of the test pipeline +--- components: comp-first-component: executorLabel: exec-first-component @@ -15,13 +16,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -43,17 +43,16 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT cropping_threshold: - defaultValue: -30.0 + defaultValue: -30 isOptional: true parameterType: NUMBER_INTEGER input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -61,7 +60,7 @@ components: output_manifest_path: parameterType: STRING padding: - defaultValue: 10.0 + defaultValue: 10 isOptional: true parameterType: NUMBER_INTEGER deploymentSpec: @@ -69,36 +68,22 @@ deploymentSpec: exec-first-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -107,38 +92,24 @@ deploymentSpec: exec-image-cropping: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --cropping_threshold - - '{{$.inputs.parameters[''cropping_threshold'']}}' - - --padding - - '{{$.inputs.parameters[''padding'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--cropping_threshold" + - "{{$.inputs.parameters['cropping_threshold']}}" + - "--padding" + - "{{$.inputs.parameters['padding']}}" command: - fondant - execute @@ -189,7 +160,7 @@ root: "cache_key": "1"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -215,13 +186,13 @@ root: constant: args: cropping_threshold: - default: -30.0 + default: -30 description: Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 type: int padding: - default: 10.0 + default: 10 description: Padding for the image cropping. The padding is added to all borders of the image. type: int @@ -230,8 +201,21 @@ root: fields: data: type: binary - description: Component that removes single-colored borders around - images and crops them appropriately + description: "This component crops out image borders. This is typically + useful when working with graphical \nimages that have single-color + borders (e.g. logos, icons, etc.).\n\nThe component takes an image + and calculates which color is most present in the border. It then + \ncrops the image in order to minimize this single-color border. + The `padding` argument will add \nextra border to the image before + cropping it, in order to avoid cutting off parts of the image.\nThe + resulting crop will always be square. If a crop is not possible, + the component will return \nthe original image.\n\n#### Examples\nExamples + of image cropping by removing the single-color border. Left side + is original image, \nright side is border-cropped image.\n\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n" image: ghcr.io/ml6team/image_cropping:dev name: Image cropping produces: @@ -245,10 +229,10 @@ root: type: int32 cropping_threshold: runtimeValue: - constant: 0.0 + constant: 0 input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -256,10 +240,10 @@ root: "cache_key": "2"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" padding: runtimeValue: - constant: 0.0 + constant: 0 taskInfo: name: image-cropping schemaVersion: 2.1.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index 5899d9cdd..9d46ddb75 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -1,6 +1,7 @@ # PIPELINE DEFINITION # Name: testpipeline # Description: description of the test pipeline +--- components: comp-first-component: executorLabel: exec-first-component @@ -15,13 +16,12 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -43,17 +43,16 @@ components: isOptional: true parameterType: STRING component_spec: - defaultValue: {} - isOptional: true parameterType: STRUCT cropping_threshold: - defaultValue: -30.0 + defaultValue: -30 isOptional: true parameterType: NUMBER_INTEGER input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -61,7 +60,7 @@ components: output_manifest_path: parameterType: STRING padding: - defaultValue: 10.0 + defaultValue: 10 isOptional: true parameterType: NUMBER_INTEGER deploymentSpec: @@ -69,36 +68,22 @@ deploymentSpec: exec-first-component: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --storage_args - - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" command: - fondant - execute @@ -107,38 +92,24 @@ deploymentSpec: exec-image-cropping: container: args: - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' - - --cropping_threshold - - '{{$.inputs.parameters[''cropping_threshold'']}}' - - --padding - - '{{$.inputs.parameters[''padding'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--cropping_threshold" + - "{{$.inputs.parameters['cropping_threshold']}}" + - "--padding" + - "{{$.inputs.parameters['padding']}}" command: - fondant - execute @@ -189,7 +160,7 @@ root: "cache_key": "1"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" storage_args: runtimeValue: constant: a dummy string arg @@ -215,13 +186,13 @@ root: constant: args: cropping_threshold: - default: -30.0 + default: -30 description: Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 type: int padding: - default: 10.0 + default: 10 description: Padding for the image cropping. The padding is added to all borders of the image. type: int @@ -230,8 +201,21 @@ root: fields: data: type: binary - description: Component that removes single-colored borders around - images and crops them appropriately + description: "This component crops out image borders. This is typically + useful when working with graphical \nimages that have single-color + borders (e.g. logos, icons, etc.).\n\nThe component takes an image + and calculates which color is most present in the border. It then + \ncrops the image in order to minimize this single-color border. + The `padding` argument will add \nextra border to the image before + cropping it, in order to avoid cutting off parts of the image.\nThe + resulting crop will always be square. If a crop is not possible, + the component will return \nthe original image.\n\n#### Examples\nExamples + of image cropping by removing the single-color border. Left side + is original image, \nright side is border-cropped image.\n\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n" image: ghcr.io/ml6team/image_cropping:dev name: Image cropping produces: @@ -245,10 +229,10 @@ root: type: int32 cropping_threshold: runtimeValue: - constant: 0.0 + constant: 0 input_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -256,10 +240,10 @@ root: "cache_key": "2"}' output_manifest_path: runtimeValue: - constant: /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" padding: runtimeValue: - constant: 0.0 + constant: 0 taskInfo: name: image-cropping schemaVersion: 2.1.0 diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index dd8797de6..4ecedde7d 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -6,13 +6,14 @@ components: cache: defaultValue: true description: Set to False to disable caching, True by default. + isOptional: true parameterType: BOOLEAN cluster_type: defaultValue: default description: The cluster type to use for the execution + isOptional: true parameterType: STRING component_spec: - defaultValue: {} description: The component specification as a dictionary parameterType: STRUCT input_manifest_path: @@ -20,6 +21,7 @@ components: isOptional: true parameterType: STRING input_partition_rows: + defaultValue: -1 description: The number of rows to load per partition. Set to override the automatic partitioning isOptional: true @@ -40,18 +42,6 @@ deploymentSpec: args: - --input_manifest_path - '{{$.inputs.parameters[''input_manifest_path'']}}' - - --metadata - - '{{$.inputs.parameters[''metadata'']}}' - - --component_spec - - '{{$.inputs.parameters[''component_spec'']}}' - - --input_partition_rows - - '{{$.inputs.parameters[''input_partition_rows'']}}' - - --cache - - '{{$.inputs.parameters[''cache'']}}' - - --cluster_type - - '{{$.inputs.parameters[''cluster_type'']}}' - - --input_manifest_path - - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec - '{{$.inputs.parameters[''component_spec'']}}' - --input_partition_rows @@ -66,8 +56,6 @@ deploymentSpec: - '{{$.inputs.parameters[''output_manifest_path'']}}' - --storage_args - '{{$.inputs.parameters[''storage_args'']}}' - - --output_manifest_path - - '{{$.inputs.parameters[''output_manifest_path'']}}' command: - fondant - execute diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 654274624..2de64b88c 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -518,10 +518,11 @@ def test_caching_dependency_kfp(tmp_path_factory): compiler.compile(pipeline=pipeline, output_path=output_path) with open(output_path) as src: spec = yaml.safe_load(src) - commands = spec["spec"]["templates"][1]["container"]["command"] - cache_key = json.loads(commands[commands.index("--metadata") + 1])[ - "cache_key" + params = spec["root"]["dag"]["tasks"]["second-component"]["inputs"][ + "parameters" ] + metadata = params["metadata"]["runtimeValue"]["constant"] + cache_key = json.loads(metadata)["cache_key"] second_component_cache_key_dict[arg] = cache_key assert ( diff --git a/tests/test_data_io.py b/tests/test_data_io.py index 5b63e3292..56e0bad04 100644 --- a/tests/test_data_io.py +++ b/tests/test_data_io.py @@ -89,18 +89,6 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_load_dataframe_disable(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" - dl = DaskDataLoader( - manifest=manifest, - component_spec=component_spec, - input_partition_rows="disable", - ) - dataframe = dl.load_dataframe() - expected_partitions = 3 # original partitions - assert dataframe.npartitions == expected_partitions - - def test_write_index( tmp_path_factory, dataframe, @@ -128,7 +116,6 @@ def test_write_index( ) -# def test_write_subsets( tmp_path_factory, dataframe, diff --git a/tests/test_runner.py b/tests/test_runner.py index b73deb32f..a8afee28e 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -31,7 +31,7 @@ def test_docker_runner(): class MockKfpClient: def __init__(self, host): self.host = host - self._experiments = {"Default": SimpleNamespace(id="123")} + self._experiments = {"Default": SimpleNamespace(experiment_id="123")} def get_experiment(self, experiment_name): try: @@ -40,11 +40,11 @@ def get_experiment(self, experiment_name): raise ValueError def create_experiment(self, experiment_name): - self._experiments[experiment_name] = SimpleNamespace(id="456") + self._experiments[experiment_name] = SimpleNamespace(experiment_id="456") return self.get_experiment(experiment_name) def run_pipeline(self, experiment_id, job_name, pipeline_package_path): - return SimpleNamespace(id="xyz") + return SimpleNamespace(run_id="xyz") def test_kubeflow_runner(): From 1815d02cdd40d0e7d373c0aed3bbe2b0b3a01205 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 15:38:14 +0200 Subject: [PATCH 18/31] Update datacomp pipeline --- examples/pipelines/datacomp/pipeline.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 68fd8166a..94c8b7e33 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -19,7 +19,6 @@ pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) # define ops @@ -45,18 +44,16 @@ }, node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", - cache=False, ) download_images_op = ComponentOp.from_registry( name="download_images", arguments={ "retries": 2, "min_image_size": 0, - "max_aspect_ratio": float("inf"), }, node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", - input_partition_rows=1000, + input_partition_rows=-1, cache=False, ) detect_text_op = ComponentOp( @@ -77,12 +74,12 @@ cache=False, ) embed_images_op = ComponentOp.from_registry( - name="image_embedding", + name="embed_images", arguments={ "batch_size": 2, }, node_pool_label="node_pool", - node_pool_name="model-inference-mega-pool", + node_pool_name="model-inference-pool", number_of_accelerators=1, accelerator_name="GPU", cache=False, @@ -104,9 +101,9 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(download_images_op, dependencies=load_from_hub_op) +pipeline.add_op(download_images_op, dependencies=load_from_hub_op) # pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(mask_images_op, dependencies=detect_text_op) -# pipeline.add_op(embed_images_op, dependencies=mask_images_op) +pipeline.add_op(embed_images_op, dependencies=download_images_op) # pipeline.add_op(add_clip_score_op, dependencies=embed_images_op) # pipeline.add_op(filter_clip_score_op, dependencies=add_clip_score_op) From 931df5654c75b7154da871e5a82a9fb501d13e4d Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 15:49:18 +0200 Subject: [PATCH 19/31] Remove python version upper bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 54e25a26e..8682da5f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">= 3.8 < 3.11" +python = ">= 3.8" dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" From 4342aa82eb83353857f8986a154762e4dc5c5116 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 15:50:09 +0200 Subject: [PATCH 20/31] Re-add test suite for Python 3.11 --- .github/workflows/pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 11deb2329..38bc56cce 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} From a789c03c6365a6771b82b5084f5c6826a5235e0e Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 16:09:21 +0200 Subject: [PATCH 21/31] Add Python 3.12 upper bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8682da5f5..687db4533 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">= 3.8" +python = ">= 3.8, <3.12" dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" From 21088941fa21a0a68d94e4b8d2e049342666f423 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 16:18:51 +0200 Subject: [PATCH 22/31] Add gcp dependencies to vertex extra --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 687db4533..7982ce3d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] -vertex = ["kfp", "google-cloud-aiplatform"] +vertex = ["kfp", "google-cloud-aiplatform", "fsspec", "gcsfs"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" From b778f5e064b7ad99eeb21a62ae5c26c594af148c Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 16:20:47 +0200 Subject: [PATCH 23/31] Address PR comments --- src/fondant/component_spec.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index 969916b4e..3698ae883 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -191,13 +191,7 @@ def outputs_additional_subsets(self) -> bool: @property def args(self) -> t.Mapping[str, Argument]: - def _is_optional(arg_information): - if "default" in arg_information: - return arg_information["default"] == "None" - return False - args = self.default_arguments - args.update( { name: Argument( @@ -205,7 +199,7 @@ def _is_optional(arg_information): description=arg_info["description"], type=arg_info["type"], default=arg_info["default"] if "default" in arg_info else None, - optional=_is_optional(arg_info), + optional=arg_info.get("default") == "None", ) for name, arg_info in self._specification.get("args", {}).items() }, @@ -320,7 +314,7 @@ def sanitize_component_name(name: str) -> str: @classmethod def from_fondant_component_spec(cls, fondant_component: ComponentSpec): - """Generate a Kubeflow component spec from a ComponentOp.""" + """Generate a Kubeflow component spec from a Fondant component spec.""" input_definitions = { "parameters": { **cls.convert_arguments(fondant_component), From c7e3a9facd1912810cda95c52f81a08d0b6d9dda Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 16:54:55 +0200 Subject: [PATCH 24/31] Add Python 3.12 trove classifier to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7982ce3d4..f94cb6e19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ classifiers = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Software Development", "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", From c6601eb9984eacfe373b8652574d74d06990e83f Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 17:26:35 +0200 Subject: [PATCH 25/31] Lower python upper bound to 3.11 again to prevent slow dependency resolving --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f94cb6e19..c1f1155a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">= 3.8, <3.12" +python = ">= 3.8, <3.11" dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" @@ -51,16 +51,16 @@ fsspec = { version = ">= 2023.4.0", optional = true} gcsfs = { version = ">= 2023.4.0", optional = true } s3fs = { version = ">= 2023.4.0", optional = true } adlfs = { version = ">= 2023.4.0", optional = true } -kfp = { version = "2.0.1", optional = true, extras =["kubernetes"] } +kfp = { version = "2.3.0", optional = true, extras =["kubernetes"] } pandas = { version = ">= 1.3.5", optional = true } -google-cloud-aiplatform = { version = "1.32.0", optional = true} +google-cloud-aiplatform = { version = "1.34.0", optional = true} [tool.poetry.extras] aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] -vertex = ["kfp", "google-cloud-aiplatform", "fsspec", "gcsfs"] +vertex = ["google-cloud-aiplatform"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" From d32dffc8c8879890f18e1b2d33b9954581359dd4 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 17:30:38 +0200 Subject: [PATCH 26/31] Remove 3.11 test suite --- .github/workflows/pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 38bc56cce..11deb2329 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} From fef720e437e1698328a6bada429de9aaf65aecf5 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 17:31:46 +0200 Subject: [PATCH 27/31] Address PR comments --- examples/pipelines/datacomp/pipeline.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 94c8b7e33..0846c2451 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -53,8 +53,6 @@ }, node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", - input_partition_rows=-1, - cache=False, ) detect_text_op = ComponentOp( component_dir="components/detect_text", @@ -65,13 +63,11 @@ node_pool_name="model-inference-mega-pool", number_of_accelerators=1, accelerator_name="GPU", - cache=False, ) mask_images_op = ComponentOp( component_dir="components/mask_images", node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", - cache=False, ) embed_images_op = ComponentOp.from_registry( name="embed_images", @@ -82,13 +78,11 @@ node_pool_name="model-inference-pool", number_of_accelerators=1, accelerator_name="GPU", - cache=False, ) add_clip_score_op = ComponentOp( component_dir="components/add_clip_score", node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", - cache=False, ) filter_clip_score_op = ComponentOp( component_dir="components/filter_clip_score", From 3cb3a27940f4ca103ca0bfb17793682cc5ee46a2 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 17:50:45 +0200 Subject: [PATCH 28/31] Changes based on self-review --- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 19631eb1f..50f983acd 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:e49867b +image: ghcr.io/ml6team/load_from_hf_hub:dev produces: images: diff --git a/pyproject.toml b/pyproject.toml index c1f1155a7..e3bc81e92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] -vertex = ["google-cloud-aiplatform"] +vertex = ["kfp", "google-cloud-aiplatform"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" From 35dad13c81fc2aabfabeb105a536e764b2771d98 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 18:06:35 +0200 Subject: [PATCH 29/31] Update component defaults for kfpv2 --- components/load_from_parquet/fondant_component.yaml | 4 ++-- components/load_from_parquet/src/main.py | 8 ++++---- components/segment_images/fondant_component.yaml | 2 +- components/write_to_hf_hub/fondant_component.yaml | 4 ++-- components/write_to_hf_hub/src/main.py | 6 +++--- .../extract_images_from_warc/fondant_component.yaml | 2 +- .../components/read_warc_paths/fondant_component.yaml | 2 +- .../commoncrawl/components/read_warc_paths/src/main.py | 4 ++-- .../components/generate_prompts/fondant_component.yaml | 2 +- .../components/generate_prompts/src/main.py | 2 +- .../write_to_hub_controlnet/fondant_component.yaml | 4 ++-- .../components/add_clip_score/fondant_component.yaml | 1 - .../components/load_from_hf_hub/fondant_component.yaml | 4 ++-- .../components/load_from_hf_hub/fondant_component.yaml | 4 ++-- .../components/write_to_hf_hub/fondant_component.yaml | 4 ++-- .../components/load_from_hub/fondant_component.yaml | 4 ++-- 16 files changed, 28 insertions(+), 29 deletions(-) diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml index 73606b090..9f128a1cb 100644 --- a/components/load_from_parquet/fondant_component.yaml +++ b/components/load_from_parquet/fondant_component.yaml @@ -15,11 +15,11 @@ args: column_name_mapping: description: Mapping of the consumed dataset type: dict - default: None + default: {} n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py index 429924bac..84d68c73c 100644 --- a/components/load_from_parquet/src/main.py +++ b/components/load_from_parquet/src/main.py @@ -19,8 +19,8 @@ def __init__(self, spec: ComponentSpec, *_, dataset_uri: str, - column_name_mapping: t.Optional[dict], - n_rows_to_load: t.Optional[int], + column_name_mapping: dict, + n_rows_to_load: int, index_column: t.Optional[str], ) -> None: """ @@ -45,12 +45,12 @@ def load(self) -> dd.DataFrame: dask_df = dd.read_parquet(self.dataset_uri) # 2) Rename columns - if self.column_name_mapping is not None: + if self.column_name_mapping: logger.info("Renaming columns...") dask_df = dask_df.rename(columns=self.column_name_mapping) # 3) Optional: only return specific amount of rows - if self.n_rows_to_load is not None: + if self.n_rows_to_load > 0: partitions_length = 0 npartitions = 1 for npartitions, partition in enumerate(dask_df.partitions, start=1): diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml index f0f73a7f1..8f32d14f6 100644 --- a/components/segment_images/fondant_component.yaml +++ b/components/segment_images/fondant_component.yaml @@ -22,4 +22,4 @@ args: batch_size: description: batch size to use type: int - batch_size: 8 \ No newline at end of file + default: 8 \ No newline at end of file diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 88be6331c..59c69a093 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -21,8 +21,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index 772b04648..022ff7802 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -39,8 +39,8 @@ def __init__(self, hf_token: str, username: str, dataset_name: str, - image_column_names: t.Optional[list], - column_name_mapping: t.Optional[dict], + image_column_names: list, + column_name_mapping: dict, ): """ Args: @@ -87,7 +87,7 @@ def write( # Map image column to hf data format feature_encoder = datasets.Image(decode=True) - if self.image_column_names is not None: + if self.image_column_names: for image_column_name in self.image_column_names: dataframe[image_column_name] = dataframe[image_column_name].map( lambda x: convert_bytes_to_image(x, feature_encoder), diff --git a/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml b/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml index 01c8e38f4..175edf2ea 100644 --- a/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml +++ b/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml @@ -26,4 +26,4 @@ args: extract_plain_text: description: If set to true the data contains the plain text without html tags type: bool - default: "False" \ No newline at end of file + default: False \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml index 229afb05e..8b774da57 100644 --- a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml +++ b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml @@ -15,4 +15,4 @@ args: n_records_to_download: description: Number of records to download type: int - default: None \ No newline at end of file + default: -1 \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py index 7c642fba3..094c9ce3e 100644 --- a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py +++ b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py @@ -18,7 +18,7 @@ def __init__( self, *_, common_crawl_indices: t.List[str], - n_records_to_download: t.Optional[int] = None, + n_records_to_download: int, ): self.index_urls = [ self.build_index_url(index_name) for index_name in common_crawl_indices @@ -38,7 +38,7 @@ def load(self) -> dd.DataFrame: warc_urls.extend([line.decode() for line in extracted.split(b"\n")]) df = pd.DataFrame(warc_urls, columns=["warc_url"]) - if self.n_records_to_download is not None: + if self.n_records_to_download > 0: df = df.head(self.n_records_to_download) return dd.from_pandas(df, npartitions=len(df) // 100) diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml b/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml index b47ccf119..b98226870 100644 --- a/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml +++ b/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml @@ -12,4 +12,4 @@ args: n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None \ No newline at end of file + default: -1 \ No newline at end of file diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py index 9d58a287a..fff2ef46c 100644 --- a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py +++ b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py @@ -114,7 +114,7 @@ def load(self) -> dd.DataFrame: pandas_df = pd.DataFrame(prompts, columns=["prompts_text"]) - if self.n_rows_to_load: + if self.n_rows_to_load > 0: pandas_df = pandas_df.head(self.n_rows_to_load) df = dd.from_pandas(pandas_df, npartitions=1) diff --git a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml index 4915810f0..62a7c8209 100644 --- a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml +++ b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml @@ -31,8 +31,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml b/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml index c91b2d7a3..eb973b865 100644 --- a/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml @@ -16,7 +16,6 @@ consumes: items: type: float32 - produces: imagetext: fields: diff --git a/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml index a53eab85e..fda34b610 100644 --- a/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml @@ -31,11 +31,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml index f79232360..aa92302e5 100644 --- a/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml @@ -24,11 +24,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml b/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml index 9d94e6aa4..4e7119f2c 100644 --- a/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml +++ b/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml @@ -26,8 +26,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml b/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml index 288314b79..379d12f0c 100644 --- a/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml +++ b/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml @@ -33,11 +33,11 @@ args: description: A list containing the original hub image column names. Used to format the image from HF hub format to a byte string type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str From c7ab7d570739bf56d635e7b7009c5c3745c11620 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 10 Oct 2023 18:20:51 +0200 Subject: [PATCH 30/31] Fix tests for kfp 2.3.0 --- .../example_1/kubeflow_pipeline.yml | 2 +- .../example_1/vertex_pipeline.yml | 6 +++--- .../example_2/kubeflow_pipeline.yml | 18 +++++++++--------- .../example_2/vertex_pipeline.yml | 16 ++++++++-------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index 85fe4853f..75e3fed87 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -334,4 +334,4 @@ root: taskInfo: name: third-component schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.1 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index 9bfd1054b..744ef2026 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -197,7 +197,7 @@ root: type: binary input_partition_rows: runtimeValue: - constant: 10 + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -253,7 +253,7 @@ root: constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" input_partition_rows: runtimeValue: - constant: 10 + constant: 10.0 metadata: runtimeValue: constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", @@ -330,4 +330,4 @@ root: taskInfo: name: third-component schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.1 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index 9d46ddb75..c12879807 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -21,7 +21,7 @@ components: isOptional: true parameterType: STRING input_partition_rows: - defaultValue: -1 + defaultValue: -1.0 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -45,14 +45,14 @@ components: component_spec: parameterType: STRUCT cropping_threshold: - defaultValue: -30 + defaultValue: -30.0 isOptional: true parameterType: NUMBER_INTEGER input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: - defaultValue: -1 + defaultValue: -1.0 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -60,7 +60,7 @@ components: output_manifest_path: parameterType: STRING padding: - defaultValue: 10 + defaultValue: 10.0 isOptional: true parameterType: NUMBER_INTEGER deploymentSpec: @@ -186,13 +186,13 @@ root: constant: args: cropping_threshold: - default: -30 + default: -30.0 description: Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 type: int padding: - default: 10 + default: 10.0 description: Padding for the image cropping. The padding is added to all borders of the image. type: int @@ -229,7 +229,7 @@ root: type: int32 cropping_threshold: runtimeValue: - constant: 0 + constant: 0.0 input_manifest_path: runtimeValue: constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" @@ -243,8 +243,8 @@ root: constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" padding: runtimeValue: - constant: 0 + constant: 0.0 taskInfo: name: image-cropping schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.1 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index 9d46ddb75..305108c98 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -45,14 +45,14 @@ components: component_spec: parameterType: STRUCT cropping_threshold: - defaultValue: -30 + defaultValue: -30.0 isOptional: true parameterType: NUMBER_INTEGER input_manifest_path: isOptional: true parameterType: STRING input_partition_rows: - defaultValue: -1 + defaultValue: -1.0 isOptional: true parameterType: NUMBER_INTEGER metadata: @@ -60,7 +60,7 @@ components: output_manifest_path: parameterType: STRING padding: - defaultValue: 10 + defaultValue: 10.0 isOptional: true parameterType: NUMBER_INTEGER deploymentSpec: @@ -186,13 +186,13 @@ root: constant: args: cropping_threshold: - default: -30 + default: -30.0 description: Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 type: int padding: - default: 10 + default: 10.0 description: Padding for the image cropping. The padding is added to all borders of the image. type: int @@ -229,7 +229,7 @@ root: type: int32 cropping_threshold: runtimeValue: - constant: 0 + constant: 0.0 input_manifest_path: runtimeValue: constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" @@ -243,8 +243,8 @@ root: constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" padding: runtimeValue: - constant: 0 + constant: 0.0 taskInfo: name: image-cropping schemaVersion: 2.1.0 -sdkVersion: kfp-2.0.1 +sdkVersion: kfp-2.3.0 From 0c6d4035986d51bee270f185e4d678ea11e833ba Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 11 Oct 2023 10:15:51 +0200 Subject: [PATCH 31/31] disable kfpv2 default caching --- src/fondant/compiler.py | 6 ++++++ .../compiled_pipeline/example_1/kubeflow_pipeline.yml | 9 +++------ .../compiled_pipeline/example_1/vertex_pipeline.yml | 9 +++------ .../compiled_pipeline/example_2/kubeflow_pipeline.yml | 6 ++---- .../compiled_pipeline/example_2/vertex_pipeline.yml | 6 ++---- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 848be6b69..907185c51 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -352,6 +352,9 @@ def kfp_pipeline(): component_op, ) + # Disable caching + component_task.set_caching_options(enable_caching=False) + previous_component_task = component_task logger.info(f"Compiling {pipeline.name} to {output_path}") @@ -490,6 +493,9 @@ def kfp_pipeline(): component_op, ) + # Disable caching + component_task.set_caching_options(enable_caching=False) + previous_component_task = component_task self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index 75e3fed87..1e148f5c3 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -168,8 +168,7 @@ root: dag: tasks: first-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-first-component inputs: @@ -216,8 +215,7 @@ root: taskInfo: name: first-component second-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-second-component dependentTasks: @@ -272,8 +270,7 @@ root: taskInfo: name: second-component third-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-third-component dependentTasks: diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml index 744ef2026..bf34acc3f 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -164,8 +164,7 @@ root: dag: tasks: first-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-first-component inputs: @@ -212,8 +211,7 @@ root: taskInfo: name: first-component second-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-second-component dependentTasks: @@ -268,8 +266,7 @@ root: taskInfo: name: second-component third-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-third-component dependentTasks: diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index c12879807..1fe3922b3 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -122,8 +122,7 @@ root: dag: tasks: first-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-first-component inputs: @@ -167,8 +166,7 @@ root: taskInfo: name: first-component image-cropping: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-image-cropping dependentTasks: diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml index 305108c98..882728792 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -122,8 +122,7 @@ root: dag: tasks: first-component: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-first-component inputs: @@ -167,8 +166,7 @@ root: taskInfo: name: first-component image-cropping: - cachingOptions: - enableCache: true + cachingOptions: {} componentRef: name: comp-image-cropping dependentTasks: