diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 38bc56cce..11deb2329 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index adfaf43b3..aa7d5d0de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,11 +56,11 @@ repos: - types-requests pass_filenames: false - - repo: local - hooks: - - id: generate_component_readmes - name: Generate component READMEs - language: python - entry: python scripts/component_readme/generate_readme.py - files: ^components/.*/fondant_component.yaml - additional_dependencies: ["fondant"] \ No newline at end of file +# - repo: local +# hooks: +# - id: generate_component_readmes +# name: Generate component READMEs +# language: python +# entry: python scripts/component_readme/generate_readme.py +# files: ^components/.*/fondant_component.yaml +# additional_dependencies: ["fondant"] \ No newline at end of file diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 4e329e79e..fdd3e7f83 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -54,7 +54,7 @@ args: resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool - default: 'False' + default: False min_image_size: description: Minimum size of the images. type: int @@ -62,4 +62,4 @@ args: max_aspect_ratio: description: Maximum aspect ratio of the images. type: float - default: 'inf' \ No newline at end of file + default: 99.9 \ No newline at end of file diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 532b77d25..64090a6f2 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -19,11 +19,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 25fd6f989..e49e73e46 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -1,6 +1,5 @@ """This component loads a seed dataset from the hub.""" import logging -import typing as t import dask import dask.dataframe as dd @@ -20,9 +19,9 @@ def __init__(self, *_, dataset_name: str, column_name_mapping: dict, - image_column_names: t.Optional[list], - n_rows_to_load: t.Optional[int], - index_column: t.Optional[str], + image_column_names: list, + n_rows_to_load: int, + index_column: str, ) -> None: """ Args: @@ -60,7 +59,7 @@ def load(self) -> dd.DataFrame: dask_df = dask_df.rename(columns=self.column_name_mapping) # 4) Optional: only return specific amount of rows - if self.n_rows_to_load is not None: + if self.n_rows_to_load > 0: partitions_length = 0 npartitions = 1 for npartitions, partition in enumerate(dask_df.partitions, start=1): @@ -73,7 +72,7 @@ def load(self) -> dd.DataFrame: dask_df = dd.from_pandas(dask_df, npartitions=npartitions) # 4) Set the index - if self.index_column is None: + if self.index_column == "None": logger.info( "Index column not specified, setting a globally unique index", ) diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml index 73606b090..9f128a1cb 100644 --- a/components/load_from_parquet/fondant_component.yaml +++ b/components/load_from_parquet/fondant_component.yaml @@ -15,11 +15,11 @@ args: column_name_mapping: description: Mapping of the consumed dataset type: dict - default: None + default: {} n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py index 429924bac..84d68c73c 100644 --- a/components/load_from_parquet/src/main.py +++ b/components/load_from_parquet/src/main.py @@ -19,8 +19,8 @@ def __init__(self, spec: ComponentSpec, *_, dataset_uri: str, - column_name_mapping: t.Optional[dict], - n_rows_to_load: t.Optional[int], + column_name_mapping: dict, + n_rows_to_load: int, index_column: t.Optional[str], ) -> None: """ @@ -45,12 +45,12 @@ def load(self) -> dd.DataFrame: dask_df = dd.read_parquet(self.dataset_uri) # 2) Rename columns - if self.column_name_mapping is not None: + if self.column_name_mapping: logger.info("Renaming columns...") dask_df = dask_df.rename(columns=self.column_name_mapping) # 3) Optional: only return specific amount of rows - if self.n_rows_to_load is not None: + if self.n_rows_to_load > 0: partitions_length = 0 npartitions = 1 for npartitions, partition in enumerate(dask_df.partitions, start=1): diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml index f0f73a7f1..8f32d14f6 100644 --- a/components/segment_images/fondant_component.yaml +++ b/components/segment_images/fondant_component.yaml @@ -22,4 +22,4 @@ args: batch_size: description: batch size to use type: int - batch_size: 8 \ No newline at end of file + default: 8 \ No newline at end of file diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 88be6331c..59c69a093 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -21,8 +21,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index 772b04648..022ff7802 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -39,8 +39,8 @@ def __init__(self, hf_token: str, username: str, dataset_name: str, - image_column_names: t.Optional[list], - column_name_mapping: t.Optional[dict], + image_column_names: list, + column_name_mapping: dict, ): """ Args: @@ -87,7 +87,7 @@ def write( # Map image column to hf data format feature_encoder = datasets.Image(decode=True) - if self.image_column_names is not None: + if self.image_column_names: for image_column_name in self.image_column_names: dataframe[image_column_name] = dataframe[image_column_name].map( lambda x: convert_bytes_to_image(x, feature_encoder), diff --git a/docs/components/component_spec.md b/docs/components/component_spec.md index 8769e0f06..c1f222dd0 100644 --- a/docs/components/component_spec.md +++ b/docs/components/component_spec.md @@ -127,9 +127,6 @@ The `args` section describes which arguments the component takes. Each argument `description` and a `type`, which should be one of the builtin Python types. Additionally, you can set an optional `default` value for each argument. -_Note:_ default iterable arguments such as `dict` and `list` have to be passed as a string -(e.g. `'{"foo":1, "bar":2}`, `'["foo","bar]'`) - ```yaml args: custom_argument: diff --git a/docs/pipeline.md b/docs/pipeline.md index 5367fb544..cae389326 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -30,7 +30,8 @@ def build_pipeline(): "batch_size": 2, "max_new_tokens": 50, }, - number_of_gpus=1, + number_of_accelerators=1, + accelerator_name="GPU", node_pool_label="node_pool", node_pool_name="model-inference-pool", ) diff --git a/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml b/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml index 01c8e38f4..175edf2ea 100644 --- a/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml +++ b/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml @@ -26,4 +26,4 @@ args: extract_plain_text: description: If set to true the data contains the plain text without html tags type: bool - default: "False" \ No newline at end of file + default: False \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml index 229afb05e..8b774da57 100644 --- a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml +++ b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml @@ -15,4 +15,4 @@ args: n_records_to_download: description: Number of records to download type: int - default: None \ No newline at end of file + default: -1 \ No newline at end of file diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py index 7c642fba3..094c9ce3e 100644 --- a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py +++ b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py @@ -18,7 +18,7 @@ def __init__( self, *_, common_crawl_indices: t.List[str], - n_records_to_download: t.Optional[int] = None, + n_records_to_download: int, ): self.index_urls = [ self.build_index_url(index_name) for index_name in common_crawl_indices @@ -38,7 +38,7 @@ def load(self) -> dd.DataFrame: warc_urls.extend([line.decode() for line in extracted.split(b"\n")]) df = pd.DataFrame(warc_urls, columns=["warc_url"]) - if self.n_records_to_download is not None: + if self.n_records_to_download > 0: df = df.head(self.n_records_to_download) return dd.from_pandas(df, npartitions=len(df) // 100) diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml b/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml index b47ccf119..b98226870 100644 --- a/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml +++ b/examples/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml @@ -12,4 +12,4 @@ args: n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None \ No newline at end of file + default: -1 \ No newline at end of file diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py index 9d58a287a..fff2ef46c 100644 --- a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py +++ b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py @@ -114,7 +114,7 @@ def load(self) -> dd.DataFrame: pandas_df = pd.DataFrame(prompts, columns=["prompts_text"]) - if self.n_rows_to_load: + if self.n_rows_to_load > 0: pandas_df = pandas_df.head(self.n_rows_to_load) df = dd.from_pandas(pandas_df, npartitions=1) diff --git a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml index 4915810f0..62a7c8209 100644 --- a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml +++ b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml @@ -31,8 +31,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/examples/pipelines/controlnet-interior-design/pipeline.py b/examples/pipelines/controlnet-interior-design/pipeline.py index f176cf5fc..433036a59 100644 --- a/examples/pipelines/controlnet-interior-design/pipeline.py +++ b/examples/pipelines/controlnet-interior-design/pipeline.py @@ -45,7 +45,8 @@ "batch_size": 2, "max_new_tokens": 50, }, - number_of_gpus=1, + number_of_accelerators=1, + accelerator_name="GPU", ) segment_images_op = ComponentOp.from_registry( name="segment_images", @@ -53,7 +54,8 @@ "model_id": "openmmlab/upernet-convnext-small", "batch_size": 2, }, - number_of_gpus=1, + number_of_accelerators=1, + accelerator_name="GPU", ) write_to_hub_controlnet = ComponentOp( diff --git a/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml b/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml index c91b2d7a3..eb973b865 100644 --- a/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml @@ -16,7 +16,6 @@ consumes: items: type: float32 - produces: imagetext: fields: diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ae646ea54..50f983acd 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -50,11 +50,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index cc384ffd2..0846c2451 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -19,7 +19,6 @@ pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) # define ops @@ -41,70 +40,64 @@ "dataset_name": "nielsr/datacomp-small-with-text-embeddings", "column_name_mapping": load_component_column_mapping, "index_column": "uid", - # "n_rows_to_load": 1000, + "n_rows_to_load": 1000, + }, + node_pool_label="node_pool", + node_pool_name="n2-standard-64-pool", +) +download_images_op = ComponentOp.from_registry( + name="download_images", + arguments={ + "retries": 2, + "min_image_size": 0, + }, + node_pool_label="node_pool", + node_pool_name="n2-standard-64-pool", +) +detect_text_op = ComponentOp( + component_dir="components/detect_text", + arguments={ + "batch_size": 2, + }, + node_pool_label="node_pool", + node_pool_name="model-inference-mega-pool", + number_of_accelerators=1, + accelerator_name="GPU", +) +mask_images_op = ComponentOp( + component_dir="components/mask_images", + node_pool_label="node_pool", + node_pool_name="n2-standard-64-pool", +) +embed_images_op = ComponentOp.from_registry( + name="embed_images", + arguments={ + "batch_size": 2, + }, + node_pool_label="node_pool", + node_pool_name="model-inference-pool", + number_of_accelerators=1, + accelerator_name="GPU", +) +add_clip_score_op = ComponentOp( + component_dir="components/add_clip_score", + node_pool_label="node_pool", + node_pool_name="n2-standard-64-pool", +) +filter_clip_score_op = ComponentOp( + component_dir="components/filter_clip_score", + arguments={ + "pct_threshold": 0.5, }, node_pool_label="node_pool", node_pool_name="n2-standard-64-pool", ) -# download_images_op = ComponentOp.from_registry( -# name="download_images", -# arguments={ -# "retries": 2, -# "min_image_size": 0, -# "max_aspect_ratio": float("inf"), -# }, -# node_pool_label="node_pool", -# node_pool_name="n2-standard-64-pool", -# input_partition_rows=1000, -# cache=False, -# ) -# detect_text_op = ComponentOp( -# component_dir="components/detect_text", -# arguments={ -# "batch_size": 2, -# }, -# node_pool_label="node_pool", -# node_pool_name="model-inference-mega-pool", -# number_of_gpus=1, -# cache=False, -# ) -# mask_images_op = ComponentOp( -# component_dir="components/mask_images", -# node_pool_label="node_pool", -# node_pool_name="n2-standard-64-pool", -# cache=False, -# ) -# embed_images_op = ComponentOp.from_registry( -# name="embed_images", -# arguments={ -# "batch_size": 2, -# }, -# node_pool_label="node_pool", -# node_pool_name="model-inference-mega-pool", -# number_of_gpus=1, -# cache=False, -# ) -# add_clip_score_op = ComponentOp( -# component_dir="components/add_clip_score", -# node_pool_label="node_pool", -# node_pool_name="n2-standard-64-pool", -# cache=False, -# ) -# filter_clip_score_op = ComponentOp( -# component_dir="components/filter_clip_score", -# arguments={ -# "pct_threshold": 0.5, -# }, -# node_pool_label="node_pool", -# node_pool_name="n2-standard-64-pool", -# ) - # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(download_images_op, dependencies=load_from_hub_op) +pipeline.add_op(download_images_op, dependencies=load_from_hub_op) # pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(mask_images_op, dependencies=detect_text_op) -# pipeline.add_op(embed_images_op, dependencies=mask_images_op) +pipeline.add_op(embed_images_op, dependencies=download_images_op) # pipeline.add_op(add_clip_score_op, dependencies=embed_images_op) # pipeline.add_op(filter_clip_score_op, dependencies=add_clip_score_op) diff --git a/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml index a53eab85e..fda34b610 100644 --- a/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/filter-cc-25m/components/load_from_hf_hub/fondant_component.yaml @@ -31,11 +31,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml index f79232360..aa92302e5 100644 --- a/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml @@ -24,11 +24,11 @@ args: description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml b/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml index 9d94e6aa4..4e7119f2c 100644 --- a/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml +++ b/examples/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml @@ -26,8 +26,8 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: None + default: [] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict - default: None \ No newline at end of file + default: {} \ No newline at end of file diff --git a/examples/pipelines/finetune_stable_diffusion/pipeline.py b/examples/pipelines/finetune_stable_diffusion/pipeline.py index 1e598121a..d3654f8e8 100644 --- a/examples/pipelines/finetune_stable_diffusion/pipeline.py +++ b/examples/pipelines/finetune_stable_diffusion/pipeline.py @@ -69,7 +69,8 @@ "batch_size": 2, "max_new_tokens": 50, }, - number_of_gpus=1, + number_of_accelerators=1, + accelerator_name="GPU", ) write_to_hub = ComponentOp( @@ -80,7 +81,8 @@ "hf_token": "hf_token", "image_column_names": ["images_data"], }, - number_of_gpus=1, + number_of_accelerators=1, + accelerator_name="GPU", ) pipeline = Pipeline( diff --git a/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml b/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml index 288314b79..379d12f0c 100644 --- a/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml +++ b/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml @@ -33,11 +33,11 @@ args: description: A list containing the original hub image column names. Used to format the image from HF hub format to a byte string type: list - default: None + default: [] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None + default: -1 index_column: description: Column to set index to in the load component, if not specified a default globally unique index will be set type: str diff --git a/pyproject.toml b/pyproject.toml index 49ac38e42..e3bc81e92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">= 3.8" +python = ">= 3.8, <3.11" dask = {extras = ["dataframe", "distributed", "diagnostics"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" @@ -51,14 +51,16 @@ fsspec = { version = ">= 2023.4.0", optional = true} gcsfs = { version = ">= 2023.4.0", optional = true } s3fs = { version = ">= 2023.4.0", optional = true } adlfs = { version = ">= 2023.4.0", optional = true } -kfp = { version = ">= 1.8.19, < 2", optional = true } +kfp = { version = "2.3.0", optional = true, extras =["kubernetes"] } pandas = { version = ">= 1.3.5", optional = true } +google-cloud-aiplatform = { version = "1.34.0", optional = true} [tool.poetry.extras] aws = ["fsspec", "s3fs"] azure = ["fsspec", "adlfs"] gcp = ["fsspec", "gcsfs"] kfp = ["kfp"] +vertex = ["kfp", "google-cloud-aiplatform"] [tool.poetry.group.test.dependencies] pre-commit = "^3.1.1" diff --git a/src/fondant/compiler.py b/src/fondant/compiler.py index 3a4a8f97e..907185c51 100644 --- a/src/fondant/compiler.py +++ b/src/fondant/compiler.py @@ -7,8 +7,13 @@ import yaml +from fondant.exceptions import InvalidPipelineDefinition from fondant.manifest import Metadata -from fondant.pipeline import Pipeline +from fondant.pipeline import ( + Pipeline, + valid_accelerator_types, + valid_vertex_accelerator_types, +) logger = logging.getLogger(__name__) @@ -193,20 +198,7 @@ def _generate_spec( "ports": ports, } - if component_op.number_of_gpus is not None: - services[component_name]["deploy"] = { - "resources": { - "reservations": { - "devices": [ - { - "driver": "nvidia", - "count": component_op.number_of_gpus, - "capabilities": ["gpu"], - }, - ], - }, - }, - } + self._set_configuration(services, component_op, component_name) if component_op.dockerfile_path is not None: logger.info( @@ -225,6 +217,40 @@ def _generate_spec( "services": services, } + @staticmethod + def _set_configuration(services, fondant_component_operation, component_name): + accelerator_name = fondant_component_operation.accelerator_name + accelerator_number = fondant_component_operation.number_of_accelerators + + if accelerator_name is not None: + if accelerator_name not in valid_accelerator_types: + msg = ( + f"Configured accelerator `{accelerator_name}`" + f" is not a valid accelerator type for Docker Compose compiler." + f" Available options: {valid_vertex_accelerator_types}" + ) + raise InvalidPipelineDefinition(msg) + + if accelerator_name == "GPU": + services[component_name]["deploy"] = { + "resources": { + "reservations": { + "devices": [ + { + "driver": "nvidia", + "count": accelerator_number, + "capabilities": ["gpu"], + }, + ], + }, + }, + } + elif accelerator_name == "TPU": + msg = "TPU configuration is not yet implemented for Docker Compose " + raise NotImplementedError(msg) + + return services + class KubeFlowCompiler(Compiler): """Compiler that creates a Kubeflow pipeline spec from a pipeline.""" @@ -236,10 +262,11 @@ def _resolve_imports(self): """Resolve imports for the Kubeflow compiler.""" try: import kfp - import kfp.gcp as kfp_gcp + import kfp.kubernetes as kfp_kubernetes self.kfp = kfp - self.kfp_gcp = kfp_gcp + self.kfp_kubernetes = kfp_kubernetes + except ImportError: msg = """You need to install kfp to use the Kubeflow compiler,\n you can install it with `pip install fondant[kfp]`""" @@ -259,15 +286,27 @@ def compile( output_path: the path where to save the Kubeflow pipeline spec """ run_id = pipeline.get_run_id() + pipeline.validate(run_id=run_id) + logger.info(f"Compiling {pipeline.name} to {output_path}") @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) def kfp_pipeline(): previous_component_task = None - manifest_path = "" component_cache_key = None for component_name, component in pipeline._graph.items(): + logger.info(f"Compiling service for {component_name}") + component_op = component["fondant_component_op"] + # convert ComponentOp to Kubeflow component + kubeflow_component_op = self.kfp.components.load_component_from_text( + text=component_op.component_spec.kubeflow_specification.to_string(), + ) + + # Remove None values from arguments + component_args = { + k: v for k, v in component_op.arguments.items() if v is not None + } component_cache_key = component_op.get_component_cache_key( previous_component_cache=component_cache_key, @@ -280,54 +319,55 @@ def kfp_pipeline(): cache_key=component_cache_key, ) - logger.info(f"Compiling service for {component_name}") - - # convert ComponentOp to Kubeflow component - kubeflow_component_op = self.kfp.components.load_component( - text=component_op.component_spec.kubeflow_specification.to_string(), + output_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{component_name}/manifest.json" ) + # Set the execution order of the component task to be after the previous + # component task. + if component["dependencies"]: + for dependency in component["dependencies"]: + input_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{dependency}/manifest.json" + ) + component_task = kubeflow_component_op( + input_manifest_path=input_manifest_path, + output_manifest_path=output_manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) - # Execute the Kubeflow component and pass in the output manifest path from - # the previous component. - component_args = component_op.arguments + else: + component_task = kubeflow_component_op( + metadata=metadata.to_json(), + output_manifest_path=output_manifest_path, + **component_args, + ) - component_task = kubeflow_component_op( - input_manifest_path=manifest_path, - metadata=metadata.to_json(), - **component_args, - ) - # Set optional configurations + # Set optional arguments component_task = self._set_configuration( component_task, component_op, ) - # Set image pull policy to always - component_task.container.set_image_pull_policy("Always") - - # Set the execution order of the component task to be after the previous - # component task. - if previous_component_task is not None: - component_task.after(previous_component_task) - - # Update the manifest path to be the output path of the current component task. - manifest_path = component_task.outputs["output_manifest_path"] + # Disable caching + component_task.set_caching_options(enable_caching=False) previous_component_task = component_task - self.pipeline = pipeline - self.pipeline.validate(run_id=run_id) - logger.info(f"Compiling {self.pipeline.name} to {output_path}") + logger.info(f"Compiling {pipeline.name} to {output_path}") self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore logger.info("Pipeline compiled successfully") def _set_configuration(self, task, fondant_component_operation): # Unpack optional specifications - number_of_gpus = fondant_component_operation.number_of_gpus + number_of_accelerators = fondant_component_operation.number_of_accelerators + accelerator_name = fondant_component_operation.accelerator_name node_pool_label = fondant_component_operation.node_pool_label node_pool_name = fondant_component_operation.node_pool_name - preemptible = fondant_component_operation.preemptible memory_request = fondant_component_operation.memory_request memory_limit = fondant_component_operation.memory_limit @@ -336,17 +376,147 @@ def _set_configuration(self, task, fondant_component_operation): task.set_memory_request(memory_request) if memory_limit is not None: task.set_memory_limit(memory_limit) - if number_of_gpus is not None: - task.set_gpu_limit(number_of_gpus) + if accelerator_name is not None: + if accelerator_name not in valid_accelerator_types: + msg = ( + f"Configured accelerator `{accelerator_name}` is not a valid accelerator type" + f"for Kubeflow compiler. Available options: {valid_accelerator_types}" + ) + raise InvalidPipelineDefinition(msg) + + task.set_accelerator_limit(number_of_accelerators) + if accelerator_name == "GPU": + task.set_accelerator_type("nvidia.com/gpu") + elif accelerator_name == "TPU": + task.set_accelerator_type("cloud-tpus.google.com/v3") if node_pool_name is not None and node_pool_label is not None: - task.add_node_selector_constraint(node_pool_label, node_pool_name) - if preemptible is True: - logger.warning( - f"Preemptible VM enabled on component `{fondant_component_operation.name}`. Please" - f" note that Preemptible nodepools only works on clusters setup on GCP and " - f"with nodepools pre-configured with preemptible VMs. More info here:" - f" https://v1-6-branch.kubeflow.org/docs/distributions/gke/pipelines/preemptible/", + task = self.kfp_kubernetes.add_node_selector( + task, + node_pool_label, + node_pool_name, + ) + return task + + +class VertexCompiler(Compiler): + def __init__(self): + self.resolve_imports() + + def resolve_imports(self): + """Resolve imports for the Vertex compiler.""" + try: + import kfp + + self.kfp = kfp + + except ImportError: + msg = """You need to install kfp to use the Vertex compiler,\n + you can install it with `pip install fondant[vertex]`""" + raise ImportError( + msg, ) - task.apply(self.kfp_gcp.use_preemptible_nodepool()) + + def compile( + self, + pipeline: Pipeline, + output_path: str = "vertex_pipeline.yml", + ) -> None: + """Compile a pipeline to vertex pipeline spec and save it to a specified output path. + + Args: + pipeline: the pipeline to compile + output_path: the path where to save the Kubeflow pipeline spec + """ + run_id = pipeline.get_run_id() + pipeline.validate(run_id=run_id) + logger.info(f"Compiling {pipeline.name} to {output_path}") + + @self.kfp.dsl.pipeline(name=pipeline.name, description=pipeline.description) + def kfp_pipeline(): + previous_component_task = None + component_cache_key = None + + for component_name, component in pipeline._graph.items(): + logger.info(f"Compiling service for {component_name}") + + component_op = component["fondant_component_op"] + # convert ComponentOp to Kubeflow component + kubeflow_component_op = self.kfp.components.load_component_from_text( + text=component_op.component_spec.kubeflow_specification.to_string(), + ) + + # Remove None values from arguments + component_args = { + k: v for k, v in component_op.arguments.items() if v is not None + } + component_cache_key = component_op.get_component_cache_key( + previous_component_cache=component_cache_key, + ) + metadata = Metadata( + pipeline_name=pipeline.name, + run_id=run_id, + base_path=pipeline.base_path, + component_id=component_name, + cache_key=component_cache_key, + ) + + output_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{component_name}/manifest.json" + ) + # Set the execution order of the component task to be after the previous + # component task. + if component["dependencies"]: + for dependency in component["dependencies"]: + input_manifest_path = ( + f"{pipeline.base_path}/{pipeline.name}/" + f"{run_id}/{dependency}/manifest.json" + ) + component_task = kubeflow_component_op( + input_manifest_path=input_manifest_path, + output_manifest_path=output_manifest_path, + metadata=metadata.to_json(), + **component_args, + ) + component_task.after(previous_component_task) + + else: + component_task = kubeflow_component_op( + metadata=metadata.to_json(), + output_manifest_path=output_manifest_path, + **component_args, + ) + + # Set optional arguments + component_task = self._set_configuration( + component_task, + component_op, + ) + + # Disable caching + component_task.set_caching_options(enable_caching=False) + + previous_component_task = component_task + + self.kfp.compiler.Compiler().compile(kfp_pipeline, output_path) # type: ignore + logger.info("Pipeline compiled successfully") + + @staticmethod + def _set_configuration(task, fondant_component_operation): + # Unpack optional specifications + number_of_accelerators = fondant_component_operation.number_of_accelerators + accelerator_name = fondant_component_operation.accelerator_name + + # Assign optional specification + if number_of_accelerators is not None: + task.set_accelerator_limit(number_of_accelerators) + if accelerator_name not in valid_vertex_accelerator_types: + msg = ( + f"Configured accelerator `{accelerator_name}` is not a valid accelerator type" + f"for Vertex compiler. Available options: {valid_vertex_accelerator_types}" + ) + raise InvalidPipelineDefinition(msg) + + task.set_accelerator_type(accelerator_name) return task diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py index a71762a87..3698ae883 100644 --- a/src/fondant/component_spec.py +++ b/src/fondant/component_spec.py @@ -1,8 +1,8 @@ """This module defines classes to represent an Fondant component specification.""" -import ast import copy import json import pkgutil +import re import types import typing as t from dataclasses import dataclass @@ -17,50 +17,49 @@ from fondant.exceptions import InvalidComponentSpec from fondant.schema import Field, KubeflowCommandArguments, Type -# TODO: remove after upgrading to kfpv2 -kubeflow_to_python_type_dict = { - "String": str, - "Integer": int, - "Float": float, - "Boolean": ast.literal_eval, - "JsonObject": json.loads, - "JsonArray": json.loads, -} - - -def kubeflow2python_type(type_: str) -> t.Any: - map_fn = kubeflow_to_python_type_dict[type_] - return lambda value: map_fn(value) if value != "None" else None # type: ignore - - -# TODO: Change after upgrading to kfp v2 -# :https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/ -python2kubeflow_type = { - "str": "String", - "int": "Integer", - "float": "Float", - "bool": "Boolean", - "dict": "JsonObject", - "list": "JsonArray", -} - @dataclass class Argument: """ - Kubeflow component argument. + Component argument. Args: name: name of the argument description: argument description - type: the python argument type (str, int, ...) + type: the python argument type in str format (str, int, ...) default: default value of the argument (defaults to None) + optional: whether an argument is optional or not (defaults to False) """ name: str description: str type: str - default: t.Optional[str] = None + default: t.Any = None + optional: t.Optional[bool] = False + + @property + def python_type(self) -> t.Any: + lookup = { + "str": str, + "int": int, + "float": float, + "bool": bool, + "dict": json.loads, + "list": json.loads, + } + return lookup[self.type] + + @property + def kubeflow_type(self) -> str: + lookup = { + "str": "STRING", + "int": "NUMBER_INTEGER", + "float": "NUMBER_DOUBLE", + "bool": "BOOLEAN", + "dict": "STRUCT", + "list": "LIST", + } + return lookup[self.type] class ComponentSubset: @@ -191,21 +190,72 @@ def outputs_additional_subsets(self) -> bool: return self._specification.get("produces", {}).get("additionalSubsets", True) @property - def args(self) -> t.Dict[str, Argument]: - return { - name: Argument( - name=name, - description=arg_info["description"], - type=arg_info["type"], - default=arg_info["default"] if "default" in arg_info else None, - ) - for name, arg_info in self._specification.get("args", {}).items() - } + def args(self) -> t.Mapping[str, Argument]: + args = self.default_arguments + args.update( + { + name: Argument( + name=name, + description=arg_info["description"], + type=arg_info["type"], + default=arg_info["default"] if "default" in arg_info else None, + optional=arg_info.get("default") == "None", + ) + for name, arg_info in self._specification.get("args", {}).items() + }, + ) + return types.MappingProxyType(args) @property def specification(self) -> t.Dict[str, t.Any]: return copy.deepcopy(self._specification) + @property + def default_arguments(self) -> t.Dict[str, Argument]: + """Add the default arguments of a fondant component.""" + return { + "input_manifest_path": Argument( + name="input_manifest_path", + description="Path to the input manifest", + type="str", + optional=True, + ), + "component_spec": Argument( + name="component_spec", + description="The component specification as a dictionary", + type="dict", + ), + "input_partition_rows": Argument( + name="input_partition_rows", + description="The number of rows to load per partition. \ + Set to override the automatic partitioning", + type="int", + default=-1, + ), + "cache": Argument( + name="cache", + description="Set to False to disable caching, True by default.", + type="bool", + default=True, + ), + "cluster_type": Argument( + name="cluster_type", + description="The cluster type to use for the execution", + type="str", + default="default", + ), + "metadata": Argument( + name="metadata", + description="Metadata arguments containing the run id and base path", + type="str", + ), + "output_manifest_path": Argument( + name="output_manifest_path", + description="Path to the output manifest", + type="str", + ), + } + @property def kubeflow_specification(self) -> "KubeflowComponentSpec": return KubeflowComponentSpec.from_fondant_component_spec(self) @@ -230,101 +280,90 @@ class KubeflowComponentSpec: def __init__(self, specification: t.Dict[str, t.Any]) -> None: self._specification = specification + @staticmethod + def convert_arguments(fondant_component: ComponentSpec): + args = {} + for arg in fondant_component.args.values(): + arg_type_dict = {} + + if arg.optional or arg.default is not None: + arg_type_dict["isOptional"] = True + if arg.default is not None: + arg_type_dict["defaultValue"] = arg.default + + args[arg.name] = { + "parameterType": arg.kubeflow_type, + "description": arg.description, + **arg_type_dict, # type: ignore + } + + return args + + @staticmethod + def sanitize_component_name(name: str) -> str: + """Cleans and converts a name to be kfp V2 compatible. + + Taken from https://github.com/kubeflow/pipelines/blob/ + cfe671c485d4ee8514290ee81ca2785e8bda5c9b/sdk/python/kfp/dsl/utils.py#L52 + """ + return ( + re.sub("-+", "-", re.sub("[^-0-9a-z]+", "-", name.lower())) + .lstrip("-") + .rstrip("-") + ) + @classmethod - def from_fondant_component_spec( - cls, - fondant_component: ComponentSpec, - ) -> "KubeflowComponentSpec": - """Create a Kubeflow component spec from a Fondant component spec.""" + def from_fondant_component_spec(cls, fondant_component: ComponentSpec): + """Generate a Kubeflow component spec from a Fondant component spec.""" + input_definitions = { + "parameters": { + **cls.convert_arguments(fondant_component), + }, + } + + cleaned_component_name = cls.sanitize_component_name(fondant_component.name) + specification = { - "name": fondant_component.name, - "description": fondant_component.description, - "inputs": [ - { - "name": "input_manifest_path", - "description": "Path to the input manifest", - "type": "String", - }, - { - "name": "metadata", - "description": "Metadata arguments containing the run id and base path", - "type": "String", + "components": { + "comp-" + + cleaned_component_name: { + "executorLabel": "exec-" + cleaned_component_name, + "inputDefinitions": input_definitions, }, - { - "name": "component_spec", - "description": "The component specification as a dictionary", - "type": "JsonObject", - "default": "None", - }, - { - "name": "input_partition_rows", - "description": "The number of rows to load per partition. Set to override the" - " automatic partitioning", - "type": "String", - "default": "None", - }, - { - "name": "cache", - "description": "Set to False to disable caching, True by default.", - "type": "Boolean", - "default": "True", - }, - { - "name": "cluster_type", - "description": "The type of cluster to use for distributed execution", - "type": "String", - "default": "default", - }, - { - "name": "client_kwargs", - "description": "Keyword arguments used to initialise the dask client", - "type": "JsonObject", - "default": "{}", - }, - *( - { - "name": arg.name, - "description": arg.description, - "type": python2kubeflow_type[arg.type], - **({"default": arg.default} if arg.default is not None else {}), - } - for arg in fondant_component.args.values() - ), - ], - "outputs": [ - { - "name": "output_manifest_path", - "description": "Path to the output manifest", - "type": "String", + }, + "deploymentSpec": { + "executors": { + "exec-" + + cleaned_component_name: { + "container": { + "args": cls._dump_args(fondant_component.args.values()), + "command": ["fondant", "execute", "main"], + "image": fondant_component.image, + }, + }, }, - ], - "implementation": { - "container": { - "image": fondant_component.image, - "command": [ - "fondant", - "execute", - "main", - "--input_manifest_path", - {"inputPath": "input_manifest_path"}, - "--metadata", - {"inputValue": "metadata"}, - "--component_spec", - {"inputValue": "component_spec"}, - "--input_partition_rows", - {"inputValue": "input_partition_rows"}, - "--cache", - {"inputValue": "cache"}, - *cls._dump_args(fondant_component.args.values()), - "--output_manifest_path", - {"outputPath": "output_manifest_path"}, - "--cluster_type", - {"inputValue": "cluster_type"}, - "--client_kwargs", - {"inputValue": "client_kwargs"}, - ], + }, + "pipelineInfo": {"name": cleaned_component_name}, + "root": { + "dag": { + "tasks": { + cleaned_component_name: { + "cachingOptions": {"enableCache": True}, + "componentRef": {"name": "comp-" + cleaned_component_name}, + "inputs": { + "parameters": { + param: {"componentInputParameter": param} + for param in input_definitions["parameters"] + }, + }, + "taskInfo": {"name": cleaned_component_name}, + }, + }, }, + "inputDefinitions": input_definitions, }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.1", } return cls(specification) @@ -337,7 +376,7 @@ def _dump_args(args: t.Iterable[Argument]) -> KubeflowCommandArguments: arg_name_cmd = f"--{arg_name}" dumped_args.append(arg_name_cmd) - dumped_args.append({"inputValue": arg_name}) + dumped_args.append("{{$.inputs.parameters['" + f"{arg_name}" + "']}}") return dumped_args @@ -356,34 +395,5 @@ def to_string(self) -> str: """Return the component specification as a string.""" return json.dumps(self._specification) - @property - def input_arguments(self) -> t.Mapping[str, Argument]: - """The input arguments of the component as an immutable mapping.""" - return types.MappingProxyType( - { - info["name"]: Argument( - name=info["name"], - description=info["description"], - type=info["type"], - default=info["default"] if "default" in info else None, - ) - for info in self._specification["inputs"] - }, - ) - - @property - def output_arguments(self) -> t.Mapping[str, Argument]: - """The output arguments of the component as an immutable mapping.""" - return types.MappingProxyType( - { - info["name"]: Argument( - name=info["name"], - description=info["description"], - type=info["type"], - ) - for info in self._specification["outputs"] - }, - ) - def __repr__(self) -> str: return f"{self.__class__.__name__}({self._specification!r})" diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index e4ca43ab0..f5ce0e5eb 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -24,7 +24,7 @@ def __init__( *, manifest: Manifest, component_spec: ComponentSpec, - input_partition_rows: t.Optional[t.Union[int, str]] = None, + input_partition_rows: int = -1, ): super().__init__(manifest=manifest, component_spec=component_spec) self.input_partition_rows = input_partition_rows @@ -38,46 +38,46 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: """ n_workers: int = os.cpu_count() # type: ignore - if self.input_partition_rows != "disable": - if isinstance(self.input_partition_rows, int): - # Only load the index column to trigger a faster compute of the rows - total_rows = len(dataframe.index) - # +1 to handle any remainder rows - n_partitions = (total_rows // self.input_partition_rows) + 1 - dataframe = dataframe.repartition(npartitions=n_partitions) - logger.info( - f"Total number of rows is {total_rows}.\n" - f"Repartitioning the data from {dataframe.partitions} partitions to have" - f" {n_partitions} such that the number of partitions per row is approximately" - f"{self.input_partition_rows}", + if self.input_partition_rows > 1: + # Only load the index column to trigger a faster compute of the rows + total_rows = len(dataframe.index) + # +1 to handle any remainder rows + n_partitions = (total_rows // self.input_partition_rows) + 1 + dataframe = dataframe.repartition(npartitions=n_partitions) + logger.info( + f"Total number of rows is {total_rows}.\n" + f"Repartitioning the data from {dataframe.partitions} partitions to have" + f" {n_partitions} such that the number of partitions per row is approximately" + f"{self.input_partition_rows}", + ) + if n_partitions < n_workers: + logger.warning( + "Setting the `input partition rows` has caused the system to not utilize" + f" all available workers {n_partitions} out of {n_workers} are used.", ) - if n_partitions < n_workers: - logger.warning( - "Setting the `input partition rows` has caused the system to not utilize" - f" all available workers {n_partitions} out of {n_workers} are used.", - ) - - elif self.input_partition_rows is None: - n_partitions = dataframe.npartitions - if n_partitions < n_workers: # type: ignore - logger.info( - f"The number of partitions of the input dataframe is {n_partitions}. The " - f"available number of workers is {n_workers}.", - ) - dataframe = dataframe.repartition(npartitions=n_workers) - logger.info( - f"Repartitioning the data to {n_workers} partitions before processing" - f" to maximize worker usage", - ) - else: - msg = ( - f"{self.input_partition_rows} is not a valid argument. Choose either " - f"the number of partitions or set to 'disable' to disable automated " - f"partitioning" + + elif self.input_partition_rows == -1: + n_partitions = dataframe.npartitions + if n_partitions < n_workers: # type: ignore + logger.info( + f"The number of partitions of the input dataframe is {n_partitions}. The " + f"available number of workers is {n_workers}.", ) - raise ValueError( - msg, + dataframe = dataframe.repartition(npartitions=n_workers) + logger.info( + f"Repartitioning the data to {n_workers} partitions before processing" + f" to maximize worker usage", ) + else: + msg = ( + f"{self.input_partition_rows} is not a valid value for the 'input_partition_rows' " + f"parameter. It should be a number larger than 0 to indicate the number of " + f"expected rows per partition, or '-1' to let Fondant optimize the number of " + f"partitions based on the number of available workers." + ) + raise ValueError( + msg, + ) return dataframe diff --git a/src/fondant/executor.py b/src/fondant/executor.py index 8126199d8..16941663b 100644 --- a/src/fondant/executor.py +++ b/src/fondant/executor.py @@ -5,12 +5,12 @@ components take care of processing, filtering and extending the data. """ import argparse -import ast import json import logging import os import typing as t from abc import abstractmethod +from distutils.util import strtobool from pathlib import Path import dask @@ -26,10 +26,9 @@ DaskWriteComponent, PandasTransformComponent, ) -from fondant.component_spec import Argument, ComponentSpec, kubeflow2python_type +from fondant.component_spec import Argument, ComponentSpec from fondant.data_io import DaskDataLoader, DaskDataWriter from fondant.manifest import Manifest, Metadata -from fondant.schema import validate_partition_number dask.config.set({"dataframe.convert-string": False}) logger = logging.getLogger(__name__) @@ -65,7 +64,7 @@ def __init__( output_manifest_path: t.Union[str, Path], metadata: t.Dict[str, t.Any], user_arguments: t.Dict[str, t.Any], - input_partition_rows: t.Optional[t.Union[str, int]] = None, + input_partition_rows: int, cluster_type: t.Optional[str] = None, client_kwargs: t.Optional[dict] = None, ) -> None: @@ -110,8 +109,8 @@ def from_args(cls) -> "Executor": """Create an executor from a passed argument containing the specification as a dict.""" parser = argparse.ArgumentParser() parser.add_argument("--component_spec", type=json.loads) - parser.add_argument("--cache", type=ast.literal_eval) - parser.add_argument("--input_partition_rows", type=validate_partition_number) + parser.add_argument("--cache", type=lambda x: bool(strtobool(x))) + parser.add_argument("--input_partition_rows", type=int) parser.add_argument("--cluster_type", type=str) parser.add_argument("--client_kwargs", type=json.loads) args, _ = parser.parse_known_args() @@ -140,7 +139,7 @@ def from_spec( component_spec: ComponentSpec, *, cache: bool, - input_partition_rows: t.Optional[t.Union[str, int]], + input_partition_rows: int, cluster_type: t.Optional[str], client_kwargs: t.Optional[dict], ) -> "Executor": @@ -188,22 +187,29 @@ def _add_and_parse_args(cls, spec: ComponentSpec): if arg.name in cls.optional_fondant_arguments(): input_required = False default = None - elif arg.default is not None: + elif arg.default is not None and arg.optional is False: input_required = False default = arg.default + elif arg.default is not None and arg.optional is True: + input_required = False + default = None else: input_required = True default = None parser.add_argument( f"--{arg.name}", - type=kubeflow2python_type(arg.type), # type: ignore + type=arg.python_type, # type: ignore required=input_required, default=default, help=arg.description, ) args, _ = parser.parse_known_args() + args.__dict__ = { + k: v if v != "None" else None for k, v in args.__dict__.items() + } + return args @staticmethod @@ -221,9 +227,7 @@ def _get_component_arguments(spec: ComponentSpec) -> t.Dict[str, Argument]: Input and output arguments of the component. """ component_arguments: t.Dict[str, Argument] = {} - kubeflow_component_spec = spec.kubeflow_specification - component_arguments.update(kubeflow_component_spec.input_arguments) - component_arguments.update(kubeflow_component_spec.output_arguments) + component_arguments.update(spec.args) return component_arguments @abstractmethod @@ -399,42 +403,15 @@ def upload_manifest(self, manifest: Manifest, save_path: t.Union[str, Path]): """ Uploads the manifest to the specified destination. - If the save_path points to the kubeflow output artifact temporary path, - it will be saved both in a specific base path and the native kfp artifact path. - Args: manifest: The Manifest object to be uploaded. save_path: The path where the Manifest object will be saved. """ - is_kubeflow_output = ( - str(save_path) == "/tmp/outputs/output_manifest_path/data" # nosec - ) - - if is_kubeflow_output: - # Save to the expected base path directory - save_path_base_path = ( - f"{manifest.base_path}/{manifest.pipeline_name}/{manifest.run_id}/" - f"{manifest.component_id}/manifest.json" - ) - # Upload manifest and it's reference if cache is False - manifest.to_file(save_path_base_path) - logger.info(f"Saving output manifest to {save_path_base_path}") - self._upload_cache_key( - manifest=manifest, - manifest_save_path=save_path_base_path, - ) - # Write manifest to the native kfp artifact path that will be passed as an artifact - # and read by the next component - manifest.to_file(save_path) - else: - # Local runner - manifest.to_file(save_path) - logger.info(f"Saving output manifest to {save_path}") - self._upload_cache_key( - manifest=manifest, - manifest_save_path=save_path, - ) + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + manifest.to_file(save_path) + logger.info(f"Saving output manifest to {save_path}") + self._upload_cache_key(manifest=manifest, manifest_save_path=save_path) class DaskLoadExecutor(Executor[DaskLoadComponent]): @@ -445,7 +422,7 @@ def _is_previous_cached(self, input_manifest: Manifest) -> bool: @staticmethod def optional_fondant_arguments() -> t.List[str]: - return ["input_manifest_path"] + return ["input_manifest_path", "input_partition_rows"] def _load_or_create_manifest(self) -> Manifest: return Manifest.create( @@ -509,6 +486,10 @@ def _execute_component( class PandasTransformExecutor(TransformExecutor[PandasTransformComponent]): + @staticmethod + def optional_fondant_arguments() -> t.List[str]: + return ["input_manifest_path", "input_partition_rows"] + @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: @@ -613,7 +594,7 @@ class DaskWriteExecutor(Executor[DaskWriteComponent]): @staticmethod def optional_fondant_arguments() -> t.List[str]: - return ["output_manifest_path"] + return ["input_partition_rows", "output_manifest_path"] def _load_or_create_manifest(self) -> Manifest: return Manifest.from_file(self.input_manifest_path) diff --git a/src/fondant/pipeline.py b/src/fondant/pipeline.py index 3aacb5f8e..420fb7ccf 100644 --- a/src/fondant/pipeline.py +++ b/src/fondant/pipeline.py @@ -16,10 +16,31 @@ from fondant.component_spec import ComponentSpec from fondant.exceptions import InvalidPipelineDefinition from fondant.manifest import Manifest -from fondant.schema import validate_partition_number logger = logging.getLogger(__name__) +valid_accelerator_types = [ + "GPU", + "TPU", +] + +# Taken from https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform_v1/types +# /accelerator_type.py +valid_vertex_accelerator_types = [ + "ACCELERATOR_TYPE_UNSPECIFIED", + "NVIDIA_TESLA_K80", + "NVIDIA_TESLA_P100", + "NVIDIA_TESLA_V100", + "NVIDIA_TESLA_P4", + "NVIDIA_TESLA_T4", + "NVIDIA_TESLA_A100", + "NVIDIA_A100_80GB", + "NVIDIA_L4", + "TPU_V2", + "TPU_V3", + "TPU_V4_POD", +] + class ComponentOp: """ @@ -31,7 +52,12 @@ class ComponentOp: arguments: A dictionary containing the argument name and value for the operation. input_partition_rows: The number of rows to load per partition. Set to override the automatic partitioning - number_of_gpus: The number of gpus to assign to the operation + number_of_accelerators: The number of accelerators to assign to the operation (GPU, TPU) + accelerator_name: The name of the accelerator to assign. If you're using a cluster setup + on GKE, select "GPU" for GPU or "TPU" for TPU. Make sure + that you select a nodepool with the available hardware. If you're running the + pipeline on Vertex, then select one of the machines specified in the list of + accelerators here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. node_pool_label: The label of the node pool to which the operation will be assigned. node_pool_name: The name of the node pool to which the operation will be assigned. cache: Set to False to disable caching, True by default. @@ -49,8 +75,8 @@ class ComponentOp: Note: - A Fondant Component operation is created by defining a Fondant Component and its input arguments. - - The `number_of_gpus`, `node_pool_label`, `node_pool_name`, `cache`, `cluster_type` and - `client_kwargs` attributes are optional and can be used to specify additional + - The `accelerator_name`, `node_pool_label`, `node_pool_name` + attributes are optional and can be used to specify additional configurations for the operation. More information on the optional attributes that can be assigned to kfp components here: https://kubeflow-pipelines.readthedocs.io/en/1.8.13/source/kfp.dsl.html @@ -64,7 +90,8 @@ def __init__( *, arguments: t.Optional[t.Dict[str, t.Any]] = None, input_partition_rows: t.Optional[t.Union[str, int]] = None, - number_of_gpus: t.Optional[int] = None, + number_of_accelerators: t.Optional[int] = None, + accelerator_name: t.Optional[str] = None, node_pool_label: t.Optional[str] = None, node_pool_name: t.Optional[str] = None, cache: t.Optional[bool] = True, @@ -85,18 +112,13 @@ def __init__( self.client_kwargs = client_kwargs self.arguments = arguments or {} - self._add_component_argument( - "input_partition_rows", - input_partition_rows, - validate_partition_number, - ) + self._add_component_argument("input_partition_rows", input_partition_rows) self._add_component_argument("cache", self.cache) self._add_component_argument("cluster_type", cluster_type) self._add_component_argument("client_kwargs", client_kwargs) self.arguments.setdefault("component_spec", self.component_spec.specification) - self.number_of_gpus = number_of_gpus self.memory_request = memory_request self.memory_limit = memory_limit self.node_pool_label, self.node_pool_name = self._validate_node_pool_spec( @@ -105,6 +127,14 @@ def __init__( ) self.preemptible = preemptible + ( + self.number_of_accelerators, + self.accelerator_name, + ) = self._validate_accelerator_spec( + number_of_accelerators, + accelerator_name, + ) + def _configure_caching_from_image_tag( self, cache: t.Optional[bool], @@ -151,8 +181,8 @@ def _add_component_argument( self.argument_name = argument_value self.arguments[argument_name] = argument_value + @staticmethod def _validate_node_pool_spec( - self, node_pool_label, node_pool_name, ) -> t.Tuple[t.Optional[str], t.Optional[str]]: @@ -164,6 +194,23 @@ def _validate_node_pool_spec( ) return node_pool_label, node_pool_name + def _validate_accelerator_spec( + self, + number_of_accelerators, + accelerator_name, + ) -> t.Tuple[t.Optional[int], t.Optional[str]]: + """Validate accelerator specification.""" + if bool(number_of_accelerators) != bool(accelerator_name): + msg = ( + "Both number of accelerators and accelerator name must be specified or both must" + " be None." + ) + raise InvalidPipelineDefinition( + msg, + ) + + return number_of_accelerators, accelerator_name + @property def dockerfile_path(self) -> t.Optional[Path]: path = self.component_dir / "Dockerfile" @@ -176,7 +223,8 @@ def from_registry( *, arguments: t.Optional[t.Dict[str, t.Any]] = None, input_partition_rows: t.Optional[t.Union[int, str]] = None, - number_of_gpus: t.Optional[int] = None, + number_of_accelerators: t.Optional[int] = None, + accelerator_name: t.Optional[str] = None, node_pool_label: t.Optional[str] = None, node_pool_name: t.Optional[str] = None, cache: t.Optional[bool] = True, @@ -193,7 +241,12 @@ def from_registry( arguments: A dictionary containing the argument name and value for the operation. input_partition_rows: The number of rows to load per partition. Set to override the automatic partitioning - number_of_gpus: The number of gpus to assign to the operation + number_of_accelerators: The number of accelerators to assign to the operation (GPU, TPU) + accelerator_name: The name of the accelerator to assign. If you're using a cluster setup + on GKE, select "GPU" for GPU or "TPU" for TPU. Make + sure that you select a nodepool with the available hardware. If you're running the + pipeline on Vertex, then select one of the machines specified in the list of + accelerators here https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec. node_pool_label: The label of the node pool to which the operation will be assigned. node_pool_name: The name of the node pool to which the operation will be assigned. cache: Set to False to disable caching, True by default. @@ -218,7 +271,8 @@ def from_registry( components_dir, arguments=arguments, input_partition_rows=input_partition_rows, - number_of_gpus=number_of_gpus, + number_of_accelerators=number_of_accelerators, + accelerator_name=accelerator_name, node_pool_label=node_pool_label, node_pool_name=node_pool_name, cache=cache, @@ -265,7 +319,8 @@ def get_nested_dict_hash(input_dict): "component_spec_hash": get_nested_dict_hash(component_spec_dict), "arguments": arguments, "input_partition_rows": self.input_partition_rows, - "number_of_gpus": self.number_of_gpus, + "number_of_accelerators": self.number_of_accelerators, + "accelerator_name": self.accelerator_name, "node_pool_name": self.node_pool_name, } diff --git a/src/fondant/runner.py b/src/fondant/runner.py index 4ff924a8e..df898c97d 100644 --- a/src/fondant/runner.py +++ b/src/fondant/runner.py @@ -1,6 +1,6 @@ -import json import logging import subprocess # nosec +import typing as t from abc import ABC, abstractmethod import yaml @@ -73,18 +73,51 @@ def run( job_name = self.get_name_from_spec(input_spec) + "_run" # TODO add logic to see if pipeline exists runner = self.client.run_pipeline( - experiment_id=experiment.id, + experiment_id=experiment.experiment_id, job_name=job_name, pipeline_package_path=input_spec, ) - pipeline_url = f"{self.host}/#/runs/details/{runner.id}" + pipeline_url = f"{self.host}/#/runs/details/{runner.run_id}" logger.info(f"Pipeline is running at: {pipeline_url}") + def get_name_from_spec(self, input_spec: str): + """Get the name of the pipeline from the spec.""" + with open(input_spec) as f: + spec, *_ = yaml.safe_load_all(f) + return spec["pipelineInfo"]["name"] + + +class VertexRunner(Runner): + def __resolve_imports(self): + import google.cloud.aiplatform as aip + + self.aip = aip + + def __init__( + self, + project_id: str, + project_region: str, + service_account: t.Optional[str] = None, + ): + self.__resolve_imports() + + self.aip.init( + project=project_id, + location=project_region, + ) + self.service_account = service_account + + def run(self, input_spec: str, *args, **kwargs): + job = self.aip.PipelineJob( + display_name=self.get_name_from_spec(input_spec), + template_path=input_spec, + enable_caching=False, + ) + job.submit(service_account=self.service_account) + def get_name_from_spec(self, input_spec: str): """Get the name of the pipeline from the spec.""" with open(input_spec) as f: spec = yaml.safe_load(f) - return json.loads( - spec["metadata"]["annotations"]["pipelines.kubeflow.org/pipeline_spec"], - )["name"] + return spec["pipelineInfo"]["name"] diff --git a/src/fondant/schema.py b/src/fondant/schema.py index 46a73b1ef..fea6ebe37 100644 --- a/src/fondant/schema.py +++ b/src/fondant/schema.py @@ -157,16 +157,6 @@ class Field(t.NamedTuple): type: Type -def validate_partition_number(arg_value): - if arg_value in ["disable", None, "None"]: - return arg_value if arg_value != "None" else None - try: - return int(arg_value) - except ValueError: - msg = f"Invalid format for '{arg_value}'. The value must be an integer or set to 'disable'" - raise InvalidTypeSchema(msg) - - def validate_partition_size(arg_value): if arg_value in ["disable", None, "None"]: return arg_value if arg_value != "None" else None diff --git a/src/fondant/schemas/component_spec.json b/src/fondant/schemas/component_spec.json index 9badfc01b..6079e7bae 100644 --- a/src/fondant/schemas/component_spec.json +++ b/src/fondant/schemas/component_spec.json @@ -88,6 +88,15 @@ }, { "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array" + }, + { + "type": "object" } ] } diff --git a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml index ec40cd6c1..362459873 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/docker-compose.yml @@ -1,4 +1,4 @@ -name: test_pipeline +name: testpipeline services: first_component: build: @@ -6,14 +6,14 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/first_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json - --storage_args - a dummy string arg - --input_partition_rows - - disable + - '10' - --cache - 'False' - --cluster_type @@ -24,27 +24,19 @@ services: {"type": "binary"}}}, "captions": {"fields": {"data": {"type": "string"}}}}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' depends_on: {} - deploy: - resources: - reservations: - devices: - - capabilities: - - gpu - count: 1 - driver: nvidia + volumes: [] ports: - 8787:8787 - volumes: [] second_component: build: args: [] context: tests/example_pipelines/valid_pipeline/example_1/second_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "second_component", "cache_key": "2"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json - --storage_args - a dummy string arg - --input_partition_rows @@ -60,7 +52,7 @@ services: "array", "items": {"type": "float32"}}}}}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json depends_on: first_component: condition: service_completed_successfully @@ -73,10 +65,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/third_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "third_component", "cache_key": "3"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/third_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json - --storage_args - a dummy string arg - --cache @@ -92,7 +84,7 @@ services: false}, "args": {"storage_args": {"description": "Storage arguments", "type": "str"}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json depends_on: second_component: condition: service_completed_successfully diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml index ba0cd1499..1e148f5c3 100644 --- a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml @@ -1,308 +1,334 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' - generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 -spec: - arguments: - parameters: [] - entrypoint: test-pipeline - serviceAccountName: pipeline-runner - templates: - - affinity: - nodeAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - preference: - matchExpressions: - - key: cloud.google.com/gke-preemptible - operator: In - values: - - 'true' - weight: 50 - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "1"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - disable - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - resources: - limits: - memory: 512M - nvidia.com/gpu: 1 - requests: - memory: 256M - inputs: - artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: - data: '' - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "disable", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - outputs: - artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - tolerations: - - effect: NoSchedule - key: preemptible - operator: Equal - value: 'true' - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "second_component", "cache_key": "2"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, "description": - "This is an example component", "image": "example_component:latest", "name": - "Second component", "produces": {"embeddings": {"fields": {"data": {"items": - {"type": "float32"}, "type": "array"}}}}}' - - --input_partition_rows - - '10' - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: - artifacts: - - name: first-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"consumes\": - {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"Second component\", \"produces\": {\"embeddings\": {\"fields\": - {\"data\": {\"items\": {\"type\": \"float32\"}, \"type\": \"array\"}}}}}", - "input_partition_rows": "10", "metadata": "{\"base_path\": \"/foo/bar\", - \"pipeline_name\": \"test_pipeline\", \"run_id\": \"test_pipeline-20230101000000\", - \"component_id\": \"second_component\", \"cache_key\": \"2\"}", "storage_args": - "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "e8f5a26a42664b1e4774da40117b542baa9676368d9e05262a40e4fd10be0e68"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "Second component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: second-component - outputs: - artifacts: - - name: second-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: - tasks: - - name: first-component - template: first-component - - arguments: - artifacts: - - from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}' - name: first-component-output_manifest_path - dependencies: +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +--- +components: + comp-first-component: + executorLabel: exec-first-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-second-component: + executorLabel: exec-second-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-third-component: + executorLabel: exec-third-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING +deploymentSpec: + executors: + exec-first-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + resources: + memoryLimit: 0.512 + memoryRequest: 0.256 + exec-second-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + exec-third-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: {} + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: 10 + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + second-component: + cachingOptions: {} + componentRef: + name: comp-second-component + dependentTasks: - first-component - name: second-component - template: second-component - - arguments: - artifacts: - - from: '{{tasks.second-component.outputs.artifacts.second-component-output_manifest_path}}' - name: second-component-output_manifest_path - dependencies: + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Second component + produces: + embeddings: + fields: + data: + items: + type: float32 + type: array + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + input_partition_rows: + runtimeValue: + constant: 10 + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "second_component", + "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: second-component + third-component: + cachingOptions: {} + componentRef: + name: comp-third-component + dependentTasks: - second-component - name: third-component - template: third-component - name: test-pipeline - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "third_component", "cache_key": "3"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "consumes": {"captions": {"fields": {"data": {"type": "string"}}}, "embeddings": - {"fields": {"data": {"items": {"type": "float32"}, "type": "array"}}}, "images": - {"fields": {"data": {"type": "binary"}}}}, "description": "This is an example - component", "image": "example_component:latest", "name": "Third component", - "produces": {"additionalSubsets": false, "images": {"fields": {"data": {"type": - "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: - artifacts: - - name: second-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"consumes\": - {\"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, \"embeddings\": - {\"fields\": {\"data\": {\"items\": {\"type\": \"float32\"}, \"type\": \"array\"}}}, - \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"Third component\", \"produces\": {\"additionalSubsets\": false, - \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}}", "input_partition_rows": - "None", "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", - \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"third_component\", - \"cache_key\": \"3\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "8d2ae6379592151eea3b644c61fb091a68a431ac15ed24064cb66434cabf6e08"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "Third component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: third-component - outputs: - artifacts: - - name: third-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + captions: + fields: + data: + type: string + embeddings: + fields: + data: + items: + type: float32 + type: array + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Third component + produces: + additionalSubsets: false + images: + fields: + data: + type: binary + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "third_component", + "cache_key": "3"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: third-component +schemaVersion: 2.1.0 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml new file mode 100644 index 000000000..bf34acc3f --- /dev/null +++ b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml @@ -0,0 +1,330 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +components: + comp-first-component: + executorLabel: exec-first-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-second-component: + executorLabel: exec-second-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-third-component: + executorLabel: exec-third-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING +deploymentSpec: + executors: + exec-first-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + exec-second-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + exec-third-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: {} + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + input_partition_rows: + runtimeValue: + constant: 10.0 + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + second-component: + cachingOptions: {} + componentRef: + name: comp-second-component + dependentTasks: + - first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Second component + produces: + embeddings: + fields: + data: + items: + type: float32 + type: array + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + input_partition_rows: + runtimeValue: + constant: 10.0 + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "second_component", + "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: second-component + third-component: + cachingOptions: {} + componentRef: + name: comp-third-component + dependentTasks: + - second-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + consumes: + captions: + fields: + data: + type: string + embeddings: + fields: + data: + items: + type: float32 + type: array + images: + fields: + data: + type: binary + description: This is an example component + image: example_component:latest + name: Third component + produces: + additionalSubsets: false + images: + fields: + data: + type: binary + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/second_component/manifest.json" + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "third_component", + "cache_key": "3"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/third_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: third-component +schemaVersion: 2.1.0 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml index 1452bde94..bffa065c3 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml @@ -1,4 +1,4 @@ -name: test_pipeline +name: testpipeline services: first_component: build: @@ -6,10 +6,10 @@ services: context: tests/example_pipelines/valid_pipeline/example_1/first_component command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "first_component", "cache_key": "1"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json - --storage_args - a dummy string arg - --cache @@ -26,12 +26,13 @@ services: - 8787:8787 volumes: [] image_cropping: + image: ghcr.io/ml6team/image_cropping:dev command: - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", + - '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", "cache_key": "2"}' - --output_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/image_cropping/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json - --cropping_threshold - '0' - --padding @@ -64,11 +65,10 @@ services: for the image cropping. The padding is added to all borders of the image.", "type": "int", "default": 10}}}' - --input_manifest_path - - /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json + - /foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json depends_on: first_component: condition: service_completed_successfully - image: ghcr.io/ml6team/image_cropping:dev ports: - 8787:8787 volumes: [] diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index 02cade54a..1fe3922b3 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,236 +1,248 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - generateName: test-pipeline- - annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.22, pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00', - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}'} - labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.22} -spec: - entrypoint: test-pipeline - templates: - - name: first-component - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "1"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - "False" - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - inputs: - artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: {data: ''} - outputs: - artifacts: - - {name: first-component-output_manifest_path, path: /tmp/outputs/output_manifest_path/data} - metadata: - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - pipelines.kubeflow.org/enable_caching: "true" - annotations: {pipelines.kubeflow.org/component_spec: '{"description": "This - is an example component", "implementation": {"container": {"command": ["fondant", - "execute", "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, - "--metadata", {"inputValue": "metadata"}, "--component_spec", {"inputValue": - "component_spec"}, "--input_partition_rows", {"inputValue": "input_partition_rows"}, - "--cache", {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}', pipelines.kubeflow.org/component_ref: '{"digest": - "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}', pipelines.kubeflow.org/arguments.parameters: '{"cache": - "False", "client_kwargs": "{}", "cluster_type": "default", "component_spec": - "{\"args\": {\"storage_args\": {\"description\": \"Storage arguments\", - \"type\": \"str\"}}, \"description\": \"This is an example component\", - \"image\": \"example_component:latest\", \"name\": \"First component\", - \"produces\": {\"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, - \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}}", "input_partition_rows": - "None", "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", - \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}'} - - name: image-cropping - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "image_cropping", "cache_key": "2"}' - - --component_spec - - '{"args": {"cropping_threshold": {"default": -30, "description": "Threshold - parameter used for detecting borders. A lower (negative) parameter results - in a more performant border detection, but can cause overcropping. Default - is -30", "type": "int"}, "padding": {"default": 10, "description": "Padding - for the image cropping. The padding is added to all borders of the image.", - "type": "int"}}, "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, - "description": "This component crops out image borders. This is typically - useful when working with graphical \nimages that have single-color borders - (e.g. logos, icons, etc.).\n\nThe component takes an image and calculates - which color is most present in the border. It then \ncrops the image in order - to minimize this single-color border. The `padding` argument will add \nextra - border to the image before cropping it, in order to avoid cutting off parts - of the image.\nThe resulting crop will always be square. If a crop is not - possible, the component will return \nthe original image.\n\n#### Examples\nExamples - of image cropping by removing the single-color border. Left side is original - image, \nright side is border-cropped image.\n\n![Example of image cropping - by removing the single-color border. Left side is original, right side is - cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example - of image cropping by removing the single-color border. Left side is original, - right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n", - "image": "ghcr.io/ml6team/image_cropping:dev", "name": "Image cropping", "produces": - {"images": {"fields": {"data": {"type": "binary"}, "height": {"type": "int32"}, - "width": {"type": "int32"}}}}}' - - --input_partition_rows - - None - - --cache - - "True" - - --cropping_threshold - - '0' - - --padding - - '0' - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: ghcr.io/ml6team/image_cropping:dev - imagePullPolicy: Always - inputs: - artifacts: - - {name: first-component-output_manifest_path, path: /tmp/inputs/input_manifest_path/data} - outputs: - artifacts: - - {name: image-cropping-output_manifest_path, path: /tmp/outputs/output_manifest_path/data} - metadata: - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - pipelines.kubeflow.org/enable_caching: "true" - annotations: {pipelines.kubeflow.org/component_spec: '{"description": "This - component crops out image borders. This is typically useful when working - with graphical \nimages that have single-color borders (e.g. logos, icons, - etc.).\n\nThe component takes an image and calculates which color is most - present in the border. It then \ncrops the image in order to minimize this - single-color border. The `padding` argument will add \nextra border to the - image before cropping it, in order to avoid cutting off parts of the image.\nThe - resulting crop will always be square. If a crop is not possible, the component - will return \nthe original image.\n\n#### Examples\nExamples of image cropping - by removing the single-color border. Left side is original image, \nright - side is border-cropped image.\n\n![Example of image cropping by removing - the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example - of image cropping by removing the single-color border. Left side is original, - right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n", - "implementation": {"container": {"command": ["fondant", "execute", "main", - "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--cropping_threshold", {"inputValue": "cropping_threshold"}, - "--padding", {"inputValue": "padding"}, "--output_manifest_path", {"outputPath": - "output_manifest_path"}, "--cluster_type", {"inputValue": "cluster_type"}, - "--client_kwargs", {"inputValue": "client_kwargs"}], "image": "ghcr.io/ml6team/image_cropping:dev"}}, - "inputs": [{"description": "Path to the input manifest", "name": "input_manifest_path", - "type": "String"}, {"description": "Metadata arguments containing the run - id and base path", "name": "metadata", "type": "String"}, {"default": "None", - "description": "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"default": - -30, "description": "Threshold parameter used for detecting borders. A lower - (negative) parameter results in a more performant border detection, but - can cause overcropping. Default is -30", "name": "cropping_threshold", "type": - "Integer"}, {"default": 10, "description": "Padding for the image cropping. - The padding is added to all borders of the image.", "name": "padding", "type": - "Integer"}], "name": "Image cropping", "outputs": [{"description": "Path - to the output manifest", "name": "output_manifest_path", "type": "String"}]}', - pipelines.kubeflow.org/component_ref: '{"digest": "bd073ded3bbd5c9bc5fd3abd3b8e8d19d65c17d6914f117596c78a5eddbd99d0"}', - pipelines.kubeflow.org/arguments.parameters: '{"cache": "True", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"cropping_threshold\": - {\"default\": -30, \"description\": \"Threshold parameter used for detecting - borders. A lower (negative) parameter results in a more performant border - detection, but can cause overcropping. Default is -30\", \"type\": \"int\"}, - \"padding\": {\"default\": 10, \"description\": \"Padding for the image - cropping. The padding is added to all borders of the image.\", \"type\": - \"int\"}}, \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": - \"binary\"}}}}, \"description\": \"This component crops out image borders. - This is typically useful when working with graphical \\nimages that have - single-color borders (e.g. logos, icons, etc.).\\n\\nThe component takes - an image and calculates which color is most present in the border. It then - \\ncrops the image in order to minimize this single-color border. The `padding` - argument will add \\nextra border to the image before cropping it, in order - to avoid cutting off parts of the image.\\nThe resulting crop will always - be square. If a crop is not possible, the component will return \\nthe original - image.\\n\\n#### Examples\\nExamples of image cropping by removing the single-color - border. Left side is original image, \\nright side is border-cropped image.\\n\\n![Example - of image cropping by removing the single-color border. Left side is original, - right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\\n![Example - of image cropping by removing the single-color border. Left side is original, - right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\\n\", - \"image\": \"ghcr.io/ml6team/image_cropping:dev\", \"name\": \"Image cropping\", - \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}, - \"height\": {\"type\": \"int32\"}, \"width\": {\"type\": \"int32\"}}}}}", - "cropping_threshold": "0", "input_partition_rows": "None", "metadata": "{\"base_path\": - \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": \"test_pipeline-20230101000000\", - \"component_id\": \"image_cropping\", \"cache_key\": \"2\"}", "padding": - "0"}'} - - name: test-pipeline - dag: - tasks: - - {name: first-component, template: first-component} - - name: image-cropping - template: image-cropping - dependencies: [first-component] - arguments: - artifacts: - - {name: first-component-output_manifest_path, from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}'} - arguments: - parameters: [] - serviceAccountName: pipeline-runner +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +--- +components: + comp-first-component: + executorLabel: exec-first-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1.0 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-image-cropping: + executorLabel: exec-image-cropping + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + isOptional: true + parameterType: NUMBER_INTEGER + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1.0 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + padding: + defaultValue: 10.0 + isOptional: true + parameterType: NUMBER_INTEGER +deploymentSpec: + executors: + exec-first-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + exec-image-cropping: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--cropping_threshold" + - "{{$.inputs.parameters['cropping_threshold']}}" + - "--padding" + - "{{$.inputs.parameters['padding']}}" + command: + - fondant + - execute + - main + image: ghcr.io/ml6team/image_cropping:dev +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: {} + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + image-cropping: + cachingOptions: {} + componentRef: + name: comp-image-cropping + dependentTasks: + - first-component + inputs: + parameters: + cache: + runtimeValue: + constant: true + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + cropping_threshold: + default: -30.0 + description: Threshold parameter used for detecting borders. + A lower (negative) parameter results in a more performant + border detection, but can cause overcropping. Default is -30 + type: int + padding: + default: 10.0 + description: Padding for the image cropping. The padding is + added to all borders of the image. + type: int + consumes: + images: + fields: + data: + type: binary + description: "This component crops out image borders. This is typically + useful when working with graphical \nimages that have single-color + borders (e.g. logos, icons, etc.).\n\nThe component takes an image + and calculates which color is most present in the border. It then + \ncrops the image in order to minimize this single-color border. + The `padding` argument will add \nextra border to the image before + cropping it, in order to avoid cutting off parts of the image.\nThe + resulting crop will always be square. If a crop is not possible, + the component will return \nthe original image.\n\n#### Examples\nExamples + of image cropping by removing the single-color border. Left side + is original image, \nright side is border-cropped image.\n\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n" + image: ghcr.io/ml6team/image_cropping:dev + name: Image cropping + produces: + images: + fields: + data: + type: binary + height: + type: int32 + width: + type: int32 + cropping_threshold: + runtimeValue: + constant: 0.0 + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", + "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" + padding: + runtimeValue: + constant: 0.0 + taskInfo: + name: image-cropping +schemaVersion: 2.1.0 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml new file mode 100644 index 000000000..882728792 --- /dev/null +++ b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml @@ -0,0 +1,248 @@ +# PIPELINE DEFINITION +# Name: testpipeline +# Description: description of the test pipeline +--- +components: + comp-first-component: + executorLabel: exec-first-component + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + storage_args: + parameterType: STRING + comp-image-cropping: + executorLabel: exec-image-cropping + inputDefinitions: + parameters: + cache: + defaultValue: true + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + isOptional: true + parameterType: STRING + component_spec: + parameterType: STRUCT + cropping_threshold: + defaultValue: -30.0 + isOptional: true + parameterType: NUMBER_INTEGER + input_manifest_path: + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1.0 + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + parameterType: STRING + output_manifest_path: + parameterType: STRING + padding: + defaultValue: 10.0 + isOptional: true + parameterType: NUMBER_INTEGER +deploymentSpec: + executors: + exec-first-component: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--storage_args" + - "{{$.inputs.parameters['storage_args']}}" + command: + - fondant + - execute + - main + image: example_component:latest + exec-image-cropping: + container: + args: + - "--input_manifest_path" + - "{{$.inputs.parameters['input_manifest_path']}}" + - "--component_spec" + - "{{$.inputs.parameters['component_spec']}}" + - "--input_partition_rows" + - "{{$.inputs.parameters['input_partition_rows']}}" + - "--cache" + - "{{$.inputs.parameters['cache']}}" + - "--cluster_type" + - "{{$.inputs.parameters['cluster_type']}}" + - "--metadata" + - "{{$.inputs.parameters['metadata']}}" + - "--output_manifest_path" + - "{{$.inputs.parameters['output_manifest_path']}}" + - "--cropping_threshold" + - "{{$.inputs.parameters['cropping_threshold']}}" + - "--padding" + - "{{$.inputs.parameters['padding']}}" + command: + - fondant + - execute + - main + image: ghcr.io/ml6team/image_cropping:dev +pipelineInfo: + description: description of the test pipeline + name: testpipeline +root: + dag: + tasks: + first-component: + cachingOptions: {} + componentRef: + name: comp-first-component + inputs: + parameters: + cache: + runtimeValue: + constant: false + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + storage_args: + description: Storage arguments + type: str + description: This is an example component + image: example_component:latest + name: First component + produces: + captions: + fields: + data: + type: string + images: + fields: + data: + type: binary + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "first_component", + "cache_key": "1"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + storage_args: + runtimeValue: + constant: a dummy string arg + taskInfo: + name: first-component + image-cropping: + cachingOptions: {} + componentRef: + name: comp-image-cropping + dependentTasks: + - first-component + inputs: + parameters: + cache: + runtimeValue: + constant: true + cluster_type: + runtimeValue: + constant: default + component_spec: + runtimeValue: + constant: + args: + cropping_threshold: + default: -30.0 + description: Threshold parameter used for detecting borders. + A lower (negative) parameter results in a more performant + border detection, but can cause overcropping. Default is -30 + type: int + padding: + default: 10.0 + description: Padding for the image cropping. The padding is + added to all borders of the image. + type: int + consumes: + images: + fields: + data: + type: binary + description: "This component crops out image borders. This is typically + useful when working with graphical \nimages that have single-color + borders (e.g. logos, icons, etc.).\n\nThe component takes an image + and calculates which color is most present in the border. It then + \ncrops the image in order to minimize this single-color border. + The `padding` argument will add \nextra border to the image before + cropping it, in order to avoid cutting off parts of the image.\nThe + resulting crop will always be square. If a crop is not possible, + the component will return \nthe original image.\n\n#### Examples\nExamples + of image cropping by removing the single-color border. Left side + is original image, \nright side is border-cropped image.\n\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side + is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n" + image: ghcr.io/ml6team/image_cropping:dev + name: Image cropping + produces: + images: + fields: + data: + type: binary + height: + type: int32 + width: + type: int32 + cropping_threshold: + runtimeValue: + constant: 0.0 + input_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/first_component/manifest.json" + metadata: + runtimeValue: + constant: '{"base_path": "/foo/bar", "pipeline_name": "testpipeline", + "run_id": "testpipeline-20230101000000", "component_id": "image_cropping", + "cache_key": "2"}' + output_manifest_path: + runtimeValue: + constant: "/foo/bar/testpipeline/testpipeline-20230101000000/image_cropping/manifest.json" + padding: + runtimeValue: + constant: 0.0 + taskInfo: + name: image-cropping +schemaVersion: 2.1.0 +sdkVersion: kfp-2.3.0 diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml index 4709fe966..18b2b38cf 100644 --- a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml @@ -1,110 +1,221 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' - generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 -spec: - arguments: - parameters: [] - entrypoint: test-pipeline - serviceAccountName: pipeline-runner - templates: - - container: - args: [] - command: - - fondant - - execute - - main - - --input_manifest_path - - /tmp/inputs/input_manifest_path/data - - --metadata - - '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000", - "component_id": "first_component", "cache_key": "b72c8e370be017d5a679a60d3984ab9d"}' - - --component_spec - - '{"args": {"storage_args": {"description": "Storage arguments", "type": "str"}}, - "description": "This is an example component", "image": "example_component:latest", - "name": "First component", "produces": {"captions": {"fields": {"data": {"type": - "string"}}}, "images": {"fields": {"data": {"type": "binary"}}}}}' - - --input_partition_rows - - None - - --cache - - 'False' - - --storage_args - - a dummy string arg - - --output_manifest_path - - /tmp/outputs/output_manifest_path/data - - --cluster_type - - default - - --client_kwargs - - '{}' - image: example_component:latest - imagePullPolicy: Always - resources: - limits: - nvidia.com/gpu: 1 - inputs: - artifacts: - - name: input_manifest_path - path: /tmp/inputs/input_manifest_path/data - raw: - data: '' - metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "None", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"b72c8e370be017d5a679a60d3984ab9d\"}", "storage_args": "a - dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, - "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", - {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], - "image": "example_component:latest"}}, "inputs": [{"description": "Path - to the input manifest", "name": "input_manifest_path", "type": "String"}, - {"description": "Metadata arguments containing the run id and base path", - "name": "metadata", "type": "String"}, {"default": "None", "description": - "The component specification as a dictionary", "name": "component_spec", - "type": "JsonObject"}, {"default": "None", "description": "The number of - rows to load per partition. Set to override the automatic partitioning", - "name": "input_partition_rows", "type": "String"}, {"default": "True", "description": - "Set to False to disable caching, True by default.", "name": "cache", "type": - "Boolean"}, {"default": "default", "description": "The type of cluster to - use for distributed execution", "name": "cluster_type", "type": "String"}, - {"default": "{}", "description": "Keyword arguments used to initialise the - dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": - "Storage arguments", "name": "storage_args", "type": "String"}], "name": - "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - nodeSelector: - a_node_pool_label: a_node_pool - outputs: - artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: - tasks: - - name: first-component - template: first-component - name: test-pipeline +{ + "components": + { + "comp-example-component": + { + "executorLabel": "exec-example-component", + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + }, + "deploymentSpec": + { + "executors": + { + "exec-example-component": + { + "container": + { + "args": + [ + "--input_manifest_path", + "{{$.inputs.artifacts['input_manifest_path'].uri}}", + "--metadata", + "{{$.inputs.parameters['metadata']}}", + "--component_spec", + "{{$.inputs.parameters['component_spec']}}", + "--input_partition_rows", + "{{$.inputs.parameters['input_partition_rows']}}", + "--cache", + "{{$.inputs.parameters['cache']}}", + "--storage_args", + "{{$.inputs.parameters['storage_args']}}", + "--output_manifest_path", + "{{$.outputs.artifacts['output_manifest_path'].uri}}", + ], + "command": ["fondant", "execute", "main"], + "image": "example_component:latest", + }, + }, + }, + }, + "pipelineInfo": { "name": "example-component" }, + "root": + { + "dag": + { + "outputs": + { + "artifacts": + { + "output_manifest_path": + { + "artifactSelectors": + [ + { + "outputArtifactKey": "output_manifest_path", + "producerSubtask": "example-component", + }, + ], + }, + }, + }, + "tasks": + { + "example-component": + { + "cachingOptions": { "enableCache": True }, + "componentRef": { "name": "comp-example-component" }, + "inputs": + { + "artifacts": + { + "input_manifest_path": + { "componentInputArtifact": "input_manifest_path" }, + }, + "parameters": + { + "component_spec": + { "componentInputParameter": "component_spec" }, + "input_partition_rows": + { + "componentInputParameter": "input_partition_rows", + }, + "metadata": { "componentInputParameter": "metadata" }, + "cache": { "componentInputParameter": "cache" }, + }, + }, + "taskInfo": { "name": "example-component" }, + }, + }, + }, + "inputDefinitions": + { + "artifacts": + { + "input_manifest_path": + { + "description": "Path to the input manifest", + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "isOptional": True, + }, + }, + "parameters": + { + "component_spec": + { + "description": "The component specification as a dictionary", + "defaultValue": {}, + "isOptional": True, + "parameterType": "STRUCT", + }, + "input_partition_rows": + { + "description": "The number of rows to load per partition. Set to override the automatic partitioning", + "isOptional": True, + "parameterType": "STRING", + }, + "cache": + { + "parameterType": "BOOLEAN", + "description": "Set to False to disable caching, True by default.", + "defaultValue": True, + "isOptional": True, + }, + "metadata": + { + "description": "Metadata arguments containing the run id and base path", + "parameterType": "STRING", + }, + "storage_args": + { + "parameterType": "STRING", + "description": "Storage arguments", + }, + }, + }, + "outputDefinitions": + { + "artifacts": + { + "output_manifest_path": + { + "artifactType": + { + "schemaTitle": "system.Artifact", + "schemaVersion": "0.0.1", + }, + "description": "Path to the output manifest", + }, + }, + }, + }, + "schemaVersion": "2.1.0", + "sdkVersion": "kfp-2.0.1", +} diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/example_specs/component_specs/kubeflow_component.yaml index 0f2026115..4ecedde7d 100644 --- a/tests/example_specs/component_specs/kubeflow_component.yaml +++ b/tests/example_specs/component_specs/kubeflow_component.yaml @@ -1,62 +1,96 @@ -name: Example component -description: This is an example component -inputs: -- name: input_manifest_path - description: Path to the input manifest - type: String -- name: metadata - description: Metadata arguments containing the run id and base path - type: String -- name: component_spec - description: The component specification as a dictionary - type: JsonObject - default: None -- name: input_partition_rows - description: The number of rows to load per partition. Set to override the automatic - partitioning - type: String - default: None -- name: cache - description: Set to False to disable caching, True by default. - type: Boolean - default: 'True' -- name: cluster_type - description: The type of cluster to use for distributed execution - type: String - default: default -- name: client_kwargs - description: Keyword arguments used to initialise the dask client - type: JsonObject - default: '{}' -- name: storage_args - description: Storage arguments - type: String -outputs: -- name: output_manifest_path - description: Path to the output manifest - type: String -implementation: - container: - image: example_component:latest - command: - - fondant - - execute - - main +components: + comp-example-component: + executorLabel: exec-example-component + inputDefinitions: &id001 + parameters: + cache: + defaultValue: true + description: Set to False to disable caching, True by default. + isOptional: true + parameterType: BOOLEAN + cluster_type: + defaultValue: default + description: The cluster type to use for the execution + isOptional: true + parameterType: STRING + component_spec: + description: The component specification as a dictionary + parameterType: STRUCT + input_manifest_path: + description: Path to the input manifest + isOptional: true + parameterType: STRING + input_partition_rows: + defaultValue: -1 + description: The number of rows to load per partition. Set + to override the automatic partitioning + isOptional: true + parameterType: NUMBER_INTEGER + metadata: + description: Metadata arguments containing the run id and base path + parameterType: STRING + output_manifest_path: + description: Path to the output manifest + parameterType: STRING + storage_args: + description: Storage arguments + parameterType: STRING +deploymentSpec: + executors: + exec-example-component: + container: + args: - --input_manifest_path - - inputPath: input_manifest_path - - --metadata - - inputValue: metadata + - '{{$.inputs.parameters[''input_manifest_path'']}}' - --component_spec - - inputValue: component_spec + - '{{$.inputs.parameters[''component_spec'']}}' - --input_partition_rows - - inputValue: input_partition_rows + - '{{$.inputs.parameters[''input_partition_rows'']}}' - --cache - - inputValue: cache - - --storage_args - - inputValue: storage_args - - --output_manifest_path - - outputPath: output_manifest_path + - '{{$.inputs.parameters[''cache'']}}' - --cluster_type - - inputValue: cluster_type - - --client_kwargs - - inputValue: client_kwargs + - '{{$.inputs.parameters[''cluster_type'']}}' + - --metadata + - '{{$.inputs.parameters[''metadata'']}}' + - --output_manifest_path + - '{{$.inputs.parameters[''output_manifest_path'']}}' + - --storage_args + - '{{$.inputs.parameters[''storage_args'']}}' + command: + - fondant + - execute + - main + image: example_component:latest +pipelineInfo: + name: example-component +root: + dag: + tasks: + example-component: + cachingOptions: + enableCache: true + componentRef: + name: comp-example-component + inputs: + parameters: + cache: + componentInputParameter: cache + cluster_type: + componentInputParameter: cluster_type + component_spec: + componentInputParameter: component_spec + input_manifest_path: + componentInputParameter: input_manifest_path + input_partition_rows: + componentInputParameter: input_partition_rows + metadata: + componentInputParameter: metadata + output_manifest_path: + componentInputParameter: output_manifest_path + storage_args: + componentInputParameter: storage_args + taskInfo: + name: example-component + inputDefinitions: *id001 +schemaVersion: 2.1.0 +sdkVersion: kfp-2.0.1 diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/example_specs/components/arguments/component.yaml index cc14f5221..659ed0026 100644 --- a/tests/example_specs/components/arguments/component.yaml +++ b/tests/example_specs/components/arguments/component.yaml @@ -18,19 +18,19 @@ args: bool_false_default_arg: description: default bool argument type: bool - default: 'False' + default: False bool_true_default_arg: description: default bool argument type: bool - default: 'True' + default: True list_default_arg: description: default list argument type: list - default: '["foo", "bar"]' + default: ["foo", "bar"] dict_default_arg: description: default dict argument type: dict - default: '{"foo":1, "bar":2}' + default: {"foo":1, "bar":2} string_default_arg_none: description: default string argument type: str @@ -38,31 +38,31 @@ args: integer_default_arg_none: description: default integer argument type: int - default: None + default: 0 float_default_arg_none: description: default float argument type: float - default: None + default: 0.0 bool_default_arg_none: description: default bool argument type: bool - default: None + default: False list_default_arg_none: description: default list argument type: list - default: None + default: [] dict_default_arg_none: description: default dict argument type: dict - default: None + default: {} override_default_arg: description: argument with default python value type that can be overriden type: str default: foo - override_default_none_arg: - description: argument with default None value type that can be overriden with a valid python type - type: float - default: None override_default_arg_with_none: description: argument with default python type that can be overriden with None type: str + optional_arg: + description: optional argument + type: str + default: None diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/example_specs/components/arguments/component_default_args.yaml index 2d582fbfe..816211c04 100644 --- a/tests/example_specs/components/arguments/component_default_args.yaml +++ b/tests/example_specs/components/arguments/component_default_args.yaml @@ -18,19 +18,19 @@ args: bool_false_default_arg: description: default bool argument type: bool - default: 'False' + default: False bool_true_default_arg: description: default bool argument type: bool - default: 'True' + default: True list_default_arg: description: default list argument type: list - default: '["foo", "bar"]' + default: ["foo", "bar"] dict_default_arg: description: default dict argument type: dict - default: '{"foo":1, "bar":2}' + default: {"foo":1, "bar":2} string_default_arg_none: description: default string argument type: str diff --git a/tests/test_cli.py b/tests/test_cli.py index b8326f148..c3ba577e7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -143,7 +143,9 @@ def test_local_logic(tmp_path_factory): def test_kfp_compile(tmp_path_factory): - with tmp_path_factory.mktemp("temp") as fn: + with tmp_path_factory.mktemp("temp") as fn, patch( + "fondant.compiler.KubeFlowCompiler.compile", + ) as mock_compiler: args = argparse.Namespace( ref=__name__, kubeflow=True, @@ -151,6 +153,10 @@ def test_kfp_compile(tmp_path_factory): output_path=str(fn / "kubeflow_pipelines.yml"), ) compile_kfp(args) + mock_compiler.assert_called_once_with( + pipeline=TEST_PIPELINE, + output_path=str(fn / "kubeflow_pipelines.yml"), + ) def test_local_run(tmp_path_factory): @@ -220,9 +226,12 @@ def test_kfp_run(tmp_path_factory): ) run_kfp(args) mock_runner.assert_called_once_with(host="localhost") - with patch("fondant.cli.KubeflowRunner") as mock_runner, tmp_path_factory.mktemp( + with patch("fondant.cli.KubeflowRunner") as mock_runner, patch( + "fondant.cli.KubeFlowCompiler", + ) as mock_compiler, tmp_path_factory.mktemp( "temp", ) as fn: + mock_compiler.compile.return_value = "some/path" args = argparse.Namespace( kubeflow=True, local=False, diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 4d77a0937..2de64b88c 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -6,7 +6,8 @@ import pytest import yaml -from fondant.compiler import DockerCompiler, KubeFlowCompiler +from fondant.compiler import DockerCompiler, KubeFlowCompiler, VertexCompiler +from fondant.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") @@ -21,8 +22,7 @@ "component_op": ComponentOp( Path(COMPONENTS_PATH / "example_1" / "first_component"), arguments={"storage_args": "a dummy string arg"}, - input_partition_rows="disable", - number_of_gpus=1, + input_partition_rows=10, preemptible=True, memory_limit="512M", memory_request="256M", @@ -33,7 +33,7 @@ "component_op": ComponentOp( Path(COMPONENTS_PATH / "example_1" / "second_component"), arguments={"storage_args": "a dummy string arg"}, - input_partition_rows="10", + input_partition_rows=10, ), "cache_key": "2", }, @@ -87,7 +87,7 @@ def now(cls): @pytest.fixture(params=TEST_PIPELINES) def setup_pipeline(request, tmp_path, monkeypatch): pipeline = Pipeline( - pipeline_name="test_pipeline", + pipeline_name="testpipeline", pipeline_description="description of the test pipeline", base_path="/foo/bar", ) @@ -144,7 +144,7 @@ def test_docker_local_path(setup_pipeline, tmp_path_factory): with open(fn / "docker-compose.yml") as f_spec: spec = yaml.safe_load(f_spec) - expected_run_id = "test_pipeline-20230101000000" + expected_run_id = "testpipeline-20230101000000" for name, service in spec["services"].items(): # check if volumes are defined correctly @@ -156,10 +156,11 @@ def test_docker_local_path(setup_pipeline, tmp_path_factory): "type": "bind", }, ] + cleaned_pipeline_name = pipeline.name.replace("_", "") # check if commands are patched to use the working dir commands_with_dir = [ - f"{work_dir}/{pipeline.name}/{expected_run_id}/{name}/manifest.json", - f'{{"base_path": "{work_dir}", "pipeline_name": "{pipeline.name}",' + f"{work_dir}/{cleaned_pipeline_name}/{expected_run_id}/{name}/manifest.json", + f'{{"base_path": "{work_dir}", "pipeline_name": "{cleaned_pipeline_name}",' f' "run_id": "{expected_run_id}", "component_id": "{name}",' f' "cache_key": "{cache_key}"}}', ] @@ -181,15 +182,16 @@ def test_docker_remote_path(setup_pipeline, tmp_path_factory): with open(fn / "docker-compose.yml") as f_spec: spec = yaml.safe_load(f_spec) - expected_run_id = "test_pipeline-20230101000000" + expected_run_id = "testpipeline-20230101000000" for name, service in spec["services"].items(): cache_key = cache_dict[name] # check that no volumes are created assert service["volumes"] == [] # check if commands are patched to use the remote dir + cleaned_pipeline_name = pipeline.name.replace("_", "") commands_with_dir = [ - f"{remote_dir}/{pipeline.name}/{expected_run_id}/{name}/manifest.json", - f'{{"base_path": "{remote_dir}", "pipeline_name": "{pipeline.name}",' + f"{remote_dir}/{cleaned_pipeline_name}/{expected_run_id}/{name}/manifest.json", + f'{{"base_path": "{remote_dir}", "pipeline_name": "{cleaned_pipeline_name}",' f' "run_id": "{expected_run_id}", "component_id": "{name}",' f' "cache_key": "{cache_key}"}}', ] @@ -222,6 +224,68 @@ def test_docker_extra_volumes(setup_pipeline, tmp_path_factory): ) +@pytest.mark.usefixtures("_freeze_time") +def test_docker_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="GPU", + ) + + expected_resources = { + "reservations": { + "devices": [ + { + "capabilities": ["gpu"], + "count": 1, + "driver": "nvidia", + }, + ], + }, + } + + pipeline.add_op(component_1) + compiler = DockerCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "docker-compose.yaml") + compiler.compile(pipeline=pipeline, output_path=output_path) + # read the generated docker-compose file + with open(output_path) as f_spec: + spec = yaml.safe_load(f_spec) + assert ( + spec["services"]["first_component"]["deploy"]["resources"] + == expected_resources + ) + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_docker_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = DockerCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") + + @pytest.mark.usefixtures("_freeze_time") def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): """Test compiling a pipeline to kubeflow.""" @@ -239,6 +303,9 @@ def test_kubeflow_compiler(setup_pipeline, tmp_path_factory): @pytest.mark.usefixtures("_freeze_time") def test_kubeflow_configuration(tmp_path_factory): """Test that the kubeflow pipeline can be configured.""" + node_pool_label = "dummy_label" + node_pool_name = "dummy_label" + pipeline = Pipeline( pipeline_name="test_pipeline", pipeline_description="description of the test pipeline", @@ -247,19 +314,57 @@ def test_kubeflow_configuration(tmp_path_factory): component_1 = ComponentOp( Path(COMPONENTS_PATH / "example_1" / "first_component"), arguments={"storage_args": "a dummy string arg"}, - node_pool_name="a_node_pool", - node_pool_label="a_node_pool_label", - number_of_gpus=1, + node_pool_label=node_pool_label, + node_pool_name=node_pool_name, + number_of_accelerators=1, + accelerator_name="GPU", ) pipeline.add_op(component_1) compiler = KubeFlowCompiler() with tmp_path_factory.mktemp("temp") as fn: output_path = str(fn / "kubeflow_pipeline.yml") compiler.compile(pipeline=pipeline, output_path=output_path) - with open(output_path) as src, open( - VALID_PIPELINE / "kubeflow_pipeline.yml", - ) as truth: - assert yaml.safe_load(src) == yaml.safe_load(truth) + with open(output_path) as src: + # Two specs are present and loaded in the yaml file (component spec and k8s specs) + compiled_specs = yaml.load_all(src, Loader=yaml.FullLoader) + for spec in compiled_specs: + if "platforms" in spec: + component_kubernetes_spec = spec["platforms"]["kubernetes"][ + "deploymentSpec" + ]["executors"]["exec-first-component"] + assert component_kubernetes_spec["nodeSelector"]["labels"] == { + node_pool_label: node_pool_name, + } + + elif "deploymentSpec" in spec: + component_resources = spec["deploymentSpec"]["executors"][ + "exec-first-component" + ]["container"]["resources"] + assert component_resources["accelerator"]["count"] == "1" + assert ( + component_resources["accelerator"]["type"] == "nvidia.com/gpu" + ) + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_kubeflow_configuration(tmp_path_factory): + """Test that an error is returned when an invalid resource is provided.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = KubeFlowCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") def test_kfp_import(): @@ -271,6 +376,71 @@ def test_kfp_import(): _ = KubeFlowCompiler() +@pytest.mark.usefixtures("_freeze_time") +def test_vertex_compiler(setup_pipeline, tmp_path_factory): + """Test compiling a pipeline to vertex.""" + example_dir, pipeline, _ = setup_pipeline + compiler = VertexCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "vertex_pipeline.json") + compiler.compile(pipeline=pipeline, output_path=output_path) + with open(output_path) as src, open( + VALID_PIPELINE / example_dir / "vertex_pipeline.yml", + ) as truth: + assert yaml.safe_load(src) == yaml.safe_load(truth) + + +@pytest.mark.usefixtures("_freeze_time") +def test_vertex_configuration(tmp_path_factory): + """Test that the kubeflow pipeline can be configured.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="NVIDIA_TESLA_K80", + ) + pipeline.add_op(component_1) + compiler = VertexCompiler() + with tmp_path_factory.mktemp("temp") as fn: + output_path = str(fn / "vertex_pipeline.yml") + compiler.compile(pipeline=pipeline, output_path=output_path) + with open(output_path) as src: + # Two specs are present and loaded in the yaml file (component spec and k8s specs) + compiled_specs = yaml.safe_load(src) + + component_resources = compiled_specs["deploymentSpec"]["executors"][ + "exec-first-component" + ]["container"]["resources"] + assert component_resources["accelerator"]["count"] == "1" + assert component_resources["accelerator"]["type"] == "NVIDIA_TESLA_K80" + + +@pytest.mark.usefixtures("_freeze_time") +def test_invalid_vertex_configuration(tmp_path_factory): + """Test that extra volumes are applied correctly.""" + pipeline = Pipeline( + pipeline_name="test_pipeline", + pipeline_description="description of the test pipeline", + base_path="/foo/bar", + ) + component_1 = ComponentOp( + Path(COMPONENTS_PATH / "example_1" / "first_component"), + arguments={"storage_args": "a dummy string arg"}, + number_of_accelerators=1, + accelerator_name="unknown resource", + ) + + pipeline.add_op(component_1) + compiler = VertexCompiler() + with pytest.raises(InvalidPipelineDefinition): + compiler.compile(pipeline=pipeline, output_path="kubeflow_pipeline.yml") + + def test_caching_dependency_docker(tmp_path_factory): """Test that the component cache key changes when a depending component cache key change for the docker compiler. @@ -348,10 +518,11 @@ def test_caching_dependency_kfp(tmp_path_factory): compiler.compile(pipeline=pipeline, output_path=output_path) with open(output_path) as src: spec = yaml.safe_load(src) - commands = spec["spec"]["templates"][1]["container"]["command"] - cache_key = json.loads(commands[commands.index("--metadata") + 1])[ - "cache_key" + params = spec["root"]["dag"]["tasks"]["second-component"]["inputs"][ + "parameters" ] + metadata = params["metadata"]["runtimeValue"]["constant"] + cache_key = json.loads(metadata)["cache_key"] second_component_cache_key_dict[arg] = cache_key assert ( diff --git a/tests/test_component.py b/tests/test_component.py index 75169e93b..215386d37 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -16,11 +16,7 @@ ) from fondant.component_spec import ComponentSpec from fondant.data_io import DaskDataLoader, DaskDataWriter -from fondant.executor import ( - Executor, - ExecutorFactory, - PandasTransformExecutor, -) +from fondant.executor import Executor, ExecutorFactory, PandasTransformExecutor from fondant.manifest import Manifest, Metadata components_path = Path(__file__).parent / "example_specs/components" @@ -123,22 +119,22 @@ def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: assert executor.input_partition_rows == expected_partition_row_arg assert executor.cache is True assert executor.user_arguments == { - "string_default_arg": "foo", "integer_default_arg": 0, "float_default_arg": 3.14, "bool_false_default_arg": False, "bool_true_default_arg": True, "list_default_arg": ["foo", "bar"], "dict_default_arg": {"foo": 1, "bar": 2}, + "string_default_arg": "foo", "string_default_arg_none": None, - "integer_default_arg_none": None, - "float_default_arg_none": None, - "bool_default_arg_none": None, - "list_default_arg_none": None, - "dict_default_arg_none": None, + "integer_default_arg_none": 0, + "float_default_arg_none": 0.0, + "bool_default_arg_none": False, + "list_default_arg_none": [], + "dict_default_arg_none": {}, "override_default_arg": "bar", - "override_default_none_arg": 3.14, "override_default_arg_with_none": None, + "optional_arg": None, } @@ -286,7 +282,7 @@ def test_dask_transform_component(metadata): "--value", "1", "--input_partition_rows", - "disable", + "10", "--output_manifest_path", str(components_path / "output_manifest.json"), "--component_spec", @@ -308,7 +304,8 @@ def transform(self, dataframe): executor_factory = ExecutorFactory(MyDaskComponent) executor = executor_factory.get_executor() - assert executor.input_partition_rows == "disable" + expected_input_partition_rows = 10 + assert executor.input_partition_rows == expected_input_partition_rows transform = patch_method_class(MyDaskComponent.transform) with mock.patch.object( MyDaskComponent, diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py index 56499515d..0b0909a77 100644 --- a/tests/test_component_specs.py +++ b/tests/test_component_specs.py @@ -80,7 +80,7 @@ def test_component_spec_no_args(valid_fondant_schema_no_args): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.args == {} + assert fondant_component.args == fondant_component.default_arguments def test_component_spec_to_file(valid_fondant_schema): diff --git a/tests/test_data_io.py b/tests/test_data_io.py index 5b63e3292..56e0bad04 100644 --- a/tests/test_data_io.py +++ b/tests/test_data_io.py @@ -89,18 +89,6 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_load_dataframe_disable(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" - dl = DaskDataLoader( - manifest=manifest, - component_spec=component_spec, - input_partition_rows="disable", - ) - dataframe = dl.load_dataframe() - expected_partitions = 3 # original partitions - assert dataframe.npartitions == expected_partitions - - def test_write_index( tmp_path_factory, dataframe, @@ -128,7 +116,6 @@ def test_write_index( ) -# def test_write_subsets( tmp_path_factory, dataframe, diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index c674ad2a2..0acad2aa2 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -53,6 +53,13 @@ def test_component_op( node_pool_label="dummy_label", ) + with pytest.raises(InvalidPipelineDefinition): + ComponentOp( + Path(components_path / component_names[0]), + arguments=component_args, + number_of_accelerators=1, + ) + @pytest.mark.parametrize( "valid_pipeline_example", diff --git a/tests/test_runner.py b/tests/test_runner.py index 975359db3..a8afee28e 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -4,7 +4,7 @@ from unittest import mock import pytest -from fondant.runner import DockerRunner, KubeflowRunner +from fondant.runner import DockerRunner, KubeflowRunner, VertexRunner VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") @@ -31,7 +31,7 @@ def test_docker_runner(): class MockKfpClient: def __init__(self, host): self.host = host - self._experiments = {"Default": SimpleNamespace(id="123")} + self._experiments = {"Default": SimpleNamespace(experiment_id="123")} def get_experiment(self, experiment_name): try: @@ -40,11 +40,11 @@ def get_experiment(self, experiment_name): raise ValueError def create_experiment(self, experiment_name): - self._experiments[experiment_name] = SimpleNamespace(id="456") + self._experiments[experiment_name] = SimpleNamespace(experiment_id="456") return self.get_experiment(experiment_name) def run_pipeline(self, experiment_id, job_name, pipeline_package_path): - return SimpleNamespace(id="xyz") + return SimpleNamespace(run_id="xyz") def test_kubeflow_runner(): @@ -79,3 +79,20 @@ def test_kfp_import(): sys.modules["kfp"] = None with pytest.raises(ImportError): _ = KubeflowRunner(host="some_host") + + +def test_vertex_runner(): + input_spec_path = str(VALID_PIPELINE / "kubeflow_pipeline.yml") + with mock.patch("google.cloud.aiplatform.init", return_value=None), mock.patch( + "google.cloud.aiplatform.PipelineJob", + ): + runner = VertexRunner(project_id="some_project", project_region="some_region") + runner.run(input_spec=input_spec_path) + + # test with service account + runner2 = VertexRunner( + project_id="some_project", + project_region="some_region", + service_account="some_account", + ) + runner2.run(input_spec=input_spec_path)