From b4fe222601fa692a1c5e3779ef9702f4eb1f2728 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 23 Nov 2023 10:37:32 +0100 Subject: [PATCH 1/4] Update core package (#653) First PR related to the data structure redesign. Implements the following: - New manifest structure (including validation, and evolution) - New ComponentSpec structure (including validation) - Removes `Subsets` and `Index` Not all tests are running successfully. But this are already quite a few changes. Therefore, I've created PR on feature branch `feature/redesign-dataset-format-and-interface`, to have quicker feedback loops. --------- Co-authored-by: Robbe Sneyders Co-authored-by: Philippe Moussalli --- src/fondant/core/component_spec.py | 58 +---- src/fondant/core/manifest.py | 239 ++++++++--------- src/fondant/core/schema.py | 31 ++- src/fondant/core/schemas/component_spec.json | 32 +-- src/fondant/core/schemas/manifest.json | 17 +- .../component_specs/invalid_component.yaml} | 10 +- .../component_specs/kubeflow_component.yaml | 0 .../component_specs/valid_component.yaml} | 21 +- .../valid_component_no_args.yaml | 13 +- .../evolution_examples/1/component.yaml} | 14 +- .../evolution_examples/1/output_manifest.json | 36 +++ .../evolution_examples/2}/component.yaml | 10 +- .../evolution_examples/2/output_manifest.json | 33 +++ .../evolution_examples/3/component.yaml | 16 ++ .../evolution_examples/3/output_manifest.json | 29 +++ .../evolution_examples/4/component.yaml | 12 + .../evolution_examples/4/output_manifest.json | 29 +++ .../evolution_examples/input_manifest.json | 29 +++ .../examples/manifests/invalid_manifest.json | 14 + .../examples/manifests/valid_manifest.json | 29 +++ tests/{ => core}/test_component_specs.py | 28 +- tests/core/test_manifest.py | 246 ++++++++++++++++++ tests/{ => core}/test_manifest_evolution.py | 9 +- tests/{ => core}/test_schema.py | 0 .../component_specs/valid_component.yaml | 29 --- .../components/input_manifest.json | 22 -- .../evolution_examples/1/output_manifest.json | 46 ---- .../evolution_examples/2/component.yaml | 23 -- .../evolution_examples/2/output_manifest.json | 38 --- .../evolution_examples/3/component.yaml | 24 -- .../evolution_examples/3/output_manifest.json | 32 --- .../evolution_examples/4/output_manifest.json | 38 --- .../evolution_examples/5/component.yaml | 21 -- .../evolution_examples/5/output_manifest.json | 29 --- .../evolution_examples/6/component.yaml | 22 -- .../evolution_examples/6/output_manifest.json | 21 -- .../evolution_examples/7/component.yaml | 22 -- .../evolution_examples/7/output_manifest.json | 21 -- .../evolution_examples/8/output_manifest.json | 35 --- .../evolution_examples/input_manifest.json | 35 --- .../manifests/invalid_manifest.json | 14 - .../manifests/valid_manifest.json | 35 --- .../component_1/manifest.json | 36 --- .../example_component/Dockerfile | 0 .../example_component/fondant_component.yaml | 0 .../example_data/components/1.yaml | 0 .../{ => examples}/example_data/manifest.json | 0 .../{ => examples}/example_data/raw/split.py | 0 .../example_data/raw/testset.parquet | Bin .../subsets_input/index/part.0.parquet | Bin .../subsets_input/index/part.1.parquet | Bin .../subsets_input/index/part.2.parquet | Bin .../subsets_input/properties/part.0.parquet | Bin .../subsets_input/properties/part.1.parquet | Bin .../subsets_input/properties/part.2.parquet | Bin .../subsets_input/types/part.0.parquet | Bin .../subsets_input/types/part.1.parquet | Bin .../subsets_input/types/part.2.parquet | Bin .../example_modules/component.py | 0 .../example_modules/invalid_component.py | 0 .../invalid_double_components.py | 0 .../invalid_double_pipeline.py | 0 .../example_modules/pipeline.py | 0 .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 0 .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 0 .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 0 .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml | 0 .../components/arguments/component.yaml | 0 .../arguments/component_default_args.yaml | 0 .../components/arguments/input_manifest.json | 14 +- .../example_specs/components/component.yaml | 18 +- .../components/input_manifest.json | 17 ++ .../example_pipeline/cache/42.txt | 0 .../component_1/manifest.json | 31 +++ .../component_2/manifest.json | 0 .../component_1/manifest.json | 0 .../component_2/manifest.json | 0 tests/test_component.py | 44 +--- tests/test_manifest.py | 239 ----------------- 90 files changed, 745 insertions(+), 1116 deletions(-) rename tests/{example_specs/evolution_examples/4/component.yaml => core/examples/component_specs/invalid_component.yaml} (84%) rename tests/{example_specs => core/examples}/component_specs/kubeflow_component.yaml (100%) rename tests/{example_specs/evolution_examples/1/component.yaml => core/examples/component_specs/valid_component.yaml} (62%) rename tests/{example_specs => core/examples}/component_specs/valid_component_no_args.yaml (59%) rename tests/{example_specs/component_specs/invalid_component.yaml => core/examples/evolution_examples/1/component.yaml} (59%) create mode 100644 tests/core/examples/evolution_examples/1/output_manifest.json rename tests/{example_specs/evolution_examples/8 => core/examples/evolution_examples/2}/component.yaml (69%) create mode 100644 tests/core/examples/evolution_examples/2/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/3/component.yaml create mode 100644 tests/core/examples/evolution_examples/3/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/4/component.yaml create mode 100644 tests/core/examples/evolution_examples/4/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/input_manifest.json create mode 100644 tests/core/examples/manifests/invalid_manifest.json create mode 100644 tests/core/examples/manifests/valid_manifest.json rename tests/{ => core}/test_component_specs.py (85%) create mode 100644 tests/core/test_manifest.py rename tests/{ => core}/test_manifest_evolution.py (83%) rename tests/{ => core}/test_schema.py (100%) delete mode 100644 tests/example_specs/component_specs/valid_component.yaml delete mode 100644 tests/example_specs/components/input_manifest.json delete mode 100644 tests/example_specs/evolution_examples/1/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/2/component.yaml delete mode 100644 tests/example_specs/evolution_examples/2/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/3/component.yaml delete mode 100644 tests/example_specs/evolution_examples/3/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/4/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/5/component.yaml delete mode 100644 tests/example_specs/evolution_examples/5/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/6/component.yaml delete mode 100644 tests/example_specs/evolution_examples/6/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/7/component.yaml delete mode 100644 tests/example_specs/evolution_examples/7/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/8/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/input_manifest.json delete mode 100644 tests/example_specs/manifests/invalid_manifest.json delete mode 100644 tests/example_specs/manifests/valid_manifest.json delete mode 100644 tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => examples}/example_component/Dockerfile (100%) rename tests/{ => examples}/example_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_data/components/1.yaml (100%) rename tests/{ => examples}/example_data/manifest.json (100%) rename tests/{ => examples}/example_data/raw/split.py (100%) rename tests/{ => examples}/example_data/raw/testset.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.2.parquet (100%) rename tests/{ => examples}/example_modules/component.py (100%) rename tests/{ => examples}/example_modules/invalid_component.py (100%) rename tests/{ => examples}/example_modules/invalid_double_components.py (100%) rename tests/{ => examples}/example_modules/invalid_double_pipeline.py (100%) rename tests/{ => examples}/example_modules/pipeline.py (100%) rename tests/{ => examples}/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/component.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/component_default_args.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/input_manifest.json (60%) rename tests/{ => examples}/example_specs/components/component.yaml (56%) create mode 100644 tests/examples/example_specs/components/input_manifest.json rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/cache/42.txt (100%) create mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json (100%) rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json (100%) rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json (100%) delete mode 100644 tests/test_manifest.py diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index cf177e07c..4dd945568 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -66,34 +66,6 @@ def kubeflow_type(self) -> str: return lookup[self.type] -class ComponentSubset: - """ - Class representing a Fondant Component subset. - - Args: - specification: the part of the component json representing the subset - """ - - def __init__(self, specification: t.Dict[str, t.Any]) -> None: - self._specification = specification - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - @property - def fields(self) -> t.Mapping[str, Field]: - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - @property - def additional_fields(self) -> bool: - return self._specification.get("additionalFields", True) - - class ComponentSpec: """ Class representing a Fondant component specification. @@ -190,39 +162,25 @@ def tags(self) -> t.List[str]: return self._specification.get("tags", None) @property - def index(self): - return ComponentSubset({"fields": {}}) - - @property - def consumes(self) -> t.Mapping[str, ComponentSubset]: - """The subsets consumed by the component as an immutable mapping.""" + def consumes(self) -> t.Mapping[str, Field]: + """The fields consumed by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("consumes", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("consumes", {}).items() }, ) @property - def produces(self) -> t.Mapping[str, ComponentSubset]: - """The subsets produced by the component as an immutable mapping.""" + def produces(self) -> t.Mapping[str, Field]: + """The fields produced by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("produces", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("produces", {}).items() }, ) - @property - def accepts_additional_subsets(self) -> bool: - return self._specification.get("consumes", {}).get("additionalSubsets", True) - - @property - def outputs_additional_subsets(self) -> bool: - return self._specification.get("produces", {}).get("additionalSubsets", True) - @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 692c4e7cd..fc750620d 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,6 +4,7 @@ import pkgutil import types import typing as t +from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path @@ -18,59 +19,6 @@ from fondant.core.schema import Field, Type -class Subset: - """ - Class representing a Fondant subset. - - Args: - specification: The part of the manifest json representing the subset - base_path: The base path which the subset location is defined relative to - """ - - def __init__(self, specification: dict, *, base_path: str) -> None: - self._specification = specification - self._base_path = base_path - - @property - def location(self) -> str: - """The absolute location of the subset.""" - return self._base_path + self._specification["location"] - - @property - def fields(self) -> t.Mapping[str, Field]: - """The fields of the subset returned as an immutable mapping.""" - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - def add_field(self, name: str, type_: Type, *, overwrite: bool = False) -> None: - if not overwrite and name in self._specification["fields"]: - msg = f"A field with name {name} already exists" - raise ValueError(msg) - - self._specification["fields"][name] = type_.to_json() - - def remove_field(self, name: str) -> None: - del self._specification["fields"][name] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - -class Index(Subset): - """Special case of a subset for the index, which has fixed fields.""" - - @property - def fields(self) -> t.Dict[str, Field]: - return { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - @dataclass class Metadata: """ @@ -171,8 +119,8 @@ def create( specification = { "metadata": metadata.to_dict(), - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": {}, + "index": {"location": f"/{component_id}"}, + "fields": {}, } return cls(specification) @@ -196,6 +144,10 @@ def copy(self) -> "Manifest": def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] + @property + def index(self) -> Field: + return Field(name="Index", location=self._specification["index"]["location"]) + def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -203,6 +155,44 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] + @property + def field_mapping(self) -> t.Mapping[str, t.List[str]]: + """ + Retrieve a mapping of field locations to corresponding field names. + A dictionary where keys are field locations and values are lists + of column names. + + The method returns an immutable OrderedDict where the first dict element contains the + location of the dataframe with the index. This allows an efficient left join operation. + + Example: + { + "/base_path/component_1": ["Name", "HP"], + "/base_path/component_2": ["Type 1", "Type 2"], + } + """ + field_mapping = {} + for field_name, field in {"id": self.index, **self.fields}.items(): + location = ( + f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" + ) + if location in field_mapping: + field_mapping[location].append(field_name) + else: + field_mapping[location] = [field_name] + + # Sort field mapping that the first dataset contains the index + sorted_keys = sorted( + field_mapping.keys(), + key=lambda key: "id" in field_mapping[key], + reverse=True, + ) + sorted_field_mapping = OrderedDict( + (key, field_mapping[key]) for key in sorted_keys + ) + + return types.MappingProxyType(sorted_field_mapping) + @property def run_id(self) -> str: return self.metadata["run_id"] @@ -220,39 +210,61 @@ def cache_key(self) -> str: return self.metadata["cache_key"] @property - def index(self) -> Index: - return Index(self._specification["index"], base_path=self.base_path) - - @property - def subsets(self) -> t.Mapping[str, Subset]: - """The subsets of the manifest as an immutable mapping.""" + def fields(self) -> t.Mapping[str, Field]: + """The fields of the manifest as an immutable mapping.""" return types.MappingProxyType( { - name: Subset(subset, base_path=self.base_path) - for name, subset in self._specification["subsets"].items() + name: Field( + name=name, + type=Type(field["type"]), + location=field["location"], + ) + for name, field in self._specification["fields"].items() }, ) - def add_subset( - self, - name: str, - fields: t.Iterable[t.Union[Field, t.Tuple[str, Type]]], - ) -> None: - if name in self._specification["subsets"]: - msg = f"A subset with name {name} already exists" + def add_or_update_field(self, field: Field, overwrite: bool = False): + """Add or update field to manifest.""" + if field.name == "index": + self._add_or_update_index(field, overwrite=True) + elif overwrite is False and field.name in self._specification["fields"]: + msg = ( + f"A field with name {field.name} already exists. Set overwrite to true, " + f"if you want to update the field." + ) + raise ValueError(msg) + else: + self._specification["fields"][field.name] = { + "location": f"/{self.component_id}", + **field.type.to_json(), + } + + def _add_or_update_index(self, field: Field, overwrite: bool = True): + """Add or update the manifest index.""" + if overwrite is False: + msg = ( + "The index already exists. Set overwrite to true, " + "if you want to update the index." + ) + raise ValueError(msg) + + if field.name != "index": + msg = ( + f"The field name is {field.name}. If you try to update the index, set the field" + f"name to `index`." + ) raise ValueError(msg) - self._specification["subsets"][name] = { - "location": f"/{self.pipeline_name}/{self.run_id}/{self.component_id}/{name}", - "fields": {name: type_.to_json() for name, type_ in fields}, + self._specification["index"] = { + "location": f"/{field.location}", } - def remove_subset(self, name: str) -> None: - if name not in self._specification["subsets"]: - msg = f"Subset {name} not found in specification" + def remove_field(self, name: str) -> None: + if name not in self._specification["fields"]: + msg = f"Field {name} not found in specification" raise ValueError(msg) - del self._specification["subsets"][name] + del self._specification["fields"][name] def evolve( # noqa : PLR0912 (too many branches) self, @@ -274,68 +286,23 @@ def evolve( # noqa : PLR0912 (too many branches) # Update `component_id` of the metadata component_id = component_spec.component_folder_name evolved_manifest.update_metadata(key="component_id", value=component_id) + if run_id is not None: evolved_manifest.update_metadata(key="run_id", value=run_id) - # Update index location as this is currently always rewritten - evolved_manifest.index._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/index" - - # If additionalSubsets is False in consumes, - # Remove all subsets from the manifest that are not listed - if not component_spec.accepts_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.consumes: - evolved_manifest.remove_subset(subset_name) - - # If additionalSubsets is False in produces, - # Remove all subsets from the manifest that are not listed - if not component_spec.outputs_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.produces: - evolved_manifest.remove_subset(subset_name) - - # If additionalFields is False for a consumed subset, - # Remove all fields from that subset that are not listed - for subset_name, subset in component_spec.consumes.items(): - if subset_name in evolved_manifest.subsets and not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # For each output subset defined in the component, add or update it - for subset_name, subset in component_spec.produces.items(): - # Subset is already in manifest, update it - if subset_name in evolved_manifest.subsets: - # If additional fields are not allowed, remove the fields not defined in the - # component spec produces section - if not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # Add fields defined in the component spec produces section - # Overwrite to persist changes to the field (eg. type of column) - for field in subset.fields.values(): - evolved_manifest.subsets[subset_name].add_field( - field.name, - field.type, - overwrite=True, - ) - - # Update subset location as this is currently always rewritten - evolved_manifest.subsets[subset_name]._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/{subset_name}" - - # Subset is not yet in manifest, add it - else: - evolved_manifest.add_subset(subset_name, subset.fields.values()) + # Update index location as this is always rewritten + evolved_manifest.add_or_update_field( + Field(name="index", location=component_spec.component_folder_name), + ) + + # TODO handle additionalFields + + # Add or update all produced fields defined in the component spec + for name, field in component_spec.produces.items(): + # If field was not part of the input manifest, add field to output manifest. + # If field was part of the input manifest and got produced by the component, update + # the manifest field. + evolved_manifest.add_or_update_field(field, overwrite=True) return evolved_manifest diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index ca9bb0944..dc940b5f7 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -5,6 +5,7 @@ import os import re import typing as t +from dataclasses import dataclass from enum import Enum import pyarrow as pa @@ -161,11 +162,33 @@ def __eq__(self, other): return False -class Field(t.NamedTuple): - """Class representing a single field or column in a Fondant subset.""" +class Field: + """Class representing a single field or column in a Fondant dataset.""" - name: str - type: Type + def __init__( + self, + name: str, + type: Type = None, + location: str = "", + ) -> None: + self._name = name + self._type = type + self._location = location + + @property + def name(self) -> str: + """The name of the field.""" + return self._name + + @property + def type(self) -> Type: + """The absolute location of the field.""" + return self._type + + @property + def location(self) -> str: + """The relative location of the field.""" + return self._location def validate_partition_size(arg_value): diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 8d684a3e5..064ea027d 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -28,44 +28,16 @@ } }, "consumes": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" }, "produces": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" }, "args": { "$ref": "#/definitions/args" } }, "definitions": { - "subset": { - "type": "object", - "properties": { - "fields": { - "$ref": "common.json#/definitions/fields" - }, - "additionalFields": { - "type": "boolean", - "default": true - } - }, - "required": [ - "fields" - ] - }, - "subsets": { - "type": "object", - "properties": { - "additionalSubsets": { - "type": "boolean", - "default": true - } - }, - "minProperties": 1, - "additionalProperties": { - "$ref": "#/definitions/subset" - } - }, "args": { "type": "object", "minProperties": 1, diff --git a/src/fondant/core/schemas/manifest.json b/src/fondant/core/schemas/manifest.json index 00ad6d1cc..77365dd5f 100644 --- a/src/fondant/core/schemas/manifest.json +++ b/src/fondant/core/schemas/manifest.json @@ -37,36 +37,33 @@ "location" ] }, - "subsets": { - "$ref": "#/definitions/subsets" + "fields": { + "$ref": "#/definitions/fields" } }, "required": [ "metadata", "index", - "subsets" + "fields" ], "definitions": { - "subset": { + "field": { "type": "object", "properties": { "location": { "type": "string", "pattern": "/.*" - }, - "fields": { - "$ref": "common.json#/definitions/fields" } }, "required": [ "location", - "fields" + "type" ] }, - "subsets": { + "fields": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/subset" + "$ref": "#/definitions/field" } } } diff --git a/tests/example_specs/evolution_examples/4/component.yaml b/tests/core/examples/component_specs/invalid_component.yaml similarity index 84% rename from tests/example_specs/evolution_examples/4/component.yaml rename to tests/core/examples/component_specs/invalid_component.yaml index 067b06da0..d1c88c444 100644 --- a/tests/example_specs/evolution_examples/4/component.yaml +++ b/tests/core/examples/component_specs/invalid_component.yaml @@ -7,14 +7,14 @@ consumes: fields: data: type: binary - + produces: - images: + captions: fields: - encoding: + data: type: string -args: +Arguments: storage_args: description: Storage arguments - type: str + type: str \ No newline at end of file diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/core/examples/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/example_specs/component_specs/kubeflow_component.yaml rename to tests/core/examples/component_specs/kubeflow_component.yaml diff --git a/tests/example_specs/evolution_examples/1/component.yaml b/tests/core/examples/component_specs/valid_component.yaml similarity index 62% rename from tests/example_specs/evolution_examples/1/component.yaml rename to tests/core/examples/component_specs/valid_component.yaml index 22ae0feb1..1215af1bd 100644 --- a/tests/example_specs/evolution_examples/1/component.yaml +++ b/tests/core/examples/component_specs/valid_component.yaml @@ -1,20 +1,21 @@ name: Example component description: This is an example component image: example_component:latest +tags: + - Data loading consumes: images: - fields: - data: - type: binary - -produces: + type: binary + embeddings: - fields: - data: - type: array - items: - type: float32 + type: array + items: + type: float32 + +produces: + captions: + type: string args: storage_args: diff --git a/tests/example_specs/component_specs/valid_component_no_args.yaml b/tests/core/examples/component_specs/valid_component_no_args.yaml similarity index 59% rename from tests/example_specs/component_specs/valid_component_no_args.yaml rename to tests/core/examples/component_specs/valid_component_no_args.yaml index c3adfa6aa..de11cb2ee 100644 --- a/tests/example_specs/component_specs/valid_component_no_args.yaml +++ b/tests/core/examples/component_specs/valid_component_no_args.yaml @@ -4,12 +4,13 @@ image: example_component:latest consumes: images: - fields: - data: - type: binary + type: binary + + embeddings: + type: array + items: + type: float32 produces: captions: - fields: - data: - type: string \ No newline at end of file + type: string diff --git a/tests/example_specs/component_specs/invalid_component.yaml b/tests/core/examples/evolution_examples/1/component.yaml similarity index 59% rename from tests/example_specs/component_specs/invalid_component.yaml rename to tests/core/examples/evolution_examples/1/component.yaml index 3fc8128b5..e91ae6f46 100644 --- a/tests/example_specs/component_specs/invalid_component.yaml +++ b/tests/core/examples/evolution_examples/1/component.yaml @@ -3,14 +3,16 @@ description: This is an example component image: example_component:latest consumes: - images: - data: binary + images_data: + type: binary produces: - captions: - data: string + embeddings_data: + type: array + items: + type: float32 -Arguments: +args: storage_args: description: Storage arguments - type: str \ No newline at end of file + type: str diff --git a/tests/core/examples/evolution_examples/1/output_manifest.json b/tests/core/examples/evolution_examples/1/output_manifest.json new file mode 100644 index 000000000..2a73e5f29 --- /dev/null +++ b/tests/core/examples/evolution_examples/1/output_manifest.json @@ -0,0 +1,36 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "embeddings_data": { + "type": "array", + "items": { + "type": "float32" + }, + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/8/component.yaml b/tests/core/examples/evolution_examples/2/component.yaml similarity index 69% rename from tests/example_specs/evolution_examples/8/component.yaml rename to tests/core/examples/evolution_examples/2/component.yaml index 5c204b9c2..2352adcb5 100644 --- a/tests/example_specs/evolution_examples/8/component.yaml +++ b/tests/core/examples/evolution_examples/2/component.yaml @@ -3,10 +3,12 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + +produces: + images_encoding: + type: string args: storage_args: diff --git a/tests/core/examples/evolution_examples/2/output_manifest.json b/tests/core/examples/evolution_examples/2/output_manifest.json new file mode 100644 index 000000000..ca1f6f361 --- /dev/null +++ b/tests/core/examples/evolution_examples/2/output_manifest.json @@ -0,0 +1,33 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "images_encoding": { + "type": "string", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/3/component.yaml b/tests/core/examples/evolution_examples/3/component.yaml new file mode 100644 index 000000000..13b1427b3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/component.yaml @@ -0,0 +1,16 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: string + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/3/output_manifest.json b/tests/core/examples/evolution_examples/3/output_manifest.json new file mode 100644 index 000000000..b11f7d8a3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "string", + "location":"/example_component_1" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/4/component.yaml b/tests/core/examples/evolution_examples/4/component.yaml new file mode 100644 index 000000000..1b766036d --- /dev/null +++ b/tests/core/examples/evolution_examples/4/component.yaml @@ -0,0 +1,12 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/4/output_manifest.json b/tests/core/examples/evolution_examples/4/output_manifest.json new file mode 100644 index 000000000..929c380ab --- /dev/null +++ b/tests/core/examples/evolution_examples/4/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/input_manifest.json b/tests/core/examples/evolution_examples/input_manifest.json new file mode 100644 index 000000000..664367cc2 --- /dev/null +++ b/tests/core/examples/evolution_examples/input_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"12345", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/invalid_manifest.json b/tests/core/examples/manifests/invalid_manifest.json new file mode 100644 index 000000000..51ec6c5e5 --- /dev/null +++ b/tests/core/examples/manifests/invalid_manifest.json @@ -0,0 +1,14 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields": { + "images": {} + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/valid_manifest.json b/tests/core/examples/manifests/valid_manifest.json new file mode 100644 index 000000000..0f7c58126 --- /dev/null +++ b/tests/core/examples/manifests/valid_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields":{ + "images": { + "location": "/component1", + "type": "binary" + }, + "height": { + "location": "/component2", + "type": "int32" + }, + "width": { + "location": "/component2", + "type": "int32" + }, + "caption": { + "location": "/component3", + "type": "string" + } + } +} \ No newline at end of file diff --git a/tests/test_component_specs.py b/tests/core/test_component_specs.py similarity index 85% rename from tests/test_component_specs.py rename to tests/core/test_component_specs.py index caf0344de..dcbf4c2ed 100644 --- a/tests/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -8,13 +8,12 @@ import yaml from fondant.core.component_spec import ( ComponentSpec, - ComponentSubset, KubeflowComponentSpec, ) from fondant.core.exceptions import InvalidComponentSpec from fondant.core.schema import Type -component_specs_path = Path(__file__).parent / "example_specs/component_specs" +component_specs_path = Path(__file__).parent / "examples/component_specs" @pytest.fixture() @@ -49,12 +48,19 @@ def test_component_spec_pkgutil_error(mock_get_data): def test_component_spec_validation(valid_fondant_schema, invalid_fondant_schema): - """Test that the manifest is validated correctly on instantiation.""" + """Test that the component spec is validated correctly on instantiation.""" ComponentSpec(valid_fondant_schema) with pytest.raises(InvalidComponentSpec): ComponentSpec(invalid_fondant_schema) +def test_component_spec_load_from_file(valid_fondant_schema, invalid_fondant_schema): + """Test that the component spec is validated correctly on instantiation.""" + ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + with pytest.raises(InvalidComponentSpec): + ComponentSpec.from_file(component_specs_path / "invalid_component.yaml") + + def test_attribute_access(valid_fondant_schema): """ Test that attributes can be accessed as expected: @@ -65,8 +71,8 @@ def test_attribute_access(valid_fondant_schema): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.consumes["images"].fields["data"].type == Type("binary") - assert fondant_component.consumes["embeddings"].fields["data"].type == Type.list( + assert fondant_component.consumes["images"].type == Type("binary") + assert fondant_component.consumes["embeddings"].type == Type.list( Type("float32"), ) @@ -129,15 +135,3 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr - - -def test_component_subset_repr(): - """Test that the __repr__ method of ComponentSubset returns the expected string.""" - component_subset_schema = { - "name": "Example subset", - "description": "This is an example subset", - } - - component_subset = ComponentSubset(component_subset_schema) - expected_repr = f"ComponentSubset({component_subset_schema!r})" - assert repr(component_subset) == expected_repr diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py new file mode 100644 index 000000000..0b255b9df --- /dev/null +++ b/tests/core/test_manifest.py @@ -0,0 +1,246 @@ +import json +import pkgutil +from collections import OrderedDict +from pathlib import Path + +import pytest +from fondant.core.component_spec import ComponentSpec +from fondant.core.exceptions import InvalidManifest +from fondant.core.manifest import Field, Manifest, Type + +manifest_path = Path(__file__).parent / "examples" / "manifests" +component_specs_path = Path(__file__).parent / "examples" / "component_specs" + + +@pytest.fixture() +def valid_manifest(): + with open(manifest_path / "valid_manifest.json") as f: + return json.load(f) + + +@pytest.fixture() +def invalid_manifest(): + with open(manifest_path / "invalid_manifest.json") as f: + return json.load(f) + + +def test_manifest_validation(valid_manifest, invalid_manifest): + """Test that the manifest is validated correctly on instantiation.""" + Manifest(valid_manifest) + with pytest.raises(InvalidManifest): + Manifest(invalid_manifest) + + +def test_set_base_path(valid_manifest): + """Test altering the base path in the manifest.""" + manifest = Manifest(valid_manifest) + tmp_path = "/tmp/base_path" + manifest.update_metadata(key="base_path", value=tmp_path) + + assert manifest.base_path == tmp_path + assert manifest._specification["metadata"]["base_path"] == tmp_path + + +def test_from_to_file(valid_manifest): + """Test reading from and writing to file.""" + tmp_path = "/tmp/manifest.json" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(valid_manifest, f) + + manifest = Manifest.from_file(tmp_path) + assert manifest.metadata == valid_manifest["metadata"] + + manifest.to_file(tmp_path) + with open(tmp_path, encoding="utf-8") as f: + assert json.load(f) == valid_manifest + + +def test_attribute_access(valid_manifest): + """ + Test that attributes can be accessed as expected: + - Fixed properties should be accessible as an attribute + - Dynamic properties should be accessible by lookup. + """ + manifest = Manifest(valid_manifest) + + assert manifest.metadata == valid_manifest["metadata"] + assert manifest.index.location == "/component1" + assert manifest.fields["images"].location == "/component1" + assert manifest.fields["images"].type == Type("binary") + + +def test_manifest_creation(): + """Test the stepwise creation of a manifest via the Manifest class.""" + base_path = "gs://bucket" + run_id = "run_id" + pipeline_name = "pipeline_name" + component_id = "component_id" + cache_key = "42" + + manifest = Manifest.create( + pipeline_name=pipeline_name, + base_path=base_path, + run_id=run_id, + component_id=component_id, + cache_key=cache_key, + ) + + manifest.add_or_update_field(Field(name="width", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height", type=Type("int32"))) + manifest.add_or_update_field(Field(name="data", type=Type("binary"))) + + assert manifest._specification == { + "metadata": { + "pipeline_name": pipeline_name, + "base_path": base_path, + "run_id": run_id, + "component_id": component_id, + "cache_key": cache_key, + }, + "index": {"location": f"/{component_id}"}, + "fields": { + "width": { + "type": "int32", + "location": f"/{component_id}", + }, + "height": { + "type": "int32", + "location": f"/{component_id}", + }, + "data": { + "type": "binary", + "location": f"/{component_id}", + }, + }, + } + + +def test_manifest_repr(): + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/", + run_id="A", + component_id="1", + cache_key="42", + ) + assert ( + manifest.__repr__() + == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," + " 'component_id': '1', 'cache_key': '42'}," + " 'index': {'location': '/1'}, 'fields': {}})" + ) + + +def test_manifest_alteration(valid_manifest): + """Test alteration functionalities of a manifest via the Manifest class.""" + manifest = Manifest(valid_manifest) + + # test adding a subset + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height2", type=Type("int32"))) + + assert "width2" in manifest.fields + assert "height2" in manifest.fields + + # test adding a duplicate subset + with pytest.raises(ValueError, match="A field with name width2 already exists"): + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + + # test removing a subset + manifest.remove_field("width2") + assert "images2" not in manifest.fields + + # test removing a nonexistant subset + with pytest.raises(ValueError, match="Field pictures not found in specification"): + manifest.remove_field("pictures") + + +def test_manifest_copy_and_adapt(valid_manifest): + """Test that a manifest can be copied and adapted without changing the original.""" + manifest = Manifest(valid_manifest) + new_manifest = manifest.copy() + new_manifest.remove_field("images") + assert manifest._specification == valid_manifest + assert new_manifest._specification != valid_manifest + + +def test_no_validate_schema(monkeypatch, valid_manifest): + monkeypatch.setattr(pkgutil, "get_data", lambda package, resource: None) + with pytest.raises(FileNotFoundError): + Manifest(valid_manifest) + + +def test_evolve_manifest(): + """Test that the fields are evolved as expected.""" + run_id = "A" + spec = ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + input_manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + output_manifest = input_manifest.evolve(component_spec=spec, run_id=run_id) + + assert output_manifest.base_path == input_manifest.base_path + assert output_manifest.run_id == run_id + assert output_manifest.index.location == "/" + spec.component_folder_name + assert output_manifest.fields["captions"].type.name == "string" + + +def test_fields(): + """Test that the fields can added and updated as expected.""" + run_id = "A" + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + # add a field + manifest.add_or_update_field(Field(name="field_1", type=Type("int32"))) + assert "field_1" in manifest.fields + + # add a duplicate field, but overwrite (update) + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=True, + ) + assert manifest.fields["field_1"].type.name == "string" + + # add duplicate field + with pytest.raises( + ValueError, + match="A field with name field_1 already exists. Set overwrite to true, " + "if you want to update the field.", + ): + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=False, + ) + + # delete a field + manifest.remove_field(name="field_1") + assert "field_1" not in manifest.fields + + +def test_field_mapping(valid_manifest): + """Test field mapping generation.""" + manifest = Manifest(valid_manifest) + manifest.add_or_update_field(Field(name="index", location="component2")) + field_mapping = manifest.field_mapping + assert field_mapping == OrderedDict( + { + "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ + "id", + "height", + "width", + ], + "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], + "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], + }, + ) diff --git a/tests/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py similarity index 83% rename from tests/test_manifest_evolution.py rename to tests/core/test_manifest_evolution.py index c79b76aaf..0d9181701 100644 --- a/tests/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -6,7 +6,7 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -examples_path = Path(__file__).parent / "example_specs/evolution_examples" +examples_path = Path(__file__).parent / "examples/evolution_examples" @pytest.fixture() @@ -41,7 +41,7 @@ def test_component_spec_location_update(): with open(examples_path / "input_manifest.json") as f: input_manifest = json.load(f) - with open(examples_path / "7/component.yaml") as f: + with open(examples_path / "4/component.yaml") as f: specification = yaml.safe_load(f) manifest = Manifest(input_manifest) @@ -50,7 +50,4 @@ def test_component_spec_location_update(): component_spec=component_spec, ) - assert ( - evolved_manifest._specification["subsets"]["images"]["location"] - == "/test_pipeline/12345/example_component/images" - ) + assert evolved_manifest.index.location == "/" + component_spec.component_folder_name diff --git a/tests/test_schema.py b/tests/core/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/core/test_schema.py diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml deleted file mode 100644 index c4b99e837..000000000 --- a/tests/example_specs/component_specs/valid_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest -tags: - - Data loading - -consumes: - images: - fields: - data: - type: binary - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - captions: - fields: - data: - type: string - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_specs/components/input_manifest.json b/tests/example_specs/components/input_manifest.json deleted file mode 100644 index 7af13d599..000000000 --- a/tests/example_specs/components/input_manifest.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index/12345/example_component" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } - } - - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/1/output_manifest.json b/tests/example_specs/evolution_examples/1/output_manifest.json deleted file mode 100644 index 17b94c0b0..000000000 --- a/tests/example_specs/evolution_examples/1/output_manifest.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/2/component.yaml b/tests/example_specs/evolution_examples/2/component.yaml deleted file mode 100644 index f37ff99d1..000000000 --- a/tests/example_specs/evolution_examples/2/component.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/2/output_manifest.json b/tests/example_specs/evolution_examples/2/output_manifest.json deleted file mode 100644 index 3a40b1c9d..000000000 --- a/tests/example_specs/evolution_examples/2/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/3/component.yaml b/tests/example_specs/evolution_examples/3/component.yaml deleted file mode 100644 index 6753a083b..000000000 --- a/tests/example_specs/evolution_examples/3/component.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalFields: false - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/3/output_manifest.json b/tests/example_specs/evolution_examples/3/output_manifest.json deleted file mode 100644 index a9abda6d0..000000000 --- a/tests/example_specs/evolution_examples/3/output_manifest.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/4/output_manifest.json b/tests/example_specs/evolution_examples/4/output_manifest.json deleted file mode 100644 index 24af4f2ac..000000000 --- a/tests/example_specs/evolution_examples/4/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - }, - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/5/component.yaml b/tests/example_specs/evolution_examples/5/component.yaml deleted file mode 100644 index 93aaf68b3..000000000 --- a/tests/example_specs/evolution_examples/5/component.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/5/output_manifest.json b/tests/example_specs/evolution_examples/5/output_manifest.json deleted file mode 100644 index 8bcf6141d..000000000 --- a/tests/example_specs/evolution_examples/5/output_manifest.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/6/component.yaml b/tests/example_specs/evolution_examples/6/component.yaml deleted file mode 100644 index 065061791..000000000 --- a/tests/example_specs/evolution_examples/6/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/6/output_manifest.json b/tests/example_specs/evolution_examples/6/output_manifest.json deleted file mode 100644 index b7521bf66..000000000 --- a/tests/example_specs/evolution_examples/6/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/7/component.yaml b/tests/example_specs/evolution_examples/7/component.yaml deleted file mode 100644 index 5746ffa4d..000000000 --- a/tests/example_specs/evolution_examples/7/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - data: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/7/output_manifest.json b/tests/example_specs/evolution_examples/7/output_manifest.json deleted file mode 100644 index a9eb8a308..000000000 --- a/tests/example_specs/evolution_examples/7/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "data":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/8/output_manifest.json b/tests/example_specs/evolution_examples/8/output_manifest.json deleted file mode 100644 index de2621c49..000000000 --- a/tests/example_specs/evolution_examples/8/output_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "custom_run_id", - "component_id": "example_component" - }, - "index": { - "location": "/test_pipeline/custom_run_id/example_component/index" - }, - "subsets": { - "images": { - "location": "/test_pipeline/12345/example_component/images", - "fields": { - "width": { - "type": "int32" - }, - "height": { - "type": "int32" - }, - "data": { - "type": "binary" - } - } - }, - "captions": { - "location": "/test_pipeline/12345/example_component/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} diff --git a/tests/example_specs/evolution_examples/input_manifest.json b/tests/example_specs/evolution_examples/input_manifest.json deleted file mode 100644 index 2ecf37243..000000000 --- a/tests/example_specs/evolution_examples/input_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"12345", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/12345/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/invalid_manifest.json b/tests/example_specs/manifests/invalid_manifest.json deleted file mode 100644 index 3fe8b1097..000000000 --- a/tests/example_specs/manifests/invalid_manifest.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "metadata": { - "base_path": "gs://bucket" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": [] - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/valid_manifest.json b/tests/example_specs/manifests/valid_manifest.json deleted file mode 100644 index 9bc00c512..000000000 --- a/tests/example_specs/manifests/valid_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 541775f84..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_component/Dockerfile b/tests/examples/example_component/Dockerfile similarity index 100% rename from tests/example_component/Dockerfile rename to tests/examples/example_component/Dockerfile diff --git a/tests/example_component/fondant_component.yaml b/tests/examples/example_component/fondant_component.yaml similarity index 100% rename from tests/example_component/fondant_component.yaml rename to tests/examples/example_component/fondant_component.yaml diff --git a/tests/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml similarity index 100% rename from tests/example_data/components/1.yaml rename to tests/examples/example_data/components/1.yaml diff --git a/tests/example_data/manifest.json b/tests/examples/example_data/manifest.json similarity index 100% rename from tests/example_data/manifest.json rename to tests/examples/example_data/manifest.json diff --git a/tests/example_data/raw/split.py b/tests/examples/example_data/raw/split.py similarity index 100% rename from tests/example_data/raw/split.py rename to tests/examples/example_data/raw/split.py diff --git a/tests/example_data/raw/testset.parquet b/tests/examples/example_data/raw/testset.parquet similarity index 100% rename from tests/example_data/raw/testset.parquet rename to tests/examples/example_data/raw/testset.parquet diff --git a/tests/example_data/subsets_input/index/part.0.parquet b/tests/examples/example_data/subsets_input/index/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.0.parquet rename to tests/examples/example_data/subsets_input/index/part.0.parquet diff --git a/tests/example_data/subsets_input/index/part.1.parquet b/tests/examples/example_data/subsets_input/index/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.1.parquet rename to tests/examples/example_data/subsets_input/index/part.1.parquet diff --git a/tests/example_data/subsets_input/index/part.2.parquet b/tests/examples/example_data/subsets_input/index/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.2.parquet rename to tests/examples/example_data/subsets_input/index/part.2.parquet diff --git a/tests/example_data/subsets_input/properties/part.0.parquet b/tests/examples/example_data/subsets_input/properties/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.0.parquet rename to tests/examples/example_data/subsets_input/properties/part.0.parquet diff --git a/tests/example_data/subsets_input/properties/part.1.parquet b/tests/examples/example_data/subsets_input/properties/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.1.parquet rename to tests/examples/example_data/subsets_input/properties/part.1.parquet diff --git a/tests/example_data/subsets_input/properties/part.2.parquet b/tests/examples/example_data/subsets_input/properties/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.2.parquet rename to tests/examples/example_data/subsets_input/properties/part.2.parquet diff --git a/tests/example_data/subsets_input/types/part.0.parquet b/tests/examples/example_data/subsets_input/types/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.0.parquet rename to tests/examples/example_data/subsets_input/types/part.0.parquet diff --git a/tests/example_data/subsets_input/types/part.1.parquet b/tests/examples/example_data/subsets_input/types/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.1.parquet rename to tests/examples/example_data/subsets_input/types/part.1.parquet diff --git a/tests/example_data/subsets_input/types/part.2.parquet b/tests/examples/example_data/subsets_input/types/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.2.parquet rename to tests/examples/example_data/subsets_input/types/part.2.parquet diff --git a/tests/example_modules/component.py b/tests/examples/example_modules/component.py similarity index 100% rename from tests/example_modules/component.py rename to tests/examples/example_modules/component.py diff --git a/tests/example_modules/invalid_component.py b/tests/examples/example_modules/invalid_component.py similarity index 100% rename from tests/example_modules/invalid_component.py rename to tests/examples/example_modules/invalid_component.py diff --git a/tests/example_modules/invalid_double_components.py b/tests/examples/example_modules/invalid_double_components.py similarity index 100% rename from tests/example_modules/invalid_double_components.py rename to tests/examples/example_modules/invalid_double_components.py diff --git a/tests/example_modules/invalid_double_pipeline.py b/tests/examples/example_modules/invalid_double_pipeline.py similarity index 100% rename from tests/example_modules/invalid_double_pipeline.py rename to tests/examples/example_modules/invalid_double_pipeline.py diff --git a/tests/example_modules/pipeline.py b/tests/examples/example_modules/pipeline.py similarity index 100% rename from tests/example_modules/pipeline.py rename to tests/examples/example_modules/pipeline.py diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/examples/example_specs/components/arguments/component.yaml similarity index 100% rename from tests/example_specs/components/arguments/component.yaml rename to tests/examples/example_specs/components/arguments/component.yaml diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/examples/example_specs/components/arguments/component_default_args.yaml similarity index 100% rename from tests/example_specs/components/arguments/component_default_args.yaml rename to tests/examples/example_specs/components/arguments/component_default_args.yaml diff --git a/tests/example_specs/components/arguments/input_manifest.json b/tests/examples/example_specs/components/arguments/input_manifest.json similarity index 60% rename from tests/example_specs/components/arguments/input_manifest.json rename to tests/examples/example_specs/components/arguments/input_manifest.json index d98ddd95b..9ee2494f9 100644 --- a/tests/example_specs/components/arguments/input_manifest.json +++ b/tests/examples/example_specs/components/arguments/input_manifest.json @@ -7,16 +7,12 @@ "cache_key": "00" }, "index": { - "location": "/index" + "location": "/component_1" }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } + "fields": { + "data": { + "type": "binary", + "location": "/component_1" } } } \ No newline at end of file diff --git a/tests/example_specs/components/component.yaml b/tests/examples/example_specs/components/component.yaml similarity index 56% rename from tests/example_specs/components/component.yaml rename to tests/examples/example_specs/components/component.yaml index 19c8d5856..973cc3e6b 100644 --- a/tests/example_specs/components/component.yaml +++ b/tests/examples/example_specs/components/component.yaml @@ -3,19 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - additionalFields: false + images_data: + type: array + items: + type: float32 +additionalFields: false args: diff --git a/tests/examples/example_specs/components/input_manifest.json b/tests/examples/example_specs/components/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/examples/example_specs/components/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/cache/42.txt rename to tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json diff --git a/tests/test_component.py b/tests/test_component.py index e759bd367..e5dcb3bc3 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -377,38 +377,22 @@ def test_wrap_transform(): "description": "Component for testing", "image": "component:test", "consumes": { - "image": { - "fields": { - "height": { - "type": "int16", - }, - "width": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "image_width": { + "type": "int16", + }, + "caption_text": { + "type": "string", }, }, "produces": { - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "caption_text": { + "type": "string", }, - "image": { - "fields": { - "height": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, }, }, @@ -425,9 +409,9 @@ def test_wrap_transform(): def transform(dataframe: pd.DataFrame) -> pd.DataFrame: # Check hierarchical columns assert dataframe.columns.tolist() == [ - ("image", "height"), - ("image", "width"), - ("caption", "text"), + "image_height", + "image_width", + "caption_text", ] return dataframe diff --git a/tests/test_manifest.py b/tests/test_manifest.py deleted file mode 100644 index 3af3ea425..000000000 --- a/tests/test_manifest.py +++ /dev/null @@ -1,239 +0,0 @@ -import json -import pkgutil -from pathlib import Path - -import pytest -from fondant.core.exceptions import InvalidManifest -from fondant.core.manifest import Field, Index, Manifest, Subset, Type - -manifest_path = Path(__file__).parent / "example_specs/manifests" - - -@pytest.fixture() -def valid_manifest(): - with open(manifest_path / "valid_manifest.json") as f: - return json.load(f) - - -@pytest.fixture() -def invalid_manifest(): - with open(manifest_path / "invalid_manifest.json") as f: - return json.load(f) - - -def test_manifest_validation(valid_manifest, invalid_manifest): - """Test that the manifest is validated correctly on instantiation.""" - Manifest(valid_manifest) - with pytest.raises(InvalidManifest): - Manifest(invalid_manifest) - - -def test_subset_init(): - """Test initializing a subset.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - assert subset.location == "/tmp/images/ABC/123" - assert ( - subset.__repr__() - == "Subset({'location': '/images/ABC/123', 'fields': {'data': {'type': 'binary'}}})" - ) - - -def test_subset_fields(): - """Test manipulating subset fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - - # add a field - subset.add_field(name="data2", type_=Type("binary")) - assert "data2" in subset.fields - - # add a duplicate field - with pytest.raises(ValueError, match="A field with name data2 already exists"): - subset.add_field(name="data2", type_=Type("binary")) - - # add a duplicate field but overwrite - subset.add_field(name="data2", type_=Type("string"), overwrite=True) - assert subset.fields["data2"].type == Type("string") - - # remove a field - subset.remove_field(name="data2") - assert "data2" not in subset.fields - - -def test_set_base_path(valid_manifest): - """Test altering the base path in the manifest.""" - manifest = Manifest(valid_manifest) - tmp_path = "/tmp/base_path" - manifest.update_metadata(key="base_path", value=tmp_path) - - assert manifest.base_path == tmp_path - assert manifest._specification["metadata"]["base_path"] == tmp_path - - -def test_from_to_file(valid_manifest): - """Test reading from and writing to file.""" - tmp_path = "/tmp/manifest.json" - with open(tmp_path, "w", encoding="utf-8") as f: - json.dump(valid_manifest, f) - - manifest = Manifest.from_file(tmp_path) - assert manifest.metadata == valid_manifest["metadata"] - - manifest.to_file(tmp_path) - with open(tmp_path, encoding="utf-8") as f: - assert json.load(f) == valid_manifest - - -def test_attribute_access(valid_manifest): - """ - Test that attributes can be accessed as expected: - - Fixed properties should be accessible as an attribute - - Dynamic properties should be accessible by lookup. - """ - manifest = Manifest(valid_manifest) - - assert manifest.metadata == valid_manifest["metadata"] - assert manifest.index.location == "gs://bucket/index" - assert manifest.subsets["images"].location == "gs://bucket/images" - assert manifest.subsets["images"].fields["data"].type == Type("binary") - - -def test_manifest_creation(): - """Test the stepwise creation of a manifest via the Manifest class.""" - base_path = "gs://bucket" - run_id = "run_id" - pipeline_name = "pipeline_name" - component_id = "component_id" - cache_key = "42" - - manifest = Manifest.create( - pipeline_name=pipeline_name, - base_path=base_path, - run_id=run_id, - component_id=component_id, - cache_key=cache_key, - ) - - manifest.add_subset("images", [("width", Type("int32")), ("height", Type("int32"))]) - manifest.subsets["images"].add_field("data", Type("binary")) - - assert manifest._specification == { - "metadata": { - "pipeline_name": pipeline_name, - "base_path": base_path, - "run_id": run_id, - "component_id": component_id, - "cache_key": cache_key, - }, - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": { - "images": { - "location": f"/{pipeline_name}/{run_id}/{component_id}/images", - "fields": { - "width": { - "type": "int32", - }, - "height": { - "type": "int32", - }, - "data": { - "type": "binary", - }, - }, - }, - }, - } - - -def test_manifest_repr(): - manifest = Manifest.create( - pipeline_name="NAME", - base_path="/", - run_id="A", - component_id="1", - cache_key="42", - ) - assert ( - manifest.__repr__() - == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," - " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1/index'}, 'subsets': {}})" - ) - - -def test_manifest_alteration(valid_manifest): - """Test alteration functionalities of a manifest via the Manifest class.""" - manifest = Manifest(valid_manifest) - - # test adding a subset - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) - assert "images2" in manifest.subsets - - # test adding a duplicate subset - with pytest.raises(ValueError, match="A subset with name images2 already exists"): - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) - - # test removing a subset - manifest.remove_subset("images2") - assert "images2" not in manifest.subsets - - # test removing a nonexistant subset - with pytest.raises(ValueError, match="Subset pictures not found in specification"): - manifest.remove_subset("pictures") - - -def test_manifest_copy_and_adapt(valid_manifest): - """Test that a manifest can be copied and adapted without changing the original.""" - manifest = Manifest(valid_manifest) - new_manifest = manifest.copy() - new_manifest.remove_subset("images") - assert manifest._specification == valid_manifest - assert new_manifest._specification != valid_manifest - - -def test_no_validate_schema(monkeypatch, valid_manifest): - monkeypatch.setattr(pkgutil, "get_data", lambda package, resource: None) - with pytest.raises(FileNotFoundError): - Manifest(valid_manifest) - - -def test_index_fields(): - """Test that the fields property of Index returns the expected fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - - index = Index(specification=subset_spec, base_path="/tmp") - - expected_fields = { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - assert index.fields == expected_fields From bb3b623a5a587ef4523c8bf41292e84726e8e902 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 23 Nov 2023 14:47:54 +0100 Subject: [PATCH 2/4] Refactor component package (#654) Refactor component package as part of #643 --------- Co-authored-by: Robbe Sneyders Co-authored-by: Philippe Moussalli --- src/fondant/component/data_io.py | 175 ++++++------------ src/fondant/component/executor.py | 34 +--- src/fondant/core/manifest.py | 48 +---- .../component_specs/arguments/component.yaml | 68 +++++++ .../arguments/component_default_args.yaml | 69 +++++++ .../arguments/input_manifest.json | 18 ++ .../examples/component_specs/component.yaml | 23 +++ .../component_specs/input_manifest.json | 17 ++ .../component/examples/data/components/1.yaml | 29 +++ tests/component/examples/data/manifest.json | 29 +++ .../component_1/part.0.parquet | Bin 0 -> 3542 bytes .../component_1/part.1.parquet | Bin 0 -> 3526 bytes .../component_1/part.2.parquet | Bin 0 -> 3584 bytes .../component_2/part.0.parquet | Bin 0 -> 3018 bytes .../component_2/part.1.parquet | Bin 0 -> 3085 bytes .../component_2/part.2.parquet | Bin 0 -> 3066 bytes .../example_pipeline/cache/42.txt | 1 + .../component_1/manifest.json | 31 ++++ tests/{ => component}/test_component.py | 4 +- tests/{ => component}/test_data_io.py | 128 +++++-------- tests/core/test_manifest.py | 19 -- tests/examples/example_data/raw/split.py | 10 +- 22 files changed, 421 insertions(+), 282 deletions(-) create mode 100644 tests/component/examples/component_specs/arguments/component.yaml create mode 100644 tests/component/examples/component_specs/arguments/component_default_args.yaml create mode 100644 tests/component/examples/component_specs/arguments/input_manifest.json create mode 100644 tests/component/examples/component_specs/component.yaml create mode 100644 tests/component/examples/component_specs/input_manifest.json create mode 100644 tests/component/examples/data/components/1.yaml create mode 100644 tests/component/examples/data/manifest.json create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.1.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.0.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.1.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet create mode 100644 tests/component/examples/mock_base_path/example_pipeline/cache/42.txt create mode 100644 tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => component}/test_component.py (99%) rename tests/{ => component}/test_data_io.py (61%) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 7023c1ee2..79a181f8d 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -1,16 +1,19 @@ import logging import os import typing as t +from collections import defaultdict import dask.dataframe as dd from dask.diagnostics import ProgressBar from dask.distributed import Client -from fondant.core.component_spec import ComponentSpec, ComponentSubset +from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest logger = logging.getLogger(__name__) +DEFAULT_INDEX_NAME = "id" + class DataIO: def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None: @@ -82,73 +85,48 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: return dataframe - def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: + def load_dataframe(self) -> dd.DataFrame: """ - Function that loads a subset from the manifest as a Dask dataframe. - - Args: - subset_name: the name of the subset to load - fields: the fields to load from the subset + Function that loads the subsets defined in the component spec as a single Dask dataframe for + the user. Returns: - The subset as a dask dataframe + The Dask dataframe with all columns defined in the manifest field mapping """ - subset = self.manifest.subsets[subset_name] - remote_path = subset.location - - logger.info(f"Loading subset {subset_name} with fields {fields}...") + dataframe = None + field_mapping = defaultdict(list) - subset_df = dd.read_parquet( - remote_path, - columns=fields, - calculate_divisions=True, + # Add index field to field mapping to guarantee start reading with the index dataframe + field_mapping[self.manifest.get_field_location(DEFAULT_INDEX_NAME)].append( + DEFAULT_INDEX_NAME, ) - # add subset prefix to columns - subset_df = subset_df.rename( - columns={col: subset_name + "_" + col for col in subset_df.columns}, - ) + for field_name in self.component_spec.consumes: + location = self.manifest.get_field_location(field_name) + field_mapping[location].append(field_name) - return subset_df - - def _load_index(self) -> dd.DataFrame: - """ - Function that loads the index from the manifest as a Dask dataframe. - - Returns: - The index as a dask dataframe - """ - # get index subset from the manifest - index = self.manifest.index - # get remote path - remote_path = index.location - - # load index from parquet, expecting id and source columns - return dd.read_parquet(remote_path, calculate_divisions=True) - - def load_dataframe(self) -> dd.DataFrame: - """ - Function that loads the subsets defined in the component spec as a single Dask dataframe for - the user. + for location, fields in field_mapping.items(): + if DEFAULT_INDEX_NAME in fields: + fields.remove(DEFAULT_INDEX_NAME) - Returns: - The Dask dataframe with the field columns in the format (_) - as well as the index columns. - """ - # load index into dataframe - dataframe = self._load_index() - for name, subset in self.component_spec.consumes.items(): - fields = list(subset.fields.keys()) - subset_df = self._load_subset(name, fields) - # left joins -> filter on index - dataframe = dd.merge( - dataframe, - subset_df, - left_index=True, - right_index=True, - how="left", + partial_df = dd.read_parquet( + location, + columns=fields, + index=DEFAULT_INDEX_NAME, + calculate_divisions=True, ) + if dataframe is None: + # ensure that the index is set correctly and divisions are known. + dataframe = partial_df + else: + dataframe = dataframe.merge( + partial_df, + how="left", + left_index=True, + right_index=True, + ) + dataframe = self.partition_loaded_dataframe(dataframe) logging.info(f"Columns of dataframe: {list(dataframe.columns)}") @@ -170,79 +148,48 @@ def write_dataframe( dataframe: dd.DataFrame, dask_client: t.Optional[Client] = None, ) -> None: - write_tasks = [] + columns_to_produce = [ + column_name for column_name, field in self.component_spec.produces.items() + ] - dataframe.index = dataframe.index.rename("id") + dataframe.index = dataframe.index.rename(DEFAULT_INDEX_NAME) - # Turn index into an empty dataframe so we can write it - index_df = dataframe.index.to_frame().drop(columns=["id"]) - write_index_task = self._write_subset( - index_df, - subset_name="index", - subset_spec=self.component_spec.index, - ) - write_tasks.append(write_index_task) + # validation that all columns are in the dataframe + self.validate_dataframe_columns(dataframe, columns_to_produce) - for subset_name, subset_spec in self.component_spec.produces.items(): - subset_df = self._extract_subset_dataframe( - dataframe, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_subset_task = self._write_subset( - subset_df, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_tasks.append(write_subset_task) + dataframe = dataframe[columns_to_produce] + write_task = self._write_dataframe(dataframe) with ProgressBar(): logging.info("Writing data...") - # alternative implementation possible: futures = client.compute(...) - dd.compute(*write_tasks, scheduler=dask_client) + dd.compute(write_task, scheduler=dask_client) @staticmethod - def _extract_subset_dataframe( - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.DataFrame: - """Create subset dataframe to save with the original field name as the column name.""" - # Create a new dataframe with only the columns needed for the output subset - subset_columns = [f"{subset_name}_{field}" for field in subset_spec.fields] - try: - subset_df = dataframe[subset_columns] - except KeyError as e: + def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]): + """Validates that all columns are available in the dataset.""" + missing_fields = [] + for col in columns: + if col not in dataframe.columns: + missing_fields.append(col) + + if missing_fields: msg = ( - f"Field {e.args[0]} defined in output subset {subset_name} " + f"Fields {missing_fields} defined in output dataset " f"but not found in dataframe" ) raise ValueError( msg, ) - # Remove the subset prefix from the column names - subset_df = subset_df.rename( - columns={col: col[(len(f"{subset_name}_")) :] for col in subset_columns}, + def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: + """Create dataframe writing task.""" + location = ( + self.manifest.base_path + "/" + self.component_spec.component_folder_name ) - - return subset_df - - def _write_subset( - self, - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.core.Scalar: - if subset_name == "index": - location = self.manifest.index.location - else: - location = self.manifest.subsets[subset_name].location - - schema = {field.name: field.type.value for field in subset_spec.fields.values()} - + schema = { + field.name: field.type.value + for field in self.component_spec.produces.values() + } return self._create_write_task(dataframe, location=location, schema=schema) @staticmethod diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 3d4d6097f..d77200da8 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -491,14 +491,11 @@ def optional_fondant_arguments() -> t.List[str]: @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: - - Converts the columns to hierarchical format before passing the dataframe to the - transform function - Removes extra columns from the returned dataframe which are not defined in the component spec `produces` section - Sorts the columns from the returned dataframe according to the order in the component spec `produces` section to match the order in the `meta` argument passed to Dask's `map_partitions`. - - Flattens the returned dataframe columns. Args: transform: Transform method to wrap @@ -506,27 +503,13 @@ def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """ def wrapped_transform(dataframe: pd.DataFrame) -> pd.DataFrame: - # Switch to hierarchical columns - dataframe.columns = pd.MultiIndex.from_tuples( - tuple(column.split("_")) for column in dataframe.columns - ) - # Call transform method dataframe = transform(dataframe) # Drop columns not in specification - columns = [ - (subset_name, field) - for subset_name, subset in spec.produces.items() - for field in subset.fields - ] - dataframe = dataframe[columns] - - # Switch to flattened columns - dataframe.columns = [ - "_".join(column) for column in dataframe.columns.to_flat_index() - ] - return dataframe + columns = [name for name, field in spec.produces.items()] + + return dataframe[columns] return wrapped_transform @@ -552,11 +535,8 @@ def _execute_component( # Create meta dataframe with expected format meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series(dtype=pd.ArrowDtype(field.type.value)) meta_df = pd.DataFrame(meta_dict).set_index("id") wrapped_transform = self.wrap_transform(component.transform, spec=self.spec) @@ -573,8 +553,10 @@ def _execute_component( return dataframe + # TODO: fix in #244 def _infer_index_change(self) -> bool: """Infer if this component changes the index based on its component spec.""" + """ if not self.spec.accepts_additional_subsets: return True if not self.spec.outputs_additional_subsets: @@ -585,6 +567,8 @@ def _infer_index_change(self) -> bool: return any( not subset.additional_fields for subset in self.spec.produces.values() ) + """ + return False class DaskWriteExecutor(Executor[DaskWriteComponent]): diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index fc750620d..013ce2b71 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,7 +4,6 @@ import pkgutil import types import typing as t -from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path @@ -146,7 +145,7 @@ def metadata(self) -> t.Dict[str, t.Any]: @property def index(self) -> Field: - return Field(name="Index", location=self._specification["index"]["location"]) + return Field(name="id", location=self._specification["index"]["location"]) def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -155,43 +154,16 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] - @property - def field_mapping(self) -> t.Mapping[str, t.List[str]]: - """ - Retrieve a mapping of field locations to corresponding field names. - A dictionary where keys are field locations and values are lists - of column names. - - The method returns an immutable OrderedDict where the first dict element contains the - location of the dataframe with the index. This allows an efficient left join operation. - - Example: - { - "/base_path/component_1": ["Name", "HP"], - "/base_path/component_2": ["Type 1", "Type 2"], - } - """ - field_mapping = {} - for field_name, field in {"id": self.index, **self.fields}.items(): - location = ( - f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" - ) - if location in field_mapping: - field_mapping[location].append(field_name) - else: - field_mapping[location] = [field_name] - - # Sort field mapping that the first dataset contains the index - sorted_keys = sorted( - field_mapping.keys(), - key=lambda key: "id" in field_mapping[key], - reverse=True, - ) - sorted_field_mapping = OrderedDict( - (key, field_mapping[key]) for key in sorted_keys - ) + def get_field_location(self, field_name: str): + """Return absolute path to the field location.""" + if field_name == "id": + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{self.index.location}" + if field_name not in self.fields: + msg = f"Field {field_name} is not available in the manifest." + raise ValueError(msg) - return types.MappingProxyType(sorted_field_mapping) + field = self.fields[field_name] + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" @property def run_id(self) -> str: diff --git a/tests/component/examples/component_specs/arguments/component.yaml b/tests/component/examples/component_specs/arguments/component.yaml new file mode 100644 index 000000000..659ed0026 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/component.yaml @@ -0,0 +1,68 @@ +name: Example component +description: This is an example component +image: example_component:latest + +args: + string_default_arg: + description: default string argument + type: str + default: foo + integer_default_arg: + description: default integer argument + type: int + default: 0 + float_default_arg: + description: default float argument + type: float + default: 3.14 + bool_false_default_arg: + description: default bool argument + type: bool + default: False + bool_true_default_arg: + description: default bool argument + type: bool + default: True + list_default_arg: + description: default list argument + type: list + default: ["foo", "bar"] + dict_default_arg: + description: default dict argument + type: dict + default: {"foo":1, "bar":2} + string_default_arg_none: + description: default string argument + type: str + default: None + integer_default_arg_none: + description: default integer argument + type: int + default: 0 + float_default_arg_none: + description: default float argument + type: float + default: 0.0 + bool_default_arg_none: + description: default bool argument + type: bool + default: False + list_default_arg_none: + description: default list argument + type: list + default: [] + dict_default_arg_none: + description: default dict argument + type: dict + default: {} + override_default_arg: + description: argument with default python value type that can be overriden + type: str + default: foo + override_default_arg_with_none: + description: argument with default python type that can be overriden with None + type: str + optional_arg: + description: optional argument + type: str + default: None diff --git a/tests/component/examples/component_specs/arguments/component_default_args.yaml b/tests/component/examples/component_specs/arguments/component_default_args.yaml new file mode 100644 index 000000000..816211c04 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/component_default_args.yaml @@ -0,0 +1,69 @@ +name: Example component +description: This is an example component +image: example_component:latest + +args: + string_default_arg: + description: default string argument + type: str + default: foo + integer_default_arg: + description: default integer argument + type: int + default: 1 + float_default_arg: + description: default float argument + type: float + default: 3.14 + bool_false_default_arg: + description: default bool argument + type: bool + default: False + bool_true_default_arg: + description: default bool argument + type: bool + default: True + list_default_arg: + description: default list argument + type: list + default: ["foo", "bar"] + dict_default_arg: + description: default dict argument + type: dict + default: {"foo":1, "bar":2} + string_default_arg_none: + description: default string argument + type: str + default: None + integer_default_arg_none: + description: default integer argument + type: int + default: None + float_default_arg_none: + description: default float argument + type: float + default: None + bool_default_arg_none: + description: default bool argument + type: bool + default: None + list_default_arg_none: + description: default list argument + type: list + default: None + dict_default_arg_none: + description: default dict argument + type: dict + default: None + override_default_arg: + description: argument with default python value type that can be overriden + type: str + default: foo + override_default_none_arg: + description: argument with default None value type that can be overriden with a valid python type + type: float + default: None + override_default_arg_with_none: + description: argument with default python type that can be overriden with None + type: str + diff --git a/tests/component/examples/component_specs/arguments/input_manifest.json b/tests/component/examples/component_specs/arguments/input_manifest.json new file mode 100644 index 000000000..9ee2494f9 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/input_manifest.json @@ -0,0 +1,18 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_123", + "component_id": "component_1", + "cache_key": "00" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "data": { + "type": "binary", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/component_specs/component.yaml b/tests/component/examples/component_specs/component.yaml new file mode 100644 index 000000000..973cc3e6b --- /dev/null +++ b/tests/component/examples/component_specs/component.yaml @@ -0,0 +1,23 @@ +name: Example component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: array + items: + type: float32 +additionalFields: false + + +args: + flag: + description: user argument + type: str + value: + description: integer value + type: int diff --git a/tests/component/examples/component_specs/input_manifest.json b/tests/component/examples/component_specs/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/component/examples/component_specs/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/data/components/1.yaml b/tests/component/examples/data/components/1.yaml new file mode 100644 index 000000000..95e5e578f --- /dev/null +++ b/tests/component/examples/data/components/1.yaml @@ -0,0 +1,29 @@ +name: Test component 1 +description: This is an example component +image: example_component:latest + +consumes: + Name: + type: "string" + HP: + type: "int32" + + Type 1: + type: "string" + Type 2: + type: "string" + +produces: + Name: + type: "string" + HP: + type: "int32" + Type 1: + type: "string" + Type 2: + type: "string" + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/component/examples/data/manifest.json b/tests/component/examples/data/manifest.json new file mode 100644 index 000000000..cc579fef1 --- /dev/null +++ b/tests/component/examples/data/manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "tests/component/examples/data", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "Name": { + "type": "string", + "location": "/component_1" + }, + "HP": { + "type": "int32", + "location": "/component_1" + }, + "Type 1": { + "type": "string", + "location": "/component_2" + }, + "Type 2": { + "type": "string", + "location": "/component_2" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fa5d96dad64c5e6291eb03909f8542a6af2faf7b GIT binary patch literal 3542 zcmcInUu+{s8J|sTXOpvYz9Vba;48-5IoCExZ706gM-IAn;(wPoj&rdcLltJdYkT7L zu5Y~?$9IZ^Djq686_3CJLe&WhLI{Z`B&aHcctAYx!Ye|61OgrqPZdHH-^@DynnQv@ zE&pa`zVDmw?|id%S*pYZm+((s`1;g_APeX^LT|Y?rV&DETa)ns+vXH33Wp=^N{EM9 zRn=|I8=HY?ri0BM))h54>sK|^(7Bhnke@ulrotsdI}dGQTAIoQ{WWY_#x-9!t>J-X zkb%lYLm6zTW}m2>DQ>D3?i&i%%BO7$s4Y`fJ)pm)s*2{GnY)j>wvM?}C=CRYXc`xu zD-)%yjyU3;@fwz8aAlv5yE0e7mW3^MN`wh%!pszR`YJY!A-9II?wPO~)UJ22KH#SP zk|`Tq?o(o!baAs|bMvsB&`CGE15ph+raBaSUhuiY{p4neD29pk-+k@#+;`lwJOPL1 zIj{ert*Uxr9-&un%p>EP*V`p}oAZV;n%cs|90^_|KRYv5wY5Gun%)(4h*eVu-0saH zX}7ge-)^nr8JcfqZc9-JIB>(C(+s1>ZH22uQ~R3TQ@Mb@ zj7>b?9=oSqYV+1+9%^Ig*cz9~=owZ=K!PH47NE3#!6SG=chGG&vmm-z{9q^Xvk4ew z&U5wUuy+lI@^36`{C>53bMt}vO8r~C&;IfFbGKht$TvFQ{l4~>w_+fRQ1ownNbm-e z$m0)+t`hF4!bCt0{Cp9(~?BmhAk9XarloF*Z0OgZ_EkFWr8z2Eb4cGxZ19%be3BY|o7SI4J0qy~| z0sDX`py1eAr zu6*xD-wt?VPu}^%lVAVhOF#V0bFca_5B&w%`>QV`xW?I=$d_5}e&7!LeKPu0Z}cD2 zq5s14Q`7ntkMPFiNk6|f>EdV-CukBUA$S+S4e$V_06btCFax*(@B(~*S-@3*A8-wD z9q=K*96$gB03Qa3fFR%_fR6$~fExf>kdFbvfR6(r09u@zfLj1s2q-Zt@0k|)6OPDpV;Neg%T!>#uhag^xpVa*3ai)aeBg;Y`seG> zH?KmyF5CJy9z3mH!HpymJ%eUP?P0-#1n;d0r>SnCjS%=OxPSoXWIQ1X3dq4waP|Mv zg&*u;jDDc03w%Kg$WcDjPp36!wm*9x~*v-(ITp*>|dlgn)j_yp9)57oiq(ajexXn_xht#h9MuQ zO^ecdaMdf=!XW-QqUsV?_CFe7DKu4V8751!Mr0G4 zBbw#@2%GTYr-lzzb3hE8QsXO2v88wfQVN$4K-_mC9D_uG5j?JNygu z6PMN`N!kX`&wWW+k5-N;o}>WR{!oUdB4zDc%7S_XQzpv4A-Oz}WY9L42YvW08OZlm7O znxn#xmD_VgJ+n_Hl*VD=RawXZ!QN0q86UB#(S}Er6Dof{se3aA3eMw4x^W zN{5Xc#6W$@6sy(sJ!>mdOtq>Pe1AmcYs&+2>Q|$_VnMKh1AhuMXrq_?N%nPhmGxp!RyWRvk35DVzBReiyo(ZKg+0cjR8B(~PHL zYCWydm}vc!wY_nEF6R$?h?Neu39XG>tH$QpA zd>!@o^kS!}ciCA0oiHyRDkGMsM5A7Oglj3A<>6pGws9cYn>)o6jrGjOhSt84c74FX1j-5EO*;U$V3xhmn#sqCL*iM|?3>iqO@ediCC`@71ez%3L+dIGEqLnHSRqMr5d0DC#GU#c7J7QgTPZrs;9ksFlDhTOwQ#~ygk7}`*gO$pQf z0A~nDl~qjNW6PQ%4^fA{?VR;WvY{|@O9nC}2dv^utWVbw%} zra3?7ne!S0tk4e@GFTG_qVpGv83P?*1shI^qW?}|=eOs}nmRI_-wRZcE{muy(H|`A zD%w!jJl~+tgJ0z1wMIw6Q5jUlQqu-V~U>^VR}9fXG|3 z?p=(X@3AR7>}YC-{s)!0B{_^6w?1$Tx^yM5D?!vEHfV1ksc56V3GsY+9`#LSME}N{ zMFaI^W*(~@WY7=1JF0w4f1XVN!6z8gZ+o*NU3I>_xosexACW|E%PM5jU?|A1v;^He z<6>NaHEMC@f+W^K2nX%hFQ=dvY1d1eL2n8LvOicV{9&VfySRrRHs0^O@#oVw7oUpq zbLqnu%3pr2dqJ5Bzv~GwUVof&&HGtL3H2~DBZY}qe`J&6d@1;Zdx!nglSj`Y zKMus+S^Rh*_B0_n*Glr!kA5}v%};m=x`+ypt4rl*@;E0_xIAvuZK=gFM^qjB1CRXVqqeVbLoE?Nguw_;BYb$ zvcuCQ2xz>!P_}&4lP&ufKeB4BtrFeI{ho_2ZTE^3VE)Q>Y}xmy{qN02x^BEL>Fl}MwjcO!SBA)p{Mc716wb~k!=C3H%$Uanf*so%4!lKq?e zE2Kra6vuHD0BPGCx5_0tT$nEH@FYeO zMe=|#$EAj#pLi|TQwFU}ZM6|A9r4+k$*VG3K zBt;nI)^ceR*2X2izEAWAV*R*ZNOE$krzm`;+U9#{v$YF)nRt}X42hj#6V^8yrM8$= zhSpp%ALV-r*sm%%dDsDaQNEFgH&&`~Aw%qPG9Sx7YSl{-dp@d&nKam_O1VR=gGYIV z@0E^P8Q24fDVeX;wwmTnGQZxgO~ige^y@2q`8=*a~6=y*{wwgSc~Dsbbkt6B6I6N+T;&g9h?EX1GRX~JuSJ^msw8H%&U4n@9TFZde386cxwy&+s@ryi zOd8Hi`s8w+Cj6*1Q^+cp^Q`k~C04TcXSN`Jus+f}KOe{I4TvMxD~($9bcA2Yk@(2t`Q-R&RBrv@%K;7VVQ# zwq*OHPZh`G1SeTif%I1WSwoK0EVpYJ>=5Trr-CiC=50?91X$~k`yfl|8CvDS$X414 zGI4evSPLx`@MN8FTVgG-RPFKo`xG10k@0AN&1FH?m#qseIKHrg_EqNJfL{srA60^X Kv&!%v+&=+gWo-8V literal 0 HcmV?d00001 diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d226a424901fa9ff3679ae88c43487366d1b2722 GIT binary patch literal 3584 zcmcInYiuJ|6`o1#tex1bTRhfSM96EJXsafv?IhdHDl0K|;#cg%aT42cx1@I-_S|?L z-g(4!meBGpEiANMo~&Yf{y%~pa! zNB-{PoO91P_ndp~j4NC<$#|F-{mgGKl^B_!9;K*Xd2U^zD5|C_mTvHV7_KUkU>Uk7 zd4p_UYPSX3iUt>KV2=SeWAnNu(aUTZ8F~+O=qG3L*iv;(k)&~o3I+p$j5PWKu@V+K z*s`@YJs(=rTP^IL2-GF%FgU?h<$qHfaHu9r~PK&BTItzgU2X@6ifXBbVEUXdTNq0 zEG*cXPJbiZL|x*+m2esK86ylN z)mwi#diLsF5r0em-Vc<&-tW%*4y31l=$~W)(M4)36lFc@NR^n8h#dLTc;t2e^gl06 z``*c1bOw$i#f(tUgrK7+m_I+t{E;p(3Fmk|pgmV9KTR$BFL|iFppQxcng9+^3Hm&k zaNi=_4+7;Wz$_pOSObIsp8zBP9|F7vcpPvO@O03(ybmVHBf1^&qsBB(!^HaD8gf`) zcsPnsS06$J<`b{Gq4Qpjd6K_mJz=x5RG58r2j~L^fJ49?z()Z`fR6#513VA-IDn+! zlYmbFJ`MN`;In`i0G|WA2zUwbdB7I{cL84ndpSMKzT?!Av|Xfvf_HV|?=fexheaoQ_kEEE-svB`fBN^AA$u2Xy@Ll2%U-m!NU>w4 zAWJG@#waE*HR2Z56jho8pBWDj;M5EUSfnCybTYd9f2_p!_JB^mrvZ%IWh3&mFY>cT zp=hHs>{u6RA~M;Zd+x-sCQAK1L04>5Gvha6Psgzse`Y2YcS(B=UvfBi;u_p#Afwre z0un7ODdOIFnyY!w8gz+Z+}5xFug*9~Yqr`QoG^5LA6~13u1XdXk%d6~PF&JJ9Nu1J z?ap>~_Imv2=+v3>Uc|}GN;I7F%@c)oJcl1qD=9jsZ` z7KrTm{hpA|Z1;i@i2Q}^xU~P#h!a9XLh#YzgjvD70Urk>%3XMo!|Pn)-^&+ zF3ct7l5q$@d}#U%Gc}i-n~xt+Y=Em)H|jUsx@7<5dy+g!ZiVBxO#pe;IBtPUx4A6G zwYf~&p{3L3_&q{gJ>lm!GwvK(M~lNRa9odrbj!+kX7O4XZ{Rc!^iDxZn1xJnpHKCZe0{Nn>WiIbLso=A;aVYU!P-TcukR3fTdeoZ zQij7#RZ;j{wZ*GhtGNYoxy2-(>k&P@U0A={SZ|4WrRVT6#U!sPpuegVaIX#el6)h* z*qE;_3OS;e!+ffEyIEgPxcsCd=CYupDi`+kwlpXzyt;nS%)uUrPnlw^wz_L=WQt3z z+Bx43iF|$D#3z1biW|H2+7DhtV_gw#kYWKSf2;>zwYJ{RK$J|x!n zH!;Z#U#}&0>s49It)AqpwbdhWJhesg?&RWPOiayHLh zUfWEqyZf`6kUvV^a~d^)8=UaCDUMLcg?M#x1wfa-!c}7LdV#}fEiLZK$7|shotJFV>@>2I&R{|9^14^=VyDI@!0hz zj#EWq*#)XtvSLx83bA0xk_BRi!~(Hk!Kz5GVB19(EI98?;yP(6fmB95^SSrF^X@(G z-6wDG*#K)}zp}88EopX|Wu_Ts>-s3eFo~AfZrkt@*IKG&YD*A1YHI}Fk#5C~N2bxL zi@Iezw$aeq4Z{J`s;A1lUx@2{&8S$$ zZ=Wb&pa^Id+2x^17G~|{XxAt$6>Jw3qQ=?F&qCL~pSFyQLDb~5c7yp0R^OA+lO+Uc z3C}Ffzb(EeHs2p3o~0K^Pc7p!JNsxP4N=LeJs7cBm@x}u9Xn%VM$S1H8!!skfir*u z7z4(Ev%my!4tNbX4@?46z%=kWZ~^sGIaOaFdtV`hCzy;FExzy0UI-PwnV_C@vUZ}h*uGpF`_KH*_)hr7$F zDRfyLI-FmQ`o10aJ+*uOLk6da5SVEjyPhks~u4hJ0imGbE<)GoEv0pZYw^&%pDv&Pv&?QoOx^!j$@e&+5c@e zZ5+0k^msO;%7Zy1by%Uv8+^%p)c$Jc!$Q1Jn`t{Sk7jh17D?=JjR)o-iD^+((UvN9 z8ZD~2u1PJi)hD7i`(mrr=n=AAwbj-d2C)Vf7yJtW4r*|RQx7?V3xS0YcfhzN_-yv3 zaMkQf*C%6F$Q7u>c%I(`$aRC~7x_qqkMn$mk5;HH5;^AY5##E>pX8(FH|~^$`jb50 z<1r3TLJ}#G2fi_~jrhn%$-3SxCGv}fU}jfZ%XcJWUAw78B-nS7y1$)_rfO1fFCYow zvM7XiN`cn4#{RY}N^_^u-CwbM~vE7f&fN@UAYJ>DtZLR=yokP@Jf1;SII4Q)NefB+(5sQ&&?$5v^C-% zv@Tvrt0nXu+-d)z(GWt6^or0ynHorzVOWi$iP4JGPhV*iUnIuEJqnt28o$Y7H29FG zY$k_h9t_&doth7&j?jZM!AGvZ z>X?{gtNYwdK6R!+b+p#^oi3U<9@iDbcndK$!16ZW z9l*PQs{jTN1;hZGfH)ulU;#-02S@>~0n&gBAPd+6@1^z<@<5ta|ju1X~M`|d#K)MNX`r^c3n_nc-0mxv)L=Q|ZU_mm3SM~?| z8?UZMUg3uPe}(h@Fyobj9`7%wFA#)((dRY=Ll!!|?_75%)sU5gU8$vYb)!RFnY&J@ zGIeutj1P%u0i~Q#;LtJ#0!h%Xr#Pch)67o%cvrqx?Sb7A=@9;WHtdY3JtE=sw@4JChFP! zDhoBBoMNd`DPFO*VyX37=|t{_h|e!~)L~w+R8c8MHEuJmj?Mcyzr-d$r^MlCZZaP+ zEA;vv)Pm{*4Z=O8v>w87VLr~th9snKAN#E;**L;gMp5WTEf~eeY@VO zS>+G*%JfV8MzkU7g@zO!owL*xc%(@Y*CiSyOG;{^njgx^lqu?x?dMCO3SVf*Y<#$m zwe23#wp-!D8qy8#1-w2l<|FWV9A}(9uwO#irtKe-hbbC)IvabrHf6|z)3g05p9qP| z9Lf*0Ixs6nsv#MBcz-}`@SfV2ZH?(_Idv%T>s_38GagDenQmq~1v)e8vmC3MWMoZ| zUcoUk>dHa7nNe$bhDEJ0;K3{ENk&1p%|X90jG2ReMW1;NcUltphph{*sMT0)5AL}C zpwWPWjQEP;LD?n*Sp!2n0Zk08i2e8#vzQ7p9`8{IlfnEv1EV1x{2TD2!Tuw5;df^a{t@^G DkXjW| literal 0 HcmV?d00001 diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1ae8001c02ac7c803353728bb9e7bb919d774ecc GIT binary patch literal 3066 zcmcImOKcle6dgN`;}l0NsAFtoiO6cLF4AB-acHV8I(A~muHz3_PcY2pl8a%OSzVD@4NV%vd$J)ko6f;ucfD=Mwx|6G zt)jLxqhj+M&ej_>LAM<^yinJgb;CAM;Dwgb7{aK9W>-=*X#lUWcEvVyBqb>hd?d8S zpldQ$msXwlnrR4?y1_E+9Ge+qRrV0;WS#yg=GY)j3RgdMqR$6d8K4l<7iCYFxI zxN^CCdH9N4o^hsc-##<^-47jx3Emj+vuQ#Gc{zFG)IuLPB7KqLV-QHimx!bW144t-*NcW6;sN$A$Zxs<*#osp1C42Cz!QdAT z|J2@u`xbK7$=(^rKvuG_HwPRtGi+m&;UNc89&s@hKm|0Q3akP;Py-C04w%3NpaC>T zT#gpW&?XtyVCewsKo{5mHh~v`E#M{KW#A(43UCQ{6?hGJ9e4w{47>@v1zZ8%2Cf3{ z0Ph0t0oQ={fe(NWfscTXM_i|_Lp^eRVm-(L%B*@{a<6O0<8{f~F&^gFK2|s*-&*_2 zdGPe8dq(ijeKwZ)_0+|I&Aq2BsZ|KpB1_9Mmw|8o0T$M$Zs zYcgGZR9L5$JXbwrdkMVD80+pNqhEgUci473eRs>@0M z(MG#wc6*MxSj8zy>@}q&$U;j%_+~&c5ROBGj9p$z96u4*+PY6>H;aIEijf?8yIbR)xldwR6jijL z2(5a9s;+BdLuhn~=w?@FH0m9aY)xr2wYov9;mPsPcsKwx0^3vfIV0oY@rl3|<8^bn z+*$sl)t26i!_SZ_T$$lGZV@2Y8IGIeVihjIaTP9Jp|)6Tm%l@dvps)`i(B8g(=61V z;u_Afk8zY;Dk3E~J>FrW7KUs)m*TlJeOEl)Svsz4qeKnF7#meZewoC#Wq!SxiE~=1rt4xdR~BoDR_Pq#lF_i3?2tU2 zBE}aB*|I#Rcc?F(4vRG%`Ezr{|0EEZzX!bC1AB}raR6C>#hB|aOn{9#>ACXgei zrmFRd(oO4PExT4qq6We#p3diIi>z4k5@dstQ2B;YGzj3m-imO zluSUU#6~|inU9*q$n+{|A@xCna7W2chiG2Rmol;;iRjy<{#FURkt0;>>9RhZiIX+h z(UH^gR2!~q)mt)4{Q7Dk@({lfQ^i_NmBRgV=Gy{KG+Cx~iAKI9&FTG`Z^(0LQ>;mr zpU;Pid`^{AAw(_dG^x>`z4gETK;`_n4+;5$;Kb9 zO#wVowQRSt@kNtCF!w_XldjdwNZ_G>()C=?#IOZd2aX*-)m+wdc>J zp);r5=UCDuBWsG0Nt$C`n_SOS=e6=8mn5yR6NgvQlbk|sGdumpapnU0m3%IyXs68) zf3J1%N?I+cui{So4~>QpVx(7$4$4-MJ<~xNy P3)YX}@uzbE|N8#{@S7Po literal 0 HcmV?d00001 diff --git a/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt new file mode 100644 index 000000000..4a9ff8afc --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt @@ -0,0 +1 @@ +tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/test_component.py b/tests/component/test_component.py similarity index 99% rename from tests/test_component.py rename to tests/component/test_component.py index e5dcb3bc3..830ce2963 100644 --- a/tests/test_component.py +++ b/tests/component/test_component.py @@ -23,8 +23,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest, Metadata -components_path = Path(__file__).parent / "example_specs/components" -base_path = Path(__file__).parent / "example_specs/mock_base_path" +components_path = Path(__file__).parent / "examples/component_specs" +base_path = Path(__file__).parent / "examples/mock_base_path" N_PARTITIONS = 2 diff --git a/tests/test_data_io.py b/tests/component/test_data_io.py similarity index 61% rename from tests/test_data_io.py rename to tests/component/test_data_io.py index 9ade4a329..30a4b7c10 100644 --- a/tests/test_data_io.py +++ b/tests/component/test_data_io.py @@ -8,8 +8,10 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -manifest_path = Path(__file__).parent / "example_data/manifest.json" -component_spec_path = Path(__file__).parent / "example_data/components/1.yaml" +manifest_path = Path(__file__).parent / "examples/data/manifest.json" +component_spec_path = ( + Path(__file__).parent / "examples/data/components/1.yaml" +) NUMBER_OF_TEST_ROWS = 151 @@ -37,33 +39,16 @@ def dataframe(manifest, component_spec): return data_loader.load_dataframe() -def test_load_index(manifest, component_spec): - """Test the loading of just the index.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - index_df = data_loader._load_index() - assert len(index_df) == NUMBER_OF_TEST_ROWS - assert index_df.index.name == "id" - - -def test_load_subset(manifest, component_spec): - """Test the loading of one field of a subset.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - subset_df = data_loader._load_subset(subset_name="types", fields=["Type 1"]) - assert len(subset_df) == NUMBER_OF_TEST_ROWS - assert list(subset_df.columns) == ["types_Type 1"] - assert subset_df.index.name == "id" - - def test_load_dataframe(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader(manifest=manifest, component_spec=component_spec) dataframe = dl.load_dataframe() assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == [ - "properties_Name", - "properties_HP", - "types_Type 1", - "types_Type 2", + "Name", + "HP", + "Type 1", + "Type 2", ] assert dataframe.index.name == "id" @@ -78,7 +63,7 @@ def test_load_dataframe_default(manifest, component_spec): def test_load_dataframe_rows(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader( manifest=manifest, component_spec=component_spec, @@ -89,34 +74,7 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_write_index( - tmp_path_factory, - dataframe, - manifest, - component_spec, - dask_client, -): - """Test writing out the index.""" - with tmp_path_factory.mktemp("temp") as fn: - # override the base path of the manifest with the temp dir - manifest.update_metadata("base_path", str(fn)) - data_writer = DaskDataWriter( - manifest=manifest, - component_spec=component_spec, - ) - # write out index to temp dir - data_writer.write_dataframe(dataframe, dask_client) - number_workers = os.cpu_count() - # read written data and assert - dataframe = dd.read_parquet(fn / "index") - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert dataframe.index.name == "id" - assert dataframe.npartitions in list( - range(number_workers - 1, number_workers + 2), - ) - - -def test_write_subsets( +def test_write_dataset( tmp_path_factory, dataframe, manifest, @@ -125,11 +83,7 @@ def test_write_subsets( ): """Test writing out subsets.""" # Dictionary specifying the expected subsets to write and their column names - subset_columns_dict = { - "index": [], - "properties": ["Name", "HP"], - "types": ["Type 1", "Type 2"], - } + columns = ["Name", "HP", "Type 1", "Type 2"] with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) @@ -137,13 +91,13 @@ def test_write_subsets( # write dataframe to temp dir data_writer.write_dataframe(dataframe, dask_client) # read written data and assert - for subset, subset_columns in subset_columns_dict.items(): - dataframe = dd.read_parquet(fn / subset) - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert list(dataframe.columns) == subset_columns - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == columns + assert dataframe.index.name == "id" +# TODO: check if this is still needed? def test_write_reset_index( tmp_path_factory, dataframe, @@ -151,7 +105,7 @@ def test_write_reset_index( component_spec, dask_client, ): - """Test writing out the index and subsets that have no dask index and checking + """Test writing out the index and fields that have no dask index and checking if the id index was created. """ dataframe = dataframe.reset_index(drop=True) @@ -160,10 +114,8 @@ def test_write_reset_index( data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) data_writer.write_dataframe(dataframe, dask_client) - - for subset in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / subset) - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" @pytest.mark.parametrize("partitions", list(range(1, 5))) @@ -189,29 +141,51 @@ def test_write_divisions( # noqa: PLR0913 data_writer.write_dataframe(dataframe, dask_client) - for target in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / target) - assert dataframe.index.name == "id" - assert dataframe.npartitions == partitions + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" + assert dataframe.npartitions == partitions + + +def test_write_fields_invalid( + tmp_path_factory, + dataframe, + manifest, + component_spec, + dask_client, +): + """Test writing out fields but the dataframe columns are incomplete.""" + with tmp_path_factory.mktemp("temp") as fn: + # override the base path of the manifest with the temp dir + manifest.update_metadata("base_path", str(fn)) + # Drop one of the columns required in the output + dataframe = dataframe.drop(["Type 2"], axis=1) + data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) + expected_error_msg = ( + r"Fields \['Type 2'\] defined in output dataset " + r"but not found in dataframe" + ) + with pytest.raises(ValueError, match=expected_error_msg): + data_writer.write_dataframe(dataframe, dask_client) -def test_write_subsets_invalid( +def test_write_fields_invalid_several_fields_missing( tmp_path_factory, dataframe, manifest, component_spec, dask_client, ): - """Test writing out subsets but the dataframe columns are incomplete.""" + """Test writing out fields but the dataframe columns are incomplete.""" with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) # Drop one of the columns required in the output - dataframe = dataframe.drop(["types_Type 2"], axis=1) + dataframe = dataframe.drop(["Type 1"], axis=1) + dataframe = dataframe.drop(["Type 2"], axis=1) data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) expected_error_msg = ( - r"Field \['types_Type 2'\] not in index defined in output subset " - r"types but not found in dataframe" + r"Fields \['Type 1', 'Type 2'\] defined in output dataset " + r"but not found in dataframe" ) with pytest.raises(ValueError, match=expected_error_msg): data_writer.write_dataframe(dataframe, dask_client) diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 0b255b9df..c24d27c9c 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -1,6 +1,5 @@ import json import pkgutil -from collections import OrderedDict from pathlib import Path import pytest @@ -226,21 +225,3 @@ def test_fields(): # delete a field manifest.remove_field(name="field_1") assert "field_1" not in manifest.fields - - -def test_field_mapping(valid_manifest): - """Test field mapping generation.""" - manifest = Manifest(valid_manifest) - manifest.add_or_update_field(Field(name="index", location="component2")) - field_mapping = manifest.field_mapping - assert field_mapping == OrderedDict( - { - "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ - "id", - "height", - "width", - ], - "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], - "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], - }, - ) diff --git a/tests/examples/example_data/raw/split.py b/tests/examples/example_data/raw/split.py index 6800ee323..ade466125 100644 --- a/tests/examples/example_data/raw/split.py +++ b/tests/examples/example_data/raw/split.py @@ -13,7 +13,7 @@ import dask.dataframe as dd data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent / "subsets_input/" +output_path = Path(__file__).parent.parent def split_into_subsets(): @@ -22,17 +22,13 @@ def split_into_subsets(): master_df = master_df.set_index("id", sorted=True) master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - # create index subset - index_df = master_df.index.to_frame().drop(columns=["id"]) - index_df.to_parquet(output_path / "index") - # create properties subset properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "properties") + properties_df.to_parquet(output_path / "component_1") # create types subset types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "types") + types_df.to_parquet(output_path / "component_2") if __name__ == "__main__": From e4eadf3ddb59925dd2d86c66abf2d70513ce3eb7 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 08:46:50 +0100 Subject: [PATCH 3/4] Use new data format (#667) This PR applies the usage of the new data format: - fixes all tests - update component specifications and component code - remove subset field usage in `pipeline.py` --------- Co-authored-by: Robbe Sneyders --- components/caption_images/README.md | 6 +- .../caption_images/fondant_component.yaml | 12 +-- components/caption_images/src/main.py | 4 +- components/chunk_text/README.md | 8 +- components/chunk_text/fondant_component.yaml | 16 ++-- components/chunk_text/src/main.py | 7 +- .../chunk_text/tests/chunk_text_test.py | 6 +- components/download_images/README.md | 10 +-- .../download_images/fondant_component.yaml | 24 +++--- components/download_images/src/main.py | 5 +- .../download_images/tests/test_component.py | 8 +- components/embed_images/README.md | 6 +- .../embed_images/fondant_component.yaml | 18 ++--- components/embed_images/src/main.py | 4 +- components/embed_text/README.md | 8 +- components/embed_text/fondant_component.yaml | 22 +++--- components/embed_text/src/main.py | 4 +- .../embedding_based_laion_retrieval/README.md | 6 +- .../fondant_component.yaml | 18 ++--- .../src/main.py | 6 +- components/filter_image_resolution/README.md | 5 +- .../fondant_component.yaml | 10 +-- .../filter_image_resolution/src/main.py | 4 +- components/filter_text_length/README.md | 3 +- .../filter_text_length/fondant_component.yaml | 6 +- components/filter_text_length/src/main.py | 4 +- .../tests/text_length_filter_test.py | 2 +- components/image_cropping/README.md | 10 +-- .../image_cropping/fondant_component.yaml | 20 ++--- components/image_cropping/src/main.py | 4 +- .../image_resolution_extraction/README.md | 10 +-- .../fondant_component.yaml | 20 ++--- .../image_resolution_extraction/src/main.py | 7 +- components/index_weaviate/README.md | 5 +- .../index_weaviate/fondant_component.yaml | 14 ++-- components/language_filter/README.md | 3 +- .../language_filter/fondant_component.yaml | 6 +- components/language_filter/src/main.py | 2 +- components/load_from_files/README.md | 5 +- .../load_from_files/fondant_component.yaml | 12 ++- components/load_from_hf_hub/README.md | 3 +- .../load_from_hf_hub/fondant_component.yaml | 10 +-- components/load_from_hf_hub/src/main.py | 25 +++---- components/load_from_parquet/README.md | 3 +- .../load_from_parquet/fondant_component.yaml | 6 +- components/load_from_parquet/src/main.py | 25 +++---- components/minhash_generator/README.md | 6 +- .../minhash_generator/fondant_component.yaml | 16 ++-- components/minhash_generator/src/main.py | 4 +- components/normalize_text/README.md | 3 +- .../normalize_text/fondant_component.yaml | 6 +- components/normalize_text/src/main.py | 12 +-- .../prompt_based_laion_retrieval/README.md | 6 +- .../fondant_component.yaml | 14 ++-- .../prompt_based_laion_retrieval/src/main.py | 6 +- components/resize_images/README.md | 6 +- .../resize_images/fondant_component.yaml | 12 +-- components/resize_images/src/main.py | 2 +- components/segment_images/README.md | 6 +- .../segment_images/fondant_component.yaml | 12 +-- components/segment_images/src/main.py | 2 +- components/write_to_hf_hub/README.md | 3 +- .../write_to_hf_hub/fondant_component.yaml | 8 +- components/write_to_hf_hub/src/main.py | 19 +++-- scripts/component_readme/readme_template.md | 14 +--- src/fondant/core/manifest.py | 8 +- src/fondant/core/schema.py | 3 +- src/fondant/pipeline/pipeline.py | 56 ++++++-------- tests/component/test_data_io.py | 4 +- tests/examples/example_data/components/1.yaml | 35 --------- tests/examples/example_data/manifest.json | 35 --------- tests/examples/example_data/raw/split.py | 35 --------- .../examples/example_data/raw/testset.parquet | Bin 15048 -> 0 bytes .../subsets_input/index/part.0.parquet | Bin 1701 -> 0 bytes .../subsets_input/index/part.1.parquet | Bin 1707 -> 0 bytes .../subsets_input/index/part.2.parquet | Bin 1715 -> 0 bytes .../subsets_input/properties/part.0.parquet | Bin 3542 -> 0 bytes .../subsets_input/properties/part.1.parquet | Bin 3526 -> 0 bytes .../subsets_input/properties/part.2.parquet | Bin 3584 -> 0 bytes .../subsets_input/types/part.0.parquet | Bin 3018 -> 0 bytes .../subsets_input/types/part.1.parquet | Bin 3085 -> 0 bytes .../subsets_input/types/part.2.parquet | Bin 3066 -> 0 bytes .../second_component/fondant_component.yaml | 27 ------- .../second_component/fondant_component.yaml | 29 -------- .../second_component/fondant_component.yaml | 27 ------- .../fourth_component/fondant_component.yaml | 38 ---------- .../third_component/fondant_component.yaml | 33 --------- .../components/arguments/component.yaml | 68 ----------------- .../arguments/component_default_args.yaml | 69 ------------------ .../components/arguments/input_manifest.json | 18 ----- .../components/input_manifest.json | 17 ----- .../example_pipeline/cache/42.txt | 1 - .../component_1/manifest.json | 31 -------- .../component_2/manifest.json | 36 --------- .../component_1/manifest.json | 36 --------- .../component_2/manifest.json | 36 --------- .../components/dummy_component/Dockerfile | 0 .../components/dummy_component/README.md | 0 .../dummy_component/fondant_component.yaml | 8 +- .../dummy_component/requirements.txt | 0 .../components/dummy_component/src/main.py | 0 .../load_from_parquet/fondant_component.yaml | 4 +- .../sample_pipeline_test/data/sample.parquet | Bin .../test_sample_pipeline.py | 6 +- .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 12 +-- .../second_component/fondant_component.yaml | 19 +++-- .../first_component/fondant_component.yaml | 17 ++--- .../second_component/fondant_component.yaml | 24 ++++++ .../first_component/fondant_component.yaml | 18 ++--- .../second_component/fondant_component.yaml | 21 ++++++ .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 19 ++--- .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 29 ++++++++ .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 18 +++++ .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml} | 19 ++--- tests/{ => pipeline}/test_compiler.py | 4 +- tests/{ => pipeline}/test_pipeline.py | 4 +- tests/{ => pipeline}/test_runner.py | 2 +- tests/test_cli.py | 32 ++++---- tox.ini | 2 +- 124 files changed, 420 insertions(+), 1059 deletions(-) delete mode 100644 tests/examples/example_data/components/1.yaml delete mode 100644 tests/examples/example_data/manifest.json delete mode 100644 tests/examples/example_data/raw/split.py delete mode 100644 tests/examples/example_data/raw/testset.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.2.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.2.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.2.parquet delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml delete mode 100644 tests/examples/example_specs/components/arguments/component.yaml delete mode 100644 tests/examples/example_specs/components/arguments/component_default_args.yaml delete mode 100644 tests/examples/example_specs/components/arguments/input_manifest.json delete mode 100644 tests/examples/example_specs/components/input_manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/Dockerfile (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/README.md (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/fondant_component.yaml (73%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/requirements.txt (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/src/main.py (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml (95%) rename tests/{ => integration_tests}/sample_pipeline_test/data/sample.parquet (100%) rename tests/{ => integration_tests}/test_sample_pipeline.py (91%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_1/first_component/fondant_component.yaml (62%) rename tests/{examples/example_pipelines/valid_pipeline => pipeline/examples/pipelines/invalid_pipeline}/example_1/second_component/fondant_component.yaml (55%) rename tests/{examples/example_pipelines/valid_pipeline/example_1 => pipeline/examples/pipelines/invalid_pipeline/example_2}/first_component/fondant_component.yaml (61%) create mode 100644 tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_3/first_component/fondant_component.yaml (53%) create mode 100644 tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{examples/example_pipelines/invalid_pipeline/example_2 => pipeline/examples/pipelines/valid_pipeline/example_1}/first_component/fondant_component.yaml (50%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/fourth_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/second_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/third_component/Dockerfile (100%) rename tests/{examples/example_specs/components/component.yaml => pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml} (59%) rename tests/{ => pipeline}/test_compiler.py (99%) rename tests/{ => pipeline}/test_pipeline.py (98%) rename tests/{ => pipeline}/test_runner.py (98%) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 8bb38e996..401747cbb 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -7,13 +7,11 @@ This component captions images using a BLIP model from the Hugging Face hub **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- captions - - text: string +- captions_text: string ### Arguments diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml index 7a72cd815..3da8e4720 100644 --- a/components/caption_images/fondant_component.yaml +++ b/components/caption_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - text: - type: utf8 + captions_text: + type: utf8 args: model_id: diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py index 934ea09ce..86be52b40 100644 --- a/components/caption_images/src/main.py +++ b/components/caption_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.max_new_tokens = max_new_tokens def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -112,4 +112,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(captions) - return pd.concat(results).to_frame(name=("captions", "text")) + return pd.concat(results).to_frame(name=("captions_text")) diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 97b3309e0..a12d74980 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -11,14 +11,12 @@ consists of the id of the original document followed by the chunk index. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - original_document_id: string +- text_data: string +- text_original_document_id: string ### Arguments diff --git a/components/chunk_text/fondant_component.yaml b/components/chunk_text/fondant_component.yaml index d266b4dac..159e67556 100644 --- a/components/chunk_text/fondant_component.yaml +++ b/components/chunk_text/fondant_component.yaml @@ -10,18 +10,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - original_document_id: - type: string + text_data: + type: string + text_original_document_id: + type: string args: chunk_size: diff --git a/components/chunk_text/src/main.py b/components/chunk_text/src/main.py index 8c41220d2..da46cbbd7 100644 --- a/components/chunk_text/src/main.py +++ b/components/chunk_text/src/main.py @@ -38,7 +38,7 @@ def __init__( def chunk_text(self, row) -> t.List[t.Tuple]: # Multi-index df has id under the name attribute doc_id = row.name - text_data = row[("text", "data")] + text_data = row[("text_data")] docs = self.text_splitter.create_documents([text_data]) return [ (doc_id, f"{doc_id}_{chunk_id}", chunk.page_content) @@ -63,9 +63,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ) results_df = results_df.set_index("id") - # Set multi-index column for the expected subset and field - results_df.columns = pd.MultiIndex.from_product( - [["text"], results_df.columns], - ) - return results_df diff --git a/components/chunk_text/tests/chunk_text_test.py b/components/chunk_text/tests/chunk_text_test.py index a47683ed3..f95180f98 100644 --- a/components/chunk_text/tests/chunk_text_test.py +++ b/components/chunk_text/tests/chunk_text_test.py @@ -7,7 +7,7 @@ def test_transform(): """Test chunk component method.""" input_dataframe = pd.DataFrame( { - ("text", "data"): [ + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo", "ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis", "parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec,", @@ -25,8 +25,8 @@ def test_transform(): expected_output_dataframe = pd.DataFrame( { - ("text", "original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], - ("text", "data"): [ + ("text_original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer", "amet, consectetuer adipiscing elit. Aenean", "elit. Aenean commodo", diff --git a/components/download_images/README.md b/components/download_images/README.md index b491007b5..6ed54d66d 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -14,15 +14,13 @@ from the img2dataset library. **This component consumes:** -- images - - url: string +- images_url: string **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 1982a96ba..abe19c653 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -13,21 +13,17 @@ tags: - Image processing consumes: - images: - fields: - url: - type: string + images_url: + type: string produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 - additionalFields: false + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 +# additionalFields: false args: timeout: @@ -53,7 +49,7 @@ args: description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". type: str default: 'border' - resize_only_if_bigger: + resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool default: False diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py index 8a37b86eb..070859e07 100644 --- a/components/download_images/src/main.py +++ b/components/download_images/src/main.py @@ -119,7 +119,7 @@ async def download_dataframe() -> None: images = await asyncio.gather( *[ self.download_and_resize_image(id_, url, semaphore=semaphore) - for id_, url in zip(dataframe.index, dataframe["images"]["url"]) + for id_, url in zip(dataframe.index, dataframe["images_url"]) ], ) results.extend(images) @@ -134,8 +134,5 @@ async def download_dataframe() -> None: results_df = results_df.dropna() results_df = results_df.set_index("id", drop=True) - results_df.columns = pd.MultiIndex.from_product( - [["images"], results_df.columns], - ) return results_df diff --git a/components/download_images/tests/test_component.py b/components/download_images/tests/test_component.py index 1f690e6e5..d851ecd73 100644 --- a/components/download_images/tests/test_component.py +++ b/components/download_images/tests/test_component.py @@ -45,7 +45,7 @@ def test_transform(respx_mock): input_dataframe = pd.DataFrame( { - ("images", "url"): urls, + "images_url": urls, }, index=pd.Index(ids, name="id"), ) @@ -55,9 +55,9 @@ def test_transform(respx_mock): resized_images = [component.resizer(io.BytesIO(image))[0] for image in images] expected_dataframe = pd.DataFrame( { - ("images", "data"): resized_images, - ("images", "width"): [image_size] * len(ids), - ("images", "height"): [image_size] * len(ids), + "images_data": resized_images, + "images_width": [image_size] * len(ids), + "images_height": [image_size] * len(ids), }, index=pd.Index(ids, name="id"), ) diff --git a/components/embed_images/README.md b/components/embed_images/README.md index eec02f577..23e746136 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -7,13 +7,11 @@ Component that generates CLIP embeddings from images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- embeddings - - data: list +- embeddings_data: list ### Arguments diff --git a/components/embed_images/fondant_component.yaml b/components/embed_images/fondant_component.yaml index a176b2f6b..86fdb53a4 100644 --- a/components/embed_images/fondant_component.yaml +++ b/components/embed_images/fondant_component.yaml @@ -2,21 +2,17 @@ name: Embed images description: Component that generates CLIP embeddings from images image: fndnt/embed_images:dev tags: - - Image processing + - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: model_id: diff --git a/components/embed_images/src/main.py b/components/embed_images/src/main.py index 03c647dc0..a0270b1e8 100644 --- a/components/embed_images/src/main.py +++ b/components/embed_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.batch_size = batch_size def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -110,4 +110,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(embeddings) - return pd.concat(results).to_frame(name=("embeddings", "data")) + return pd.concat(results).to_frame(name=("embeddings_data")) diff --git a/components/embed_text/README.md b/components/embed_text/README.md index a30a9ec4f..c53a779b9 100644 --- a/components/embed_text/README.md +++ b/components/embed_text/README.md @@ -7,14 +7,12 @@ Component that generates embeddings of text passages. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list ### Arguments diff --git a/components/embed_text/fondant_component.yaml b/components/embed_text/fondant_component.yaml index 2e34c5c0a..a1a3ca816 100644 --- a/components/embed_text/fondant_component.yaml +++ b/components/embed_text/fondant_component.yaml @@ -5,21 +5,17 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 - + text_data: + type: string + text_embedding: + type: array + items: + type: float32 + args: model_provider: description: | diff --git a/components/embed_text/src/main.py b/components/embed_text/src/main.py index c8c2acfde..3fdc08e47 100644 --- a/components/embed_text/src/main.py +++ b/components/embed_text/src/main.py @@ -65,7 +65,7 @@ def get_embeddings_vectors(self, texts): return self.embedding_model.embed_documents(texts.tolist()) def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe[("text", "embedding")] = self.get_embeddings_vectors( - dataframe[("text", "data")], + dataframe["text_embedding"] = self.get_embeddings_vectors( + dataframe["text_data"], ) return dataframe diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 454253416..f19d55b03 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -9,13 +9,11 @@ used to find images similar to the embedded images / captions. **This component consumes:** -- embeddings - - data: list +- embeddings_data: list **This component produces:** -- images - - url: string +- images_url: string ### Arguments diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml index d93e634a3..af147c158 100644 --- a/components/embedding_based_laion_retrieval/fondant_component.yaml +++ b/components/embedding_based_laion_retrieval/fondant_component.yaml @@ -7,19 +7,15 @@ tags: - Data retrieval consumes: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string +# additionalFields: false args: num_images: diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py index b350e6142..0f7697dc3 100644 --- a/components/embedding_based_laion_retrieval/src/main.py +++ b/components/embedding_based_laion_retrieval/src/main.py @@ -58,18 +58,18 @@ async def async_query(): embedding_input=embedding.tolist(), ), ) - for embedding in dataframe["embeddings"]["data"] + for embedding in dataframe["embeddings_data"] ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)["id", "url"] results_df = results_df.set_index("id") # Cast the index to string results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.columns = ["images_url"] return results_df diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 1bc0c27f5..e7093e680 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -7,9 +7,8 @@ Component that filters images based on minimum size and max aspect ratio **This component consumes:** -- images - - width: int32 - - height: int32 +- images_width: int32 +- images_height: int32 **This component produces no data.** diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 0512d87f9..b6ff8cbe7 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -5,12 +5,10 @@ tags: - Image processing consumes: - images: - fields: - width: - type: int32 - height: - type: int32 + images_width: + type: int32 + images_height: + type: int32 args: min_image_dim: diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py index 8fbfdfa77..b169196ec 100644 --- a/components/filter_image_resolution/src/main.py +++ b/components/filter_image_resolution/src/main.py @@ -23,8 +23,8 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None: self.max_aspect_ratio = max_aspect_ratio def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - width = dataframe["images"]["width"] - height = dataframe["images"]["height"] + width = dataframe["images_width"] + height = dataframe["images_height"] min_image_dim = np.minimum(width, height) max_image_dim = np.maximum(width, height) aspect_ratio = max_image_dim / min_image_dim diff --git a/components/filter_text_length/README.md b/components/filter_text_length/README.md index ed89dd128..4c5730180 100644 --- a/components/filter_text_length/README.md +++ b/components/filter_text_length/README.md @@ -7,8 +7,7 @@ A component that filters out text based on their length **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/filter_text_length/fondant_component.yaml b/components/filter_text_length/fondant_component.yaml index fee0fb242..2451f5981 100644 --- a/components/filter_text_length/fondant_component.yaml +++ b/components/filter_text_length/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: min_characters_length: diff --git a/components/filter_text_length/src/main.py b/components/filter_text_length/src/main.py index 3e2f472a4..e3a6b0d61 100644 --- a/components/filter_text_length/src/main.py +++ b/components/filter_text_length/src/main.py @@ -23,10 +23,10 @@ def __init__(self, *_, min_characters_length: int, min_words_length: int): def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """Filter out text based on their length.""" - caption_num_words = dataframe["text"]["data"].apply( + caption_num_words = dataframe["text_data"].apply( lambda x: len(fasttext.tokenize(x)), ) - caption_num_chars = dataframe["text"]["data"].apply(len) + caption_num_chars = dataframe["text_data"].apply(len) mask = (caption_num_words >= self.min_words_length) & ( caption_num_chars >= self.min_characters_length diff --git a/components/filter_text_length/tests/text_length_filter_test.py b/components/filter_text_length/tests/text_length_filter_test.py index eea98864e..55c927e79 100644 --- a/components/filter_text_length/tests/text_length_filter_test.py +++ b/components/filter_text_length/tests/text_length_filter_test.py @@ -24,6 +24,6 @@ def test_run_component_test(): # Then: dataframe only contains one row assert len(dataframe) == 1 assert ( - dataframe.loc[2]["text"]["data"] + dataframe.loc[2]["text_data"] == "This a valid sentence which should be still there" ) diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 5d679c457..e59af3af6 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -22,15 +22,13 @@ right side is border-cropped image. **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_cropping/fondant_component.yaml b/components/image_cropping/fondant_component.yaml index 416bc2c1d..130b14324 100644 --- a/components/image_cropping/fondant_component.yaml +++ b/components/image_cropping/fondant_component.yaml @@ -20,20 +20,16 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 args: cropping_threshold: diff --git a/components/image_cropping/src/main.py b/components/image_cropping/src/main.py index c670fdeb8..6a62e309c 100644 --- a/components/image_cropping/src/main.py +++ b/components/image_cropping/src/main.py @@ -46,12 +46,12 @@ def __init__( def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: # crop images - dataframe["images"]["data"] = dataframe["images"]["data"].apply( + dataframe["images_data"] = dataframe["images_data"].apply( lambda image: remove_borders(image, self.cropping_threshold, self.padding), ) # extract width and height - dataframe["images"][["width", "height"]] = dataframe["images"]["data"].apply( + dataframe["images_width", "images_height"] = dataframe["images_data"].apply( extract_dimensions, axis=1, result_type="expand", diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index a69a4df4e..77e11742d 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -7,15 +7,13 @@ Component that extracts image resolution data from the images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_resolution_extraction/fondant_component.yaml b/components/image_resolution_extraction/fondant_component.yaml index 1ddbf4afb..f840da680 100644 --- a/components/image_resolution_extraction/fondant_component.yaml +++ b/components/image_resolution_extraction/fondant_component.yaml @@ -5,17 +5,13 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 \ No newline at end of file + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 \ No newline at end of file diff --git a/components/image_resolution_extraction/src/main.py b/components/image_resolution_extraction/src/main.py index 823b7b70f..a8715d831 100644 --- a/components/image_resolution_extraction/src/main.py +++ b/components/image_resolution_extraction/src/main.py @@ -38,8 +38,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """ logger.info("Filtering dataset...") - dataframe[[("images", "width"), ("images", "height")]] = dataframe[ - [("images", "data")] - ].apply(lambda x: extract_dimensions(x.images.data), axis=1) + dataframe["images_width", "images_height"] = dataframe[["images_data"]].apply( + lambda x: extract_dimensions(x.images.data), + axis=1, + ) return dataframe diff --git a/components/index_weaviate/README.md b/components/index_weaviate/README.md index ce4729c52..efa6286a0 100644 --- a/components/index_weaviate/README.md +++ b/components/index_weaviate/README.md @@ -7,9 +7,8 @@ Component that takes embeddings of text snippets and indexes them into a weaviat **This component consumes:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list **This component produces no data.** diff --git a/components/index_weaviate/fondant_component.yaml b/components/index_weaviate/fondant_component.yaml index d20d168fd..cb06ad683 100644 --- a/components/index_weaviate/fondant_component.yaml +++ b/components/index_weaviate/fondant_component.yaml @@ -5,14 +5,12 @@ tags: - Data writing consumes: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 + text_data: + type: string + text_embedding: + type: array + items: + type: float32 args: weaviate_url: diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c3afd6435..3aebe1e26 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -7,8 +7,7 @@ A component that filters text based on the provided language. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml index ab59a58be..3a98f27f7 100644 --- a/components/language_filter/fondant_component.yaml +++ b/components/language_filter/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: language: diff --git a/components/language_filter/src/main.py b/components/language_filter/src/main.py index f306512e4..4c753d1b4 100644 --- a/components/language_filter/src/main.py +++ b/components/language_filter/src/main.py @@ -38,7 +38,7 @@ def predict_lang(self, text: str): def is_language(self, row): """Predict if text of a row is written in the defined language.""" - return self.language in self.predict_lang(row["text"]) + return self.language in self.predict_lang(row["text_data"]) class LanguageFilterComponent(PandasTransformComponent): diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 834f568e5..9a618f176 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -11,9 +11,8 @@ location. It supports the following formats: .zip, gzip, tar and tar.gz. **This component produces:** -- file - - filename: string - - content: binary +- file_filename: string +- file_content: binary ### Arguments diff --git a/components/load_from_files/fondant_component.yaml b/components/load_from_files/fondant_component.yaml index 11416e5b5..2e0167b9d 100644 --- a/components/load_from_files/fondant_component.yaml +++ b/components/load_from_files/fondant_component.yaml @@ -7,13 +7,11 @@ tags: - Data loading produces: - file: - fields: - filename: - type: string - content: - type: binary - + file_filename: + type: string + file_content: + type: binary + args: directory_uri: description: Local or remote path to the directory containing the files diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 1faa0175a..e14e6f440 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from the hub **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index d6a625971..7e72f2b22 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: #TODO: fill in here + type: binary args: dataset_name: @@ -19,10 +17,10 @@ args: type: dict default: {} image_column_names: - description: Optional argument, a list containing the original image column names in case the + description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: [] + default: [ ] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index b978a96af..ccb2dd2ab 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -54,16 +54,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -99,11 +95,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index c83f7e9e8..d6bda66c3 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from a parquet uri **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml index 5cc5796fa..894069c59 100644 --- a/components/load_from_parquet/fondant_component.yaml +++ b/components/load_from_parquet/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: dataset_uri: diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py index ddd338552..117ae10ce 100644 --- a/components/load_from_parquet/src/main.py +++ b/components/load_from_parquet/src/main.py @@ -50,16 +50,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -85,11 +81,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 422fdc7af..5fc4cb86e 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -7,13 +7,11 @@ A component that generates minhashes of text. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - minhash: list +- text_minhash: list ### Arguments diff --git a/components/minhash_generator/fondant_component.yaml b/components/minhash_generator/fondant_component.yaml index 6528112ef..1747982f8 100644 --- a/components/minhash_generator/fondant_component.yaml +++ b/components/minhash_generator/fondant_component.yaml @@ -5,18 +5,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - minhash: - type: array - items: - type: uint64 + text_minhash: + type: array + items: + type: uint64 args: shingle_ngram_size: description: Define size of ngram used for the shingle generation diff --git a/components/minhash_generator/src/main.py b/components/minhash_generator/src/main.py index c8034334b..f61e34fcb 100644 --- a/components/minhash_generator/src/main.py +++ b/components/minhash_generator/src/main.py @@ -51,10 +51,10 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - dataframe[("text", "shingles")] = dataframe[("text", "data")].apply( + dataframe["text_shingles"] = dataframe["text_data"].apply( create_shingles, ) - dataframe[("text", "minhash")] = dataframe[("text", "shingles")].apply( + dataframe["text_minhash"] = dataframe["text_shingles"].apply( compute_minhash, ) diff --git a/components/normalize_text/README.md b/components/normalize_text/README.md index edc955a79..3609ba0de 100644 --- a/components/normalize_text/README.md +++ b/components/normalize_text/README.md @@ -19,8 +19,7 @@ the training of large language models. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/normalize_text/fondant_component.yaml b/components/normalize_text/fondant_component.yaml index d6551f578..fd9cfc4cb 100644 --- a/components/normalize_text/fondant_component.yaml +++ b/components/normalize_text/fondant_component.yaml @@ -17,10 +17,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: remove_additional_whitespaces: diff --git a/components/normalize_text/src/main.py b/components/normalize_text/src/main.py index 47220fba4..a98b7b36b 100644 --- a/components/normalize_text/src/main.py +++ b/components/normalize_text/src/main.py @@ -89,31 +89,31 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Pandas dataframe """ if self.normalize_lines: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( normalize_lines, ) if self.do_lowercase: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( lambda x: x.lower(), ) if self.apply_nfc: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( self._do_nfc_normalization, ) if self.remove_punctuation: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_punctuation, ) if self.remove_additional_whitespaces: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_additional_whitespaces, ) # remove all empty rows - dataframe = dataframe[dataframe[("text", "data")].astype(bool)] + dataframe = dataframe[dataframe["text_data"].astype(bool)] return dataframe diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index af43a9826..8d7ffcf70 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -12,13 +12,11 @@ This component doesn’t return the actual images, only URLs. **This component consumes:** -- prompts - - text: string +- prompts_text: string **This component produces:** -- images - - url: string +- images_url: string ### Arguments diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index fdd7589dc..02ea08349 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -10,17 +10,13 @@ tags: - Data retrieval consumes: - prompts: - fields: - text: - type: string + prompts_text: + type: string produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string +# additionalFields: false args: num_images: diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index c9459060f..2168f5ef0 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -56,18 +56,18 @@ async def async_query(): self.client.query, prompt, ) - for prompt in dataframe["prompts"]["text"] + for prompt in dataframe["prompts_text"] ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)["id", "url"] results_df = results_df.set_index("id") # Cast the index to string results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.columns = ["images_url"] return results_df diff --git a/components/resize_images/README.md b/components/resize_images/README.md index 593b2ca76..89561e7a5 100644 --- a/components/resize_images/README.md +++ b/components/resize_images/README.md @@ -7,13 +7,11 @@ Component that resizes images based on given width and height **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary +- images_data: binary ### Arguments diff --git a/components/resize_images/fondant_component.yaml b/components/resize_images/fondant_component.yaml index 6ab866d12..6112815c4 100644 --- a/components/resize_images/fondant_component.yaml +++ b/components/resize_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary + images_data: + type: binary args: resize_width: diff --git a/components/resize_images/src/main.py b/components/resize_images/src/main.py index 434dd29db..d5d4207bb 100644 --- a/components/resize_images/src/main.py +++ b/components/resize_images/src/main.py @@ -29,6 +29,6 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: axis=1, ) - dataframe[("images", "data")] = result + dataframe["images_data"] = result return dataframe diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 9f475d516..418eacb13 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -7,13 +7,11 @@ Component that creates segmentation masks for images using a model from the Hugg **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- segmentations - - data: binary +- segmentations_data: binary ### Arguments diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml index fca45e541..34fbd9fcd 100644 --- a/components/segment_images/fondant_component.yaml +++ b/components/segment_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - segmentations: - fields: - data: - type: binary + segmentations_data: + type: binary args: model_id: diff --git a/components/segment_images/src/main.py b/components/segment_images/src/main.py index 0f8f46faa..4e06c5d89 100644 --- a/components/segment_images/src/main.py +++ b/components/segment_images/src/main.py @@ -150,4 +150,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: results.append(segmentations) - return pd.concat(results).to_frame(name=("segmentations", "data")) + return pd.concat(results).to_frame(name=("segmentations_data")) diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 54978470a..ec80bf334 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -7,8 +7,7 @@ Component that writes a dataset to the hub **This component consumes:** -- dummy_variable - - data: binary +- dummy_variable: binary **This component produces no data.** diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 363f2507c..b4391fbbc 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data writing consumes: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: hf_token: @@ -23,7 +21,7 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: [] + default: [ ] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index 0ed01b961..6d464f0f2 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -74,16 +74,15 @@ def write( # Get columns to write and schema write_columns = [] schema_dict = {} - for subset_name, subset in self.spec.consumes.items(): - for field in subset.fields.values(): - column_name = f"{subset_name}_{field.name}" - write_columns.append(column_name) - if self.image_column_names and column_name in self.image_column_names: - schema_dict[column_name] = datasets.Image() - else: - schema_dict[column_name] = generate_from_arrow_type( - field.type.value, - ) + for field_name, field in self.spec.consumes.items(): + column_name = field.name + write_columns.append(column_name) + if self.image_column_names and column_name in self.image_column_names: + schema_dict[column_name] = datasets.Image() + else: + schema_dict[column_name] = generate_from_arrow_type( + field.type.value, + ) schema = datasets.Features(schema_dict).arrow_schema dataframe = dataframe[write_columns] diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 1266b56d3..54ad2e417 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -8,11 +8,8 @@ {% if consumes %} **This component consumes:** -{% for subset_name, subset in consumes.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in consumes.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component consumes no data.** @@ -21,11 +18,8 @@ {% if produces %} **This component produces:** -{% for subset_name, subset in produces.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in produces.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component produces no data.** diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 013ce2b71..58c8ab045 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -188,7 +188,7 @@ def fields(self) -> t.Mapping[str, Field]: { name: Field( name=name, - type=Type(field["type"]), + type=Type.from_json(field), location=field["location"], ) for name, field in self._specification["fields"].items() @@ -222,8 +222,8 @@ def _add_or_update_index(self, field: Field, overwrite: bool = True): if field.name != "index": msg = ( - f"The field name is {field.name}. If you try to update the index, set the field" - f"name to `index`." + f"The field name is {field.name}. If you try to update the index, " # nosec B608 + f"set the field name to `index`." ) raise ValueError(msg) @@ -238,7 +238,7 @@ def remove_field(self, name: str) -> None: del self._specification["fields"][name] - def evolve( # noqa : PLR0912 (too many branches) + def evolve( # : PLR0912 (too many branches) self, component_spec: ComponentSpec, *, diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index dc940b5f7..2599b5de1 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -5,7 +5,6 @@ import os import re import typing as t -from dataclasses import dataclass from enum import Enum import pyarrow as pa @@ -168,7 +167,7 @@ class Field: def __init__( self, name: str, - type: Type = None, + type: Type = Type("null"), location: str = "", ) -> None: self._name = name diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 36f81b7db..05be61c17 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -443,13 +443,13 @@ def _validate_pipeline_definition(self, run_id: str): if not load_component: # Check subset exists for ( - component_subset_name, - component_subset, + component_field_name, + component_field, ) in component_spec.consumes.items(): - if component_subset_name not in manifest.subsets: + if component_field_name not in manifest.fields: msg = ( - f"Component '{component_spec.name}' is trying to invoke the subset " - f"'{component_subset_name}', which has not been defined or created " + f"Component '{component_spec.name}' is trying to invoke the field " + f"'{component_field_name}', which has not been defined or created " f"in the previous components." ) raise InvalidPipelineDefinition( @@ -457,36 +457,22 @@ def _validate_pipeline_definition(self, run_id: str): ) # Get the corresponding manifest fields - manifest_fields = manifest.subsets[component_subset_name].fields - - # Check fields - for field_name, subset_field in component_subset.fields.items(): - # Check if invoked field exists - if field_name not in manifest_fields: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The component is " - f"trying to invoke the field '{field_name}' which has not been " - f"previously defined. Current available fields are " - f"{manifest_fields}\n" - ) - raise InvalidPipelineDefinition( - msg, - ) - # Check if the invoked field schema matches the current schema - if subset_field != manifest_fields[field_name]: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The '{field_name}' " - f"field is currently defined with the following schema:\n" - f"{manifest_fields[field_name]}\nThe current component to " - f"trying to invoke it with this schema:\n{subset_field}" - ) - raise InvalidPipelineDefinition( - msg, - ) + manifest_field = manifest.fields[component_field_name] + + # Check if the invoked field schema matches the current schema + if component_field.type != manifest_field.type: + msg = ( + f"The invoked field '{component_field_name}' of the " + f"'{component_spec.name}' component does not match the " + f"previously created field type.\n The '{manifest_field.name}' " + f"field is currently defined with the following type:\n" + f"{manifest_field.type}\nThe current component to " + f"trying to invoke it with this type:\n{component_field.type}" + ) + raise InvalidPipelineDefinition( + msg, + ) + manifest = manifest.evolve(component_spec, run_id=run_id) load_component = False diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index 30a4b7c10..d9dad121f 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -9,9 +9,7 @@ from fondant.core.manifest import Manifest manifest_path = Path(__file__).parent / "examples/data/manifest.json" -component_spec_path = ( - Path(__file__).parent / "examples/data/components/1.yaml" -) +component_spec_path = Path(__file__).parent / "examples/data/components/1.yaml" NUMBER_OF_TEST_ROWS = 151 diff --git a/tests/examples/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml deleted file mode 100644 index 0c245a512..000000000 --- a/tests/examples/example_data/components/1.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Test component 1 -description: This is an example component -image: example_component:latest - -consumes: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" - -produces: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_data/manifest.json b/tests/examples/example_data/manifest.json deleted file mode 100644 index 8fe4ef16b..000000000 --- a/tests/examples/example_data/manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "tests/example_data/subsets_input", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "properties": { - "location": "/properties", - "fields": { - "Name": { - "type": "string" - }, - "HP": { - "type": "int32" - } - } - }, - "types": { - "location": "/types", - "fields": { - "Type 1": { - "type": "string" - }, - "Type 2": { - "type": "string" - } - } - } - } - } \ No newline at end of file diff --git a/tests/examples/example_data/raw/split.py b/tests/examples/example_data/raw/split.py deleted file mode 100644 index ade466125..000000000 --- a/tests/examples/example_data/raw/split.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -This is a small script to split the raw data into different subsets to be used while testing. - -The data is the 151 first pokemon and the following fields are available: - -'id', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', -'Sp. Atk', 'Sp. Def', 'Speed', 'source', 'Legendary' - - -""" -from pathlib import Path - -import dask.dataframe as dd - -data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent - - -def split_into_subsets(): - # read in complete dataset - master_df = dd.read_parquet(path=data_path / "testset.parquet") - master_df = master_df.set_index("id", sorted=True) - master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - - # create properties subset - properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "component_1") - - # create types subset - types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "component_2") - - -if __name__ == "__main__": - split_into_subsets() diff --git a/tests/examples/example_data/raw/testset.parquet b/tests/examples/example_data/raw/testset.parquet deleted file mode 100644 index e7b9c625f0c104d9fb7c08137912df65d1915cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15048 zcmdU$3v?URna5{r$&qX+krTi0r726d9GWh)1lpxMx;-tW6xyEMF70NarCpZo?*ALf zugC^^8gfnrKWXOPJ9F>-{`dRty)#aS-Cb@_7|ty>ysU0DJepx(mNU$gY6Z)vStdih zM8RZgRg3~CK^9m7R6q@~K@QLWEm#Vc0UgK%%fSku2L_M_E&)c64_1OpfeEYvmx0y5 z3<`h+Tn-9B5wL<{Py*I~E5KT?4wQoRU;`)v<-i6iKqaUG8^M*J8q|PI;3`lHt_IhD zYe5~b0|%%FPS5}vK@(^OF3n>Dt1eq@F0fo)SY$0OS#!nOb*1Y!l$F~mDyueLSzWW~s@kis zxwg*ksCPCrHZ{9iHn+C5Z|UfCcXjvl_Vo`8ZoO{XXSVO)JYF9^ubV#n2@5+BJaG{RnE><~scZy?SV<8pxxM$BF4RWuOc z#|&Bx+8kiNRoE{0qzD&!>z)br%gUuH0Sziu26 zYPiq{ul#pYM??tnT$pE%n44fjQHTY2Hf*#COo`y)YQ#~Vw_i!l$WZtnn;`%*{%8}Uh>6#%4bYX z5pEPCF;Z}y23_Hm`!pR=DA9SZJ}mf1#{bZFiPBhDQctoPl_w&xZ7ia&Tg6Ea>Dx7goE6utX7fHm~o7rqdP*`(XA6aj16M_w*zRR1yAS!Cmv?M~~ ztXPz*<2>jvoz zeb4Hh0=!>Y5K!@>yz(Y}F9Jz~SN~4a0;_3rRE;8+3pbG>3BS5Pw*}EE@#RwbjmFfs z={BS8*uF>9*@|$K{YSH%hjBS?EYa}3u`47+L~fitskcWEO5(0%`av#?x(k-;+i>p) zugozzG1!HWQ1FTwM;H)rcnBiIc=pw@HhwHNCdnfMVI+N;+Q^hBWmh*U$W<_c{x;^K zH;PRLO=>Puv~xi|IUC)6S>EeL>)Ar<-esoqySCt*b27tlB%{@EnY>;+#MV({hWkdN z6g(P*2+!tr_t6|dCUnI86cO7c%r_#N?d-rLB86x=YLItyMyWwW{yVcuMxwCz_E zc@HV9C)ZkkQzF}TPIDGC?MtlTdu*#APd06!kSv!p7@Mtc6caQ-WmhMyMOIImOAY(M z`YLw0Y1(XTtv+QnbGR$U^CI;d_9BJIigdBu7MN`>YdA2^Y!9*1c2mj~psf8|-6iHI&4LB%y-nq3g-J*i{7R+Gyj8xZQ$oP|6i?+{|#~h?G4@rvn>LA8!Jc zb_eril&5{h0H8h6_eyoxSCz^8Eid}(ME(O>U2Q+D8u583^$Ydy%`iMLe+@pJX;ghD zS(>3WczP-_?_kdzN_o_h;)#hJIq^|_uEec7{^9KAjrBitR{iv=`yTn#aLMG9 zq3)}(dj~_`edG_;r#k|FSi!D2akymsWdDsOiZ7wLWy!?1Ed_En&%)9Aj^2@=mL)w^Uo-wPDen-kU}GxbWap8h(cW*qt1E8H!Tc zFxS$CnKpkAK(UEZ+9`#G56MW<(w4LY(DstHnqA;BK-<#G0c|H60c{P}1KO5$0@@Z@ zfeS&Q=psiaOm;!6SR&2k9Q(F2`zmWcY*6`t6$2Y1zNM#yT zv`L^3nI%0H8N1KzN-3XJTxZreIg{#$BKxtD*0*cVW+*$3I1g8L9xvL~|6s61p~%%M zmM>Ye*;4XC#hMphU30jw^?6Il-gi~c+qP|c%l>Vf`U^*^>lo$KdnNX8k$tlDtmXP& zyj!!(T)oFq`oiPRx2h)p<;it9g*AIR+F$YY7N5+pzVDcoIitBFsqtx5-oY&E2iIEP zsfEVr&0eH43u^qP(r_@d)o`7x@wp5|IVpM#DM}mrNh^dO)HkaG_K8VRIcE1;1>qlQKTsZE=GQkg~*ZRlM9i(ok@ z(Pkc_qc%$V;j=&aL)*r5#iuikDw-1LLqf&8OD2zP)al-gWrB)?-}*eG~jo zPKWRNMmXp9dvuq$&C1f(&2^S5Z`|=v*=paN$1gu&(w{u4?0qq_{p~Y$i_y0FN#mEA zt=sc&JhH|;RdHvD;lz%ErLWm;Y<_6)kxlt})#R4{uKP-L`yCDX-v(IgPg-Ub&$45) z=p9L_{=O>j!7S?=P1YxzP(8iri*#r~)t^xs{tVCVmsNi=rTVp`I{7L>{T$CGJxEj1 zkF+K|Nhk7X(uK4o-6;Z45GTz^HyS(jpbwDVZGgrQX*fN0NP8MPr1vsFUQc>I0m!Rq zOjQAjDx^QOuf0K*k70z=?}Bo{LtoqJ z`ciB%vhMIV3b*tYE*qzB}v%Ov)*_OZlX9YJL zO>A2I!TrK`?C0K+$(@D&y6^h43EvCv)vh{q;t|g5em;Nut&H_4_pIJaocs`vElpDJ zAF1+AW?7%!Vx4M*;OT8A!P8Tj1qGi{;PF$d!6^%VBtu~*H}4?9Nmi1N=6sTsLcaN&pu$(>++A=YM*!4GUn3E z%WIY1Z9jH1w-@NyFV@waQ@wn=^yV|Q6CTx18h5U8*M0lVtDk*bbF+QTZRg5f&M4c< z2$Rg!Ki@IY(&K78Wq*cg{(N7VqlB@(I50=ZuJC%ZJ(lv^(@XNsDy=W|TK9HC$n>@^ z(xU~1d@0kgA3|=Dh5R@6JRzqAM9fwQB(a{1Pg0WS&XtgsW8_uS^YXMOlgE+dBr6RI zlJXiro(XwpM4R?Pcpnz?Kbk+kPG=fb`Z=Pdry?_O&YzlJmBr=*Y_s~;Q{j&0`l7G* zM=!567TnsgSFKptnNh<{Og^k?oEpZ;rn_q3-G`R{v6FFVI)35nS(WiaMbUlvT4RaP zKH_@q@2pwQriowfZ#m#@dR068+VU$Ne&E+fHAN3H)+0O4QwZ*sbhT%)@;+2ppW0?U zwiODcH-J6nY!I*k`Ds#Rorl+FVd2?t+r*X+d z;n_2lKgeuQ7pPbCXnUh;GEXZq?_^)TrTgH1=9TB#?z^RE>tk(Qw#4fh&Bu=wI!`l4 z|5#AByi9Q<&~jXL>G6sC&*(fwrO%8PY@X_ADN`B;JU{zo@bkBJp2?qj`N-1OEH1~v z9sm45p`(SdzU7}4T8YQ2eCmO8a^duApY_KaWKM56$()|Lu*^DxzSWR?m!vXHKgEaL z@(e5I?9FE%@XsDmEz=jwrow7DB`H^6AI)H_?+nj&A}cDzBi_`!@HCtE9BciNV0}

7N6% zU@4%1tOL1VIiOj}0P=tlP?%c@E(IpA3Xo&124+wIEPx!W5EKC`z(!`qnaELSo}$GJ zg~9b;1E7UVIUonE0F{6iCKN`=Dal!Dz$QT3APSu{Q&Q+8XQM?lxxNFCBa-tq0CIYA zZWm|)YZTUT`1yJoo~b0{0`Box;GS&li(FH2l)vz)fcw z=@Fg$A-73SMb_WOQ_r)zXT!yc?b}0fQQVF<3M1PEy#5;BzCG!rcqR9g);ck6JrY9f zNbl|c^zqTT=>C~tEVB{j~!Y{_g?_%spF2<_*(_heCgw@txwRW;z z3S=-WO&Em_s}7~U|Upd}s>ypoT%pmZo63@2tXB+oG3 zH&c2Lf3)D^VjPO^w(ubo$2%mtwx_$XwAymR4YQ4z-wKQ1qwD9jB9(StD}+#NV`X|9 zW|HSLfx33XI{y_> z>E!cjo6%?f(3wg8DA}4j((6g4UBF6`&v?@-noYlem8Bjqrq?zn;{sNfTAig=Hz(r) zSBH))TwN;Tf>wto3ky}2O1^+qCHMd7b@TV@31Wg@b{4EduR*8%-N`HtZ zg($VWe0|yaG7G}HB~|sDOdImbEjKVmjot0u(zhwOIWoR;=?41P{G07|dncd|jw|gB zzrE3J_uK3Ja+t*QSH8)Z1IakI+v{B@@3_(x z6yt+Uy_Nm8_EArBZ_E>F6SfErk4vl>@`QS$p2nJyf&TVLzb#d+yjtk*8uPn?sq$h? z*b{V%d{gf(*RbU8tB4AoU}J2s%eB$fR8{V28jHH>TY_9)Z*;J+CO*(tE>b!9-u1Xo z$PvKx0dILt8Q<@a^IW0cIM>%cI@r`BwAR}Nl#O|t#UWp_I40k>zGZYE7#0UAI+OW* zy=4P^?trhUf!dBf^~MKmH6vVKRfy_xiM>5;p{&vsDzBN>_EcZ_M9WOOV2=@(K>ZsG z5B80Z^xLWep1vO3*Di2<$@;uajj&ncuKo&ll^1PH=2exo52xCb-rle$)M;z4cabe( zgZ=IRt}h#yvC()OX8n*T;Q*Wll{VY?RQko)zxEb**VzPUY5)+7ky2QxQ9D1GlrYXqORsSW4^wn$5Y`B zc#^hVs7<4|M^wJwLUw{b`kVu)`qJBjF+DrZ=8yY@Z8Q?-TuigBM!W1k3yq<9^<04Q z+}ZDjkG4k^Ys2$ z36*25!wtd^)@oSi+3n4#br9BPPCDQ0uSbqV%eD?$zB%kTcY0}U7olZevmeViM+2QV z(0Mzp_v$5l*?siYATI)uVYf>i6&p5CcBe~DY@x)Nr6!6}UgM&+p1q!yqxe+J+;2d> zA1yrR-mjig8|I8bxzw(;8#YwCHjHlAfLws5??n+OEJ??72{NQ(Z=U-nx@X4=lEaS%N+vZ>I z{J8$h^;0Yz?5o-}*jZnF-t`~Xf7yQ1b1Ce1UjOY9Z8cz*Y5U7oonwEDF}oe>0+bj= z$!a=v$nrIIr_LMd&0OEO@bxql96jO_5O zqxh2B>u}Re7alLuYftodfXbJ5;7cCwJqwOcxxWqa8d|<7&e0^fXead+Mt~i?b|>W~ zGv#vA{&W6#m$z0;;s^@K`z>6LB5>T~CuzUr@zspc9OfeUXIHUOZ#Y~kZz9d9O`3a)_H zz*QiC2zVVtK@7YB-UM;*7I+)H0}|j}AcFV6HEiVMM5evO1;(SV@|fe4E{3`f3ST7SB@ z9{w){#FDrkA_NGgH7tZ6a57J#p^@6tErlz=>j9pHL|7~yi~q05`7YQx-^o=%V##R8 zQEW{aMSl&Ph=#4{(*t#2jxBp6-b#ES8k%@NnGk)`fv4-9aw6J_rNhS_nED?zrsP#UpK@+{E~R0+ zH*^0?d2W7T#-XmbgCUAI5lw>)mEnw6@643pa4_M)j`iWl7}(sJ$)wV$w0H{ZJoU3; zDxcaCPf2uzHkBCpij8*?zvv0 zkoji}FN&vEDV=Zus}}_8nNtT(#Yv}~Kk7(zO3Pii6+f<8=2)(@a#FT&%(h!Dv+Kq^ zqsU-Cs+#FhwN&e~>@>sL`9lTo%2Ll%XVsh4vWs`~J=Xr5`;WEubhJ~VhHRN8t27Up zRd(fl_*L>5R+;cP^W2R?ZQGo9TB&9-shHOHt6WEn%tUV$(z;YMtNlT@Y84E{YW1~} zr@OM$?8%jKx^o_PT9Ec9@Vi*MKC6v8?aVv|_F-@z*^%4RBPn}UGc}a%-+j3{=Kf5+%BitR+XO-&THy)>{?H?kGvd+ ziRap}HuLtEm8IGPrCk_%d+*l7F6^}$)c6={!KsLChrCH_Du?@5Yt!Q7>HygpKWBNzP0ltdGbGW>e1 fyPK-xYo_k02qt_=EZjHb$F@ diff --git a/tests/examples/example_data/subsets_input/index/part.1.parquet b/tests/examples/example_data/subsets_input/index/part.1.parquet deleted file mode 100644 index be1028aaaf00e22c04ab54f275a1f5b9da8678ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1707 zcmcJQ&u6vrnR6C&BFRdo?a9MGz@av&~X)+w&rX21q70c_)hT{dya{xU4yUAC|o zh~R%95J@n8+)MHOo5B(de-g~Ij{uBAmg55ZcROQf-J~PYvzJ2e_%mWT-Gs}g! zZzJ5xr5a~&a!`sZ`Q|7S)%V*$o@CA4P zC@6vw*adr_3@TtBh@c7{f*PoU1~>qR;0QFqF?a+XgBFm04B9{ePry@f0=@(t@C>M{ z(H)JYvRM${P@!vxiyyY6^w4)E~vEz>T?bH+AGWe5pil3W0fo=xM1@Eek2_JWC+weHG zOxx((w9O-T{F%obdDHRy84q7~?0B8pn7tBa_Y|756m8hjSwyjl~ zr==tc!h%$^GYh5#8pbf0g%@U^3!$f*Rwzm*ZM~ohs)`yY z8t4|^@5lJooiQ$4xbP!%df~#ITlXfK=$G(+3%VV~NsJ3`^1HV<|5N9jd+z~uX+6ya zxziALvQp&?jyyuhH!DGnT)aT^iw^}!do4^lzyK!bf*!Dd4II!1F1P~*V7L|zdMv_- zML2|I3?^U-j=(W^5zN3#;AQX%con<`UI%Z0H^E!rZSW3w7rY1F2OoeB!CmkX_!xWw zJ_VnF&%qaK;p< zap{H}-~KjP{q_3pW3^}W+v0cj^S{qtxO`VLzc#-A!TR&WP5un;jh|NITriefC!tt0 zU}{_-Vnj~Dk<-4|0Kv45g%AWz=14R&R1MvhnG$?Fz_O5t5sSy- z|7&u#3uA-r)0ZS%+n-qdA|yl%VRl!vb~vfb&qxvz9})nj&B_he1>Ba={0!Db>U+h_+~yWPBhgE=Ucc46Hox!a>FN_~j5|e2=^kL?X-c!S(4i$t zJG9vGZH2;;K4!+7=X!-gW}h+qD84RGI%Wb^KM2+{rxuY|B#0^|oS{JZT?(7csC8lle%l)SUFh z%vsIUl$(1dt643Va0aC#ajTwF%WOYP%cXX@b6s&7p1N(_Teqt?dzqbouWKqbODSbb zU}uE$nz${eVyI5f&w-eDt{iGpe}5TCtQ^VB{K(&X_vWP9rK4K0l3%V-vQ%fsRWeyW zMr}Wvu6Ao?d)`T`Cs-5VYW)@W-@>g^T)5u{XZ=>72D4C=P5w*lfl}6!6z77uLvQ$f uq|S{@m`A20nsDX8ossTcQC;_n|HViw8b2}rO2p!E{L=jqBxDzVSAPN5#JEWS diff --git a/tests/examples/example_data/subsets_input/properties/part.0.parquet b/tests/examples/example_data/subsets_input/properties/part.0.parquet deleted file mode 100644 index 5217045981270e26d7f66f5757130739b23b27b6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3542 zcmcInUu+{s8J|sTXOpvYz9Vba;48-5IoCExZC`wCjvRFD#Q!dK9Oq&?hAPZ@*Y?Eg zUEg{)j_(u+RXkJ$RXox@AXJ^8AcT;3LV~J7hzG<2FT5fINFd+=@l+vH@y)FBuQ?u| zoR#0~>^I+h|9*DnjqNHl`6mDO;2A0NdsiyyOo?+@%l? zv#P4woHsTD({u-$J*+EgaMrJCs-bf)b0I%@h)sn{hPEHr#I!V(3;Ju=w2Ujha7x1i z%OC@li-ywJQq4Y5Ia4gE7VaAg*UF`A3#2VmRXt$8rmBkOo|(IcyS9$Gl~4)xYj%izjBA9rc4f-MVM?iCRxqzN-q-0rK`G=|(7%D893ZZNyv#rlAo z_DiO0bh*!oWzxmXj?K-(c0woJ@HS*M=$PtI@OdHT4)@dRC88K6)_?!zm$~n{XL$kv z&2wJ=16x(~#5_W;UYke86|c8T^fu=Wr8TvMi8&IyNPc!^u4-$2ayY#s>X56Z4!E7` z1=4P7qrTm0akHrAp7mOE`s;;;En|zOpEWe!%-p7;5D4I!KdTu=kJ}7aiKg~7yQgvi ze;J#2z&&zLyVT~5Vh*@5bZm`jGJ1y95s;t=oduxQFL(q`=r+3PW*$U0%OCtCem()C z%y}-q9QLl^Q10!8jX$iGuNUvDuhhTY`{JLDzI5{qg?zK~y&q_QeK!WW2u1(ShXijh z=|`TR=qll!Doh0Az%OP3f99hhzQgm3=&#_wlYfR8pdn1(19eun9;2ZUQ90=K}AF>VRc}(AUOKh^s0zPD>^7$15WL%vdLYZazT@i_gE~6w!S-{F?Oo3jg?9 z?^VA4ix|Z5?tfzP2|d~c0Y6n{xKQ-hBx}p z>Ck`S`iW)zx<`0>^0=R0n{;ush!eDklaRa%;0AaAQve1etLg0((aIa~=RjCgmcfwzY5GC@; zLO9}x_Sm}0RvtxkxWPb2w>1qkT13^9z4J21@}4#7Q^km_lcu4l5zyA{UVn75V95Jw z)1vYoT=fdJFsMI@s5+>_R|DOIMCg4=Eu4$vMEa2;p&1@6KtSW{ zBDwNKPmb(Eeu!?}UcQ*`iTF&q7o0%lFQnthKB5y=LQ}<-VX`u7L^iQGqE+sXunAxM z)bW984v3*sX*|9ZTZ%=X1d*}pvx`f~rCX6BBzmPvWwUn2=}7T?{ssDpOKXxOZ2{=# zo+K?xskW4nq_&i9Gg&HiZhc6F>nH2Et+ewFBeTNR^O7`_U>`i05qPpqFzy0U!`)i4 zyIU_x(%KOAr(VqWG`o?lF4q&K136cNlFHJH?PG8`E+4lP8`PNTC#;GP znxp*fe8z&k$&Osxq58H`I~;7JCDQ0=nw+h)@b)KYTVkgcT) zak-~~{fd?+!#3E9%k@;UeyfsfW~p6?$ce&(My(Wc*5jIz%|QH>PJZ8jUu*?U?v)N2 zS;&FLlrB`O>$}!wy0Fr!o{Rk<)vw(ekdwI5h0WbseC)T;Z`2OE^~7;L61nfh|5W0Z z_3A**L43J$@#J^XdFro+Yt=y`vtqOOAnts(v&Hh)QysZi=``ajF}0r3Xik(rWo>s{ zpNsVa&SItgEke1GYt`6ptnEJGwuiKir@kofjEjpk!8ONDGn;`klX-ZSr*nSv zs@2SCXL;_+`c|UkV-~I@3ON1 zHep^kP)4jyiAKHf5Z6|0R)_ud!;O8(E&}&7*V7mqT2n9e<-{_xQQTgRZ|oOI%jrDY zgHnqQYfJ$BB}qmZ`jfWPR-Or=Dd`Iv*+5$>t94=14sB&7+o9uHar~Z=I!xN6BlP3( zhxV+r%?!5b1pxiol$eldIG!K~u-2ycL5}t_e6~)FtY=t|spI>AZDcCoiJfUj!nQC~ t=l$+oB!>Igg#R(D#b&?1$S$<-_`(W1S6Ofsejzx2r~tqG%kUrFzX2YBUC00c diff --git a/tests/examples/example_data/subsets_input/properties/part.1.parquet b/tests/examples/example_data/subsets_input/properties/part.1.parquet deleted file mode 100644 index ac842a070982a6f00c7fd496ab555a5916e220db..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3526 zcmcInO>84c74FX1iJdsJ*;U$V3xhmn#sqCL*iM|?3>iqdJt9tAKycv70dav>-Oe8w4zNtg zUsd<3SMR<0UcIti;c79)#r)pGJew*qB164GQNMI8Oi>h-mb(hJEceX%plg}3J_)iC z)`ylvGi(K$eHm%)nfd-m90)yn#*@_*ddB^#g|r?Xc|f^h$|^<%_QG@4)Q5^}Ntg}< zIa5HItYP{NThSGHh`RJm_l!@HO@*0VG?68_Ui zcYnz{>oW&fq3_LSu`Uio_s0ua6CGj&n{JAt|3P8*_vb3QHnQA54%Uz%i>NQr@6Ycj z`q0q5Kcsyg-qh(ogi5HZVO6#;Y-HAB=^Fj+%?vU-*b=0#iS&J68qD8%YaRw5^2Us3 z2V?iUY#I-{y4I!tNo8+HF7w)r_g#Y?T@CI?5VeR++80bI`lxR~JYSzfeM=e9pZRiV zpuNn@VXccy`krrFlaJ^xvuPms7-RZvUv6Y*?zcC#OvLjelE^JtgG`zX1^Ja0p_^kS znaSWPwJ?2766+#_gLeEk6VQwFZ z^uaUbuOAscP^Kb3@CF%QATdWx23S`ashFAYi~bL${onIOem52IJl|Ph5sn_kOi<9x z!9Y>){$zrAk1jG{`zSu5UDK48roQT(a#0Jj9*P6R0TsY&fHoii$N=^MD}eP`k4q-2 zzT%zQpq5C0(Ds~x_^LDQa~fEWJf2}ax?vUetDfEbYhyC-T2T!8E z2*%%8_+&o*I4QbU%ktw7e>?Hr&xw`DhqFP(HD+ZZSFt&H%jN&kwaA}lB2TA-|Ap)4 zQuUKb=GnE=s(o_J6(^fsCR7m`5`a|zq3LzN8qjiwXnd1s07=wcz_$S325bNbxk*3@ zkOpi52(ei}4v+^F09$|}padubwgDs|RX`1}1K0)Z0qTGTpb2OJNW$&`-T>SOdJNDVbYTF$gg;E)c+J89&G(TU!1(1xV;=cp;#YRt8F*#I9191-SZ97VqBWzxGI3O zEsk5^l3gytaa}IewPnfVrTHNdZl2BOsww9kdfSRUpXazC2kT(8jlgIxg1!Yv4qJ^x zuhlGZTzUxWldt7fWzf#nSDNwiA)l*TyjGI8H^`)wQT#!;^sKiv-}?>h0nkhpoR z-sf`=U+z*o`Mp%0_)}4%-fw5t26lW9cfKc8?fq*>iC1fq5L=7lW>O)0BKZ^3tua4W z`GcHA%Li4NDc-lzK zayz!cF1Y}pJ_XJe+8T~02m-8i$$gL`^$e|gZe%lK2bny*5A21u3V5>5v?H;X*s9L^ q-MbVUGLZRTfUPCLFqZ5KEi}HcLe5ndSchK;&W9?)->eG!gZmecoorbE diff --git a/tests/examples/example_data/subsets_input/properties/part.2.parquet b/tests/examples/example_data/subsets_input/properties/part.2.parquet deleted file mode 100644 index 1d7df89dbdd9ac5c06af6e7be741f0c26d523b58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3584 zcmcInTWlj&8JfE5aIz4tX8P-21pf8JRl?59M&x~_xwhvHt zRR!(-liM_y9~-6)aeWE@E$p z?c;V^u&r2V!3MSuxEY(*HB2wFWn}0*)S;iA&dZjnYYN6=6cq{u1qo^N2jV4J=*X6> zwdwirn%-*3f$?A+V6Upg{h_N@r0;MBsZErkmA!A_E@s%x@s516LxMgEDUY1qw zZ?A6&153h&2Z+Ui)~A)p$HIuW4-e^dxQyB=nj8sNWX+VlW07rS8@lOTV^Le~AfrnM zCvpR1AW=8zYu8JtYar7LhE`IM${Jy?WnhQ|F_1w9rGS9D|0 z4qSGU@vP1Ge3T#1yyzJa~t?23UbS(C5)G!C=2t|8foh5WLG=^M#1?pZy3lq3Q77>W@Y zX&Q|;=bB^3PW^)H+?yjXO0@6thvUH&G?D+#bm=e4l_$#E__@aS)LVZ& zdj9HNQU12{{U0iSyWgGuJxEXeC@{eUV~b(R7h^r^NX5)ZREqw2Ec!-Z>R*?p{O{&2 zHVxa6Vn!$!!f>M~cz$t|`4e4YlFs&gKzpuI0h(G4T=GzRAwM+-XaYDuCFJ+Wg!?w( zeh?_n0A>JLz#1R|_yiyc_z>WAz>|QRfM-Mg<$Z8TF44V_IBHDuG)=7k?IDNtM~1T~ zb@d@sU_SMRn>z31xF_{1))O%+ONE)&MxOufy|=#Aul%xF6rW@Z{%`!`kK>TDufK1C z@eFe|lCL-!`-dm`*67rm!KvSjP5c+GADYGoKIZ=DahJb7>JdqzFiBJk7T*R)02#0k z=l~Re3eW&LpbNMSFaRc`+9DQgV&MQNJwP8Y02~7D06q#h0(=bc0^mi!#{r}Sp9Fjg z@M*wj0G|bX4)A%vOMsUFUjTd&a2N0;z$<`P0bd5Z2Dk_K3Y6`u!%}>0cz@^aX8p$; z)_+`DQnrg!Fz`R^_dB&0z5D*?1Mk$2-#_(-%TT?GzTPE*hgC0DTBKN?DM(mF zjE`c1lOt|pO;V)^h?(&K0d~!BB8ya1icQ3p|Bs#cXgBE8BTZoRE*q7m{L!C34ow@I zW_?|xiO6Js;klELH4*ps1YNOJ%}m^gKbw%n#B&e;Dn*``|w&NbQN1jL>2<^I|;0TIJ~`x?#_00 z=6d4j=row~Q6$L8N-Ug<)OJ_^`4N=Ecb#F z$oz%nxU~1^#7Uun5qz{bX;x(3fR6)`l&e^QnSg~WCBu<7`i^o zoL!uqPaIKfkgHZV>NniJWdH4dnp`Pvh2ywQ0J+vUZh=d;xh%)ExlG%krPJs5JwjYP z;paCq?lX*z6^CEoxE=?3aCeNr-BE&Z7m)7l))zawjWWlr^guuTOhHxbX0Eo-m|H*K z^EHdt%JPPs=7HWRC`q%BDem)g{S;qcY$4EXHY7zD6s{Gr7U(WYe0_(=+hV&GU_?+e>hboW}`tkT8J1nInd(6 Y11s(xWwB-WPH=xz1^x+D;J@F00P_QnLjV8( diff --git a/tests/examples/example_data/subsets_input/types/part.0.parquet b/tests/examples/example_data/subsets_input/types/part.0.parquet deleted file mode 100644 index 6074b2fc3bd605b27f527b0d1b07e7ebc07c0fd2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3018 zcmcImOKcif6df=o*l`-$%}_{bOQ}QL)^_+9*N(i%00u*V*u*eE6FD=?7zWP_gpVx+Dna{oVop@~2m-Vw2_Dd7{#FS#ES!SAHwyupb3=?k(t(FB3F{LS+rnY#YEjLH-9`01ExMk{1 zO;Ana(T%#&s_QnG)>OINZYpJyeL3E&cXWyH!~~_;V;OdxP0g_)JIPvEt80mw9i@bh z5zrxO=2J7SqRDcenVj+jg26~M8qMu=bGh7usc4Sz1eyq<>$d`-%XoF#vp29TDi_Vg4O$U^mqvY zTEcUa{V$XEsm1&Ih->Kt>8WKrva*jyQV^A_+Jg~`i5W97=CLytX5^fWu>hlh6*vRf zfH7bkI15Yw=YZFM^S~r91xy340~Y`mumf)Z4!{Xq1l|N(z+1rEz&n5&co*;h?*TKw zCBO?@24;cxfe(NWfjM9vSOBh^v#t7|4*B_w8(A=kt-es&VLS9VY^HvUotZtw3TNPZ zV?SBXUvp1H1=srHxzul~8#C!G`F7!ncI%IWJF^cZ<@4%SU#ow9YfSC^Y{JD_`n$`l zNOV~q+U#G9dcPU>KC`<1MFyv>mopV&9M)F3Dr*93W!Q;J=EF^ViAlMT^^hrWXtE#* z$uM@&>2j{Vk`8!fd+`5N%>K)SUG&=Q&&Dn=jC0OmZ3wy~v>ZQJ?t2tnlJ~dEb+x1E zEzec=#~wxU+@5oL4AVCC%hYqyah}cnNRoXskGnlvAI8hMj>F7$iF3BE5 z>m9Ap8#wA>4M!ZYYjRtVgtmb2`yN?GIL-|+Hn$aFBSrsJ#jtX7WK4pLkWwp?0G#+C|IwcJZAC1P!i4OK-xN8&pYzu!tlIHjbisu<7ih+3>&x{0`W$S=mbgr`4u zW=C3AyVMs+`bA9ze^yN>-3s{pVj&zV1hb)XobYmr7)ahN@fn}t_p4GohWxYDM6F(t zdr4K)GJB;sY9Kj9lKK2vv3(agZ}MXjM0gPVq* z&-;pewkpNf2J5(UvrDw?6nVc&bfbHLuFuJQ1m55|O&{zRU#8Zd=VTrYQQr(?{z01p z@$`M+pf)@w_oN=xI8Z7i?+N@$hvr>d+fUWFP8$1x ztnrH+OKMrqG{nFnMpLd|IkbzXfDJYbkfhGMfWb4v1O{Uq zS$(XcDypLHx{0bP$|8%Xt1i3hqH3$E%c_g2Y8PGf4|G-a+&h@ygjA}Uj{MEzoO93p z&Uf#*!dpxsOuEP)9pn#=4Cy6_^8}&Iy9t6|+pVr4J5Fo4R$CXeNigw(rL;Zpl2q#r zOEv1Q$>~sAnF7mJr!O^B$+0lYwuMf|5m+y@q&*ay&=kpPgKKkjtEF~Y2Fk=XS{;id z$P}4bBxQ1zbd&DD3b8PWr|@}>2SMoky$QI>4%b=Y92ME13dM+2qk8Q82flOEC)DG^ zA_e~7eP@82_D5$4x8LX9>6^-2n4Iv+-g{2(U1#u5N6<4izkd;`C&&o`ZnN+p2>AZc zLEZ&5UcqW4c19~k%GhN%PWx<$rW_Ym?dnlw+m@nej@Lp-tq%e@!NN%jBwr%wkxEw= zCUKz8b1WqUzFI~z`G_;`zx&s(PUlqW2j* zU}a3svDJO*CZ9MnpgLOX`%V{4oN^G6Q&TQtdB#IT0V{yZfK|ZSjK_5aMZAF`*1_^7 z;4Q%0fU5uo5Cg;k8-N5L319&!00&3|t^qQDEFcHi1Z)BFfC69}umdOpN&p^E22=o5 zz;(b4z)iqAfL*{XfH31(7eP35N%jM+rc5@T3hwh9dwd>x5aT5l##q5ge`W7D_t~p+ z(;Gq{_1R+PxAm?0?2dA?{7Ap?`_Zk1k7V^hlrMOHCP2Cd`)@*(@$x+I zct4*EJ~rvredM@8 zN|mXbi*uB1+Qohad+tz%peqn#bTthkYO12iyC-dS=3NVp9tfnmhAOpWg@S0KtDF6y zqa`-sKty(3u>@JL1PH%FDF%eYiGjuzcaoP@siUK(WKOc6?DGxfIKgDZ{-4;ODvuU{ z)SyB;Fa3h~u>IxE2Mh6BZMN;1c~rw%Sw^uZH6EHrB*sNyMcayCwc1#9O%>ZhyN^US z`$D_j>Y-%&O1q=B3}g+5mqJS+3e=zmQ;#~AqDv9#i11A_g~BF(#cogEd#7GUSGc~x zFiakRu4@dl%*5(Ul40shypC<2{mpNVQR@u^Wna=HtsgflcS#?v5iD2L3v`~kOr<)>u5El7pO@Oi0 zazm5)+zOYpU~IG@@9DA4kiVdDYOfCY!eTiVEk_DbiA8xCRix9mt9&kG`@@>dCV_vU!8KcTrJvSBJ-1(F zp$5b$o-UOV6>Bq|UaOT(a6d%xe59iedBxL3r5w|^jf6Uu_cMNpO@dCz!%=QBA2loV z+8)$`>H`hJJ*Bi3!g*mnPG3_6z0{QB*l$&VH{=LahPoWAW#VWJj&)?z64r+4+x1q> zDu1w7reEMUVhvF*G^FszoJOfD@JN#)u1hpZmXy*)HQ%r3rdEmMZ9iWMRro?fW)s78 ztZny@w%rOJ){t&^FW~igo{zxWdB*7j`z4fX4(55DhbbC)I`UNKcF^vPw&fptTA0Jrw;{wt&4d#69<_l)6K$uKx_PzV^x!koGH@F zI7fDSIh<)`)molGJ;^BOwsG8V3}bFVzoO4PhdV8W{KM9TSJY~(wg-3If6!PEVmC3^ zVt7!l2}xGrGm(TQ1}paCSM)jP&%vIAG+7*&4fY?g3%@(J;2(j%0ku*TeEKYqaB3iXp4{5NSI5brbT|2R}uI(mn;df>p3N>!0K0TNe4FGw6f;?4na;GbE?bLmzbQI!PCRD=Ns$Cxv8|8x?{S;a+YL{g4c4ZC&-H6g4aa5<`_M~2$CBnDa9P| z&J>zL!vnJgldCm#k|3AJ>?A3Zhe!|U3C$A6MsT2j3w*)s$Q37yvcr9lc$}JEJ({4a z)#|0O%VKrGlevBS^w_uGcL*YU!xbWZ!T2G<6AXA(dxkU-C7piRf79i^?h0RXhP``3 z2PdI^f^-rv9D)Zyz+5HuEa}5+%o;aQIms=#p9E3DB>L#k=UZMt&0ScfFXaH1z2G9XbfB`rUFafP` zuiHWq+9<*XNIHNnpa<9lYyn;bYy(~bybQPicm;3~@G9Un!0UiF0G9x70^R~#2D}Zp z0(b{-74R_=#itNI@C< z#y(%3gQq5Z3tVXF)5+{F^UFtatJ1mBXWH|>ZohEss;JzMZ+)Tue#e;n0er%L`a-0; zf7+aii1+A@*MD~;{K;eCuSP@vB7^;QFPM!J0Z&ViC5zv@penZ~A*p_acHG!Q*2D5gyB=)B1GU+*J?kjCH9LcBcXK=t4^fPZ!i<^{XEjaC zx~lLdXZ8^3MvpViW(Nh^kj$3S)R8nAor+9FC}4x?FFmZBicd{b+eE-e7Yb+ClXhDM zE{r{ce$m-iNbJ&gkZ^IJXXvCohY?4?dWNPuH2A?2 z2Z1Mcg0T!pbjobJQ7+{`*aH8=vy7&;E9v4)DVE>hmx>mz=ae%_f(LnnQ6nuTnW^)! zZj@)^RStYBC0P}E%q)|#z&9@Q>>9Ffi)^=*P0~t5Q&m1)sPbCMs+Eq zBdruMH+0Xgw_;S-Zle@?h+a>~yjGBf=+K&avCXlFlQmoyuNN(0Ngd{VqpTTfd3wdx zv&Bf6Ey!Yeaj=iovmL~3yUa#a#2Y>fcz+(uhhyzH`|$(kC6cfA*Lg4xgV$GL`G4T1 z1bHx8zE@#W5q_FM`2km5qpW9SL2uym0ky$m>jlWLEaEAP1Ay%95na9 zX=QESd~oO#?x9)^ENmBSidP~669#9VZ%fv3!7z^Fi*2HRaZT7)T5ujd7VJO4!bj&a HeD(hg6`~ko diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml deleted file mode 100644 index 389da55a1..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - caption: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml deleted file mode 100644 index 3c996e9d6..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - description: - type: binary - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml deleted file mode 100644 index c02abbaa1..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: string - - captions: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml deleted file mode 100644 index 3cda0cc6c..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: Fourth component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str - some_list: - description: Some list - type: list - items: - type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml deleted file mode 100644 index 091a7d9d5..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Third component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/components/arguments/component.yaml b/tests/examples/example_specs/components/arguments/component.yaml deleted file mode 100644 index 659ed0026..000000000 --- a/tests/examples/example_specs/components/arguments/component.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -args: - string_default_arg: - description: default string argument - type: str - default: foo - integer_default_arg: - description: default integer argument - type: int - default: 0 - float_default_arg: - description: default float argument - type: float - default: 3.14 - bool_false_default_arg: - description: default bool argument - type: bool - default: False - bool_true_default_arg: - description: default bool argument - type: bool - default: True - list_default_arg: - description: default list argument - type: list - default: ["foo", "bar"] - dict_default_arg: - description: default dict argument - type: dict - default: {"foo":1, "bar":2} - string_default_arg_none: - description: default string argument - type: str - default: None - integer_default_arg_none: - description: default integer argument - type: int - default: 0 - float_default_arg_none: - description: default float argument - type: float - default: 0.0 - bool_default_arg_none: - description: default bool argument - type: bool - default: False - list_default_arg_none: - description: default list argument - type: list - default: [] - dict_default_arg_none: - description: default dict argument - type: dict - default: {} - override_default_arg: - description: argument with default python value type that can be overriden - type: str - default: foo - override_default_arg_with_none: - description: argument with default python type that can be overriden with None - type: str - optional_arg: - description: optional argument - type: str - default: None diff --git a/tests/examples/example_specs/components/arguments/component_default_args.yaml b/tests/examples/example_specs/components/arguments/component_default_args.yaml deleted file mode 100644 index 816211c04..000000000 --- a/tests/examples/example_specs/components/arguments/component_default_args.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -args: - string_default_arg: - description: default string argument - type: str - default: foo - integer_default_arg: - description: default integer argument - type: int - default: 1 - float_default_arg: - description: default float argument - type: float - default: 3.14 - bool_false_default_arg: - description: default bool argument - type: bool - default: False - bool_true_default_arg: - description: default bool argument - type: bool - default: True - list_default_arg: - description: default list argument - type: list - default: ["foo", "bar"] - dict_default_arg: - description: default dict argument - type: dict - default: {"foo":1, "bar":2} - string_default_arg_none: - description: default string argument - type: str - default: None - integer_default_arg_none: - description: default integer argument - type: int - default: None - float_default_arg_none: - description: default float argument - type: float - default: None - bool_default_arg_none: - description: default bool argument - type: bool - default: None - list_default_arg_none: - description: default list argument - type: list - default: None - dict_default_arg_none: - description: default dict argument - type: dict - default: None - override_default_arg: - description: argument with default python value type that can be overriden - type: str - default: foo - override_default_none_arg: - description: argument with default None value type that can be overriden with a valid python type - type: float - default: None - override_default_arg_with_none: - description: argument with default python type that can be overriden with None - type: str - diff --git a/tests/examples/example_specs/components/arguments/input_manifest.json b/tests/examples/example_specs/components/arguments/input_manifest.json deleted file mode 100644 index 9ee2494f9..000000000 --- a/tests/examples/example_specs/components/arguments/input_manifest.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_123", - "component_id": "component_1", - "cache_key": "00" - }, - "index": { - "location": "/component_1" - }, - "fields": { - "data": { - "type": "binary", - "location": "/component_1" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/components/input_manifest.json b/tests/examples/example_specs/components/input_manifest.json deleted file mode 100644 index 80fa0b91d..000000000 --- a/tests/examples/example_specs/components/input_manifest.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/example_component" - }, - "fields": { - "data": { - "location": "/example_component", - "type": "binary" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt deleted file mode 100644 index 768ddfb21..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt +++ /dev/null @@ -1 +0,0 @@ -tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 47c2fe949..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/component_1" - }, - "fields": - { - "data": { - "type": "binary", - "location": "/component_1" - }, - "height": { - "type": "int32", - "location": "/component_1" - }, - "width": { - "type": "int32", - "location": "/component_1" - }, - "captions": { - "type": "string", - "location": "/component_1" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json deleted file mode 100644 index 78cfec59a..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json deleted file mode 100644 index f00c64aac..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json deleted file mode 100644 index f7a6f429d..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/sample_pipeline_test/components/dummy_component/Dockerfile b/tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/Dockerfile rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile diff --git a/tests/sample_pipeline_test/components/dummy_component/README.md b/tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/README.md rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md diff --git a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml similarity index 73% rename from tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml index 1091703eb..0a041fa3d 100644 --- a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml @@ -4,13 +4,9 @@ description: Dummy component for testing custom components image: fndnt/dummy_component:dev consumes: - text: - fields: - data: + text_data: type: string produces: - text: - fields: - data: + text_data: type: string \ No newline at end of file diff --git a/tests/sample_pipeline_test/components/dummy_component/requirements.txt b/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/requirements.txt rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt diff --git a/tests/sample_pipeline_test/components/dummy_component/src/main.py b/tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/src/main.py rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py diff --git a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml similarity index 95% rename from tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml index 35c43aadb..eddb6e580 100644 --- a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml @@ -3,9 +3,7 @@ description: Component that loads a dataset from a parquet uri image: fndnt/load_from_parquet:dev produces: - text: - fields: - data: + text_data: type: string args: diff --git a/tests/sample_pipeline_test/data/sample.parquet b/tests/integration_tests/sample_pipeline_test/data/sample.parquet similarity index 100% rename from tests/sample_pipeline_test/data/sample.parquet rename to tests/integration_tests/sample_pipeline_test/data/sample.parquet diff --git a/tests/test_sample_pipeline.py b/tests/integration_tests/test_sample_pipeline.py similarity index 91% rename from tests/test_sample_pipeline.py rename to tests/integration_tests/test_sample_pipeline.py index fefc65531..8e7f6fbda 100644 --- a/tests/test_sample_pipeline.py +++ b/tests/integration_tests/test_sample_pipeline.py @@ -17,7 +17,7 @@ # work around to make test executable on M1 Macbooks os.environ["DOCKER_DEFAULT_PLATFORM"] = "linux/amd64" -BASE_PATH = Path("./tests/sample_pipeline_test") +BASE_PATH = Path("./tests/integration_tests/sample_pipeline_test") NUMBER_OF_COMPONENTS = 3 @@ -57,6 +57,7 @@ def sample_pipeline(data_dir="./data") -> Pipeline: return pipeline +@pytest.mark.skip(reason="Skipping due to random failure.") def test_local_runner(sample_pipeline, tmp_path_factory): with tmp_path_factory.mktemp("temp") as data_dir: sample_pipeline.base_path = str(data_dir) @@ -64,7 +65,8 @@ def test_local_runner(sample_pipeline, tmp_path_factory): sample_pipeline, output_path="docker-compose.yaml", extra_volumes=[ - str(Path("tests/sample_pipeline_test/data").resolve()) + ":/data", + str(Path("tests/integration_tests/sample_pipeline_test/data").resolve()) + + ":/data", ], ) DockerRunner().run("docker-compose.yaml") diff --git a/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 62% rename from tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml index abe5091ea..066519825 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml @@ -3,16 +3,12 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - data: - type: string + captions_data: + type: string args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 55% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml index 2f9907df1..e9b67d68e 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml @@ -3,18 +3,17 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + + caption_data: + type: string produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 61% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml index 18ea49b2c..053b4c5b5 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml @@ -2,17 +2,16 @@ name: First component description: This is an example component image: example_component:latest -produces: - images: - fields: - data: - type: binary +consumes: + images_data: + type: binary - captions: - fields: - data: - type: string +produces: + captions_data: + type: string + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml new file mode 100644 index 000000000..a1a7995a2 --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml @@ -0,0 +1,24 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + captions_description: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 53% rename from tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml index 45964a8c6..053b4c5b5 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml @@ -3,21 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - data: - type: string + captions_data: + type: string - images: - fields: - data: - type: binary + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml new file mode 100644 index 000000000..8e0517f0a --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml @@ -0,0 +1,21 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: string + + captions_data: + type: string + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 50% rename from tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml index 45964a8c6..0841688e9 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml @@ -2,22 +2,13 @@ name: First component description: This is an example component image: example_component:latest -consumes: - images: - fields: - data: - type: binary - produces: - captions: - fields: - data: - type: string + images_data: + type: binary + + captions_data: + type: string - images: - fields: - data: - type: binary args: storage_args: description: Storage arguments diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml new file mode 100644 index 000000000..1cef340bd --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml @@ -0,0 +1,29 @@ +name: Fourth component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str + some_list: + description: Some list + type: list + items: + type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml new file mode 100644 index 000000000..fa328ae01 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml @@ -0,0 +1,18 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/examples/example_specs/components/component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml similarity index 59% rename from tests/examples/example_specs/components/component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml index 973cc3e6b..fb6ebbaa0 100644 --- a/tests/examples/example_specs/components/component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml @@ -1,4 +1,4 @@ -name: Example component +name: Third component description: This is an example component image: example_component:latest @@ -6,18 +6,19 @@ consumes: images_data: type: binary -produces: - images_data: + captions_data: + type: string + + embeddings_data: type: array items: type: float32 -additionalFields: false +produces: + images_data: + type: binary args: - flag: - description: user argument + storage_args: + description: Storage arguments type: str - value: - description: integer value - type: int diff --git a/tests/test_compiler.py b/tests/pipeline/test_compiler.py similarity index 99% rename from tests/test_compiler.py rename to tests/pipeline/test_compiler.py index 903c7963c..2c34f7f4e 100644 --- a/tests/test_compiler.py +++ b/tests/pipeline/test_compiler.py @@ -20,9 +20,9 @@ VertexPipelineConfigs, ) -COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") +COMPONENTS_PATH = Path("./tests/pipeline/examples/pipelines/valid_pipeline") -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") TEST_PIPELINES = [ ( diff --git a/tests/test_pipeline.py b/tests/pipeline/test_pipeline.py similarity index 98% rename from tests/test_pipeline.py rename to tests/pipeline/test_pipeline.py index 37d421ef6..b4deebc97 100644 --- a/tests/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -8,8 +8,8 @@ from fondant.core.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline, Resources -valid_pipeline_path = Path(__file__).parent / "example_pipelines/valid_pipeline" -invalid_pipeline_path = Path(__file__).parent / "example_pipelines/invalid_pipeline" +valid_pipeline_path = Path(__file__).parent / "examples/pipelines/valid_pipeline" +invalid_pipeline_path = Path(__file__).parent / "examples/pipelines/invalid_pipeline" def yaml_file_to_dict(file_path): diff --git a/tests/test_runner.py b/tests/pipeline/test_runner.py similarity index 98% rename from tests/test_runner.py rename to tests/pipeline/test_runner.py index 84ad63304..011f65e55 100644 --- a/tests/test_runner.py +++ b/tests/pipeline/test_runner.py @@ -11,7 +11,7 @@ VertexRunner, ) -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") def test_docker_runner(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 7897719aa..61fa8630f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -55,16 +55,16 @@ def test_basic_invocation(command): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", - "example_modules/component", - "example_modules.component.py", - "example_modules/component.py", + "examples.example_modules.component", + "examples.example_modules/component", + "examples.example_modules.component.py", + "examples.example_modules/component.py", ], ) def test_get_module(module_str): """Test get module method.""" module = get_module(module_str) - assert module.__name__ == "example_modules.component" + assert module.__name__ == "examples.example_modules.component" def test_get_module_error(): @@ -77,7 +77,7 @@ def test_get_module_error(): "module_str", [ __name__, # cannot be split - "example_modules.component", # module does not exist + "examples.example_modules.component", # module does not exist ], ) def test_component_from_module(module_str): @@ -89,8 +89,10 @@ def test_component_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.invalid_component", # module contains more than one component class - "example_modules.invalid_double_components", # module does not contain a component class + # module contains more than one component class + "examples.example_modules.invalid_component", + # module does not contain a component class + "examples.example_modules.invalid_double_components", ], ) def test_component_from_module_error(module_str): @@ -103,7 +105,7 @@ def test_component_from_module_error(module_str): "module_str", [ __name__, - "example_modules.pipeline", + "examples.example_modules.pipeline", ], ) def test_pipeline_from_module(module_str): @@ -115,8 +117,10 @@ def test_pipeline_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", # module does not contain a pipeline instance - "example_modules.invalid_double_pipeline", # module contains many pipeline instances + # module does not contain a pipeline instance + "examples.example_modules.component", + # module contains many pipeline instances + "examples.example_modules.invalid_double_pipeline", ], ) def test_pipeline_from_module_error(module_str): @@ -417,7 +421,7 @@ def test_vertex_run(tmp_path_factory): def test_component_build(mock_build, mock_push): """Test that the build command works as expected.""" args = argparse.Namespace( - component_dir=Path(__file__).parent / "example_component", + component_dir=Path(__file__).parent / "examples/example_component", tag="image:test", build_arg=["key=value"], nocache=True, @@ -435,7 +439,7 @@ def test_component_build(mock_build, mock_push): # Check that docker build and push were executed correctly mock_build.assert_called_with( - path=str(Path(__file__).parent / "example_component"), + path=str(Path(__file__).parent / "examples/example_component"), tag="image:test", buildargs={"key": "value"}, nocache=True, @@ -449,7 +453,7 @@ def test_component_build(mock_build, mock_push): # Check that the component specification file was updated correctly with open( - Path(__file__).parent / "example_component" / "fondant_component.yaml", + Path(__file__).parent / "examples/example_component" / "fondant_component.yaml", "r+", ) as f: content = f.read() diff --git a/tox.ini b/tox.ini index acd58f104..d22216b49 100644 --- a/tox.ini +++ b/tox.ini @@ -48,6 +48,6 @@ commands_pre= poetry install --all-extras poetry show commands= - poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing + poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing --ignore=tests/integration_tests commands_post= bash ./scripts/post-build.sh From 521578fa891fc8cb6a677bc3b8b5406dbb1b03ec Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Mon, 27 Nov 2023 10:21:30 +0100 Subject: [PATCH 4/4] Implement `previous_index` field (#668) #656 We might want to validate this by checking that the field mentioned in `previous_index` is also defined in the `consumes` section. --- .../download_images/fondant_component.yaml | 1 - .../Dockerfile | 15 +++-- .../embedding_based_laion_retrieval/README.md | 7 ++ .../fondant_component.yaml | 5 +- .../src/main.py | 20 +++--- .../test_requirements.txt | 1 + .../tests/pytest.ini | 2 + .../tests/test_component.py | 66 +++++++++++++++++++ .../index_qdrant/fondant_component.yaml | 14 ++-- .../prompt_based_laion_retrieval/Dockerfile | 15 +++-- .../prompt_based_laion_retrieval/README.md | 7 ++ .../fondant_component.yaml | 5 +- .../prompt_based_laion_retrieval/src/main.py | 17 +++-- .../test_requirements.txt | 1 + .../tests/pytest.ini | 2 + .../tests/test_component.py | 66 +++++++++++++++++++ src/fondant/component/executor.py | 19 +----- src/fondant/core/component_spec.py | 4 ++ src/fondant/core/manifest.py | 5 +- src/fondant/core/schemas/component_spec.json | 3 + .../examples/component_specs/component.yaml | 2 - .../evolution_examples/2/component.yaml | 6 +- .../evolution_examples/2/output_manifest.json | 16 ----- 23 files changed, 224 insertions(+), 75 deletions(-) create mode 100644 components/embedding_based_laion_retrieval/test_requirements.txt create mode 100644 components/embedding_based_laion_retrieval/tests/pytest.ini create mode 100644 components/embedding_based_laion_retrieval/tests/test_component.py create mode 100644 components/prompt_based_laion_retrieval/test_requirements.txt create mode 100644 components/prompt_based_laion_retrieval/tests/pytest.ini create mode 100644 components/prompt_based_laion_retrieval/tests/test_component.py diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index abe19c653..91efeca15 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -23,7 +23,6 @@ produces: type: int32 images_height: type: int32 -# additionalFields: false args: timeout: diff --git a/components/embedding_based_laion_retrieval/Dockerfile b/components/embedding_based_laion_retrieval/Dockerfile index 72525d884..0cdcde81a 100644 --- a/components/embedding_based_laion_retrieval/Dockerfile +++ b/components/embedding_based_laion_retrieval/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base # System dependencies RUN apt-get update && \ @@ -16,8 +16,15 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team # Set the working directory to the component folder WORKDIR /component/src +COPY src/ src/ +ENV PYTHONPATH "${PYTHONPATH}:./src" -# Copy over src-files -COPY src/ . +FROM base as test +COPY test_requirements.txt . +RUN pip3 install --no-cache-dir -r test_requirements.txt +COPY tests/ tests/ +RUN python -m pytest tests -ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file +FROM base +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index f19d55b03..97e0866a5 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -14,6 +14,7 @@ used to find images similar to the embedded images / captions. **This component produces:** - images_url: string +- embedding_id: string ### Arguments @@ -45,3 +46,9 @@ embedding_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(embedding_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml index af147c158..d7616cfbd 100644 --- a/components/embedding_based_laion_retrieval/fondant_component.yaml +++ b/components/embedding_based_laion_retrieval/fondant_component.yaml @@ -15,7 +15,10 @@ consumes: produces: images_url: type: string -# additionalFields: false + embedding_id: + type: string + +previous_index: embedding_id args: num_images: diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py index 0f7697dc3..4d730f24c 100644 --- a/components/embedding_based_laion_retrieval/src/main.py +++ b/components/embedding_based_laion_retrieval/src/main.py @@ -1,7 +1,6 @@ """This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings.""" import asyncio import concurrent.futures -import functools import logging import typing as t @@ -40,6 +39,10 @@ def __init__( modality=Modality.IMAGE, ) + def query(self, id_: t.Any, embedding: t.List[float]) -> t.List[t.Dict]: + results = self.client.query(embedding_input=embedding) + return [dict(d, embedding_id=id_) for d in results] + def transform( self, dataframe: pd.DataFrame, @@ -53,23 +56,20 @@ async def async_query(): futures = [ loop.run_in_executor( executor, - functools.partial( - self.client.query, - embedding_input=embedding.tolist(), - ), + self.query, + row.id, + row.embeddings_data.tolist(), ) - for embedding in dataframe["embeddings_data"] + for row in dataframe.itertuples() ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)["id", "url"] + results_df = pd.DataFrame(results)[["id", "url", "embedding_id"]] results_df = results_df.set_index("id") - # Cast the index to string - results_df.index = results_df.index.astype(str) - results_df.columns = ["images_url"] + results_df.rename(columns={"url": "images_url"}) return results_df diff --git a/components/embedding_based_laion_retrieval/test_requirements.txt b/components/embedding_based_laion_retrieval/test_requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/embedding_based_laion_retrieval/test_requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 diff --git a/components/embedding_based_laion_retrieval/tests/pytest.ini b/components/embedding_based_laion_retrieval/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/embedding_based_laion_retrieval/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/embedding_based_laion_retrieval/tests/test_component.py b/components/embedding_based_laion_retrieval/tests/test_component.py new file mode 100644 index 000000000..ba59028bf --- /dev/null +++ b/components/embedding_based_laion_retrieval/tests/test_component.py @@ -0,0 +1,66 @@ +import typing as t + +import numpy as np +import pandas as pd + +from src.main import LAIONRetrievalComponent + + +def test_component(monkeypatch): + def mocked_client_query(embedding_input: t.List[float]) -> t.List[dict]: + if embedding_input == [1, 2]: + return [ + { + "id": "a", + "url": "http://a", + }, + { + "id": "b", + "url": "http://b", + }, + ] + if embedding_input == [2, 3]: + return [ + { + "id": "c", + "url": "http://c", + }, + { + "id": "d", + "url": "http://d", + }, + ] + msg = f"Unexpected value: `embeddings_input` was {embedding_input}" + raise ValueError(msg) + + input_dataframe = pd.DataFrame.from_dict( + { + "id": ["1", "2"], + "embeddings_data": [np.array([1, 2]), np.array([2, 3])], + }, + ) + + expected_output_dataframe = pd.DataFrame.from_dict( + { + "id": ["a", "b", "c", "d"], + "url": ["http://a", "http://b", "http://c", "http://d"], + "embedding_id": ["1", "1", "2", "2"], + }, + ) + expected_output_dataframe = expected_output_dataframe.set_index("id") + + component = LAIONRetrievalComponent( + num_images=2, + aesthetic_score=9, + aesthetic_weight=0.5, + ) + + monkeypatch.setattr(component.client, "query", mocked_client_query) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output_dataframe, + right=output_dataframe, + check_dtype=False, + ) diff --git a/components/index_qdrant/fondant_component.yaml b/components/index_qdrant/fondant_component.yaml index 6feb3b257..68ea33847 100644 --- a/components/index_qdrant/fondant_component.yaml +++ b/components/index_qdrant/fondant_component.yaml @@ -7,14 +7,12 @@ image: 'fndnt/index_qdrant:dev' tags: - Data writing consumes: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 + text_data: + type: string + embeddings_data: + type: array + items: + type: float32 args: collection_name: description: The name of the Qdrant collection to upsert data into. diff --git a/components/prompt_based_laion_retrieval/Dockerfile b/components/prompt_based_laion_retrieval/Dockerfile index 72525d884..0cdcde81a 100644 --- a/components/prompt_based_laion_retrieval/Dockerfile +++ b/components/prompt_based_laion_retrieval/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base # System dependencies RUN apt-get update && \ @@ -16,8 +16,15 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team # Set the working directory to the component folder WORKDIR /component/src +COPY src/ src/ +ENV PYTHONPATH "${PYTHONPATH}:./src" -# Copy over src-files -COPY src/ . +FROM base as test +COPY test_requirements.txt . +RUN pip3 install --no-cache-dir -r test_requirements.txt +COPY tests/ tests/ +RUN python -m pytest tests -ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file +FROM base +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 8d7ffcf70..0551730d9 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -17,6 +17,7 @@ This component doesn’t return the actual images, only URLs. **This component produces:** - images_url: string +- prompt_id: string ### Arguments @@ -50,3 +51,9 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(prompt_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index 02ea08349..3ac3604ac 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -16,7 +16,10 @@ consumes: produces: images_url: type: string -# additionalFields: false + prompt_id: + type: string + +previous_index: prompt_id args: num_images: diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index 2168f5ef0..bd3cee783 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -41,6 +41,10 @@ def __init__( modality=Modality.IMAGE, ) + def query(self, id_: t.Any, prompt: str) -> t.List[t.Dict]: + results = self.client.query(text=prompt) + return [dict(d, prompt_id=id_) for d in results] + def transform( self, dataframe: pd.DataFrame, @@ -53,21 +57,20 @@ async def async_query(): futures = [ loop.run_in_executor( executor, - self.client.query, - prompt, + self.query, + row.id, + row.prompts_text, ) - for prompt in dataframe["prompts_text"] + for row in dataframe.itertuples() ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)["id", "url"] + results_df = pd.DataFrame(results)[["id", "url", "prompt_id"]] results_df = results_df.set_index("id") - # Cast the index to string - results_df.index = results_df.index.astype(str) - results_df.columns = ["images_url"] + results_df.rename(columns={"url": "images_url"}) return results_df diff --git a/components/prompt_based_laion_retrieval/test_requirements.txt b/components/prompt_based_laion_retrieval/test_requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/prompt_based_laion_retrieval/test_requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 diff --git a/components/prompt_based_laion_retrieval/tests/pytest.ini b/components/prompt_based_laion_retrieval/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/prompt_based_laion_retrieval/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/tests/test_component.py b/components/prompt_based_laion_retrieval/tests/test_component.py new file mode 100644 index 000000000..7a3a268e6 --- /dev/null +++ b/components/prompt_based_laion_retrieval/tests/test_component.py @@ -0,0 +1,66 @@ +import typing as t + +import pandas as pd + +from src.main import LAIONRetrievalComponent + + +def test_component(monkeypatch): + def mocked_client_query(text: str) -> t.List[dict]: + if text == "first prompt": + return [ + { + "id": "a", + "url": "http://a", + }, + { + "id": "b", + "url": "http://b", + }, + ] + if text == "second prompt": + return [ + { + "id": "c", + "url": "http://c", + }, + { + "id": "d", + "url": "http://d", + }, + ] + msg = f"Unexpected value: `text` was {text}" + raise ValueError(msg) + + input_dataframe = pd.DataFrame.from_dict( + { + "id": ["1", "2"], + "prompts_text": ["first prompt", "second prompt"], + }, + ) + + expected_output_dataframe = pd.DataFrame.from_dict( + { + "id": ["a", "b", "c", "d"], + "url": ["http://a", "http://b", "http://c", "http://d"], + "prompt_id": ["1", "1", "2", "2"], + }, + ) + expected_output_dataframe = expected_output_dataframe.set_index("id") + + component = LAIONRetrievalComponent( + num_images=2, + aesthetic_score=9, + aesthetic_weight=0.5, + url="", + ) + + monkeypatch.setattr(component.client, "query", mocked_client_query) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output_dataframe, + right=output_dataframe, + check_dtype=False, + ) diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index d77200da8..571bc60bb 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -548,28 +548,11 @@ def _execute_component( ) # Clear divisions if component spec indicates that the index is changed - if self._infer_index_change(): + if self.spec.previous_index is not None: dataframe.clear_divisions() return dataframe - # TODO: fix in #244 - def _infer_index_change(self) -> bool: - """Infer if this component changes the index based on its component spec.""" - """ - if not self.spec.accepts_additional_subsets: - return True - if not self.spec.outputs_additional_subsets: - return True - for subset in self.spec.consumes.values(): - if not subset.additional_fields: - return True - return any( - not subset.additional_fields for subset in self.spec.produces.values() - ) - """ - return False - class DaskWriteExecutor(Executor[DaskWriteComponent]): """Base class for a Fondant write component.""" diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 4dd945568..1700e10a1 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -181,6 +181,10 @@ def produces(self) -> t.Mapping[str, Field]: }, ) + @property + def previous_index(self) -> t.Optional[str]: + return self._specification.get("previous_index") + @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 58c8ab045..4f0aab480 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -267,7 +267,10 @@ def evolve( # : PLR0912 (too many branches) Field(name="index", location=component_spec.component_folder_name), ) - # TODO handle additionalFields + # Remove all previous fields if the component changes the index + if component_spec.previous_index: + for field_name in evolved_manifest.fields: + evolved_manifest.remove_field(field_name) # Add or update all produced fields defined in the component spec for name, field in component_spec.produces.items(): diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 064ea027d..dfa6bf68c 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -33,6 +33,9 @@ "produces": { "$ref": "common.json#/definitions/fields" }, + "previous_index": { + "type": "string" + }, "args": { "$ref": "#/definitions/args" } diff --git a/tests/component/examples/component_specs/component.yaml b/tests/component/examples/component_specs/component.yaml index 973cc3e6b..d1f28b76e 100644 --- a/tests/component/examples/component_specs/component.yaml +++ b/tests/component/examples/component_specs/component.yaml @@ -11,8 +11,6 @@ produces: type: array items: type: float32 -additionalFields: false - args: flag: diff --git a/tests/core/examples/evolution_examples/2/component.yaml b/tests/core/examples/evolution_examples/2/component.yaml index 2352adcb5..95d9300d1 100644 --- a/tests/core/examples/evolution_examples/2/component.yaml +++ b/tests/core/examples/evolution_examples/2/component.yaml @@ -7,8 +7,10 @@ consumes: type: binary produces: - images_encoding: - type: string + images_data: + type: binary + +previous_index: "true" # Only used to remove old fields for now args: storage_args: diff --git a/tests/core/examples/evolution_examples/2/output_manifest.json b/tests/core/examples/evolution_examples/2/output_manifest.json index ca1f6f361..db62fda15 100644 --- a/tests/core/examples/evolution_examples/2/output_manifest.json +++ b/tests/core/examples/evolution_examples/2/output_manifest.json @@ -9,25 +9,9 @@ "location":"/example_component" }, "fields": { - "images_width": { - "type": "int32", - "location":"/example_component" - }, - "images_height": { - "type": "int32", - "location":"/example_component" - }, "images_data": { "type": "binary", "location":"/example_component" - }, - "captions_data": { - "type": "binary", - "location":"/example_component" - }, - "images_encoding": { - "type": "string", - "location":"/example_component" } } } \ No newline at end of file