From 35338b6e792368309c10c8522d130c84412d9af0 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 16 Nov 2023 14:15:48 +0100 Subject: [PATCH 01/34] Update component spec schema validation --- src/fondant/core/schemas/component_spec.json | 33 +++---------------- .../component_specs/invalid_component.yaml | 8 +++-- .../component_specs/valid_component.yaml | 18 ++++------ .../valid_component_no_args.yaml | 13 ++++---- 4 files changed, 23 insertions(+), 49 deletions(-) diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 8d684a3e5..418c5cb2b 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -28,43 +28,18 @@ } }, "consumes": { - "$ref": "#/definitions/subsets" + "$ref": "#/definitions/field" }, "produces": { - "$ref": "#/definitions/subsets" + "$ref": "#/definitions/field" }, "args": { "$ref": "#/definitions/args" } }, "definitions": { - "subset": { - "type": "object", - "properties": { - "fields": { - "$ref": "common.json#/definitions/fields" - }, - "additionalFields": { - "type": "boolean", - "default": true - } - }, - "required": [ - "fields" - ] - }, - "subsets": { - "type": "object", - "properties": { - "additionalSubsets": { - "type": "boolean", - "default": true - } - }, - "minProperties": 1, - "additionalProperties": { - "$ref": "#/definitions/subset" - } + "field": { + "$ref": "common.json#/definitions/fields" }, "args": { "type": "object", diff --git a/tests/example_specs/component_specs/invalid_component.yaml b/tests/example_specs/component_specs/invalid_component.yaml index 3fc8128b5..d1c88c444 100644 --- a/tests/example_specs/component_specs/invalid_component.yaml +++ b/tests/example_specs/component_specs/invalid_component.yaml @@ -4,11 +4,15 @@ image: example_component:latest consumes: images: - data: binary + fields: + data: + type: binary produces: captions: - data: string + fields: + data: + type: string Arguments: storage_args: diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml index c4b99e837..1215af1bd 100644 --- a/tests/example_specs/component_specs/valid_component.yaml +++ b/tests/example_specs/component_specs/valid_component.yaml @@ -6,24 +6,18 @@ tags: consumes: images: - fields: - data: - type: binary + type: binary embeddings: - fields: - data: - type: array - items: - type: float32 + type: array + items: + type: float32 produces: captions: - fields: - data: - type: string + type: string args: storage_args: description: Storage arguments - type: str \ No newline at end of file + type: str diff --git a/tests/example_specs/component_specs/valid_component_no_args.yaml b/tests/example_specs/component_specs/valid_component_no_args.yaml index c3adfa6aa..de11cb2ee 100644 --- a/tests/example_specs/component_specs/valid_component_no_args.yaml +++ b/tests/example_specs/component_specs/valid_component_no_args.yaml @@ -4,12 +4,13 @@ image: example_component:latest consumes: images: - fields: - data: - type: binary + type: binary + + embeddings: + type: array + items: + type: float32 produces: captions: - fields: - data: - type: string \ No newline at end of file + type: string From a269e3cf49b8cc9ac4af6247db7a229617f0b84c Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 16 Nov 2023 14:46:38 +0100 Subject: [PATCH 02/34] Update component spec tests to validate new component spec --- src/fondant/core/component_spec.py | 43 ++++++------------------------ tests/test_component_specs.py | 17 ++---------- 2 files changed, 10 insertions(+), 50 deletions(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index cf177e07c..1c1f24cfa 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -66,34 +66,6 @@ def kubeflow_type(self) -> str: return lookup[self.type] -class ComponentSubset: - """ - Class representing a Fondant Component subset. - - Args: - specification: the part of the component json representing the subset - """ - - def __init__(self, specification: t.Dict[str, t.Any]) -> None: - self._specification = specification - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - @property - def fields(self) -> t.Mapping[str, Field]: - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - @property - def additional_fields(self) -> bool: - return self._specification.get("additionalFields", True) - - class ComponentSpec: """ Class representing a Fondant component specification. @@ -191,26 +163,27 @@ def tags(self) -> t.List[str]: @property def index(self): - return ComponentSubset({"fields": {}}) + # TODO: check usage + return {"fields": {}} @property - def consumes(self) -> t.Mapping[str, ComponentSubset]: + def consumes(self) -> t.Mapping[str, Field]: """The subsets consumed by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("consumes", {}).items() + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification["consumes"].items() if name != "additionalSubsets" }, ) @property - def produces(self) -> t.Mapping[str, ComponentSubset]: + def produces(self) -> t.Mapping[str, Field]: """The subsets produced by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("produces", {}).items() + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification["produces"].items() if name != "additionalSubsets" }, ) diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py index caf0344de..e403180bb 100644 --- a/tests/test_component_specs.py +++ b/tests/test_component_specs.py @@ -8,7 +8,6 @@ import yaml from fondant.core.component_spec import ( ComponentSpec, - ComponentSubset, KubeflowComponentSpec, ) from fondant.core.exceptions import InvalidComponentSpec @@ -65,8 +64,8 @@ def test_attribute_access(valid_fondant_schema): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.consumes["images"].fields["data"].type == Type("binary") - assert fondant_component.consumes["embeddings"].fields["data"].type == Type.list( + assert fondant_component.consumes["images"].type == Type("binary") + assert fondant_component.consumes["embeddings"].type == Type.list( Type("float32"), ) @@ -129,15 +128,3 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr - - -def test_component_subset_repr(): - """Test that the __repr__ method of ComponentSubset returns the expected string.""" - component_subset_schema = { - "name": "Example subset", - "description": "This is an example subset", - } - - component_subset = ComponentSubset(component_subset_schema) - expected_repr = f"ComponentSubset({component_subset_schema!r})" - assert repr(component_subset) == expected_repr From ad0dab64c67004b1e22909ce7fd69427fdb036b1 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 16 Nov 2023 15:03:13 +0100 Subject: [PATCH 03/34] Add additional fields to json schema --- src/fondant/core/schemas/common.json | 5 +++++ tests/example_specs/component_specs/valid_component.yaml | 1 + 2 files changed, 6 insertions(+) diff --git a/src/fondant/core/schemas/common.json b/src/fondant/core/schemas/common.json index 11df4e988..e64ffd57e 100644 --- a/src/fondant/core/schemas/common.json +++ b/src/fondant/core/schemas/common.json @@ -57,6 +57,11 @@ "fields": { "type": "object", "minProperties": 1, + "properties": { + "additionalFields": { + "type": "boolean" + } + }, "additionalProperties": { "$ref": "#/definitions/field" } diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml index 1215af1bd..1df326e87 100644 --- a/tests/example_specs/component_specs/valid_component.yaml +++ b/tests/example_specs/component_specs/valid_component.yaml @@ -16,6 +16,7 @@ consumes: produces: captions: type: string + additionalFields: true args: storage_args: From 7b9153599129b282aad66cf49dd2bd6023caad01 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 16 Nov 2023 15:38:02 +0100 Subject: [PATCH 04/34] Update manifest json schema for validation --- src/fondant/core/schemas/manifest.json | 14 ++++---- .../manifests/invalid_manifest.json | 14 ++++---- .../manifests/valid_manifest.json | 36 ++++++++----------- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/fondant/core/schemas/manifest.json b/src/fondant/core/schemas/manifest.json index 00ad6d1cc..3756f48e1 100644 --- a/src/fondant/core/schemas/manifest.json +++ b/src/fondant/core/schemas/manifest.json @@ -37,17 +37,17 @@ "location" ] }, - "subsets": { - "$ref": "#/definitions/subsets" + "fields": { + "$ref": "#/definitions/fields" } }, "required": [ "metadata", "index", - "subsets" + "fields" ], "definitions": { - "subset": { + "field": { "type": "object", "properties": { "location": { @@ -60,13 +60,13 @@ }, "required": [ "location", - "fields" + "type" ] }, - "subsets": { + "fields": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/subset" + "$ref": "#/definitions/field" } } } diff --git a/tests/example_specs/manifests/invalid_manifest.json b/tests/example_specs/manifests/invalid_manifest.json index 3fe8b1097..51ec6c5e5 100644 --- a/tests/example_specs/manifests/invalid_manifest.json +++ b/tests/example_specs/manifests/invalid_manifest.json @@ -1,14 +1,14 @@ { "metadata": { - "base_path": "gs://bucket" + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" }, "index": { - "location": "/index" + "location": "/component1" }, - "subsets": { - "images": { - "location": "/images", - "fields": [] - } + "fields": { + "images": {} } } \ No newline at end of file diff --git a/tests/example_specs/manifests/valid_manifest.json b/tests/example_specs/manifests/valid_manifest.json index 9bc00c512..0f7c58126 100644 --- a/tests/example_specs/manifests/valid_manifest.json +++ b/tests/example_specs/manifests/valid_manifest.json @@ -6,30 +6,24 @@ "component_id": "67890" }, "index": { - "location": "/index" + "location": "/component1" }, - "subsets": { + "fields":{ "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } + "location": "/component1", + "type": "binary" }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } + "height": { + "location": "/component2", + "type": "int32" + }, + "width": { + "location": "/component2", + "type": "int32" + }, + "caption": { + "location": "/component3", + "type": "string" } } } \ No newline at end of file From 5d1bf5e9c6cff613ec4e4902c136a696977ac97a Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 17 Nov 2023 14:30:25 +0100 Subject: [PATCH 05/34] Update manifest creation --- src/fondant/core/manifest.py | 45 +++++++++++++--------- src/fondant/core/schema.py | 30 +++++++++++++-- tests/test_manifest.py | 74 ++++++++++++++++++++---------------- 3 files changed, 95 insertions(+), 54 deletions(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 692c4e7cd..76d486d4a 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -172,7 +172,7 @@ def create( specification = { "metadata": metadata.to_dict(), "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": {}, + "fields": {}, } return cls(specification) @@ -224,35 +224,46 @@ def index(self) -> Index: return Index(self._specification["index"], base_path=self.base_path) @property - def subsets(self) -> t.Mapping[str, Subset]: + def fields(self) -> t.Mapping[str, Field]: """The subsets of the manifest as an immutable mapping.""" + # e.g. ('images', {'location': '/component1', 'type': 'binary'}) return types.MappingProxyType( { - name: Subset(subset, base_path=self.base_path) - for name, subset in self._specification["subsets"].items() + name: Field( + name=name, + type=Type(field["type"]), + base_path=self.base_path, + location=field["location"], + ) + for name, field in self._specification["fields"].items() }, ) - def add_subset( + def add_fields( self, - name: str, fields: t.Iterable[t.Union[Field, t.Tuple[str, Type]]], ) -> None: - if name in self._specification["subsets"]: - msg = f"A subset with name {name} already exists" - raise ValueError(msg) - - self._specification["subsets"][name] = { - "location": f"/{self.pipeline_name}/{self.run_id}/{self.component_id}/{name}", - "fields": {name: type_.to_json() for name, type_ in fields}, + """Add fields to manifest.""" + for field in fields: + if field.name in self._specification["fields"]: + msg = f"A field with name {field.name} already exists" + raise ValueError(msg) + + self.add_field(field) + + def add_field(self, field: Field): + """Add field to manifest.""" + self._specification["fields"][field.name] = { + "location": f"/{self.component_id}", + "type": field.type.name, } - def remove_subset(self, name: str) -> None: - if name not in self._specification["subsets"]: - msg = f"Subset {name} not found in specification" + def remove_field(self, name: str) -> None: + if name not in self._specification["fields"]: + msg = f"Field {name} not found in specification" raise ValueError(msg) - del self._specification["subsets"][name] + del self._specification["fields"][name] def evolve( # noqa : PLR0912 (too many branches) self, diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index ca9bb0944..6f23f642e 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -5,6 +5,7 @@ import os import re import typing as t +from dataclasses import dataclass from enum import Enum import pyarrow as pa @@ -161,11 +162,32 @@ def __eq__(self, other): return False -class Field(t.NamedTuple): - """Class representing a single field or column in a Fondant subset.""" +@dataclass +class Field: + """Class representing a single field or column in a Fondant dataset.""" - name: str - type: Type + def __init__( + self, name: str, type: Type, base_path: str = None, location: str = None + ) -> None: + self._name = name + self._type = type + self._base_path = base_path + self._location = location + + @property + def name(self) -> str: + """The absolute location of the field.""" + return self._name + + @property + def type(self) -> Type: + """The absolute location of the field.""" + return self._type + + @property + def location(self) -> str: + """The absolute location of the field.""" + return self._base_path + self._location def validate_partition_size(arg_value): diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 3af3ea425..cafa561b6 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -108,9 +108,9 @@ def test_attribute_access(valid_manifest): manifest = Manifest(valid_manifest) assert manifest.metadata == valid_manifest["metadata"] - assert manifest.index.location == "gs://bucket/index" - assert manifest.subsets["images"].location == "gs://bucket/images" - assert manifest.subsets["images"].fields["data"].type == Type("binary") + assert manifest.index.location == "gs://bucket/component1" + assert manifest.fields["images"].location == "gs://bucket/component1" + assert manifest.fields["images"].type == Type("binary") def test_manifest_creation(): @@ -129,8 +129,13 @@ def test_manifest_creation(): cache_key=cache_key, ) - manifest.add_subset("images", [("width", Type("int32")), ("height", Type("int32"))]) - manifest.subsets["images"].add_field("data", Type("binary")) + manifest.add_fields( + [ + Field(name="width", type=Type("int32")), + Field(name="height", type=Type("int32")), + ] + ) + manifest.add_field(Field(name="data", type=Type("binary"))) assert manifest._specification == { "metadata": { @@ -141,20 +146,18 @@ def test_manifest_creation(): "cache_key": cache_key, }, "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": { - "images": { - "location": f"/{pipeline_name}/{run_id}/{component_id}/images", - "fields": { - "width": { - "type": "int32", - }, - "height": { - "type": "int32", - }, - "data": { - "type": "binary", - }, - }, + "fields": { + "width": { + "type": "int32", + "location": f"/{component_id}", + }, + "height": { + "type": "int32", + "location": f"/{component_id}", + }, + "data": { + "type": "binary", + "location": f"/{component_id}", }, }, } @@ -172,7 +175,7 @@ def test_manifest_repr(): manifest.__repr__() == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1/index'}, 'subsets': {}})" + " 'index': {'location': '/NAME/A/1/index'}, 'fields': {}})" ) @@ -181,33 +184,38 @@ def test_manifest_alteration(valid_manifest): manifest = Manifest(valid_manifest) # test adding a subset - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], + manifest.add_fields( + [ + Field(name="width2", type=Type("int32")), + Field(name="height2", type=Type("int32")), + ], ) - assert "images2" in manifest.subsets + + assert "width2" in manifest.fields + assert "height2" in manifest.fields # test adding a duplicate subset - with pytest.raises(ValueError, match="A subset with name images2 already exists"): - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], + with pytest.raises(ValueError, match="A field with name width2 already exists"): + manifest.add_fields( + [ + Field(name="width2", type=Type("int32")), + ], ) # test removing a subset - manifest.remove_subset("images2") - assert "images2" not in manifest.subsets + manifest.remove_field("width2") + assert "images2" not in manifest.fields # test removing a nonexistant subset - with pytest.raises(ValueError, match="Subset pictures not found in specification"): - manifest.remove_subset("pictures") + with pytest.raises(ValueError, match="Field pictures not found in specification"): + manifest.remove_field("pictures") def test_manifest_copy_and_adapt(valid_manifest): """Test that a manifest can be copied and adapted without changing the original.""" manifest = Manifest(valid_manifest) new_manifest = manifest.copy() - new_manifest.remove_subset("images") + new_manifest.remove_field("images") assert manifest._specification == valid_manifest assert new_manifest._specification != valid_manifest From d8ecd01c58a53f689a0fb5eb34930353e145470f Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 09:25:54 +0100 Subject: [PATCH 06/34] Reduce PR to core module --- src/fondant/core/component_spec.py | 4 +- src/fondant/core/manifest.py | 170 +++++++++-------------------- src/fondant/core/schema.py | 8 +- tests/test_component.py | 44 +++----- tests/test_component_specs.py | 9 +- tests/test_manifest.py | 132 +++++++++++----------- 6 files changed, 142 insertions(+), 225 deletions(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 1c1f24cfa..6a9399ba4 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -173,7 +173,7 @@ def consumes(self) -> t.Mapping[str, Field]: { name: Field(name=name, type=Type.from_json(field)) for name, field in self._specification["consumes"].items() - if name != "additionalSubsets" + if name != "additionalFields" }, ) @@ -184,7 +184,7 @@ def produces(self) -> t.Mapping[str, Field]: { name: Field(name=name, type=Type.from_json(field)) for name, field in self._specification["produces"].items() - if name != "additionalSubsets" + if name != "additionalFields" }, ) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 76d486d4a..7dd2f7dad 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -18,59 +18,6 @@ from fondant.core.schema import Field, Type -class Subset: - """ - Class representing a Fondant subset. - - Args: - specification: The part of the manifest json representing the subset - base_path: The base path which the subset location is defined relative to - """ - - def __init__(self, specification: dict, *, base_path: str) -> None: - self._specification = specification - self._base_path = base_path - - @property - def location(self) -> str: - """The absolute location of the subset.""" - return self._base_path + self._specification["location"] - - @property - def fields(self) -> t.Mapping[str, Field]: - """The fields of the subset returned as an immutable mapping.""" - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - def add_field(self, name: str, type_: Type, *, overwrite: bool = False) -> None: - if not overwrite and name in self._specification["fields"]: - msg = f"A field with name {name} already exists" - raise ValueError(msg) - - self._specification["fields"][name] = type_.to_json() - - def remove_field(self, name: str) -> None: - del self._specification["fields"][name] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - -class Index(Subset): - """Special case of a subset for the index, which has fixed fields.""" - - @property - def fields(self) -> t.Dict[str, Field]: - return { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - @dataclass class Metadata: """ @@ -171,7 +118,7 @@ def create( specification = { "metadata": metadata.to_dict(), - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, + "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}"}, "fields": {}, } return cls(specification) @@ -196,6 +143,10 @@ def copy(self) -> "Manifest": def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] + @property + def index(self) -> t.Dict[str, t.Any]: + return self._specification["index"] + def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -203,6 +154,30 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] + def get_dataset_location(self, spec: ComponentSpec) -> str: + """Determines dataset location using the base_path and component spec.""" + return self.base_path + "/" + spec.component_folder_name + + def retrieve_field_mapping(self): + """ + Retrieve a mapping of field locations to corresponding field names. + A dictionary where keys are field locations and values are lists + of column names. + + Example: + { + "/base_path/component_1": ["Name", "HP"], + "/base_path/component_2": ["Type 1", "Type 2"], + } + """ + field_mapping = {} + for field_name, field in self.fields.items(): + if field.location in field_mapping: + field_mapping[field.location].append(field_name) + else: + field_mapping[field.location] = [field_name] + return field_mapping + @property def run_id(self) -> str: return self.metadata["run_id"] @@ -219,10 +194,6 @@ def pipeline_name(self) -> str: def cache_key(self) -> str: return self.metadata["cache_key"] - @property - def index(self) -> Index: - return Index(self._specification["index"], base_path=self.base_path) - @property def fields(self) -> t.Mapping[str, Field]: """The subsets of the manifest as an immutable mapping.""" @@ -232,7 +203,6 @@ def fields(self) -> t.Mapping[str, Field]: name: Field( name=name, type=Type(field["type"]), - base_path=self.base_path, location=field["location"], ) for name, field in self._specification["fields"].items() @@ -241,7 +211,7 @@ def fields(self) -> t.Mapping[str, Field]: def add_fields( self, - fields: t.Iterable[t.Union[Field, t.Tuple[str, Type]]], + fields: t.Iterable[Field], ) -> None: """Add fields to manifest.""" for field in fields: @@ -249,10 +219,17 @@ def add_fields( msg = f"A field with name {field.name} already exists" raise ValueError(msg) - self.add_field(field) + self.add_or_update_field(field, overwrite=False) + + def add_or_update_field(self, field: Field, overwrite: bool = False): + """Add or update field to manifest.""" + if overwrite is False and field.name in self._specification["fields"]: + msg = ( + f"A field with name {field.name} already exists. Set overwrite to true, " + f"if you want to update the field." + ) + raise ValueError(msg) - def add_field(self, field: Field): - """Add field to manifest.""" self._specification["fields"][field.name] = { "location": f"/{self.component_id}", "type": field.type.name, @@ -265,7 +242,7 @@ def remove_field(self, name: str) -> None: del self._specification["fields"][name] - def evolve( # noqa : PLR0912 (too many branches) + def evolve( # : PLR0912 (too many branches) self, component_spec: ComponentSpec, *, @@ -285,68 +262,23 @@ def evolve( # noqa : PLR0912 (too many branches) # Update `component_id` of the metadata component_id = component_spec.component_folder_name evolved_manifest.update_metadata(key="component_id", value=component_id) + if run_id is not None: evolved_manifest.update_metadata(key="run_id", value=run_id) # Update index location as this is currently always rewritten - evolved_manifest.index._specification[ + evolved_manifest._specification["index"][ "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/index" - - # If additionalSubsets is False in consumes, - # Remove all subsets from the manifest that are not listed - if not component_spec.accepts_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.consumes: - evolved_manifest.remove_subset(subset_name) - - # If additionalSubsets is False in produces, - # Remove all subsets from the manifest that are not listed - if not component_spec.outputs_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.produces: - evolved_manifest.remove_subset(subset_name) - - # If additionalFields is False for a consumed subset, - # Remove all fields from that subset that are not listed - for subset_name, subset in component_spec.consumes.items(): - if subset_name in evolved_manifest.subsets and not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) + ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}" + + # TODO handle additionalFields # For each output subset defined in the component, add or update it - for subset_name, subset in component_spec.produces.items(): - # Subset is already in manifest, update it - if subset_name in evolved_manifest.subsets: - # If additional fields are not allowed, remove the fields not defined in the - # component spec produces section - if not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # Add fields defined in the component spec produces section - # Overwrite to persist changes to the field (eg. type of column) - for field in subset.fields.values(): - evolved_manifest.subsets[subset_name].add_field( - field.name, - field.type, - overwrite=True, - ) - - # Update subset location as this is currently always rewritten - evolved_manifest.subsets[subset_name]._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/{subset_name}" - - # Subset is not yet in manifest, add it - else: - evolved_manifest.add_subset(subset_name, subset.fields.values()) + for name, field in component_spec.produces.items(): + # If field was part not part of the input manifest, add field to output manifest. + # If field was part of the input manifest and got produced by the component, update + # the manifest field. + evolved_manifest.add_or_update_field(field, overwrite=True) return evolved_manifest diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index 6f23f642e..d6ca6427f 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -167,11 +167,13 @@ class Field: """Class representing a single field or column in a Fondant dataset.""" def __init__( - self, name: str, type: Type, base_path: str = None, location: str = None + self, + name: str, + type: Type, + location: str = "", ) -> None: self._name = name self._type = type - self._base_path = base_path self._location = location @property @@ -187,7 +189,7 @@ def type(self) -> Type: @property def location(self) -> str: """The absolute location of the field.""" - return self._base_path + self._location + return self._location def validate_partition_size(arg_value): diff --git a/tests/test_component.py b/tests/test_component.py index e759bd367..e5dcb3bc3 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -377,38 +377,22 @@ def test_wrap_transform(): "description": "Component for testing", "image": "component:test", "consumes": { - "image": { - "fields": { - "height": { - "type": "int16", - }, - "width": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "image_width": { + "type": "int16", + }, + "caption_text": { + "type": "string", }, }, "produces": { - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "caption_text": { + "type": "string", }, - "image": { - "fields": { - "height": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, }, }, @@ -425,9 +409,9 @@ def test_wrap_transform(): def transform(dataframe: pd.DataFrame) -> pd.DataFrame: # Check hierarchical columns assert dataframe.columns.tolist() == [ - ("image", "height"), - ("image", "width"), - ("caption", "text"), + "image_height", + "image_width", + "caption_text", ] return dataframe diff --git a/tests/test_component_specs.py b/tests/test_component_specs.py index e403180bb..38a51ea8d 100644 --- a/tests/test_component_specs.py +++ b/tests/test_component_specs.py @@ -48,12 +48,19 @@ def test_component_spec_pkgutil_error(mock_get_data): def test_component_spec_validation(valid_fondant_schema, invalid_fondant_schema): - """Test that the manifest is validated correctly on instantiation.""" + """Test that the comp is validated correctly on instantiation.""" ComponentSpec(valid_fondant_schema) with pytest.raises(InvalidComponentSpec): ComponentSpec(invalid_fondant_schema) +def test_component_spec_load_from_file(valid_fondant_schema, invalid_fondant_schema): + """Test that the component spec is validated correctly on instantiation.""" + ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + with pytest.raises(InvalidComponentSpec): + ComponentSpec.from_file(component_specs_path / "invalid_component.yaml") + + def test_attribute_access(valid_fondant_schema): """ Test that attributes can be accessed as expected: diff --git a/tests/test_manifest.py b/tests/test_manifest.py index cafa561b6..74587ae5a 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -3,10 +3,12 @@ from pathlib import Path import pytest +from fondant.core.component_spec import ComponentSpec from fondant.core.exceptions import InvalidManifest -from fondant.core.manifest import Field, Index, Manifest, Subset, Type +from fondant.core.manifest import Field, Manifest, Type manifest_path = Path(__file__).parent / "example_specs/manifests" +component_specs_path = Path(__file__).parent / "example_specs/component_specs" @pytest.fixture() @@ -28,53 +30,6 @@ def test_manifest_validation(valid_manifest, invalid_manifest): Manifest(invalid_manifest) -def test_subset_init(): - """Test initializing a subset.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - assert subset.location == "/tmp/images/ABC/123" - assert ( - subset.__repr__() - == "Subset({'location': '/images/ABC/123', 'fields': {'data': {'type': 'binary'}}})" - ) - - -def test_subset_fields(): - """Test manipulating subset fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - - # add a field - subset.add_field(name="data2", type_=Type("binary")) - assert "data2" in subset.fields - - # add a duplicate field - with pytest.raises(ValueError, match="A field with name data2 already exists"): - subset.add_field(name="data2", type_=Type("binary")) - - # add a duplicate field but overwrite - subset.add_field(name="data2", type_=Type("string"), overwrite=True) - assert subset.fields["data2"].type == Type("string") - - # remove a field - subset.remove_field(name="data2") - assert "data2" not in subset.fields - - def test_set_base_path(valid_manifest): """Test altering the base path in the manifest.""" manifest = Manifest(valid_manifest) @@ -108,8 +63,8 @@ def test_attribute_access(valid_manifest): manifest = Manifest(valid_manifest) assert manifest.metadata == valid_manifest["metadata"] - assert manifest.index.location == "gs://bucket/component1" - assert manifest.fields["images"].location == "gs://bucket/component1" + assert manifest.index["location"] == "/component1" + assert manifest.fields["images"].location == "/component1" assert manifest.fields["images"].type == Type("binary") @@ -133,9 +88,9 @@ def test_manifest_creation(): [ Field(name="width", type=Type("int32")), Field(name="height", type=Type("int32")), - ] + ], ) - manifest.add_field(Field(name="data", type=Type("binary"))) + manifest.add_or_update_field(Field(name="data", type=Type("binary"))) assert manifest._specification == { "metadata": { @@ -145,7 +100,7 @@ def test_manifest_creation(): "component_id": component_id, "cache_key": cache_key, }, - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, + "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}"}, "fields": { "width": { "type": "int32", @@ -175,7 +130,7 @@ def test_manifest_repr(): manifest.__repr__() == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1/index'}, 'fields': {}})" + " 'index': {'location': '/NAME/A/1'}, 'fields': {}})" ) @@ -226,22 +181,59 @@ def test_no_validate_schema(monkeypatch, valid_manifest): Manifest(valid_manifest) -def test_index_fields(): - """Test that the fields property of Index returns the expected fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } +def test_evolve_manifest(): + """Test that the fields are evolved as expected.""" + run_id = "A" + spec = ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + input_manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) - index = Index(specification=subset_spec, base_path="/tmp") + output_manifest = input_manifest.evolve(component_spec=spec, run_id=run_id) - expected_fields = { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } + assert output_manifest.base_path == input_manifest.base_path + assert output_manifest.run_id == run_id + assert output_manifest.index["location"] == "/NAME/A/" + spec.component_folder_name + assert output_manifest.fields["captions"].type.name == "string" + + +def test_fields(): + """Test that the fields can added and updated as expected.""" + run_id = "A" + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + # add a field + manifest.add_or_update_field(Field(name="field_1", type=Type("int32"))) + assert "field_1" in manifest.fields + + # add a duplicate field, but overwrite (update) + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=True, + ) + assert manifest.fields["field_1"].type.name == "string" + + # add duplicate field + with pytest.raises( + ValueError, + match="A field with name field_1 already exists. Set overwrite to true, " + "if you want to update the field.", + ): + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=False, + ) - assert index.fields == expected_fields + # delete a field + manifest.remove_field(name="field_1") + assert "field_1" not in manifest.fields From 12c78ca35ec74522d241379e1959a77ebadb3a9a Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 10:39:57 +0100 Subject: [PATCH 07/34] Addresses comments --- src/fondant/core/component_spec.py | 13 +---- src/fondant/core/manifest.py | 51 ++++++++++++++----- src/fondant/core/schema.py | 4 +- src/fondant/core/schemas/manifest.json | 3 -- .../component_specs/valid_component.yaml | 1 - tests/test_manifest.py | 34 +++++++++++-- 6 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 6a9399ba4..a3d19178d 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -163,8 +163,7 @@ def tags(self) -> t.List[str]: @property def index(self): - # TODO: check usage - return {"fields": {}} + return Field(name="index", location=self._specification["index"].location) @property def consumes(self) -> t.Mapping[str, Field]: @@ -173,7 +172,6 @@ def consumes(self) -> t.Mapping[str, Field]: { name: Field(name=name, type=Type.from_json(field)) for name, field in self._specification["consumes"].items() - if name != "additionalFields" }, ) @@ -184,18 +182,9 @@ def produces(self) -> t.Mapping[str, Field]: { name: Field(name=name, type=Type.from_json(field)) for name, field in self._specification["produces"].items() - if name != "additionalFields" }, ) - @property - def accepts_additional_subsets(self) -> bool: - return self._specification.get("consumes", {}).get("additionalSubsets", True) - - @property - def outputs_additional_subsets(self) -> bool: - return self._specification.get("produces", {}).get("additionalSubsets", True) - @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 7dd2f7dad..6198bdecd 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -118,7 +118,7 @@ def create( specification = { "metadata": metadata.to_dict(), - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}"}, + "index": {"location": f"/{component_id}"}, "fields": {}, } return cls(specification) @@ -154,11 +154,8 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] - def get_dataset_location(self, spec: ComponentSpec) -> str: - """Determines dataset location using the base_path and component spec.""" - return self.base_path + "/" + spec.component_folder_name - - def retrieve_field_mapping(self): + @property + def field_mapping(self): """ Retrieve a mapping of field locations to corresponding field names. A dictionary where keys are field locations and values are lists @@ -172,10 +169,13 @@ def retrieve_field_mapping(self): """ field_mapping = {} for field_name, field in self.fields.items(): + location = ( + f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" + ) if field.location in field_mapping: - field_mapping[field.location].append(field_name) + field_mapping[location].append(field_name) else: - field_mapping[field.location] = [field_name] + field_mapping[location] = [field_name] return field_mapping @property @@ -223,16 +223,38 @@ def add_fields( def add_or_update_field(self, field: Field, overwrite: bool = False): """Add or update field to manifest.""" - if overwrite is False and field.name in self._specification["fields"]: + if field.name == "index": + self._add_or_update_index(field, overwrite=True) + elif overwrite is False and field.name in self._specification["fields"]: msg = ( f"A field with name {field.name} already exists. Set overwrite to true, " f"if you want to update the field." ) raise ValueError(msg) + else: + self._specification["fields"][field.name] = { + "location": f"/{self.component_id}", + "type": field.type.name, + } + + def _add_or_update_index(self, field: Field, overwrite: bool = True): + """Add or update the manifest index.""" + if overwrite is False: + msg = ( + "The index already exists. Set overwrite to true, " + "if you want to update the index." + ) + raise ValueError(msg) + + if field.name != "index": + msg = ( + f"The field name is {field.name}. If you try to update the index, set the field" + f"name to `index`." + ) + raise ValueError(msg) - self._specification["fields"][field.name] = { + self._specification["index"] = { "location": f"/{self.component_id}", - "type": field.type.name, } def remove_field(self, name: str) -> None: @@ -267,9 +289,10 @@ def evolve( # : PLR0912 (too many branches) evolved_manifest.update_metadata(key="run_id", value=run_id) # Update index location as this is currently always rewritten - evolved_manifest._specification["index"][ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}" + evolved_manifest.add_or_update_field(Field(name="index")) + # evolved_manifest._specification["index"][ + # "location" + # ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}" # TODO handle additionalFields diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index d6ca6427f..b775a2c0f 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -169,7 +169,7 @@ class Field: def __init__( self, name: str, - type: Type, + type: Type = None, location: str = "", ) -> None: self._name = name @@ -178,7 +178,7 @@ def __init__( @property def name(self) -> str: - """The absolute location of the field.""" + """The name of the field.""" return self._name @property diff --git a/src/fondant/core/schemas/manifest.json b/src/fondant/core/schemas/manifest.json index 3756f48e1..77365dd5f 100644 --- a/src/fondant/core/schemas/manifest.json +++ b/src/fondant/core/schemas/manifest.json @@ -53,9 +53,6 @@ "location": { "type": "string", "pattern": "/.*" - }, - "fields": { - "$ref": "common.json#/definitions/fields" } }, "required": [ diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml index 1df326e87..1215af1bd 100644 --- a/tests/example_specs/component_specs/valid_component.yaml +++ b/tests/example_specs/component_specs/valid_component.yaml @@ -16,7 +16,6 @@ consumes: produces: captions: type: string - additionalFields: true args: storage_args: diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 74587ae5a..37897f67e 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -100,7 +100,7 @@ def test_manifest_creation(): "component_id": component_id, "cache_key": cache_key, }, - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}"}, + "index": {"location": f"/{component_id}"}, "fields": { "width": { "type": "int32", @@ -130,7 +130,7 @@ def test_manifest_repr(): manifest.__repr__() == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1'}, 'fields': {}})" + " 'index': {'location': '/1'}, 'fields': {}})" ) @@ -197,7 +197,7 @@ def test_evolve_manifest(): assert output_manifest.base_path == input_manifest.base_path assert output_manifest.run_id == run_id - assert output_manifest.index["location"] == "/NAME/A/" + spec.component_folder_name + assert output_manifest.index["location"] == "/" + spec.component_folder_name assert output_manifest.fields["captions"].type.name == "string" @@ -237,3 +237,31 @@ def test_fields(): # delete a field manifest.remove_field(name="field_1") assert "field_1" not in manifest.fields + + +def test_accessing_the_index(): + """Test that test the index access.""" + run_id = "A" + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + # Add index field + manifest.metadata["component_id"] = "component_2" + manifest.add_or_update_field(Field(name="index", type=Type("int32"))) + assert manifest.index["location"] == "/component_2" + + +def test_field_mapping(valid_manifest): + """Test field mapping generation.""" + manifest = Manifest(valid_manifest) + field_mapping = manifest.field_mapping + assert field_mapping == { + "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], + "gs://bucket/test_pipeline/test_pipeline_12345/component2": ["width"], + "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], + } From c1cad603ceb6d88a136c1c417e42f0c303e1bcbe Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 10:43:01 +0100 Subject: [PATCH 08/34] Restructure test directory --- .../component_specs/invalid_component.yaml | 0 .../component_specs/kubeflow_component.yaml | 0 .../component_specs/valid_component.yaml | 0 .../valid_component_no_args.yaml | 0 .../components/arguments/component.yaml | 0 .../arguments/component_default_args.yaml | 0 .../components/arguments/input_manifest.json | 14 ++---- .../example_specs/components/component.yaml | 18 +++----- .../components/input_manifest.json | 17 +++++++ .../evolution_examples/1/component.yaml | 18 +++----- .../evolution_examples/1/output_manifest.json | 33 +++++++++++++ .../evolution_examples/2/component.yaml | 0 .../evolution_examples/2/output_manifest.json | 0 .../evolution_examples/3/component.yaml | 0 .../evolution_examples/3/output_manifest.json | 0 .../evolution_examples/4/component.yaml | 0 .../evolution_examples/4/output_manifest.json | 0 .../evolution_examples/5/component.yaml | 0 .../evolution_examples/5/output_manifest.json | 0 .../evolution_examples/6/component.yaml | 0 .../evolution_examples/6/output_manifest.json | 0 .../evolution_examples/7/component.yaml | 0 .../evolution_examples/7/output_manifest.json | 0 .../evolution_examples/8/component.yaml | 0 .../evolution_examples/8/output_manifest.json | 0 .../evolution_examples/input_manifest.json | 29 ++++++++++++ .../manifests/invalid_manifest.json | 0 .../manifests/valid_manifest.json | 0 .../example_pipeline/cache/42.txt | 0 .../component_1/manifest.json | 31 +++++++++++++ .../component_2/manifest.json | 0 .../component_1/manifest.json | 0 .../component_2/manifest.json | 0 tests/{ => core}/test_component_specs.py | 0 tests/{ => core}/test_manifest.py | 0 tests/{ => core}/test_manifest_evolution.py | 0 tests/{ => core}/test_schema.py | 0 .../components/input_manifest.json | 22 --------- .../evolution_examples/1/output_manifest.json | 46 ------------------- .../evolution_examples/input_manifest.json | 35 -------------- .../component_1/manifest.json | 36 --------------- 41 files changed, 129 insertions(+), 170 deletions(-) rename tests/{ => core}/example_specs/component_specs/invalid_component.yaml (100%) rename tests/{ => core}/example_specs/component_specs/kubeflow_component.yaml (100%) rename tests/{ => core}/example_specs/component_specs/valid_component.yaml (100%) rename tests/{ => core}/example_specs/component_specs/valid_component_no_args.yaml (100%) rename tests/{ => core}/example_specs/components/arguments/component.yaml (100%) rename tests/{ => core}/example_specs/components/arguments/component_default_args.yaml (100%) rename tests/{ => core}/example_specs/components/arguments/input_manifest.json (60%) rename tests/{ => core}/example_specs/components/component.yaml (56%) create mode 100644 tests/core/example_specs/components/input_manifest.json rename tests/{ => core}/example_specs/evolution_examples/1/component.yaml (55%) create mode 100644 tests/core/example_specs/evolution_examples/1/output_manifest.json rename tests/{ => core}/example_specs/evolution_examples/2/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/2/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/3/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/3/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/4/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/4/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/5/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/5/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/6/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/6/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/7/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/7/output_manifest.json (100%) rename tests/{ => core}/example_specs/evolution_examples/8/component.yaml (100%) rename tests/{ => core}/example_specs/evolution_examples/8/output_manifest.json (100%) create mode 100644 tests/core/example_specs/evolution_examples/input_manifest.json rename tests/{ => core}/example_specs/manifests/invalid_manifest.json (100%) rename tests/{ => core}/example_specs/manifests/valid_manifest.json (100%) rename tests/{ => core}/example_specs/mock_base_path/example_pipeline/cache/42.txt (100%) create mode 100644 tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => core}/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json (100%) rename tests/{ => core}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json (100%) rename tests/{ => core}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json (100%) rename tests/{ => core}/test_component_specs.py (100%) rename tests/{ => core}/test_manifest.py (100%) rename tests/{ => core}/test_manifest_evolution.py (100%) rename tests/{ => core}/test_schema.py (100%) delete mode 100644 tests/example_specs/components/input_manifest.json delete mode 100644 tests/example_specs/evolution_examples/1/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/input_manifest.json delete mode 100644 tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json diff --git a/tests/example_specs/component_specs/invalid_component.yaml b/tests/core/example_specs/component_specs/invalid_component.yaml similarity index 100% rename from tests/example_specs/component_specs/invalid_component.yaml rename to tests/core/example_specs/component_specs/invalid_component.yaml diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/core/example_specs/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/example_specs/component_specs/kubeflow_component.yaml rename to tests/core/example_specs/component_specs/kubeflow_component.yaml diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/core/example_specs/component_specs/valid_component.yaml similarity index 100% rename from tests/example_specs/component_specs/valid_component.yaml rename to tests/core/example_specs/component_specs/valid_component.yaml diff --git a/tests/example_specs/component_specs/valid_component_no_args.yaml b/tests/core/example_specs/component_specs/valid_component_no_args.yaml similarity index 100% rename from tests/example_specs/component_specs/valid_component_no_args.yaml rename to tests/core/example_specs/component_specs/valid_component_no_args.yaml diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/core/example_specs/components/arguments/component.yaml similarity index 100% rename from tests/example_specs/components/arguments/component.yaml rename to tests/core/example_specs/components/arguments/component.yaml diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/core/example_specs/components/arguments/component_default_args.yaml similarity index 100% rename from tests/example_specs/components/arguments/component_default_args.yaml rename to tests/core/example_specs/components/arguments/component_default_args.yaml diff --git a/tests/example_specs/components/arguments/input_manifest.json b/tests/core/example_specs/components/arguments/input_manifest.json similarity index 60% rename from tests/example_specs/components/arguments/input_manifest.json rename to tests/core/example_specs/components/arguments/input_manifest.json index d98ddd95b..9ee2494f9 100644 --- a/tests/example_specs/components/arguments/input_manifest.json +++ b/tests/core/example_specs/components/arguments/input_manifest.json @@ -7,16 +7,12 @@ "cache_key": "00" }, "index": { - "location": "/index" + "location": "/component_1" }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } + "fields": { + "data": { + "type": "binary", + "location": "/component_1" } } } \ No newline at end of file diff --git a/tests/example_specs/components/component.yaml b/tests/core/example_specs/components/component.yaml similarity index 56% rename from tests/example_specs/components/component.yaml rename to tests/core/example_specs/components/component.yaml index 19c8d5856..973cc3e6b 100644 --- a/tests/example_specs/components/component.yaml +++ b/tests/core/example_specs/components/component.yaml @@ -3,19 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - additionalFields: false + images_data: + type: array + items: + type: float32 +additionalFields: false args: diff --git a/tests/core/example_specs/components/input_manifest.json b/tests/core/example_specs/components/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/core/example_specs/components/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/1/component.yaml b/tests/core/example_specs/evolution_examples/1/component.yaml similarity index 55% rename from tests/example_specs/evolution_examples/1/component.yaml rename to tests/core/example_specs/evolution_examples/1/component.yaml index 22ae0feb1..e91ae6f46 100644 --- a/tests/example_specs/evolution_examples/1/component.yaml +++ b/tests/core/example_specs/evolution_examples/1/component.yaml @@ -3,18 +3,14 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary - + images_data: + type: binary + produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/core/example_specs/evolution_examples/1/output_manifest.json b/tests/core/example_specs/evolution_examples/1/output_manifest.json new file mode 100644 index 000000000..985f99625 --- /dev/null +++ b/tests/core/example_specs/evolution_examples/1/output_manifest.json @@ -0,0 +1,33 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/test_pipeline/custom_run_id/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/test_pipeline/12345/example_component" + }, + "images_height": { + "type": "int32", + "location":"/test_pipeline/12345/example_component" + }, + "images_data": { + "type": "binary", + "location":"/test_pipeline/12345/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/test_pipeline/12345/example_component" + }, + "embeddings_data": { + "type": "ListType(list)", + "location":"/test_pipeline/12345/example_component" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/2/component.yaml b/tests/core/example_specs/evolution_examples/2/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/2/component.yaml rename to tests/core/example_specs/evolution_examples/2/component.yaml diff --git a/tests/example_specs/evolution_examples/2/output_manifest.json b/tests/core/example_specs/evolution_examples/2/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/2/output_manifest.json rename to tests/core/example_specs/evolution_examples/2/output_manifest.json diff --git a/tests/example_specs/evolution_examples/3/component.yaml b/tests/core/example_specs/evolution_examples/3/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/3/component.yaml rename to tests/core/example_specs/evolution_examples/3/component.yaml diff --git a/tests/example_specs/evolution_examples/3/output_manifest.json b/tests/core/example_specs/evolution_examples/3/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/3/output_manifest.json rename to tests/core/example_specs/evolution_examples/3/output_manifest.json diff --git a/tests/example_specs/evolution_examples/4/component.yaml b/tests/core/example_specs/evolution_examples/4/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/4/component.yaml rename to tests/core/example_specs/evolution_examples/4/component.yaml diff --git a/tests/example_specs/evolution_examples/4/output_manifest.json b/tests/core/example_specs/evolution_examples/4/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/4/output_manifest.json rename to tests/core/example_specs/evolution_examples/4/output_manifest.json diff --git a/tests/example_specs/evolution_examples/5/component.yaml b/tests/core/example_specs/evolution_examples/5/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/5/component.yaml rename to tests/core/example_specs/evolution_examples/5/component.yaml diff --git a/tests/example_specs/evolution_examples/5/output_manifest.json b/tests/core/example_specs/evolution_examples/5/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/5/output_manifest.json rename to tests/core/example_specs/evolution_examples/5/output_manifest.json diff --git a/tests/example_specs/evolution_examples/6/component.yaml b/tests/core/example_specs/evolution_examples/6/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/6/component.yaml rename to tests/core/example_specs/evolution_examples/6/component.yaml diff --git a/tests/example_specs/evolution_examples/6/output_manifest.json b/tests/core/example_specs/evolution_examples/6/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/6/output_manifest.json rename to tests/core/example_specs/evolution_examples/6/output_manifest.json diff --git a/tests/example_specs/evolution_examples/7/component.yaml b/tests/core/example_specs/evolution_examples/7/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/7/component.yaml rename to tests/core/example_specs/evolution_examples/7/component.yaml diff --git a/tests/example_specs/evolution_examples/7/output_manifest.json b/tests/core/example_specs/evolution_examples/7/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/7/output_manifest.json rename to tests/core/example_specs/evolution_examples/7/output_manifest.json diff --git a/tests/example_specs/evolution_examples/8/component.yaml b/tests/core/example_specs/evolution_examples/8/component.yaml similarity index 100% rename from tests/example_specs/evolution_examples/8/component.yaml rename to tests/core/example_specs/evolution_examples/8/component.yaml diff --git a/tests/example_specs/evolution_examples/8/output_manifest.json b/tests/core/example_specs/evolution_examples/8/output_manifest.json similarity index 100% rename from tests/example_specs/evolution_examples/8/output_manifest.json rename to tests/core/example_specs/evolution_examples/8/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/input_manifest.json b/tests/core/example_specs/evolution_examples/input_manifest.json new file mode 100644 index 000000000..a1ae120b0 --- /dev/null +++ b/tests/core/example_specs/evolution_examples/input_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"12345", + "component_id":"example_component" + }, + "index":{ + "location":"/test_pipeline/12345/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/test_pipeline/12345/example_component" + }, + "images_height": { + "type": "int32", + "location":"/test_pipeline/12345/example_component" + }, + "images_data": { + "type": "binary", + "location":"/test_pipeline/12345/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/test_pipeline/12345/example_component" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/manifests/invalid_manifest.json b/tests/core/example_specs/manifests/invalid_manifest.json similarity index 100% rename from tests/example_specs/manifests/invalid_manifest.json rename to tests/core/example_specs/manifests/invalid_manifest.json diff --git a/tests/example_specs/manifests/valid_manifest.json b/tests/core/example_specs/manifests/valid_manifest.json similarity index 100% rename from tests/example_specs/manifests/valid_manifest.json rename to tests/core/example_specs/manifests/valid_manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/core/example_specs/mock_base_path/example_pipeline/cache/42.txt similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/cache/42.txt rename to tests/core/example_specs/mock_base_path/example_pipeline/cache/42.txt diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json rename to tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json rename to tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename to tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json diff --git a/tests/test_component_specs.py b/tests/core/test_component_specs.py similarity index 100% rename from tests/test_component_specs.py rename to tests/core/test_component_specs.py diff --git a/tests/test_manifest.py b/tests/core/test_manifest.py similarity index 100% rename from tests/test_manifest.py rename to tests/core/test_manifest.py diff --git a/tests/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py similarity index 100% rename from tests/test_manifest_evolution.py rename to tests/core/test_manifest_evolution.py diff --git a/tests/test_schema.py b/tests/core/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/core/test_schema.py diff --git a/tests/example_specs/components/input_manifest.json b/tests/example_specs/components/input_manifest.json deleted file mode 100644 index 7af13d599..000000000 --- a/tests/example_specs/components/input_manifest.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index/12345/example_component" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } - } - - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/1/output_manifest.json b/tests/example_specs/evolution_examples/1/output_manifest.json deleted file mode 100644 index 17b94c0b0..000000000 --- a/tests/example_specs/evolution_examples/1/output_manifest.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/input_manifest.json b/tests/example_specs/evolution_examples/input_manifest.json deleted file mode 100644 index 2ecf37243..000000000 --- a/tests/example_specs/evolution_examples/input_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"12345", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/12345/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 541775f84..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file From fd0699c7f6b987730e17849bab19690db20ef8f2 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 10:54:07 +0100 Subject: [PATCH 09/34] Remove additional fields in common.json --- src/fondant/core/schemas/common.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/fondant/core/schemas/common.json b/src/fondant/core/schemas/common.json index e64ffd57e..11df4e988 100644 --- a/src/fondant/core/schemas/common.json +++ b/src/fondant/core/schemas/common.json @@ -57,11 +57,6 @@ "fields": { "type": "object", "minProperties": 1, - "properties": { - "additionalFields": { - "type": "boolean" - } - }, "additionalProperties": { "$ref": "#/definitions/field" } From 0f8117f2aae462f65aacb778143ed5fe4b286a3f Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 11:37:11 +0100 Subject: [PATCH 10/34] Test structure --- src/fondant/core/manifest.py | 2 +- tests/core/test_component_specs.py | 2 +- tests/core/test_manifest.py | 7 ++++--- tests/core/test_manifest_evolution.py | 2 +- tests/{ => examples}/example_component/Dockerfile | 0 .../example_component/fondant_component.yaml | 0 tests/{ => examples}/example_data/components/1.yaml | 0 tests/{ => examples}/example_data/manifest.json | 0 tests/{ => examples}/example_data/raw/split.py | 0 .../{ => examples}/example_data/raw/testset.parquet | Bin .../example_data/subsets_input/index/part.0.parquet | Bin .../example_data/subsets_input/index/part.1.parquet | Bin .../example_data/subsets_input/index/part.2.parquet | Bin .../subsets_input/properties/part.0.parquet | Bin .../subsets_input/properties/part.1.parquet | Bin .../subsets_input/properties/part.2.parquet | Bin .../example_data/subsets_input/types/part.0.parquet | Bin .../example_data/subsets_input/types/part.1.parquet | Bin .../example_data/subsets_input/types/part.2.parquet | Bin tests/{ => examples}/example_modules/component.py | 0 .../example_modules/invalid_component.py | 0 .../example_modules/invalid_double_components.py | 0 .../example_modules/invalid_double_pipeline.py | 0 tests/{ => examples}/example_modules/pipeline.py | 0 .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 0 .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 0 .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 0 .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml | 0 .../component_specs/invalid_component.yaml | 0 .../component_specs/kubeflow_component.yaml | 0 .../component_specs/valid_component.yaml | 0 .../component_specs/valid_component_no_args.yaml | 0 .../components/arguments/component.yaml | 0 .../arguments/component_default_args.yaml | 0 .../components/arguments/input_manifest.json | 0 .../example_specs/components/component.yaml | 0 .../example_specs/components/input_manifest.json | 0 .../evolution_examples/1/component.yaml | 0 .../evolution_examples/1/output_manifest.json | 0 .../evolution_examples/2/component.yaml | 0 .../evolution_examples/2/output_manifest.json | 0 .../evolution_examples/3/component.yaml | 0 .../evolution_examples/3/output_manifest.json | 0 .../evolution_examples/4/component.yaml | 0 .../evolution_examples/4/output_manifest.json | 0 .../evolution_examples/5/component.yaml | 0 .../evolution_examples/5/output_manifest.json | 0 .../evolution_examples/6/component.yaml | 0 .../evolution_examples/6/output_manifest.json | 0 .../evolution_examples/7/component.yaml | 0 .../evolution_examples/7/output_manifest.json | 0 .../evolution_examples/8/component.yaml | 0 .../evolution_examples/8/output_manifest.json | 0 .../evolution_examples/input_manifest.json | 0 .../example_specs/manifests/invalid_manifest.json | 0 .../example_specs/manifests/valid_manifest.json | 0 .../mock_base_path/example_pipeline/cache/42.txt | 0 .../example_pipeline_2023/component_1/manifest.json | 0 .../example_pipeline_2023/component_2/manifest.json | 0 .../example_pipeline_2024/component_1/manifest.json | 0 .../example_pipeline_2024/component_2/manifest.json | 0 72 files changed, 7 insertions(+), 6 deletions(-) rename tests/{ => examples}/example_component/Dockerfile (100%) rename tests/{ => examples}/example_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_data/components/1.yaml (100%) rename tests/{ => examples}/example_data/manifest.json (100%) rename tests/{ => examples}/example_data/raw/split.py (100%) rename tests/{ => examples}/example_data/raw/testset.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.2.parquet (100%) rename tests/{ => examples}/example_modules/component.py (100%) rename tests/{ => examples}/example_modules/invalid_component.py (100%) rename tests/{ => examples}/example_modules/invalid_double_components.py (100%) rename tests/{ => examples}/example_modules/invalid_double_pipeline.py (100%) rename tests/{ => examples}/example_modules/pipeline.py (100%) rename tests/{ => examples}/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml (100%) rename tests/{core => examples}/example_specs/component_specs/invalid_component.yaml (100%) rename tests/{core => examples}/example_specs/component_specs/kubeflow_component.yaml (100%) rename tests/{core => examples}/example_specs/component_specs/valid_component.yaml (100%) rename tests/{core => examples}/example_specs/component_specs/valid_component_no_args.yaml (100%) rename tests/{core => examples}/example_specs/components/arguments/component.yaml (100%) rename tests/{core => examples}/example_specs/components/arguments/component_default_args.yaml (100%) rename tests/{core => examples}/example_specs/components/arguments/input_manifest.json (100%) rename tests/{core => examples}/example_specs/components/component.yaml (100%) rename tests/{core => examples}/example_specs/components/input_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/1/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/1/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/2/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/2/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/3/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/3/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/4/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/4/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/5/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/5/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/6/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/6/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/7/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/7/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/8/component.yaml (100%) rename tests/{core => examples}/example_specs/evolution_examples/8/output_manifest.json (100%) rename tests/{core => examples}/example_specs/evolution_examples/input_manifest.json (100%) rename tests/{core => examples}/example_specs/manifests/invalid_manifest.json (100%) rename tests/{core => examples}/example_specs/manifests/valid_manifest.json (100%) rename tests/{core => examples}/example_specs/mock_base_path/example_pipeline/cache/42.txt (100%) rename tests/{core => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json (100%) rename tests/{core => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json (100%) rename tests/{core => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json (100%) rename tests/{core => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json (100%) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 6198bdecd..bd7889ff3 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -172,7 +172,7 @@ def field_mapping(self): location = ( f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" ) - if field.location in field_mapping: + if location in field_mapping: field_mapping[location].append(field_name) else: field_mapping[location] = [field_name] diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py index 38a51ea8d..48b10ce2b 100644 --- a/tests/core/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -13,7 +13,7 @@ from fondant.core.exceptions import InvalidComponentSpec from fondant.core.schema import Type -component_specs_path = Path(__file__).parent / "example_specs/component_specs" +component_specs_path = Path(__file__).parent.parent / "examples/example_specs/component_specs" @pytest.fixture() diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 37897f67e..a978f66c6 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -7,8 +7,8 @@ from fondant.core.exceptions import InvalidManifest from fondant.core.manifest import Field, Manifest, Type -manifest_path = Path(__file__).parent / "example_specs/manifests" -component_specs_path = Path(__file__).parent / "example_specs/component_specs" +manifest_path = Path(__file__).parent.parent / "examples" / "example_specs/manifests" +component_specs_path = Path(__file__).parent.parent / "examples" / "example_specs/component_specs" @pytest.fixture() @@ -258,10 +258,11 @@ def test_accessing_the_index(): def test_field_mapping(valid_manifest): """Test field mapping generation.""" + manifest_path = Path(__file__).parent / "example_specs/manifests" manifest = Manifest(valid_manifest) field_mapping = manifest.field_mapping assert field_mapping == { "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], - "gs://bucket/test_pipeline/test_pipeline_12345/component2": ["width"], + "gs://bucket/test_pipeline/test_pipeline_12345/component2": ["height", "width"], "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], } diff --git a/tests/core/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py index c79b76aaf..5dce88885 100644 --- a/tests/core/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -6,7 +6,7 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -examples_path = Path(__file__).parent / "example_specs/evolution_examples" +examples_path = Path(__file__).parent / "examples/example_specs/evolution_examples" @pytest.fixture() diff --git a/tests/example_component/Dockerfile b/tests/examples/example_component/Dockerfile similarity index 100% rename from tests/example_component/Dockerfile rename to tests/examples/example_component/Dockerfile diff --git a/tests/example_component/fondant_component.yaml b/tests/examples/example_component/fondant_component.yaml similarity index 100% rename from tests/example_component/fondant_component.yaml rename to tests/examples/example_component/fondant_component.yaml diff --git a/tests/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml similarity index 100% rename from tests/example_data/components/1.yaml rename to tests/examples/example_data/components/1.yaml diff --git a/tests/example_data/manifest.json b/tests/examples/example_data/manifest.json similarity index 100% rename from tests/example_data/manifest.json rename to tests/examples/example_data/manifest.json diff --git a/tests/example_data/raw/split.py b/tests/examples/example_data/raw/split.py similarity index 100% rename from tests/example_data/raw/split.py rename to tests/examples/example_data/raw/split.py diff --git a/tests/example_data/raw/testset.parquet b/tests/examples/example_data/raw/testset.parquet similarity index 100% rename from tests/example_data/raw/testset.parquet rename to tests/examples/example_data/raw/testset.parquet diff --git a/tests/example_data/subsets_input/index/part.0.parquet b/tests/examples/example_data/subsets_input/index/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.0.parquet rename to tests/examples/example_data/subsets_input/index/part.0.parquet diff --git a/tests/example_data/subsets_input/index/part.1.parquet b/tests/examples/example_data/subsets_input/index/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.1.parquet rename to tests/examples/example_data/subsets_input/index/part.1.parquet diff --git a/tests/example_data/subsets_input/index/part.2.parquet b/tests/examples/example_data/subsets_input/index/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.2.parquet rename to tests/examples/example_data/subsets_input/index/part.2.parquet diff --git a/tests/example_data/subsets_input/properties/part.0.parquet b/tests/examples/example_data/subsets_input/properties/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.0.parquet rename to tests/examples/example_data/subsets_input/properties/part.0.parquet diff --git a/tests/example_data/subsets_input/properties/part.1.parquet b/tests/examples/example_data/subsets_input/properties/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.1.parquet rename to tests/examples/example_data/subsets_input/properties/part.1.parquet diff --git a/tests/example_data/subsets_input/properties/part.2.parquet b/tests/examples/example_data/subsets_input/properties/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.2.parquet rename to tests/examples/example_data/subsets_input/properties/part.2.parquet diff --git a/tests/example_data/subsets_input/types/part.0.parquet b/tests/examples/example_data/subsets_input/types/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.0.parquet rename to tests/examples/example_data/subsets_input/types/part.0.parquet diff --git a/tests/example_data/subsets_input/types/part.1.parquet b/tests/examples/example_data/subsets_input/types/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.1.parquet rename to tests/examples/example_data/subsets_input/types/part.1.parquet diff --git a/tests/example_data/subsets_input/types/part.2.parquet b/tests/examples/example_data/subsets_input/types/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.2.parquet rename to tests/examples/example_data/subsets_input/types/part.2.parquet diff --git a/tests/example_modules/component.py b/tests/examples/example_modules/component.py similarity index 100% rename from tests/example_modules/component.py rename to tests/examples/example_modules/component.py diff --git a/tests/example_modules/invalid_component.py b/tests/examples/example_modules/invalid_component.py similarity index 100% rename from tests/example_modules/invalid_component.py rename to tests/examples/example_modules/invalid_component.py diff --git a/tests/example_modules/invalid_double_components.py b/tests/examples/example_modules/invalid_double_components.py similarity index 100% rename from tests/example_modules/invalid_double_components.py rename to tests/examples/example_modules/invalid_double_components.py diff --git a/tests/example_modules/invalid_double_pipeline.py b/tests/examples/example_modules/invalid_double_pipeline.py similarity index 100% rename from tests/example_modules/invalid_double_pipeline.py rename to tests/examples/example_modules/invalid_double_pipeline.py diff --git a/tests/example_modules/pipeline.py b/tests/examples/example_modules/pipeline.py similarity index 100% rename from tests/example_modules/pipeline.py rename to tests/examples/example_modules/pipeline.py diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml diff --git a/tests/core/example_specs/component_specs/invalid_component.yaml b/tests/examples/example_specs/component_specs/invalid_component.yaml similarity index 100% rename from tests/core/example_specs/component_specs/invalid_component.yaml rename to tests/examples/example_specs/component_specs/invalid_component.yaml diff --git a/tests/core/example_specs/component_specs/kubeflow_component.yaml b/tests/examples/example_specs/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/core/example_specs/component_specs/kubeflow_component.yaml rename to tests/examples/example_specs/component_specs/kubeflow_component.yaml diff --git a/tests/core/example_specs/component_specs/valid_component.yaml b/tests/examples/example_specs/component_specs/valid_component.yaml similarity index 100% rename from tests/core/example_specs/component_specs/valid_component.yaml rename to tests/examples/example_specs/component_specs/valid_component.yaml diff --git a/tests/core/example_specs/component_specs/valid_component_no_args.yaml b/tests/examples/example_specs/component_specs/valid_component_no_args.yaml similarity index 100% rename from tests/core/example_specs/component_specs/valid_component_no_args.yaml rename to tests/examples/example_specs/component_specs/valid_component_no_args.yaml diff --git a/tests/core/example_specs/components/arguments/component.yaml b/tests/examples/example_specs/components/arguments/component.yaml similarity index 100% rename from tests/core/example_specs/components/arguments/component.yaml rename to tests/examples/example_specs/components/arguments/component.yaml diff --git a/tests/core/example_specs/components/arguments/component_default_args.yaml b/tests/examples/example_specs/components/arguments/component_default_args.yaml similarity index 100% rename from tests/core/example_specs/components/arguments/component_default_args.yaml rename to tests/examples/example_specs/components/arguments/component_default_args.yaml diff --git a/tests/core/example_specs/components/arguments/input_manifest.json b/tests/examples/example_specs/components/arguments/input_manifest.json similarity index 100% rename from tests/core/example_specs/components/arguments/input_manifest.json rename to tests/examples/example_specs/components/arguments/input_manifest.json diff --git a/tests/core/example_specs/components/component.yaml b/tests/examples/example_specs/components/component.yaml similarity index 100% rename from tests/core/example_specs/components/component.yaml rename to tests/examples/example_specs/components/component.yaml diff --git a/tests/core/example_specs/components/input_manifest.json b/tests/examples/example_specs/components/input_manifest.json similarity index 100% rename from tests/core/example_specs/components/input_manifest.json rename to tests/examples/example_specs/components/input_manifest.json diff --git a/tests/core/example_specs/evolution_examples/1/component.yaml b/tests/examples/example_specs/evolution_examples/1/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/1/component.yaml rename to tests/examples/example_specs/evolution_examples/1/component.yaml diff --git a/tests/core/example_specs/evolution_examples/1/output_manifest.json b/tests/examples/example_specs/evolution_examples/1/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/1/output_manifest.json rename to tests/examples/example_specs/evolution_examples/1/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/2/component.yaml b/tests/examples/example_specs/evolution_examples/2/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/2/component.yaml rename to tests/examples/example_specs/evolution_examples/2/component.yaml diff --git a/tests/core/example_specs/evolution_examples/2/output_manifest.json b/tests/examples/example_specs/evolution_examples/2/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/2/output_manifest.json rename to tests/examples/example_specs/evolution_examples/2/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/3/component.yaml b/tests/examples/example_specs/evolution_examples/3/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/3/component.yaml rename to tests/examples/example_specs/evolution_examples/3/component.yaml diff --git a/tests/core/example_specs/evolution_examples/3/output_manifest.json b/tests/examples/example_specs/evolution_examples/3/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/3/output_manifest.json rename to tests/examples/example_specs/evolution_examples/3/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/4/component.yaml b/tests/examples/example_specs/evolution_examples/4/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/4/component.yaml rename to tests/examples/example_specs/evolution_examples/4/component.yaml diff --git a/tests/core/example_specs/evolution_examples/4/output_manifest.json b/tests/examples/example_specs/evolution_examples/4/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/4/output_manifest.json rename to tests/examples/example_specs/evolution_examples/4/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/5/component.yaml b/tests/examples/example_specs/evolution_examples/5/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/5/component.yaml rename to tests/examples/example_specs/evolution_examples/5/component.yaml diff --git a/tests/core/example_specs/evolution_examples/5/output_manifest.json b/tests/examples/example_specs/evolution_examples/5/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/5/output_manifest.json rename to tests/examples/example_specs/evolution_examples/5/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/6/component.yaml b/tests/examples/example_specs/evolution_examples/6/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/6/component.yaml rename to tests/examples/example_specs/evolution_examples/6/component.yaml diff --git a/tests/core/example_specs/evolution_examples/6/output_manifest.json b/tests/examples/example_specs/evolution_examples/6/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/6/output_manifest.json rename to tests/examples/example_specs/evolution_examples/6/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/7/component.yaml b/tests/examples/example_specs/evolution_examples/7/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/7/component.yaml rename to tests/examples/example_specs/evolution_examples/7/component.yaml diff --git a/tests/core/example_specs/evolution_examples/7/output_manifest.json b/tests/examples/example_specs/evolution_examples/7/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/7/output_manifest.json rename to tests/examples/example_specs/evolution_examples/7/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/8/component.yaml b/tests/examples/example_specs/evolution_examples/8/component.yaml similarity index 100% rename from tests/core/example_specs/evolution_examples/8/component.yaml rename to tests/examples/example_specs/evolution_examples/8/component.yaml diff --git a/tests/core/example_specs/evolution_examples/8/output_manifest.json b/tests/examples/example_specs/evolution_examples/8/output_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/8/output_manifest.json rename to tests/examples/example_specs/evolution_examples/8/output_manifest.json diff --git a/tests/core/example_specs/evolution_examples/input_manifest.json b/tests/examples/example_specs/evolution_examples/input_manifest.json similarity index 100% rename from tests/core/example_specs/evolution_examples/input_manifest.json rename to tests/examples/example_specs/evolution_examples/input_manifest.json diff --git a/tests/core/example_specs/manifests/invalid_manifest.json b/tests/examples/example_specs/manifests/invalid_manifest.json similarity index 100% rename from tests/core/example_specs/manifests/invalid_manifest.json rename to tests/examples/example_specs/manifests/invalid_manifest.json diff --git a/tests/core/example_specs/manifests/valid_manifest.json b/tests/examples/example_specs/manifests/valid_manifest.json similarity index 100% rename from tests/core/example_specs/manifests/valid_manifest.json rename to tests/examples/example_specs/manifests/valid_manifest.json diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt similarity index 100% rename from tests/core/example_specs/mock_base_path/example_pipeline/cache/42.txt rename to tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json similarity index 100% rename from tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json similarity index 100% rename from tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json similarity index 100% rename from tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json diff --git a/tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json similarity index 100% rename from tests/core/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json From 7e8a1d6d32f59c9e8787bb438aa334fdb4567700 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 14:50:09 +0100 Subject: [PATCH 11/34] Refactor component package --- src/fondant/component/data_io.py | 156 ++++++------------ src/fondant/component/executor.py | 36 ++-- tests/{ => component}/test_component.py | 4 +- tests/{ => component}/test_data_io.py | 130 ++++++--------- tests/examples/example_data/components/1.yaml | 42 ++--- tests/examples/example_data/manifest.json | 56 +++---- tests/examples/example_data/raw/split.py | 10 +- .../example_pipeline/cache/42.txt | 2 +- 8 files changed, 168 insertions(+), 268 deletions(-) rename tests/{ => component}/test_component.py (98%) rename tests/{ => component}/test_data_io.py (61%) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 7023c1ee2..5d947ce76 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -6,7 +6,7 @@ from dask.diagnostics import ProgressBar from dask.distributed import Client -from fondant.core.component_spec import ComponentSpec, ComponentSubset +from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest logger = logging.getLogger(__name__) @@ -82,35 +82,7 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: return dataframe - def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: - """ - Function that loads a subset from the manifest as a Dask dataframe. - - Args: - subset_name: the name of the subset to load - fields: the fields to load from the subset - - Returns: - The subset as a dask dataframe - """ - subset = self.manifest.subsets[subset_name] - remote_path = subset.location - - logger.info(f"Loading subset {subset_name} with fields {fields}...") - - subset_df = dd.read_parquet( - remote_path, - columns=fields, - calculate_divisions=True, - ) - - # add subset prefix to columns - subset_df = subset_df.rename( - columns={col: subset_name + "_" + col for col in subset_df.columns}, - ) - - return subset_df - + # TODO: probably not needed anymore! def _load_index(self) -> dd.DataFrame: """ Function that loads the index from the manifest as a Dask dataframe. @@ -121,9 +93,10 @@ def _load_index(self) -> dd.DataFrame: # get index subset from the manifest index = self.manifest.index # get remote path - remote_path = index.location + remote_path = index["location"] # load index from parquet, expecting id and source columns + # TODO: reduce dataframe to index loading? .loc[:, []]? return dd.read_parquet(remote_path, calculate_divisions=True) def load_dataframe(self) -> dd.DataFrame: @@ -135,20 +108,34 @@ def load_dataframe(self) -> dd.DataFrame: The Dask dataframe with the field columns in the format (_) as well as the index columns. """ - # load index into dataframe - dataframe = self._load_index() - for name, subset in self.component_spec.consumes.items(): - fields = list(subset.fields.keys()) - subset_df = self._load_subset(name, fields) - # left joins -> filter on index - dataframe = dd.merge( - dataframe, - subset_df, - left_index=True, - right_index=True, - how="left", + dataframe = None + field_mapping = self.manifest.field_mapping + for location, fields in field_mapping.items(): + partial_df = dd.read_parquet( + location, + columns=fields, + index="id", + calculate_divisions=True, ) + if dataframe is None: + # ensure that the index is set correctly and divisions are known. + dataframe = partial_df + else: + dask_divisions = partial_df.set_index("id").divisions + unique_divisions = list(dict.fromkeys(list(dask_divisions))) + + # apply set index to both dataframes + partial_df = partial_df.set_index("id", divisions=unique_divisions) + dataframe = dataframe.set_index("id", divisions=unique_divisions) + + dataframe = dataframe.merge( + partial_df, + how="left", + left_index=True, + right_index=True, + ) + dataframe = self.partition_loaded_dataframe(dataframe) logging.info(f"Columns of dataframe: {list(dataframe.columns)}") @@ -170,79 +157,46 @@ def write_dataframe( dataframe: dd.DataFrame, dask_client: t.Optional[Client] = None, ) -> None: - write_tasks = [] + columns_to_produce = [ + column_name for column_name, field in self.component_spec.produces.items() + ] - dataframe.index = dataframe.index.rename("id") + # validation that all columns are in the dataframe + self.validate_dataframe_columns(dataframe, columns_to_produce) - # Turn index into an empty dataframe so we can write it - index_df = dataframe.index.to_frame().drop(columns=["id"]) - write_index_task = self._write_subset( - index_df, - subset_name="index", - subset_spec=self.component_spec.index, - ) - write_tasks.append(write_index_task) - - for subset_name, subset_spec in self.component_spec.produces.items(): - subset_df = self._extract_subset_dataframe( - dataframe, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_subset_task = self._write_subset( - subset_df, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_tasks.append(write_subset_task) + dataframe = dataframe[columns_to_produce] + write_task = self._write_dataframe(dataframe) with ProgressBar(): logging.info("Writing data...") - # alternative implementation possible: futures = client.compute(...) - dd.compute(*write_tasks, scheduler=dask_client) + dd.compute(write_task, scheduler=dask_client) @staticmethod - def _extract_subset_dataframe( - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.DataFrame: - """Create subset dataframe to save with the original field name as the column name.""" - # Create a new dataframe with only the columns needed for the output subset - subset_columns = [f"{subset_name}_{field}" for field in subset_spec.fields] - try: - subset_df = dataframe[subset_columns] - except KeyError as e: + def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]): + """Validates that all columns are available in the dataset.""" + missing_fields = [] + for col in columns: + if col not in dataframe.columns: + missing_fields.append(col) + + if missing_fields: msg = ( - f"Field {e.args[0]} defined in output subset {subset_name} " + f"Fields {missing_fields} defined in output dataset " f"but not found in dataframe" ) raise ValueError( msg, ) - # Remove the subset prefix from the column names - subset_df = subset_df.rename( - columns={col: col[(len(f"{subset_name}_")) :] for col in subset_columns}, + def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: + """Create dataframe writing task.""" + location = ( + self.manifest.base_path + "/" + self.component_spec.component_folder_name ) - - return subset_df - - def _write_subset( - self, - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.core.Scalar: - if subset_name == "index": - location = self.manifest.index.location - else: - location = self.manifest.subsets[subset_name].location - - schema = {field.name: field.type.value for field in subset_spec.fields.values()} - + schema = { + field.name: field.type.value + for field in self.component_spec.produces.values() + } return self._create_write_task(dataframe, location=location, schema=schema) @staticmethod diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 3d4d6097f..bed4df80d 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -491,14 +491,11 @@ def optional_fondant_arguments() -> t.List[str]: @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: - - Converts the columns to hierarchical format before passing the dataframe to the - transform function - Removes extra columns from the returned dataframe which are not defined in the component spec `produces` section - Sorts the columns from the returned dataframe according to the order in the component spec `produces` section to match the order in the `meta` argument passed to Dask's `map_partitions`. - - Flattens the returned dataframe columns. Args: transform: Transform method to wrap @@ -506,27 +503,13 @@ def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """ def wrapped_transform(dataframe: pd.DataFrame) -> pd.DataFrame: - # Switch to hierarchical columns - dataframe.columns = pd.MultiIndex.from_tuples( - tuple(column.split("_")) for column in dataframe.columns - ) - # Call transform method dataframe = transform(dataframe) # Drop columns not in specification - columns = [ - (subset_name, field) - for subset_name, subset in spec.produces.items() - for field in subset.fields - ] - dataframe = dataframe[columns] - - # Switch to flattened columns - dataframe.columns = [ - "_".join(column) for column in dataframe.columns.to_flat_index() - ] - return dataframe + columns = [name for name, field in spec.produces.items()] + + return dataframe[columns] return wrapped_transform @@ -552,11 +535,8 @@ def _execute_component( # Create meta dataframe with expected format meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series(dtype=pd.ArrowDtype(field.type.value)) meta_df = pd.DataFrame(meta_dict).set_index("id") wrapped_transform = self.wrap_transform(component.transform, spec=self.spec) @@ -569,12 +549,16 @@ def _execute_component( # Clear divisions if component spec indicates that the index is changed if self._infer_index_change(): + # TODO: might causing issues for merging components + # to guarantee fast merging of large dataframes we need to keep the division information dataframe.clear_divisions() return dataframe + # TODO: fix in #244 def _infer_index_change(self) -> bool: """Infer if this component changes the index based on its component spec.""" + """ if not self.spec.accepts_additional_subsets: return True if not self.spec.outputs_additional_subsets: @@ -585,6 +569,8 @@ def _infer_index_change(self) -> bool: return any( not subset.additional_fields for subset in self.spec.produces.values() ) + """ + return False class DaskWriteExecutor(Executor[DaskWriteComponent]): diff --git a/tests/test_component.py b/tests/component/test_component.py similarity index 98% rename from tests/test_component.py rename to tests/component/test_component.py index e5dcb3bc3..9f910dbcd 100644 --- a/tests/test_component.py +++ b/tests/component/test_component.py @@ -23,8 +23,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest, Metadata -components_path = Path(__file__).parent / "example_specs/components" -base_path = Path(__file__).parent / "example_specs/mock_base_path" +components_path = Path(__file__).parent.parent / "examples/example_specs/components" +base_path = Path(__file__).parent.parent / "examples/example_specs/mock_base_path" N_PARTITIONS = 2 diff --git a/tests/test_data_io.py b/tests/component/test_data_io.py similarity index 61% rename from tests/test_data_io.py rename to tests/component/test_data_io.py index 9ade4a329..89eadddb9 100644 --- a/tests/test_data_io.py +++ b/tests/component/test_data_io.py @@ -8,10 +8,13 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -manifest_path = Path(__file__).parent / "example_data/manifest.json" -component_spec_path = Path(__file__).parent / "example_data/components/1.yaml" +manifest_path = Path(__file__).parent.parent / "examples/example_data/manifest.json" +component_spec_path = ( + Path(__file__).parent.parent / "examples/example_data/components/1.yaml" +) NUMBER_OF_TEST_ROWS = 151 +NUMBER_OF_TEST_ROWS = 151 @pytest.fixture() @@ -37,33 +40,16 @@ def dataframe(manifest, component_spec): return data_loader.load_dataframe() -def test_load_index(manifest, component_spec): - """Test the loading of just the index.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - index_df = data_loader._load_index() - assert len(index_df) == NUMBER_OF_TEST_ROWS - assert index_df.index.name == "id" - - -def test_load_subset(manifest, component_spec): - """Test the loading of one field of a subset.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - subset_df = data_loader._load_subset(subset_name="types", fields=["Type 1"]) - assert len(subset_df) == NUMBER_OF_TEST_ROWS - assert list(subset_df.columns) == ["types_Type 1"] - assert subset_df.index.name == "id" - - def test_load_dataframe(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader(manifest=manifest, component_spec=component_spec) dataframe = dl.load_dataframe() assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == [ - "properties_Name", - "properties_HP", - "types_Type 1", - "types_Type 2", + "Name", + "HP", + "Type 1", + "Type 2", ] assert dataframe.index.name == "id" @@ -78,7 +64,7 @@ def test_load_dataframe_default(manifest, component_spec): def test_load_dataframe_rows(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader( manifest=manifest, component_spec=component_spec, @@ -89,34 +75,7 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_write_index( - tmp_path_factory, - dataframe, - manifest, - component_spec, - dask_client, -): - """Test writing out the index.""" - with tmp_path_factory.mktemp("temp") as fn: - # override the base path of the manifest with the temp dir - manifest.update_metadata("base_path", str(fn)) - data_writer = DaskDataWriter( - manifest=manifest, - component_spec=component_spec, - ) - # write out index to temp dir - data_writer.write_dataframe(dataframe, dask_client) - number_workers = os.cpu_count() - # read written data and assert - dataframe = dd.read_parquet(fn / "index") - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert dataframe.index.name == "id" - assert dataframe.npartitions in list( - range(number_workers - 1, number_workers + 2), - ) - - -def test_write_subsets( +def test_write_dataset( tmp_path_factory, dataframe, manifest, @@ -125,11 +84,7 @@ def test_write_subsets( ): """Test writing out subsets.""" # Dictionary specifying the expected subsets to write and their column names - subset_columns_dict = { - "index": [], - "properties": ["Name", "HP"], - "types": ["Type 1", "Type 2"], - } + columns = ["Name", "HP", "Type 1", "Type 2"] with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) @@ -137,13 +92,14 @@ def test_write_subsets( # write dataframe to temp dir data_writer.write_dataframe(dataframe, dask_client) # read written data and assert - for subset, subset_columns in subset_columns_dict.items(): - dataframe = dd.read_parquet(fn / subset) - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert list(dataframe.columns) == subset_columns - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == columns + assert dataframe.index.name == "id" +# TODO: check if this is still needed? +# Change this to a validation instead? def test_write_reset_index( tmp_path_factory, dataframe, @@ -151,7 +107,7 @@ def test_write_reset_index( component_spec, dask_client, ): - """Test writing out the index and subsets that have no dask index and checking + """Test writing out the index and fields that have no dask index and checking if the id index was created. """ dataframe = dataframe.reset_index(drop=True) @@ -160,10 +116,8 @@ def test_write_reset_index( data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) data_writer.write_dataframe(dataframe, dask_client) - - for subset in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / subset) - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" @pytest.mark.parametrize("partitions", list(range(1, 5))) @@ -189,29 +143,51 @@ def test_write_divisions( # noqa: PLR0913 data_writer.write_dataframe(dataframe, dask_client) - for target in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / target) - assert dataframe.index.name == "id" - assert dataframe.npartitions == partitions + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" + assert dataframe.npartitions == partitions + + +def test_write_fields_invalid( + tmp_path_factory, + dataframe, + manifest, + component_spec, + dask_client, +): + """Test writing out fields but the dataframe columns are incomplete.""" + with tmp_path_factory.mktemp("temp") as fn: + # override the base path of the manifest with the temp dir + manifest.update_metadata("base_path", str(fn)) + # Drop one of the columns required in the output + dataframe = dataframe.drop(["Type 2"], axis=1) + data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) + expected_error_msg = ( + r"Fields \['Type 2'\] defined in output dataset " + r"but not found in dataframe" + ) + with pytest.raises(ValueError, match=expected_error_msg): + data_writer.write_dataframe(dataframe, dask_client) -def test_write_subsets_invalid( +def test_write_fields_invalid_several_fields_missing( tmp_path_factory, dataframe, manifest, component_spec, dask_client, ): - """Test writing out subsets but the dataframe columns are incomplete.""" + """Test writing out fields but the dataframe columns are incomplete.""" with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) # Drop one of the columns required in the output - dataframe = dataframe.drop(["types_Type 2"], axis=1) + dataframe = dataframe.drop(["Type 1"], axis=1) + dataframe = dataframe.drop(["Type 2"], axis=1) data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) expected_error_msg = ( - r"Field \['types_Type 2'\] not in index defined in output subset " - r"types but not found in dataframe" + r"Fields \['Type 1', 'Type 2'\] defined in output dataset " + r"but not found in dataframe" ) with pytest.raises(ValueError, match=expected_error_msg): data_writer.write_dataframe(dataframe, dask_client) diff --git a/tests/examples/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml index 0c245a512..95e5e578f 100644 --- a/tests/examples/example_data/components/1.yaml +++ b/tests/examples/example_data/components/1.yaml @@ -3,32 +3,26 @@ description: This is an example component image: example_component:latest consumes: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" + Name: + type: "string" + HP: + type: "int32" + + Type 1: + type: "string" + Type 2: + type: "string" produces: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" + Name: + type: "string" + HP: + type: "int32" + Type 1: + type: "string" + Type 2: + type: "string" + args: storage_args: description: Storage arguments diff --git a/tests/examples/example_data/manifest.json b/tests/examples/example_data/manifest.json index 8fe4ef16b..14366fbbd 100644 --- a/tests/examples/example_data/manifest.json +++ b/tests/examples/example_data/manifest.json @@ -1,35 +1,29 @@ { - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "tests/example_data/subsets_input", - "run_id": "test_pipeline_12345", - "component_id": "67890" + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "tests/examples/example_data", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "Name": { + "type": "string", + "location": "/component_1" }, - "index": { - "location": "/index" + "HP": { + "type": "int32", + "location": "/component_1" }, - "subsets": { - "properties": { - "location": "/properties", - "fields": { - "Name": { - "type": "string" - }, - "HP": { - "type": "int32" - } - } - }, - "types": { - "location": "/types", - "fields": { - "Type 1": { - "type": "string" - }, - "Type 2": { - "type": "string" - } - } - } + "Type 1": { + "type": "string", + "location": "/component_2" + }, + "Type 2": { + "type": "string", + "location": "/component_2" } - } \ No newline at end of file + } +} \ No newline at end of file diff --git a/tests/examples/example_data/raw/split.py b/tests/examples/example_data/raw/split.py index 6800ee323..ade466125 100644 --- a/tests/examples/example_data/raw/split.py +++ b/tests/examples/example_data/raw/split.py @@ -13,7 +13,7 @@ import dask.dataframe as dd data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent / "subsets_input/" +output_path = Path(__file__).parent.parent def split_into_subsets(): @@ -22,17 +22,13 @@ def split_into_subsets(): master_df = master_df.set_index("id", sorted=True) master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - # create index subset - index_df = master_df.index.to_frame().drop(columns=["id"]) - index_df.to_parquet(output_path / "index") - # create properties subset properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "properties") + properties_df.to_parquet(output_path / "component_1") # create types subset types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "types") + types_df.to_parquet(output_path / "component_2") if __name__ == "__main__": diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt index 768ddfb21..614074264 100644 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt +++ b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt @@ -1 +1 @@ -tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file +tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file From 9f67c6171fd0b5421f72ba2424476167439e0929 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 14:57:54 +0100 Subject: [PATCH 12/34] Update src/fondant/core/component_spec.py Co-authored-by: Robbe Sneyders --- src/fondant/core/component_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index a3d19178d..b3305eeb6 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -167,7 +167,7 @@ def index(self): @property def consumes(self) -> t.Mapping[str, Field]: - """The subsets consumed by the component as an immutable mapping.""" + """The fields consumed by the component as an immutable mapping.""" return types.MappingProxyType( { name: Field(name=name, type=Type.from_json(field)) From 40955bf95a2dae5c7e881d2c0eedce3ea2e8a62f Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 14:58:09 +0100 Subject: [PATCH 13/34] Update src/fondant/core/manifest.py Co-authored-by: Robbe Sneyders --- src/fondant/core/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index bd7889ff3..56b6bd1e5 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -196,7 +196,7 @@ def cache_key(self) -> str: @property def fields(self) -> t.Mapping[str, Field]: - """The subsets of the manifest as an immutable mapping.""" + """The fields of the manifest as an immutable mapping.""" # e.g. ('images', {'location': '/component1', 'type': 'binary'}) return types.MappingProxyType( { From 6b246a4618892ae1afad221936efcbed2033e49d Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 14:58:20 +0100 Subject: [PATCH 14/34] Update src/fondant/core/component_spec.py Co-authored-by: Robbe Sneyders --- src/fondant/core/component_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index b3305eeb6..5ecfa981b 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -177,7 +177,7 @@ def consumes(self) -> t.Mapping[str, Field]: @property def produces(self) -> t.Mapping[str, Field]: - """The subsets produced by the component as an immutable mapping.""" + """The fields produced by the component as an immutable mapping.""" return types.MappingProxyType( { name: Field(name=name, type=Type.from_json(field)) From 8ef38d9d39415d07874610de5de99834810f6123 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:02:54 +0100 Subject: [PATCH 15/34] Update src/fondant/core/manifest.py Co-authored-by: Robbe Sneyders --- src/fondant/core/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 56b6bd1e5..31872a6ad 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -288,7 +288,7 @@ def evolve( # : PLR0912 (too many branches) if run_id is not None: evolved_manifest.update_metadata(key="run_id", value=run_id) - # Update index location as this is currently always rewritten + # Update index location as this is always rewritten evolved_manifest.add_or_update_field(Field(name="index")) # evolved_manifest._specification["index"][ # "location" From e8c8135f588cf1092488e58c2ae468289d023c19 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:03:38 +0100 Subject: [PATCH 16/34] Update src/fondant/core/schema.py Co-authored-by: Robbe Sneyders --- src/fondant/core/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index b775a2c0f..b8549d0d5 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -188,7 +188,7 @@ def type(self) -> Type: @property def location(self) -> str: - """The absolute location of the field.""" + """The relative location of the field.""" return self._location From df9a60ece6ff8c6ab4f5ca625d616e20daa09e97 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:09:36 +0100 Subject: [PATCH 17/34] Addresses comments --- src/fondant/core/manifest.py | 12 ------------ tests/core/test_manifest.py | 22 +++++----------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 31872a6ad..a239159f3 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -209,18 +209,6 @@ def fields(self) -> t.Mapping[str, Field]: }, ) - def add_fields( - self, - fields: t.Iterable[Field], - ) -> None: - """Add fields to manifest.""" - for field in fields: - if field.name in self._specification["fields"]: - msg = f"A field with name {field.name} already exists" - raise ValueError(msg) - - self.add_or_update_field(field, overwrite=False) - def add_or_update_field(self, field: Field, overwrite: bool = False): """Add or update field to manifest.""" if field.name == "index": diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index a978f66c6..3174dcd60 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -84,12 +84,8 @@ def test_manifest_creation(): cache_key=cache_key, ) - manifest.add_fields( - [ - Field(name="width", type=Type("int32")), - Field(name="height", type=Type("int32")), - ], - ) + manifest.add_or_update_field(Field(name="width", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height", type=Type("int32"))) manifest.add_or_update_field(Field(name="data", type=Type("binary"))) assert manifest._specification == { @@ -139,23 +135,15 @@ def test_manifest_alteration(valid_manifest): manifest = Manifest(valid_manifest) # test adding a subset - manifest.add_fields( - [ - Field(name="width2", type=Type("int32")), - Field(name="height2", type=Type("int32")), - ], - ) + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height2", type=Type("int32"))) assert "width2" in manifest.fields assert "height2" in manifest.fields # test adding a duplicate subset with pytest.raises(ValueError, match="A field with name width2 already exists"): - manifest.add_fields( - [ - Field(name="width2", type=Type("int32")), - ], - ) + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) # test removing a subset manifest.remove_field("width2") From 2256118312aa0b7e1435a8a1edd3dbf34485cca1 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:22:16 +0100 Subject: [PATCH 18/34] Addresses comments --- src/fondant/core/manifest.py | 7 +++++-- tests/core/test_manifest.py | 23 ++++------------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index a239159f3..670b9928c 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -242,7 +242,7 @@ def _add_or_update_index(self, field: Field, overwrite: bool = True): raise ValueError(msg) self._specification["index"] = { - "location": f"/{self.component_id}", + "location": f"/{field.location}", } def remove_field(self, name: str) -> None: @@ -277,7 +277,10 @@ def evolve( # : PLR0912 (too many branches) evolved_manifest.update_metadata(key="run_id", value=run_id) # Update index location as this is always rewritten - evolved_manifest.add_or_update_field(Field(name="index")) + evolved_manifest.add_or_update_field( + Field(name="index", location=component_spec.component_folder_name) + ) + # evolved_manifest._specification["index"][ # "location" # ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}" diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 3174dcd60..676d664a7 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -8,7 +8,9 @@ from fondant.core.manifest import Field, Manifest, Type manifest_path = Path(__file__).parent.parent / "examples" / "example_specs/manifests" -component_specs_path = Path(__file__).parent.parent / "examples" / "example_specs/component_specs" +component_specs_path = ( + Path(__file__).parent.parent / "examples" / "example_specs/component_specs" +) @pytest.fixture() @@ -227,26 +229,9 @@ def test_fields(): assert "field_1" not in manifest.fields -def test_accessing_the_index(): - """Test that test the index access.""" - run_id = "A" - manifest = Manifest.create( - pipeline_name="NAME", - base_path="/base_path", - run_id=run_id, - component_id="component_1", - cache_key="42", - ) - - # Add index field - manifest.metadata["component_id"] = "component_2" - manifest.add_or_update_field(Field(name="index", type=Type("int32"))) - assert manifest.index["location"] == "/component_2" - - def test_field_mapping(valid_manifest): """Test field mapping generation.""" - manifest_path = Path(__file__).parent / "example_specs/manifests" + Path(__file__).parent / "example_specs/manifests" manifest = Manifest(valid_manifest) field_mapping = manifest.field_mapping assert field_mapping == { From 3042fb549029d06fcbde56a50b7739aff1a21e82 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:23:45 +0100 Subject: [PATCH 19/34] Addresses comments --- src/fondant/core/manifest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 670b9928c..e962f1825 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -197,7 +197,6 @@ def cache_key(self) -> str: @property def fields(self) -> t.Mapping[str, Field]: """The fields of the manifest as an immutable mapping.""" - # e.g. ('images', {'location': '/component1', 'type': 'binary'}) return types.MappingProxyType( { name: Field( From 8fa8be72eeea6515765bef5166a23d03c225bfcb Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Tue, 21 Nov 2023 15:24:27 +0100 Subject: [PATCH 20/34] Update src/fondant/core/manifest.py Co-authored-by: Robbe Sneyders --- src/fondant/core/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index e962f1825..1d22b7e73 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -286,7 +286,7 @@ def evolve( # : PLR0912 (too many branches) # TODO handle additionalFields - # For each output subset defined in the component, add or update it + # Add or update all produced fields defined in the component spec for name, field in component_spec.produces.items(): # If field was part not part of the input manifest, add field to output manifest. # If field was part of the input manifest and got produced by the component, update From 25eb49270e6a2a8b9da05cf9619affe4e07e4f2a Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 08:16:14 +0100 Subject: [PATCH 21/34] Addresses comments --- src/fondant/core/component_spec.py | 4 +- src/fondant/core/manifest.py | 8 +--- src/fondant/core/schemas/component_spec.json | 7 +--- .../component_specs/invalid_component.yaml | 0 .../component_specs/kubeflow_component.yaml | 0 .../component_specs/valid_component.yaml | 0 .../valid_component_no_args.yaml | 0 .../evolution_examples/1/component.yaml | 0 .../evolution_examples/1/output_manifest.json | 14 +++---- .../evolution_examples/2}/component.yaml | 10 +++-- .../evolution_examples/2/output_manifest.json | 33 ++++++++++++++++ .../evolution_examples/3/component.yaml | 16 ++++++++ .../evolution_examples/3/output_manifest.json | 29 ++++++++++++++ .../evolution_examples/4/component.yaml | 12 ++++++ .../evolution_examples/4/output_manifest.json | 29 ++++++++++++++ .../evolution_examples/input_manifest.json | 10 ++--- .../examples}/manifests/invalid_manifest.json | 0 .../examples}/manifests/valid_manifest.json | 0 tests/core/test_component_specs.py | 2 +- tests/core/test_manifest.py | 6 +-- tests/core/test_manifest_evolution.py | 21 +--------- .../evolution_examples/2/component.yaml | 23 ----------- .../evolution_examples/2/output_manifest.json | 38 ------------------- .../evolution_examples/3/component.yaml | 24 ------------ .../evolution_examples/3/output_manifest.json | 32 ---------------- .../evolution_examples/4/component.yaml | 20 ---------- .../evolution_examples/4/output_manifest.json | 38 ------------------- .../evolution_examples/5/component.yaml | 21 ---------- .../evolution_examples/5/output_manifest.json | 29 -------------- .../evolution_examples/6/component.yaml | 22 ----------- .../evolution_examples/6/output_manifest.json | 21 ---------- .../evolution_examples/7/component.yaml | 22 ----------- .../evolution_examples/7/output_manifest.json | 21 ---------- .../evolution_examples/8/output_manifest.json | 35 ----------------- 34 files changed, 147 insertions(+), 400 deletions(-) rename tests/{examples/example_specs => core/examples}/component_specs/invalid_component.yaml (100%) rename tests/{examples/example_specs => core/examples}/component_specs/kubeflow_component.yaml (100%) rename tests/{examples/example_specs => core/examples}/component_specs/valid_component.yaml (100%) rename tests/{examples/example_specs => core/examples}/component_specs/valid_component_no_args.yaml (100%) rename tests/{examples/example_specs => core/examples}/evolution_examples/1/component.yaml (100%) rename tests/{examples/example_specs => core/examples}/evolution_examples/1/output_manifest.json (53%) rename tests/{examples/example_specs/evolution_examples/8 => core/examples/evolution_examples/2}/component.yaml (69%) create mode 100644 tests/core/examples/evolution_examples/2/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/3/component.yaml create mode 100644 tests/core/examples/evolution_examples/3/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/4/component.yaml create mode 100644 tests/core/examples/evolution_examples/4/output_manifest.json rename tests/{examples/example_specs => core/examples}/evolution_examples/input_manifest.json (59%) rename tests/{examples/example_specs => core/examples}/manifests/invalid_manifest.json (100%) rename tests/{examples/example_specs => core/examples}/manifests/valid_manifest.json (100%) delete mode 100644 tests/examples/example_specs/evolution_examples/2/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/2/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/3/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/3/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/4/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/4/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/5/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/5/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/6/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/6/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/7/component.yaml delete mode 100644 tests/examples/example_specs/evolution_examples/7/output_manifest.json delete mode 100644 tests/examples/example_specs/evolution_examples/8/output_manifest.json diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 5ecfa981b..fa5bb6ac6 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -171,7 +171,7 @@ def consumes(self) -> t.Mapping[str, Field]: return types.MappingProxyType( { name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["consumes"].items() + for name, field in self._specification.get("produces", {}).items() }, ) @@ -181,7 +181,7 @@ def produces(self) -> t.Mapping[str, Field]: return types.MappingProxyType( { name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["produces"].items() + for name, field in self._specification.get("produces", {}).items() }, ) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 1d22b7e73..04e1dfd0b 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -251,7 +251,7 @@ def remove_field(self, name: str) -> None: del self._specification["fields"][name] - def evolve( # : PLR0912 (too many branches) + def evolve( # noqa : PLR0912 (too many branches) self, component_spec: ComponentSpec, *, @@ -277,13 +277,9 @@ def evolve( # : PLR0912 (too many branches) # Update index location as this is always rewritten evolved_manifest.add_or_update_field( - Field(name="index", location=component_spec.component_folder_name) + Field(name="index", location=component_spec.component_folder_name), ) - # evolved_manifest._specification["index"][ - # "location" - # ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}" - # TODO handle additionalFields # Add or update all produced fields defined in the component spec diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 418c5cb2b..064ea027d 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -28,19 +28,16 @@ } }, "consumes": { - "$ref": "#/definitions/field" + "$ref": "common.json#/definitions/fields" }, "produces": { - "$ref": "#/definitions/field" + "$ref": "common.json#/definitions/fields" }, "args": { "$ref": "#/definitions/args" } }, "definitions": { - "field": { - "$ref": "common.json#/definitions/fields" - }, "args": { "type": "object", "minProperties": 1, diff --git a/tests/examples/example_specs/component_specs/invalid_component.yaml b/tests/core/examples/component_specs/invalid_component.yaml similarity index 100% rename from tests/examples/example_specs/component_specs/invalid_component.yaml rename to tests/core/examples/component_specs/invalid_component.yaml diff --git a/tests/examples/example_specs/component_specs/kubeflow_component.yaml b/tests/core/examples/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/examples/example_specs/component_specs/kubeflow_component.yaml rename to tests/core/examples/component_specs/kubeflow_component.yaml diff --git a/tests/examples/example_specs/component_specs/valid_component.yaml b/tests/core/examples/component_specs/valid_component.yaml similarity index 100% rename from tests/examples/example_specs/component_specs/valid_component.yaml rename to tests/core/examples/component_specs/valid_component.yaml diff --git a/tests/examples/example_specs/component_specs/valid_component_no_args.yaml b/tests/core/examples/component_specs/valid_component_no_args.yaml similarity index 100% rename from tests/examples/example_specs/component_specs/valid_component_no_args.yaml rename to tests/core/examples/component_specs/valid_component_no_args.yaml diff --git a/tests/examples/example_specs/evolution_examples/1/component.yaml b/tests/core/examples/evolution_examples/1/component.yaml similarity index 100% rename from tests/examples/example_specs/evolution_examples/1/component.yaml rename to tests/core/examples/evolution_examples/1/component.yaml diff --git a/tests/examples/example_specs/evolution_examples/1/output_manifest.json b/tests/core/examples/evolution_examples/1/output_manifest.json similarity index 53% rename from tests/examples/example_specs/evolution_examples/1/output_manifest.json rename to tests/core/examples/evolution_examples/1/output_manifest.json index 985f99625..e5a2d8aa0 100644 --- a/tests/examples/example_specs/evolution_examples/1/output_manifest.json +++ b/tests/core/examples/evolution_examples/1/output_manifest.json @@ -6,28 +6,28 @@ "component_id":"example_component" }, "index":{ - "location":"/test_pipeline/custom_run_id/example_component" + "location":"/example_component" }, "fields": { "images_width": { "type": "int32", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "images_height": { "type": "int32", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "images_data": { "type": "binary", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "captions_data": { "type": "binary", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "embeddings_data": { - "type": "ListType(list)", - "location":"/test_pipeline/12345/example_component" + "type": "list", + "location":"/example_component" } } } \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/8/component.yaml b/tests/core/examples/evolution_examples/2/component.yaml similarity index 69% rename from tests/examples/example_specs/evolution_examples/8/component.yaml rename to tests/core/examples/evolution_examples/2/component.yaml index 5c204b9c2..2352adcb5 100644 --- a/tests/examples/example_specs/evolution_examples/8/component.yaml +++ b/tests/core/examples/evolution_examples/2/component.yaml @@ -3,10 +3,12 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + +produces: + images_encoding: + type: string args: storage_args: diff --git a/tests/core/examples/evolution_examples/2/output_manifest.json b/tests/core/examples/evolution_examples/2/output_manifest.json new file mode 100644 index 000000000..ca1f6f361 --- /dev/null +++ b/tests/core/examples/evolution_examples/2/output_manifest.json @@ -0,0 +1,33 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "images_encoding": { + "type": "string", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/3/component.yaml b/tests/core/examples/evolution_examples/3/component.yaml new file mode 100644 index 000000000..13b1427b3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/component.yaml @@ -0,0 +1,16 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: string + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/3/output_manifest.json b/tests/core/examples/evolution_examples/3/output_manifest.json new file mode 100644 index 000000000..b11f7d8a3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "string", + "location":"/example_component_1" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/4/component.yaml b/tests/core/examples/evolution_examples/4/component.yaml new file mode 100644 index 000000000..1b766036d --- /dev/null +++ b/tests/core/examples/evolution_examples/4/component.yaml @@ -0,0 +1,12 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/4/output_manifest.json b/tests/core/examples/evolution_examples/4/output_manifest.json new file mode 100644 index 000000000..929c380ab --- /dev/null +++ b/tests/core/examples/evolution_examples/4/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/input_manifest.json b/tests/core/examples/evolution_examples/input_manifest.json similarity index 59% rename from tests/examples/example_specs/evolution_examples/input_manifest.json rename to tests/core/examples/evolution_examples/input_manifest.json index a1ae120b0..664367cc2 100644 --- a/tests/examples/example_specs/evolution_examples/input_manifest.json +++ b/tests/core/examples/evolution_examples/input_manifest.json @@ -6,24 +6,24 @@ "component_id":"example_component" }, "index":{ - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "fields": { "images_width": { "type": "int32", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "images_height": { "type": "int32", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "images_data": { "type": "binary", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" }, "captions_data": { "type": "binary", - "location":"/test_pipeline/12345/example_component" + "location":"/example_component" } } } \ No newline at end of file diff --git a/tests/examples/example_specs/manifests/invalid_manifest.json b/tests/core/examples/manifests/invalid_manifest.json similarity index 100% rename from tests/examples/example_specs/manifests/invalid_manifest.json rename to tests/core/examples/manifests/invalid_manifest.json diff --git a/tests/examples/example_specs/manifests/valid_manifest.json b/tests/core/examples/manifests/valid_manifest.json similarity index 100% rename from tests/examples/example_specs/manifests/valid_manifest.json rename to tests/core/examples/manifests/valid_manifest.json diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py index 48b10ce2b..7130c890e 100644 --- a/tests/core/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -13,7 +13,7 @@ from fondant.core.exceptions import InvalidComponentSpec from fondant.core.schema import Type -component_specs_path = Path(__file__).parent.parent / "examples/example_specs/component_specs" +component_specs_path = Path(__file__).parent / "examples/component_specs" @pytest.fixture() diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 676d664a7..2cbfa54b9 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -7,10 +7,8 @@ from fondant.core.exceptions import InvalidManifest from fondant.core.manifest import Field, Manifest, Type -manifest_path = Path(__file__).parent.parent / "examples" / "example_specs/manifests" -component_specs_path = ( - Path(__file__).parent.parent / "examples" / "example_specs/component_specs" -) +manifest_path = Path(__file__).parent / "examples" / "manifests" +component_specs_path = Path(__file__).parent / "examples" / "component_specs" @pytest.fixture() diff --git a/tests/core/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py index 5dce88885..c7b8ab130 100644 --- a/tests/core/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -6,7 +6,7 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -examples_path = Path(__file__).parent / "examples/example_specs/evolution_examples" +examples_path = Path(__file__).parent / "examples/evolution_examples" @pytest.fixture() @@ -35,22 +35,3 @@ def test_evolution(input_manifest, component_spec, output_manifest): ) assert evolved_manifest._specification == output_manifest - - -def test_component_spec_location_update(): - with open(examples_path / "input_manifest.json") as f: - input_manifest = json.load(f) - - with open(examples_path / "7/component.yaml") as f: - specification = yaml.safe_load(f) - - manifest = Manifest(input_manifest) - component_spec = ComponentSpec(specification) - evolved_manifest = manifest.evolve( - component_spec=component_spec, - ) - - assert ( - evolved_manifest._specification["subsets"]["images"]["location"] - == "/test_pipeline/12345/example_component/images" - ) diff --git a/tests/examples/example_specs/evolution_examples/2/component.yaml b/tests/examples/example_specs/evolution_examples/2/component.yaml deleted file mode 100644 index f37ff99d1..000000000 --- a/tests/examples/example_specs/evolution_examples/2/component.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/2/output_manifest.json b/tests/examples/example_specs/evolution_examples/2/output_manifest.json deleted file mode 100644 index 3a40b1c9d..000000000 --- a/tests/examples/example_specs/evolution_examples/2/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/3/component.yaml b/tests/examples/example_specs/evolution_examples/3/component.yaml deleted file mode 100644 index 6753a083b..000000000 --- a/tests/examples/example_specs/evolution_examples/3/component.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalFields: false - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/3/output_manifest.json b/tests/examples/example_specs/evolution_examples/3/output_manifest.json deleted file mode 100644 index a9abda6d0..000000000 --- a/tests/examples/example_specs/evolution_examples/3/output_manifest.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/4/component.yaml b/tests/examples/example_specs/evolution_examples/4/component.yaml deleted file mode 100644 index 067b06da0..000000000 --- a/tests/examples/example_specs/evolution_examples/4/component.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/4/output_manifest.json b/tests/examples/example_specs/evolution_examples/4/output_manifest.json deleted file mode 100644 index 24af4f2ac..000000000 --- a/tests/examples/example_specs/evolution_examples/4/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - }, - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/5/component.yaml b/tests/examples/example_specs/evolution_examples/5/component.yaml deleted file mode 100644 index 93aaf68b3..000000000 --- a/tests/examples/example_specs/evolution_examples/5/component.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/5/output_manifest.json b/tests/examples/example_specs/evolution_examples/5/output_manifest.json deleted file mode 100644 index 8bcf6141d..000000000 --- a/tests/examples/example_specs/evolution_examples/5/output_manifest.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/6/component.yaml b/tests/examples/example_specs/evolution_examples/6/component.yaml deleted file mode 100644 index 065061791..000000000 --- a/tests/examples/example_specs/evolution_examples/6/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/6/output_manifest.json b/tests/examples/example_specs/evolution_examples/6/output_manifest.json deleted file mode 100644 index b7521bf66..000000000 --- a/tests/examples/example_specs/evolution_examples/6/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/7/component.yaml b/tests/examples/example_specs/evolution_examples/7/component.yaml deleted file mode 100644 index 5746ffa4d..000000000 --- a/tests/examples/example_specs/evolution_examples/7/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - data: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/evolution_examples/7/output_manifest.json b/tests/examples/example_specs/evolution_examples/7/output_manifest.json deleted file mode 100644 index a9eb8a308..000000000 --- a/tests/examples/example_specs/evolution_examples/7/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "data":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/evolution_examples/8/output_manifest.json b/tests/examples/example_specs/evolution_examples/8/output_manifest.json deleted file mode 100644 index de2621c49..000000000 --- a/tests/examples/example_specs/evolution_examples/8/output_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "custom_run_id", - "component_id": "example_component" - }, - "index": { - "location": "/test_pipeline/custom_run_id/example_component/index" - }, - "subsets": { - "images": { - "location": "/test_pipeline/12345/example_component/images", - "fields": { - "width": { - "type": "int32" - }, - "height": { - "type": "int32" - }, - "data": { - "type": "binary" - } - } - }, - "captions": { - "location": "/test_pipeline/12345/example_component/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} From 07016622b966eb4c2c3f3cd174c73165a2aa7707 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 10:32:09 +0100 Subject: [PATCH 22/34] Addresses comments --- src/fondant/component/data_io.py | 38 ++++++++++++-------------------- src/fondant/core/manifest.py | 25 +++++++++++++++++---- tests/component/test_data_io.py | 2 -- tests/core/test_manifest.py | 18 ++++++++++----- 4 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 5d947ce76..f22072d7e 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -11,6 +11,8 @@ logger = logging.getLogger(__name__) +DEFAULT_INDEX_NAME = "id" + class DataIO: def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None: @@ -82,31 +84,13 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: return dataframe - # TODO: probably not needed anymore! - def _load_index(self) -> dd.DataFrame: - """ - Function that loads the index from the manifest as a Dask dataframe. - - Returns: - The index as a dask dataframe - """ - # get index subset from the manifest - index = self.manifest.index - # get remote path - remote_path = index["location"] - - # load index from parquet, expecting id and source columns - # TODO: reduce dataframe to index loading? .loc[:, []]? - return dd.read_parquet(remote_path, calculate_divisions=True) - def load_dataframe(self) -> dd.DataFrame: """ Function that loads the subsets defined in the component spec as a single Dask dataframe for the user. Returns: - The Dask dataframe with the field columns in the format (_) - as well as the index columns. + The Dask dataframe with all columns defined in the manifest field mapping """ dataframe = None field_mapping = self.manifest.field_mapping @@ -114,7 +98,7 @@ def load_dataframe(self) -> dd.DataFrame: partial_df = dd.read_parquet( location, columns=fields, - index="id", + index=DEFAULT_INDEX_NAME, calculate_divisions=True, ) @@ -122,12 +106,16 @@ def load_dataframe(self) -> dd.DataFrame: # ensure that the index is set correctly and divisions are known. dataframe = partial_df else: - dask_divisions = partial_df.set_index("id").divisions - unique_divisions = list(dict.fromkeys(list(dask_divisions))) + dask_divisions = dataframe.divisions + unique_divisions = list(set(dask_divisions)) # apply set index to both dataframes - partial_df = partial_df.set_index("id", divisions=unique_divisions) - dataframe = dataframe.set_index("id", divisions=unique_divisions) + partial_df = partial_df.set_index( + DEFAULT_INDEX_NAME, divisions=unique_divisions + ) + dataframe = dataframe.set_index( + DEFAULT_INDEX_NAME, divisions=unique_divisions + ) dataframe = dataframe.merge( partial_df, @@ -161,6 +149,8 @@ def write_dataframe( column_name for column_name, field in self.component_spec.produces.items() ] + dataframe.index = dataframe.index.rename(DEFAULT_INDEX_NAME) + # validation that all columns are in the dataframe self.validate_dataframe_columns(dataframe, columns_to_produce) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 04e1dfd0b..629871d17 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,8 +4,10 @@ import pkgutil import types import typing as t +from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path +from types import MappingProxyType import jsonschema.exceptions from fsspec import open as fs_open @@ -144,8 +146,8 @@ def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] @property - def index(self) -> t.Dict[str, t.Any]: - return self._specification["index"] + def index(self) -> Field: + return Field(name="Index", location=self._specification["index"]["location"]) def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -155,12 +157,15 @@ def base_path(self) -> str: return self.metadata["base_path"] @property - def field_mapping(self): + def field_mapping(self) -> t.Mapping[str, t.List[str]]: """ Retrieve a mapping of field locations to corresponding field names. A dictionary where keys are field locations and values are lists of column names. + The method returns an immutable OrderedDict where the first dict element contains the + location of the dataframe with the index. This allows an efficient left join operation. + Example: { "/base_path/component_1": ["Name", "HP"], @@ -176,7 +181,19 @@ def field_mapping(self): field_mapping[location].append(field_name) else: field_mapping[location] = [field_name] - return field_mapping + + # Sort field mapping that the first dataset contains the index + index_location = ( + f"{self.base_path}/{self.pipeline_name}/{self.run_id}{self.index.location}" + ) + sorted_keys = sorted( + field_mapping.keys(), key=lambda key: index_location == key, reverse=True + ) + sorted_field_mapping = OrderedDict( + (key, field_mapping[key]) for key in sorted_keys + ) + + return MappingProxyType(sorted_field_mapping) @property def run_id(self) -> str: diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index 89eadddb9..ed3e9169f 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -14,7 +14,6 @@ ) NUMBER_OF_TEST_ROWS = 151 -NUMBER_OF_TEST_ROWS = 151 @pytest.fixture() @@ -99,7 +98,6 @@ def test_write_dataset( # TODO: check if this is still needed? -# Change this to a validation instead? def test_write_reset_index( tmp_path_factory, dataframe, diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 2cbfa54b9..6af69a387 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -1,5 +1,6 @@ import json import pkgutil +from collections import OrderedDict from pathlib import Path import pytest @@ -229,11 +230,16 @@ def test_fields(): def test_field_mapping(valid_manifest): """Test field mapping generation.""" - Path(__file__).parent / "example_specs/manifests" manifest = Manifest(valid_manifest) + manifest.add_or_update_field(Field(name="index", location="component2")) field_mapping = manifest.field_mapping - assert field_mapping == { - "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], - "gs://bucket/test_pipeline/test_pipeline_12345/component2": ["height", "width"], - "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], - } + assert field_mapping == OrderedDict( + { + "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ + "height", + "width", + ], + "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], + "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], + } + ) From 365ca6dc6cdc51e8159a5d1d2a8dca575324b3c8 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 11:00:06 +0100 Subject: [PATCH 23/34] Update test examples --- src/fondant/component/executor.py | 2 -- .../component_specs}/arguments/component.yaml | 0 .../arguments/component_default_args.yaml | 0 .../component_specs}/arguments/input_manifest.json | 2 +- .../examples/component_specs}/component.yaml | 0 .../examples/component_specs}/input_manifest.json | 0 .../examples/data}/components/1.yaml | 0 .../examples/data}/manifest.json | 2 +- .../examples/data}/raw/split.py | 0 .../examples/data}/raw/testset.parquet | Bin .../data}/subsets_input/index/part.0.parquet | Bin .../data}/subsets_input/index/part.1.parquet | Bin .../data}/subsets_input/index/part.2.parquet | Bin .../data}/subsets_input/properties/part.0.parquet | Bin .../data}/subsets_input/properties/part.1.parquet | Bin .../data}/subsets_input/properties/part.2.parquet | Bin .../data}/subsets_input/types/part.0.parquet | Bin .../data}/subsets_input/types/part.1.parquet | Bin .../data}/subsets_input/types/part.2.parquet | Bin .../mock_base_path/example_pipeline/cache/42.txt | 1 + .../example_pipeline_2023/component_1/manifest.json | 0 .../example_pipeline_2023/component_2/manifest.json | 0 .../example_pipeline_2024/component_1/manifest.json | 0 .../example_pipeline_2024/component_2/manifest.json | 0 tests/component/test_component.py | 4 ++-- tests/component/test_data_io.py | 4 ++-- .../mock_base_path/example_pipeline/cache/42.txt | 1 - 27 files changed, 7 insertions(+), 9 deletions(-) rename tests/{examples/example_specs/components => component/examples/component_specs}/arguments/component.yaml (100%) rename tests/{examples/example_specs/components => component/examples/component_specs}/arguments/component_default_args.yaml (100%) rename tests/{examples/example_specs/components => component/examples/component_specs}/arguments/input_manifest.json (81%) rename tests/{examples/example_specs/components => component/examples/component_specs}/component.yaml (100%) rename tests/{examples/example_specs/components => component/examples/component_specs}/input_manifest.json (100%) rename tests/{examples/example_data => component/examples/data}/components/1.yaml (100%) rename tests/{examples/example_data => component/examples/data}/manifest.json (91%) rename tests/{examples/example_data => component/examples/data}/raw/split.py (100%) rename tests/{examples/example_data => component/examples/data}/raw/testset.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/index/part.0.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/index/part.1.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/index/part.2.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/properties/part.0.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/properties/part.1.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/properties/part.2.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/types/part.0.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/types/part.1.parquet (100%) rename tests/{examples/example_data => component/examples/data}/subsets_input/types/part.2.parquet (100%) create mode 100644 tests/component/examples/mock_base_path/example_pipeline/cache/42.txt rename tests/{examples/example_specs => component/examples}/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json (100%) rename tests/{examples/example_specs => component/examples}/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json (100%) rename tests/{examples/example_specs => component/examples}/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json (100%) rename tests/{examples/example_specs => component/examples}/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json (100%) delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index bed4df80d..d77200da8 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -549,8 +549,6 @@ def _execute_component( # Clear divisions if component spec indicates that the index is changed if self._infer_index_change(): - # TODO: might causing issues for merging components - # to guarantee fast merging of large dataframes we need to keep the division information dataframe.clear_divisions() return dataframe diff --git a/tests/examples/example_specs/components/arguments/component.yaml b/tests/component/examples/component_specs/arguments/component.yaml similarity index 100% rename from tests/examples/example_specs/components/arguments/component.yaml rename to tests/component/examples/component_specs/arguments/component.yaml diff --git a/tests/examples/example_specs/components/arguments/component_default_args.yaml b/tests/component/examples/component_specs/arguments/component_default_args.yaml similarity index 100% rename from tests/examples/example_specs/components/arguments/component_default_args.yaml rename to tests/component/examples/component_specs/arguments/component_default_args.yaml diff --git a/tests/examples/example_specs/components/arguments/input_manifest.json b/tests/component/examples/component_specs/arguments/input_manifest.json similarity index 81% rename from tests/examples/example_specs/components/arguments/input_manifest.json rename to tests/component/examples/component_specs/arguments/input_manifest.json index 9ee2494f9..526faf72f 100644 --- a/tests/examples/example_specs/components/arguments/input_manifest.json +++ b/tests/component/examples/component_specs/arguments/input_manifest.json @@ -1,7 +1,7 @@ { "metadata": { "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", + "base_path": "tests/component/examples/mock_base_path", "run_id": "example_pipeline_123", "component_id": "component_1", "cache_key": "00" diff --git a/tests/examples/example_specs/components/component.yaml b/tests/component/examples/component_specs/component.yaml similarity index 100% rename from tests/examples/example_specs/components/component.yaml rename to tests/component/examples/component_specs/component.yaml diff --git a/tests/examples/example_specs/components/input_manifest.json b/tests/component/examples/component_specs/input_manifest.json similarity index 100% rename from tests/examples/example_specs/components/input_manifest.json rename to tests/component/examples/component_specs/input_manifest.json diff --git a/tests/examples/example_data/components/1.yaml b/tests/component/examples/data/components/1.yaml similarity index 100% rename from tests/examples/example_data/components/1.yaml rename to tests/component/examples/data/components/1.yaml diff --git a/tests/examples/example_data/manifest.json b/tests/component/examples/data/manifest.json similarity index 91% rename from tests/examples/example_data/manifest.json rename to tests/component/examples/data/manifest.json index 14366fbbd..cc579fef1 100644 --- a/tests/examples/example_data/manifest.json +++ b/tests/component/examples/data/manifest.json @@ -1,7 +1,7 @@ { "metadata": { "pipeline_name": "test_pipeline", - "base_path": "tests/examples/example_data", + "base_path": "tests/component/examples/data", "run_id": "test_pipeline_12345", "component_id": "67890" }, diff --git a/tests/examples/example_data/raw/split.py b/tests/component/examples/data/raw/split.py similarity index 100% rename from tests/examples/example_data/raw/split.py rename to tests/component/examples/data/raw/split.py diff --git a/tests/examples/example_data/raw/testset.parquet b/tests/component/examples/data/raw/testset.parquet similarity index 100% rename from tests/examples/example_data/raw/testset.parquet rename to tests/component/examples/data/raw/testset.parquet diff --git a/tests/examples/example_data/subsets_input/index/part.0.parquet b/tests/component/examples/data/subsets_input/index/part.0.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/index/part.0.parquet rename to tests/component/examples/data/subsets_input/index/part.0.parquet diff --git a/tests/examples/example_data/subsets_input/index/part.1.parquet b/tests/component/examples/data/subsets_input/index/part.1.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/index/part.1.parquet rename to tests/component/examples/data/subsets_input/index/part.1.parquet diff --git a/tests/examples/example_data/subsets_input/index/part.2.parquet b/tests/component/examples/data/subsets_input/index/part.2.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/index/part.2.parquet rename to tests/component/examples/data/subsets_input/index/part.2.parquet diff --git a/tests/examples/example_data/subsets_input/properties/part.0.parquet b/tests/component/examples/data/subsets_input/properties/part.0.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/properties/part.0.parquet rename to tests/component/examples/data/subsets_input/properties/part.0.parquet diff --git a/tests/examples/example_data/subsets_input/properties/part.1.parquet b/tests/component/examples/data/subsets_input/properties/part.1.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/properties/part.1.parquet rename to tests/component/examples/data/subsets_input/properties/part.1.parquet diff --git a/tests/examples/example_data/subsets_input/properties/part.2.parquet b/tests/component/examples/data/subsets_input/properties/part.2.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/properties/part.2.parquet rename to tests/component/examples/data/subsets_input/properties/part.2.parquet diff --git a/tests/examples/example_data/subsets_input/types/part.0.parquet b/tests/component/examples/data/subsets_input/types/part.0.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/types/part.0.parquet rename to tests/component/examples/data/subsets_input/types/part.0.parquet diff --git a/tests/examples/example_data/subsets_input/types/part.1.parquet b/tests/component/examples/data/subsets_input/types/part.1.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/types/part.1.parquet rename to tests/component/examples/data/subsets_input/types/part.1.parquet diff --git a/tests/examples/example_data/subsets_input/types/part.2.parquet b/tests/component/examples/data/subsets_input/types/part.2.parquet similarity index 100% rename from tests/examples/example_data/subsets_input/types/part.2.parquet rename to tests/component/examples/data/subsets_input/types/part.2.parquet diff --git a/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt new file mode 100644 index 000000000..4a9ff8afc --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt @@ -0,0 +1 @@ +tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json similarity index 100% rename from tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename to tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json similarity index 100% rename from tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json rename to tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json similarity index 100% rename from tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json rename to tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json similarity index 100% rename from tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename to tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 9f910dbcd..830ce2963 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -23,8 +23,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest, Metadata -components_path = Path(__file__).parent.parent / "examples/example_specs/components" -base_path = Path(__file__).parent.parent / "examples/example_specs/mock_base_path" +components_path = Path(__file__).parent / "examples/component_specs" +base_path = Path(__file__).parent / "examples/mock_base_path" N_PARTITIONS = 2 diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index ed3e9169f..30a4b7c10 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -8,9 +8,9 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -manifest_path = Path(__file__).parent.parent / "examples/example_data/manifest.json" +manifest_path = Path(__file__).parent / "examples/data/manifest.json" component_spec_path = ( - Path(__file__).parent.parent / "examples/example_data/components/1.yaml" + Path(__file__).parent / "examples/data/components/1.yaml" ) NUMBER_OF_TEST_ROWS = 151 diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt deleted file mode 100644 index 614074264..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt +++ /dev/null @@ -1 +0,0 @@ -tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file From 4dc7dc715f0f0306d8e47bd14ea75f02fe7e887d Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 13:23:29 +0100 Subject: [PATCH 24/34] Update src/fondant/core/manifest.py Co-authored-by: Philippe Moussalli --- src/fondant/core/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 04e1dfd0b..47eeee897 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -284,7 +284,7 @@ def evolve( # noqa : PLR0912 (too many branches) # Add or update all produced fields defined in the component spec for name, field in component_spec.produces.items(): - # If field was part not part of the input manifest, add field to output manifest. + # If field was not part of the input manifest, add field to output manifest. # If field was part of the input manifest and got produced by the component, update # the manifest field. evolved_manifest.add_or_update_field(field, overwrite=True) From a60ca3e3e5db5aadf68e92d2be7949caf56ecc4b Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 13:50:41 +0100 Subject: [PATCH 25/34] addresses comments --- src/fondant/core/component_spec.py | 6 +---- src/fondant/core/manifest.py | 24 ++++++++++++++----- src/fondant/core/schema.py | 1 - .../evolution_examples/1/output_manifest.json | 7 +++++- tests/core/test_manifest.py | 19 ++++++++++----- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index fa5bb6ac6..4dd945568 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -161,17 +161,13 @@ def image(self, value: str) -> None: def tags(self) -> t.List[str]: return self._specification.get("tags", None) - @property - def index(self): - return Field(name="index", location=self._specification["index"].location) - @property def consumes(self) -> t.Mapping[str, Field]: """The fields consumed by the component as an immutable mapping.""" return types.MappingProxyType( { name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification.get("produces", {}).items() + for name, field in self._specification.get("consumes", {}).items() }, ) diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 47eeee897..f1c9f76c2 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,6 +4,7 @@ import pkgutil import types import typing as t +from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path @@ -144,8 +145,8 @@ def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] @property - def index(self) -> t.Dict[str, t.Any]: - return self._specification["index"] + def index(self) -> Field: + return Field(name="Index", location=self._specification["index"]["location"]) def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -155,12 +156,15 @@ def base_path(self) -> str: return self.metadata["base_path"] @property - def field_mapping(self): + def field_mapping(self) -> t.Mapping[str, t.List[str]]: """ Retrieve a mapping of field locations to corresponding field names. A dictionary where keys are field locations and values are lists of column names. + The method returns an immutable OrderedDict where the first dict element contains the + location of the dataframe with the index. This allows an efficient left join operation. + Example: { "/base_path/component_1": ["Name", "HP"], @@ -168,7 +172,7 @@ def field_mapping(self): } """ field_mapping = {} - for field_name, field in self.fields.items(): + for field_name, field in {"Index": self.index, **self.fields}.items(): location = ( f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" ) @@ -176,7 +180,15 @@ def field_mapping(self): field_mapping[location].append(field_name) else: field_mapping[location] = [field_name] - return field_mapping + + + # Sort field mapping that the first dataset contains the index + sorted_keys = sorted(field_mapping.keys(), key=lambda key: "Index" in field_mapping[key], reverse=True) + sorted_field_mapping = OrderedDict( + (key, field_mapping[key]) for key in sorted_keys + ) + + return types.MappingProxyType(sorted_field_mapping) @property def run_id(self) -> str: @@ -221,7 +233,7 @@ def add_or_update_field(self, field: Field, overwrite: bool = False): else: self._specification["fields"][field.name] = { "location": f"/{self.component_id}", - "type": field.type.name, + "type": field.type.to_json(), } def _add_or_update_index(self, field: Field, overwrite: bool = True): diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index b8549d0d5..dc940b5f7 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -162,7 +162,6 @@ def __eq__(self, other): return False -@dataclass class Field: """Class representing a single field or column in a Fondant dataset.""" diff --git a/tests/core/examples/evolution_examples/1/output_manifest.json b/tests/core/examples/evolution_examples/1/output_manifest.json index e5a2d8aa0..9735cc50d 100644 --- a/tests/core/examples/evolution_examples/1/output_manifest.json +++ b/tests/core/examples/evolution_examples/1/output_manifest.json @@ -26,7 +26,12 @@ "location":"/example_component" }, "embeddings_data": { - "type": "list", + "type": { + "type": "array", + "items": { + "type": "float32" + } + }, "location":"/example_component" } } diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 2cbfa54b9..295faf290 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -1,5 +1,6 @@ import json import pkgutil +from collections import OrderedDict from pathlib import Path import pytest @@ -229,11 +230,17 @@ def test_fields(): def test_field_mapping(valid_manifest): """Test field mapping generation.""" - Path(__file__).parent / "example_specs/manifests" manifest = Manifest(valid_manifest) + manifest.add_or_update_field(Field(name="index", location="component2")) field_mapping = manifest.field_mapping - assert field_mapping == { - "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], - "gs://bucket/test_pipeline/test_pipeline_12345/component2": ["height", "width"], - "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], - } + assert field_mapping == OrderedDict( + { + "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ + "Index", + "height", + "width", + ], + "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], + "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], + } + ) From e141231438d87838efa4d7934c90096b8318e282 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 16:16:49 +0100 Subject: [PATCH 26/34] Adjust interface for usage of produces and consumes --- src/fondant/component/data_io.py | 17 +++ src/fondant/component/executor.py | 15 ++- src/fondant/pipeline/pipeline.py | 59 ++++++++++- .../fourth_component/fondant_component.yaml | 38 ------- .../third_component/fondant_component.yaml | 33 ------ .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 12 +-- .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 28 +++++ .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 16 ++- .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml | 25 +++++ tests/{ => pipeline}/test_compiler.py | 4 +- tests/{ => pipeline}/test_pipeline.py | 100 +++++++++++++----- tests/{ => pipeline}/test_runner.py | 0 23 files changed, 223 insertions(+), 124 deletions(-) delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_2/first_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_2/second_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_3/first_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_3/second_component/fondant_component.yaml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/first_component/fondant_component.yaml (61%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/fourth_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/second_component/Dockerfile (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/second_component/fondant_component.yaml (55%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/third_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml rename tests/{ => pipeline}/test_compiler.py (99%) rename tests/{ => pipeline}/test_pipeline.py (80%) rename tests/{ => pipeline}/test_runner.py (100%) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index f22072d7e..6be285bea 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -27,9 +27,11 @@ def __init__( manifest: Manifest, component_spec: ComponentSpec, input_partition_rows: t.Optional[int] = None, + consumes: t.Optional[t.Dict[str, str]] = None ): super().__init__(manifest=manifest, component_spec=component_spec) self.input_partition_rows = input_partition_rows + self.consumes = consumes def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: """ @@ -126,8 +128,15 @@ def load_dataframe(self) -> dd.DataFrame: dataframe = self.partition_loaded_dataframe(dataframe) + logging.info(f"Columns of dataframe: {list(dataframe.columns)}") + # rename columns accordingly to the consumes + if self.consumes: + reverted_consumes = {v: k for k, v in self.consumes.items()} + dataframe = dataframe.rename(columns=reverted_consumes) + + logging.info(f"Columns of components dataframe: {list(dataframe.columns)}") return dataframe @@ -137,8 +146,10 @@ def __init__( *, manifest: Manifest, component_spec: ComponentSpec, + produces: t.Optional[t.Dict[str, str]] = None ): super().__init__(manifest=manifest, component_spec=component_spec) + self.producer = produces def write_dataframe( self, @@ -155,6 +166,12 @@ def write_dataframe( self.validate_dataframe_columns(dataframe, columns_to_produce) dataframe = dataframe[columns_to_produce] + + # Rename produces accordingly + if self.produces: + reverted_produces = {v: k for k, v in self.produces.items()} + dataframe = dataframe.rename(columns=reverted_produces) + write_task = self._write_dataframe(dataframe) with ProgressBar(): diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index d77200da8..f92dee6b3 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -67,6 +67,8 @@ def __init__( input_partition_rows: int, cluster_type: t.Optional[str] = None, client_kwargs: t.Optional[dict] = None, + consumes: t.Optional[t.Dict[str, str]] = None, + produces: t.Optional[t.Dict[str, str]] = None ) -> None: self.spec = spec self.cache = cache @@ -75,6 +77,8 @@ def __init__( self.metadata = Metadata.from_dict(metadata) self.user_arguments = user_arguments self.input_partition_rows = input_partition_rows + self.consumes = consumes + self.produces = produces if cluster_type == "local": client_kwargs = client_kwargs or { @@ -112,6 +116,9 @@ def from_args(cls) -> "Executor": parser.add_argument("--input_partition_rows", type=int) parser.add_argument("--cluster_type", type=str) parser.add_argument("--client_kwargs", type=json.loads) + parser.add_argument("--consumes", type=json.loads) + parser.add_argument("--produces", type=json.loads) + args, _ = parser.parse_known_args() if "component_spec" not in args: @@ -251,11 +258,12 @@ def _execute_component( A Dask DataFrame containing the output data """ - def _write_data(self, dataframe: dd.DataFrame, *, manifest: Manifest): + def _write_data(self, dataframe: dd.DataFrame, *, manifest: Manifest, produces: t.Optional[t.Dict[str, str]]): """Create a data writer given a manifest and writes out the index and subsets.""" data_writer = DaskDataWriter( manifest=manifest, component_spec=self.spec, + produces=produces ) data_writer.write_dataframe(dataframe, self.client) @@ -340,7 +348,7 @@ def _run_execution( component_spec=self.spec, run_id=self.metadata.run_id, ) - self._write_data(dataframe=output_df, manifest=output_manifest) + self._write_data(dataframe=output_df, manifest=output_manifest, produces=self.produces) return output_manifest @@ -478,6 +486,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, + consumes=self.consumes ) dataframe = data_loader.load_dataframe() return component.transform(dataframe) @@ -530,6 +539,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, + consumes=self.consumes ) dataframe = data_loader.load_dataframe() @@ -591,6 +601,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, + consumes=self.consumes ) dataframe = data_loader.load_dataframe() component.write(dataframe) diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 36f81b7db..c8a236fe3 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -138,6 +138,8 @@ def __init__( cluster_type: t.Optional[str] = "default", client_kwargs: t.Optional[dict] = None, resources: t.Optional[Resources] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None ) -> None: self.component_dir = Path(component_dir) self.input_partition_rows = input_partition_rows @@ -154,6 +156,8 @@ def __init__( self._add_component_argument("cache", self.cache) self._add_component_argument("cluster_type", cluster_type) self._add_component_argument("client_kwargs", client_kwargs) + self._add_component_argument("consumes", consumes) + self._add_component_argument("produces", produces) self.arguments.setdefault("component_spec", self.component_spec.specification) @@ -221,6 +225,8 @@ def from_registry( cache: t.Optional[bool] = True, cluster_type: t.Optional[str] = "default", client_kwargs: t.Optional[dict] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None ) -> "ComponentOp": """Load a reusable component by its name. @@ -248,6 +254,8 @@ def from_registry( cache=cache, cluster_type=cluster_type, client_kwargs=client_kwargs, + consumes=consumes, + produces=produces ) def get_component_cache_key( @@ -319,11 +327,54 @@ def __init__( self._graph: t.OrderedDict[str, t.Any] = OrderedDict() self.task_without_dependencies_added = False + + def apply(self, name, consumes, produces) -> "Pipeline": + """ + Args: + name: Either component name from registry or local path + consumes: + produces: + + Returns: + + """ + for field in consumes: + setattr(self, field, field) + + for field in produces: + setattr(self, field, field) + + component_op = self._get_component_op(name, consumes, produces) + previous_component = list(self._graph.items())[-1] + + if previous_component is None: + msg = f"No previous component found." + raise ValueError(msg) + + + self.add_op(component_op, dependencies=previous_component) + return self + + + + @staticmethod + def _get_component_op(path_or_name, consumes, produces): + """Return either a custom component when name is a path, or a component from registry.""" + components_dir: Path = Path(path_or_name) + if (components_dir.exists() and components_dir.is_dir()): + return ComponentOp(component_dir=path_or_name, consumes=consumes, produces=produces) + else: + components_dir: Path = t.cast(Path, files("fondant") / f"components/{path_or_name}") + if not (components_dir.exists() and components_dir.is_dir()): + msg = f"No reusable component with name {path_or_name} found." + raise ValueError(msg) + return ComponentOp.from_registry(path_or_name, consumes=consumes, produces=produces) + def add_op( self, task: ComponentOp, dependencies: t.Optional[t.Union[ComponentOp, t.List[ComponentOp]]] = None, - ): + ) -> "Pipeline": """ Add a task to the pipeline with an optional dependency. @@ -332,6 +383,8 @@ def add_op( dependencies: Optional task dependencies that needs to be completed before the task can run. """ + + if dependencies is None: if self.task_without_dependencies_added: msg = "At most one task can be defined without dependencies." @@ -361,6 +414,8 @@ def add_op( "dependencies": dependencies_names, } + return self + def sort_graph(self): """Sort the graph topologically based on task dependencies.""" logger.info("Sorting pipeline component graph topologically.") @@ -494,4 +549,4 @@ def _validate_pipeline_definition(self, run_id: str): def __repr__(self) -> str: """Return a string representation of the FondantPipeline object.""" - return f"{self.__class__.__name__}({self._graph!r}" + return f"{self.__class__.__name__}({self._graph!r}" \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml deleted file mode 100644 index 3cda0cc6c..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: Fourth component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str - some_list: - description: Some list - type: list - items: - type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml deleted file mode 100644 index 091a7d9d5..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Third component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml similarity index 100% rename from tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 61% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml index 18ea49b2c..0841688e9 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml @@ -3,15 +3,11 @@ description: This is an example component image: example_component:latest produces: - images: - fields: - data: - type: binary + images_data: + type: binary - captions: - fields: - data: - type: string + captions_data: + type: string args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml new file mode 100644 index 000000000..42ce65084 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml @@ -0,0 +1,28 @@ +name: Fourth component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str + some_list: + description: Some list + type: list + items: + type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml similarity index 55% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml index 2f9907df1..fa328ae01 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml @@ -3,18 +3,14 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml new file mode 100644 index 000000000..e2b3082d8 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml @@ -0,0 +1,25 @@ +name: Third component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces: + images_data: + type: binary + + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/test_compiler.py b/tests/pipeline/test_compiler.py similarity index 99% rename from tests/test_compiler.py rename to tests/pipeline/test_compiler.py index 429acbb05..d891fd31a 100644 --- a/tests/test_compiler.py +++ b/tests/pipeline/test_compiler.py @@ -18,9 +18,9 @@ VertexPipelineConfigs, ) -COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") +COMPONENTS_PATH = Path("./tests/pipeline/examples/pipelines/valid_pipeline") -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") TEST_PIPELINES = [ ( diff --git a/tests/test_pipeline.py b/tests/pipeline/test_pipeline.py similarity index 80% rename from tests/test_pipeline.py rename to tests/pipeline/test_pipeline.py index 37d421ef6..5409bad17 100644 --- a/tests/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -8,8 +8,8 @@ from fondant.core.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline, Resources -valid_pipeline_path = Path(__file__).parent / "example_pipelines/valid_pipeline" -invalid_pipeline_path = Path(__file__).parent / "example_pipelines/invalid_pipeline" +valid_pipeline_path = Path(__file__).parent / "examples/pipelines/valid_pipeline" +invalid_pipeline_path = Path(__file__).parent / "examples/pipelines/invalid_pipeline" def yaml_file_to_dict(file_path): @@ -29,13 +29,13 @@ def default_pipeline_args(): "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_component_op( - valid_pipeline_example, + valid_pipeline_example, ): component_args = {"storage_args": "a dummy string arg"} example_dir, component_names = valid_pipeline_example @@ -65,18 +65,60 @@ def test_component_op( ) +def test_new_pipeline_interface(): + pipeline = Pipeline( + pipeline_name="my_pipeline", + pipeline_description="description of my pipeline", + base_path="/foo/bar", + ) + + dataset = pipeline.read( + name="load_images", + schema={ + "image": "binary" # or pa.binary() + } + ) + + dataset = dataset.apply( + name="caption_images", + consumes={ + "images_data": "image" + }, + produces={ + "captions": "text" + } + ) + + dataset = dataset.apply( + component_dir="embed_text", + consumes={ + "text_data": "text" + } + ) + + dataset.write( + name="write_data", + schema={ + "image": "image", + "caption": "text" + } + ) + + assert True + + @pytest.mark.parametrize( "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_component_op_hash( - valid_pipeline_example, - monkeypatch, + valid_pipeline_example, + monkeypatch, ): example_dir, component_names = valid_pipeline_example components_path = Path(valid_pipeline_path / example_dir) @@ -99,16 +141,16 @@ def test_component_op_hash( comp_0_op_spec_0_copy = copy.deepcopy(comp_0_op_spec_0) assert ( - comp_0_op_spec_0.get_component_cache_key() - != comp_0_op_spec_1.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + != comp_0_op_spec_1.get_component_cache_key() ) assert ( - comp_0_op_spec_0.get_component_cache_key() - == comp_0_op_spec_0_copy.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + == comp_0_op_spec_0_copy.get_component_cache_key() ) assert ( - comp_0_op_spec_0.get_component_cache_key() - != comp_1_op_spec_0.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + != comp_1_op_spec_0.get_component_cache_key() ) @@ -135,16 +177,16 @@ def test_component_op_caching_strategy(monkeypatch): "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_valid_pipeline( - default_pipeline_args, - valid_pipeline_example, - tmp_path, - monkeypatch, + default_pipeline_args, + valid_pipeline_example, + tmp_path, + monkeypatch, ): """Test that a valid pipeline definition can be compiled without errors.""" example_dir, component_names = valid_pipeline_example @@ -190,8 +232,8 @@ def test_valid_pipeline( "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) @@ -234,8 +276,8 @@ def test_invalid_pipeline_dependencies(default_pipeline_args, valid_pipeline_exa ], ) def test_invalid_pipeline_declaration( - default_pipeline_args, - invalid_pipeline_example, + default_pipeline_args, + invalid_pipeline_example, ): """Test that an InvalidPipelineDefinition exception is raised when attempting to register invalid components combinations. @@ -304,8 +346,8 @@ def test_reusable_component_op(): component_name = "this_component_does_not_exist" with pytest.raises( - ValueError, - match=f"No reusable component with name {component_name} " "found.", + ValueError, + match=f"No reusable component with name {component_name} " "found.", ): ComponentOp.from_registry( name=component_name, @@ -332,8 +374,8 @@ def test_defining_reusable_component_op_with_custom_spec(): ) assert ( - load_from_hub_custom_op.component_spec - == load_from_hub_default_op.component_spec + load_from_hub_custom_op.component_spec + == load_from_hub_default_op.component_spec ) diff --git a/tests/test_runner.py b/tests/pipeline/test_runner.py similarity index 100% rename from tests/test_runner.py rename to tests/pipeline/test_runner.py From f3e0a6a16814d2d163e46d312088da9cf00f2f3a Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Wed, 22 Nov 2023 17:00:19 +0100 Subject: [PATCH 27/34] Adjust interface for usage of schema, consumes, and produces --- .../caption_images/fondant_component.yaml | 12 +- components/embed_text/fondant_component.yaml | 26 +-- .../load_from_hf_hub/fondant_component.yaml | 12 +- src/fondant/component/component.py | 2 +- src/fondant/component/executor.py | 11 +- src/fondant/pipeline/pipeline.py | 220 ++++++++++++------ tests/pipeline/test_pipeline.py | 4 +- 7 files changed, 181 insertions(+), 106 deletions(-) diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml index 7a72cd815..3da8e4720 100644 --- a/components/caption_images/fondant_component.yaml +++ b/components/caption_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - text: - type: utf8 + captions_text: + type: utf8 args: model_id: diff --git a/components/embed_text/fondant_component.yaml b/components/embed_text/fondant_component.yaml index 2e34c5c0a..2866a2beb 100644 --- a/components/embed_text/fondant_component.yaml +++ b/components/embed_text/fondant_component.yaml @@ -5,21 +5,17 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 - + text_data: + type: string + embedding: + type: array + items: + type: float32 + args: model_provider: description: | @@ -40,12 +36,12 @@ args: Pass only the keys required by the model provider or conveniently pass all keys you will ever need. Pay attention how to name the dictionary keys so that they can be used by the model provider. type: dict - default: {} + default: { } auth_kwargs: description: | Additional keyword arguments required for api initialization/authentication. type: dict - default: {} + default: { } \ No newline at end of file diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index d6a625971..c3780ac5b 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: dataset_name: @@ -17,12 +15,12 @@ args: column_name_mapping: description: Mapping of the consumed hub dataset to fondant column names type: dict - default: {} + default: { } image_column_names: - description: Optional argument, a list containing the original image column names in case the + description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: [] + default: [ ] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int diff --git a/src/fondant/component/component.py b/src/fondant/component/component.py index dd094ff94..5b33a3876 100644 --- a/src/fondant/component/component.py +++ b/src/fondant/component/component.py @@ -16,7 +16,7 @@ class BaseComponent: **kwargs: The provided user arguments are passed in as keyword arguments """ - def __init__(self, spec: ComponentSpec, **kwargs): + def __init__(self, spec: ComponentSpec, schema: t.Optional[t.Dict[str, str]] = None, **kwargs): pass diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index f92dee6b3..31ef923c8 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -67,6 +67,7 @@ def __init__( input_partition_rows: int, cluster_type: t.Optional[str] = None, client_kwargs: t.Optional[dict] = None, + schema: t.Optional[t.Dict[str, str]] = None, consumes: t.Optional[t.Dict[str, str]] = None, produces: t.Optional[t.Dict[str, str]] = None ) -> None: @@ -77,6 +78,7 @@ def __init__( self.metadata = Metadata.from_dict(metadata) self.user_arguments = user_arguments self.input_partition_rows = input_partition_rows + self.schema = schema self.consumes = consumes self.produces = produces @@ -116,6 +118,7 @@ def from_args(cls) -> "Executor": parser.add_argument("--input_partition_rows", type=int) parser.add_argument("--cluster_type", type=str) parser.add_argument("--client_kwargs", type=json.loads) + parser.add_argument("--schema", type=json.loads) parser.add_argument("--consumes", type=json.loads) parser.add_argument("--produces", type=json.loads) @@ -136,7 +139,7 @@ def from_args(cls) -> "Executor": cache=cache, input_partition_rows=input_partition_rows, cluster_type=cluster_type, - client_kwargs=client_kwargs, + client_kwargs=client_kwargs ) @classmethod @@ -147,7 +150,7 @@ def from_spec( cache: bool, input_partition_rows: int, cluster_type: t.Optional[str], - client_kwargs: t.Optional[dict], + client_kwargs: t.Optional[dict] ) -> "Executor": """Create an executor from a component spec.""" args_dict = vars(cls._add_and_parse_args(component_spec)) @@ -181,7 +184,7 @@ def from_spec( user_arguments=args_dict, input_partition_rows=input_partition_rows, cluster_type=cluster_type, - client_kwargs=client_kwargs, + client_kwargs=client_kwargs ) @classmethod @@ -339,7 +342,7 @@ def _run_execution( input_manifest: Manifest, ) -> Manifest: logging.info("Executing component") - component = component_cls(self.spec, **self.user_arguments) + component = component_cls(self.spec, self.schema, **self.user_arguments) output_df = self._execute_component( component, manifest=input_manifest, diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index c8a236fe3..a04ce9c66 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -129,17 +129,18 @@ class ComponentOp: COMPONENT_SPEC_NAME = "fondant_component.yaml" def __init__( - self, - component_dir: t.Union[str, Path], - *, - arguments: t.Optional[t.Dict[str, t.Any]] = None, - input_partition_rows: t.Optional[t.Union[str, int]] = None, - cache: t.Optional[bool] = True, - cluster_type: t.Optional[str] = "default", - client_kwargs: t.Optional[dict] = None, - resources: t.Optional[Resources] = None, - consumes: t.Optional[t.Dict[str, t.Any]] = None, - produces: t.Optional[t.Dict[str, t.Any]] = None + self, + component_dir: t.Union[str, Path], + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + schema: t.Optional[t.Dict[str, t.Any]] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None ) -> None: self.component_dir = Path(component_dir) self.input_partition_rows = input_partition_rows @@ -156,16 +157,16 @@ def __init__( self._add_component_argument("cache", self.cache) self._add_component_argument("cluster_type", cluster_type) self._add_component_argument("client_kwargs", client_kwargs) + self._add_component_argument("schema", schema) self._add_component_argument("consumes", consumes) self._add_component_argument("produces", produces) - self.arguments.setdefault("component_spec", self.component_spec.specification) self.resources = resources or Resources() def _configure_caching_from_image_tag( - self, - cache: t.Optional[bool], + self, + cache: t.Optional[bool], ) -> t.Optional[bool]: """ Adjusts the caching setting based on the image tag of the component. @@ -196,10 +197,10 @@ def _configure_caching_from_image_tag( return cache def _add_component_argument( - self, - argument_name: str, - argument_value: t.Any, - validator: t.Optional[t.Callable] = None, + self, + argument_name: str, + argument_value: t.Any, + validator: t.Optional[t.Callable] = None, ): """Register component argument to arguments dict as well as component attributes.""" if hasattr(self, "arguments") is False: @@ -216,17 +217,17 @@ def dockerfile_path(self) -> t.Optional[Path]: @classmethod def from_registry( - cls, - name: str, - *, - arguments: t.Optional[t.Dict[str, t.Any]] = None, - input_partition_rows: t.Optional[t.Union[int, str]] = None, - resources: t.Optional[Resources] = None, - cache: t.Optional[bool] = True, - cluster_type: t.Optional[str] = "default", - client_kwargs: t.Optional[dict] = None, - consumes: t.Optional[t.Dict[str, t.Any]] = None, - produces: t.Optional[t.Dict[str, t.Any]] = None + cls, + name: str, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[int, str]] = None, + resources: t.Optional[Resources] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None ) -> "ComponentOp": """Load a reusable component by its name. @@ -259,8 +260,8 @@ def from_registry( ) def get_component_cache_key( - self, - previous_component_cache: t.Optional[str] = None, + self, + previous_component_cache: t.Optional[str] = None, ) -> str: """Calculate a cache key representing the unique identity of this ComponentOp. @@ -309,10 +310,10 @@ class Pipeline: """Class representing a Fondant Pipeline.""" def __init__( - self, - base_path: str, - pipeline_name: str, - pipeline_description: t.Optional[str] = None, + self, + base_path: str, + pipeline_name: str, + pipeline_description: t.Optional[str] = None, ): """ Args: @@ -327,53 +328,135 @@ def __init__( self._graph: t.OrderedDict[str, t.Any] = OrderedDict() self.task_without_dependencies_added = False + def read(self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + schema: t.Optional[t.Dict[str, str]] = None) -> "Pipeline": + + component_op = self._build_component_op(name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + schema=schema) + + self.add_op(component_op) + return self - def apply(self, name, consumes, produces) -> "Pipeline": - """ - Args: - name: Either component name from registry or local path - consumes: - produces: - - Returns: - """ - for field in consumes: - setattr(self, field, field) + def apply(self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + consumes: t.Optional[t.Dict[str, str]] = None, + produces: t.Optional[t.Dict[str, str]] = None) -> "Pipeline": + + component_op = self._build_component_op(name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + consumes=consumes, + produces=produces) + + # Get previous component + previous_component = list(self._graph.items())[-1][1]["fondant_component_op"] + if previous_component is None: + msg = f"No previous component found." + raise ValueError(msg) - for field in produces: - setattr(self, field, field) + self.add_op(component_op, dependencies=previous_component) + return self - component_op = self._get_component_op(name, consumes, produces) + def write(self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + consumes: t.Optional[t.Dict[str, str]] = None, + schema: t.Optional[t.Dict[str, str]] = None) -> "Pipeline": + + component_op = self._build_component_op(name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + consumes=consumes, + schema=schema) + + # Get previous component previous_component = list(self._graph.items())[-1] - if previous_component is None: msg = f"No previous component found." raise ValueError(msg) - self.add_op(component_op, dependencies=previous_component) return self - + def _build_component_op(self, name, *, arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + schema: t.Optional[t.Dict[str, t.Any]] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None): + + if not self._is_custom_component(path_or_name=name): + name = self._get_registry_path(name) + component_op = ComponentOp(name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + schema=schema, + consumes=consumes, + produces=produces) + return component_op @staticmethod - def _get_component_op(path_or_name, consumes, produces): - """Return either a custom component when name is a path, or a component from registry.""" + def _is_custom_component(path_or_name): + """Checks if name is a local path and a custom component.""" components_dir: Path = Path(path_or_name) - if (components_dir.exists() and components_dir.is_dir()): - return ComponentOp(component_dir=path_or_name, consumes=consumes, produces=produces) - else: - components_dir: Path = t.cast(Path, files("fondant") / f"components/{path_or_name}") - if not (components_dir.exists() and components_dir.is_dir()): - msg = f"No reusable component with name {path_or_name} found." - raise ValueError(msg) - return ComponentOp.from_registry(path_or_name, consumes=consumes, produces=produces) + return components_dir.exists() and components_dir.is_dir() + + @ staticmethod + def _get_registry_path(name): + """Checks if name is a local path and a custom component.""" + components_dir: Path = t.cast(Path, files("fondant") / f"components/{name}") + if not (components_dir.exists() and components_dir.is_dir()): + msg = f"No reusable component with name {name} found." + raise ValueError(msg) + return components_dir def add_op( - self, - task: ComponentOp, - dependencies: t.Optional[t.Union[ComponentOp, t.List[ComponentOp]]] = None, + self, + task: ComponentOp, + dependencies: t.Optional[t.Union[ComponentOp, t.List[ComponentOp]]] = None, ) -> "Pipeline": """ Add a task to the pipeline with an optional dependency. @@ -384,7 +467,6 @@ def add_op( can run. """ - if dependencies is None: if self.task_without_dependencies_added: msg = "At most one task can be defined without dependencies." @@ -498,8 +580,8 @@ def _validate_pipeline_definition(self, run_id: str): if not load_component: # Check subset exists for ( - component_subset_name, - component_subset, + component_subset_name, + component_subset, ) in component_spec.consumes.items(): if component_subset_name not in manifest.subsets: msg = ( @@ -549,4 +631,4 @@ def _validate_pipeline_definition(self, run_id: str): def __repr__(self) -> str: """Return a string representation of the FondantPipeline object.""" - return f"{self.__class__.__name__}({self._graph!r}" \ No newline at end of file + return f"{self.__class__.__name__}({self._graph!r}" diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 5409bad17..30c4cdb1b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -73,7 +73,7 @@ def test_new_pipeline_interface(): ) dataset = pipeline.read( - name="load_images", + name="load_from_hf_hub", schema={ "image": "binary" # or pa.binary() } @@ -90,7 +90,7 @@ def test_new_pipeline_interface(): ) dataset = dataset.apply( - component_dir="embed_text", + name="embed_text", consumes={ "text_data": "text" } From b4fe222601fa692a1c5e3779ef9702f4eb1f2728 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 23 Nov 2023 10:37:32 +0100 Subject: [PATCH 28/34] Update core package (#653) First PR related to the data structure redesign. Implements the following: - New manifest structure (including validation, and evolution) - New ComponentSpec structure (including validation) - Removes `Subsets` and `Index` Not all tests are running successfully. But this are already quite a few changes. Therefore, I've created PR on feature branch `feature/redesign-dataset-format-and-interface`, to have quicker feedback loops. --------- Co-authored-by: Robbe Sneyders Co-authored-by: Philippe Moussalli --- src/fondant/core/component_spec.py | 58 +---- src/fondant/core/manifest.py | 239 ++++++++--------- src/fondant/core/schema.py | 31 ++- src/fondant/core/schemas/component_spec.json | 32 +-- src/fondant/core/schemas/manifest.json | 17 +- .../component_specs/invalid_component.yaml} | 10 +- .../component_specs/kubeflow_component.yaml | 0 .../component_specs/valid_component.yaml} | 21 +- .../valid_component_no_args.yaml | 13 +- .../evolution_examples/1/component.yaml} | 14 +- .../evolution_examples/1/output_manifest.json | 36 +++ .../evolution_examples/2}/component.yaml | 10 +- .../evolution_examples/2/output_manifest.json | 33 +++ .../evolution_examples/3/component.yaml | 16 ++ .../evolution_examples/3/output_manifest.json | 29 +++ .../evolution_examples/4/component.yaml | 12 + .../evolution_examples/4/output_manifest.json | 29 +++ .../evolution_examples/input_manifest.json | 29 +++ .../examples/manifests/invalid_manifest.json | 14 + .../examples/manifests/valid_manifest.json | 29 +++ tests/{ => core}/test_component_specs.py | 28 +- tests/core/test_manifest.py | 246 ++++++++++++++++++ tests/{ => core}/test_manifest_evolution.py | 9 +- tests/{ => core}/test_schema.py | 0 .../component_specs/valid_component.yaml | 29 --- .../components/input_manifest.json | 22 -- .../evolution_examples/1/output_manifest.json | 46 ---- .../evolution_examples/2/component.yaml | 23 -- .../evolution_examples/2/output_manifest.json | 38 --- .../evolution_examples/3/component.yaml | 24 -- .../evolution_examples/3/output_manifest.json | 32 --- .../evolution_examples/4/output_manifest.json | 38 --- .../evolution_examples/5/component.yaml | 21 -- .../evolution_examples/5/output_manifest.json | 29 --- .../evolution_examples/6/component.yaml | 22 -- .../evolution_examples/6/output_manifest.json | 21 -- .../evolution_examples/7/component.yaml | 22 -- .../evolution_examples/7/output_manifest.json | 21 -- .../evolution_examples/8/output_manifest.json | 35 --- .../evolution_examples/input_manifest.json | 35 --- .../manifests/invalid_manifest.json | 14 - .../manifests/valid_manifest.json | 35 --- .../component_1/manifest.json | 36 --- .../example_component/Dockerfile | 0 .../example_component/fondant_component.yaml | 0 .../example_data/components/1.yaml | 0 .../{ => examples}/example_data/manifest.json | 0 .../{ => examples}/example_data/raw/split.py | 0 .../example_data/raw/testset.parquet | Bin .../subsets_input/index/part.0.parquet | Bin .../subsets_input/index/part.1.parquet | Bin .../subsets_input/index/part.2.parquet | Bin .../subsets_input/properties/part.0.parquet | Bin .../subsets_input/properties/part.1.parquet | Bin .../subsets_input/properties/part.2.parquet | Bin .../subsets_input/types/part.0.parquet | Bin .../subsets_input/types/part.1.parquet | Bin .../subsets_input/types/part.2.parquet | Bin .../example_modules/component.py | 0 .../example_modules/invalid_component.py | 0 .../invalid_double_components.py | 0 .../invalid_double_pipeline.py | 0 .../example_modules/pipeline.py | 0 .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../first_component/fondant_component.yaml | 0 .../second_component/fondant_component.yaml | 0 .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 0 .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 0 .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 0 .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml | 0 .../components/arguments/component.yaml | 0 .../arguments/component_default_args.yaml | 0 .../components/arguments/input_manifest.json | 14 +- .../example_specs/components/component.yaml | 18 +- .../components/input_manifest.json | 17 ++ .../example_pipeline/cache/42.txt | 0 .../component_1/manifest.json | 31 +++ .../component_2/manifest.json | 0 .../component_1/manifest.json | 0 .../component_2/manifest.json | 0 tests/test_component.py | 44 +--- tests/test_manifest.py | 239 ----------------- 90 files changed, 745 insertions(+), 1116 deletions(-) rename tests/{example_specs/evolution_examples/4/component.yaml => core/examples/component_specs/invalid_component.yaml} (84%) rename tests/{example_specs => core/examples}/component_specs/kubeflow_component.yaml (100%) rename tests/{example_specs/evolution_examples/1/component.yaml => core/examples/component_specs/valid_component.yaml} (62%) rename tests/{example_specs => core/examples}/component_specs/valid_component_no_args.yaml (59%) rename tests/{example_specs/component_specs/invalid_component.yaml => core/examples/evolution_examples/1/component.yaml} (59%) create mode 100644 tests/core/examples/evolution_examples/1/output_manifest.json rename tests/{example_specs/evolution_examples/8 => core/examples/evolution_examples/2}/component.yaml (69%) create mode 100644 tests/core/examples/evolution_examples/2/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/3/component.yaml create mode 100644 tests/core/examples/evolution_examples/3/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/4/component.yaml create mode 100644 tests/core/examples/evolution_examples/4/output_manifest.json create mode 100644 tests/core/examples/evolution_examples/input_manifest.json create mode 100644 tests/core/examples/manifests/invalid_manifest.json create mode 100644 tests/core/examples/manifests/valid_manifest.json rename tests/{ => core}/test_component_specs.py (85%) create mode 100644 tests/core/test_manifest.py rename tests/{ => core}/test_manifest_evolution.py (83%) rename tests/{ => core}/test_schema.py (100%) delete mode 100644 tests/example_specs/component_specs/valid_component.yaml delete mode 100644 tests/example_specs/components/input_manifest.json delete mode 100644 tests/example_specs/evolution_examples/1/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/2/component.yaml delete mode 100644 tests/example_specs/evolution_examples/2/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/3/component.yaml delete mode 100644 tests/example_specs/evolution_examples/3/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/4/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/5/component.yaml delete mode 100644 tests/example_specs/evolution_examples/5/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/6/component.yaml delete mode 100644 tests/example_specs/evolution_examples/6/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/7/component.yaml delete mode 100644 tests/example_specs/evolution_examples/7/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/8/output_manifest.json delete mode 100644 tests/example_specs/evolution_examples/input_manifest.json delete mode 100644 tests/example_specs/manifests/invalid_manifest.json delete mode 100644 tests/example_specs/manifests/valid_manifest.json delete mode 100644 tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => examples}/example_component/Dockerfile (100%) rename tests/{ => examples}/example_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_data/components/1.yaml (100%) rename tests/{ => examples}/example_data/manifest.json (100%) rename tests/{ => examples}/example_data/raw/split.py (100%) rename tests/{ => examples}/example_data/raw/testset.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/index/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/properties/part.2.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.0.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.1.parquet (100%) rename tests/{ => examples}/example_data/subsets_input/types/part.2.parquet (100%) rename tests/{ => examples}/example_modules/component.py (100%) rename tests/{ => examples}/example_modules/invalid_component.py (100%) rename tests/{ => examples}/example_modules/invalid_double_components.py (100%) rename tests/{ => examples}/example_modules/invalid_double_pipeline.py (100%) rename tests/{ => examples}/example_modules/pipeline.py (100%) rename tests/{ => examples}/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile (100%) rename tests/{ => examples}/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/component.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/component_default_args.yaml (100%) rename tests/{ => examples}/example_specs/components/arguments/input_manifest.json (60%) rename tests/{ => examples}/example_specs/components/component.yaml (56%) create mode 100644 tests/examples/example_specs/components/input_manifest.json rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/cache/42.txt (100%) create mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json (100%) rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json (100%) rename tests/{ => examples}/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json (100%) delete mode 100644 tests/test_manifest.py diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index cf177e07c..4dd945568 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -66,34 +66,6 @@ def kubeflow_type(self) -> str: return lookup[self.type] -class ComponentSubset: - """ - Class representing a Fondant Component subset. - - Args: - specification: the part of the component json representing the subset - """ - - def __init__(self, specification: t.Dict[str, t.Any]) -> None: - self._specification = specification - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - @property - def fields(self) -> t.Mapping[str, Field]: - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - @property - def additional_fields(self) -> bool: - return self._specification.get("additionalFields", True) - - class ComponentSpec: """ Class representing a Fondant component specification. @@ -190,39 +162,25 @@ def tags(self) -> t.List[str]: return self._specification.get("tags", None) @property - def index(self): - return ComponentSubset({"fields": {}}) - - @property - def consumes(self) -> t.Mapping[str, ComponentSubset]: - """The subsets consumed by the component as an immutable mapping.""" + def consumes(self) -> t.Mapping[str, Field]: + """The fields consumed by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("consumes", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("consumes", {}).items() }, ) @property - def produces(self) -> t.Mapping[str, ComponentSubset]: - """The subsets produced by the component as an immutable mapping.""" + def produces(self) -> t.Mapping[str, Field]: + """The fields produced by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("produces", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("produces", {}).items() }, ) - @property - def accepts_additional_subsets(self) -> bool: - return self._specification.get("consumes", {}).get("additionalSubsets", True) - - @property - def outputs_additional_subsets(self) -> bool: - return self._specification.get("produces", {}).get("additionalSubsets", True) - @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 692c4e7cd..fc750620d 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,6 +4,7 @@ import pkgutil import types import typing as t +from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path @@ -18,59 +19,6 @@ from fondant.core.schema import Field, Type -class Subset: - """ - Class representing a Fondant subset. - - Args: - specification: The part of the manifest json representing the subset - base_path: The base path which the subset location is defined relative to - """ - - def __init__(self, specification: dict, *, base_path: str) -> None: - self._specification = specification - self._base_path = base_path - - @property - def location(self) -> str: - """The absolute location of the subset.""" - return self._base_path + self._specification["location"] - - @property - def fields(self) -> t.Mapping[str, Field]: - """The fields of the subset returned as an immutable mapping.""" - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - def add_field(self, name: str, type_: Type, *, overwrite: bool = False) -> None: - if not overwrite and name in self._specification["fields"]: - msg = f"A field with name {name} already exists" - raise ValueError(msg) - - self._specification["fields"][name] = type_.to_json() - - def remove_field(self, name: str) -> None: - del self._specification["fields"][name] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - -class Index(Subset): - """Special case of a subset for the index, which has fixed fields.""" - - @property - def fields(self) -> t.Dict[str, Field]: - return { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - @dataclass class Metadata: """ @@ -171,8 +119,8 @@ def create( specification = { "metadata": metadata.to_dict(), - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": {}, + "index": {"location": f"/{component_id}"}, + "fields": {}, } return cls(specification) @@ -196,6 +144,10 @@ def copy(self) -> "Manifest": def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] + @property + def index(self) -> Field: + return Field(name="Index", location=self._specification["index"]["location"]) + def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -203,6 +155,44 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] + @property + def field_mapping(self) -> t.Mapping[str, t.List[str]]: + """ + Retrieve a mapping of field locations to corresponding field names. + A dictionary where keys are field locations and values are lists + of column names. + + The method returns an immutable OrderedDict where the first dict element contains the + location of the dataframe with the index. This allows an efficient left join operation. + + Example: + { + "/base_path/component_1": ["Name", "HP"], + "/base_path/component_2": ["Type 1", "Type 2"], + } + """ + field_mapping = {} + for field_name, field in {"id": self.index, **self.fields}.items(): + location = ( + f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" + ) + if location in field_mapping: + field_mapping[location].append(field_name) + else: + field_mapping[location] = [field_name] + + # Sort field mapping that the first dataset contains the index + sorted_keys = sorted( + field_mapping.keys(), + key=lambda key: "id" in field_mapping[key], + reverse=True, + ) + sorted_field_mapping = OrderedDict( + (key, field_mapping[key]) for key in sorted_keys + ) + + return types.MappingProxyType(sorted_field_mapping) + @property def run_id(self) -> str: return self.metadata["run_id"] @@ -220,39 +210,61 @@ def cache_key(self) -> str: return self.metadata["cache_key"] @property - def index(self) -> Index: - return Index(self._specification["index"], base_path=self.base_path) - - @property - def subsets(self) -> t.Mapping[str, Subset]: - """The subsets of the manifest as an immutable mapping.""" + def fields(self) -> t.Mapping[str, Field]: + """The fields of the manifest as an immutable mapping.""" return types.MappingProxyType( { - name: Subset(subset, base_path=self.base_path) - for name, subset in self._specification["subsets"].items() + name: Field( + name=name, + type=Type(field["type"]), + location=field["location"], + ) + for name, field in self._specification["fields"].items() }, ) - def add_subset( - self, - name: str, - fields: t.Iterable[t.Union[Field, t.Tuple[str, Type]]], - ) -> None: - if name in self._specification["subsets"]: - msg = f"A subset with name {name} already exists" + def add_or_update_field(self, field: Field, overwrite: bool = False): + """Add or update field to manifest.""" + if field.name == "index": + self._add_or_update_index(field, overwrite=True) + elif overwrite is False and field.name in self._specification["fields"]: + msg = ( + f"A field with name {field.name} already exists. Set overwrite to true, " + f"if you want to update the field." + ) + raise ValueError(msg) + else: + self._specification["fields"][field.name] = { + "location": f"/{self.component_id}", + **field.type.to_json(), + } + + def _add_or_update_index(self, field: Field, overwrite: bool = True): + """Add or update the manifest index.""" + if overwrite is False: + msg = ( + "The index already exists. Set overwrite to true, " + "if you want to update the index." + ) + raise ValueError(msg) + + if field.name != "index": + msg = ( + f"The field name is {field.name}. If you try to update the index, set the field" + f"name to `index`." + ) raise ValueError(msg) - self._specification["subsets"][name] = { - "location": f"/{self.pipeline_name}/{self.run_id}/{self.component_id}/{name}", - "fields": {name: type_.to_json() for name, type_ in fields}, + self._specification["index"] = { + "location": f"/{field.location}", } - def remove_subset(self, name: str) -> None: - if name not in self._specification["subsets"]: - msg = f"Subset {name} not found in specification" + def remove_field(self, name: str) -> None: + if name not in self._specification["fields"]: + msg = f"Field {name} not found in specification" raise ValueError(msg) - del self._specification["subsets"][name] + del self._specification["fields"][name] def evolve( # noqa : PLR0912 (too many branches) self, @@ -274,68 +286,23 @@ def evolve( # noqa : PLR0912 (too many branches) # Update `component_id` of the metadata component_id = component_spec.component_folder_name evolved_manifest.update_metadata(key="component_id", value=component_id) + if run_id is not None: evolved_manifest.update_metadata(key="run_id", value=run_id) - # Update index location as this is currently always rewritten - evolved_manifest.index._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/index" - - # If additionalSubsets is False in consumes, - # Remove all subsets from the manifest that are not listed - if not component_spec.accepts_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.consumes: - evolved_manifest.remove_subset(subset_name) - - # If additionalSubsets is False in produces, - # Remove all subsets from the manifest that are not listed - if not component_spec.outputs_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.produces: - evolved_manifest.remove_subset(subset_name) - - # If additionalFields is False for a consumed subset, - # Remove all fields from that subset that are not listed - for subset_name, subset in component_spec.consumes.items(): - if subset_name in evolved_manifest.subsets and not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # For each output subset defined in the component, add or update it - for subset_name, subset in component_spec.produces.items(): - # Subset is already in manifest, update it - if subset_name in evolved_manifest.subsets: - # If additional fields are not allowed, remove the fields not defined in the - # component spec produces section - if not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # Add fields defined in the component spec produces section - # Overwrite to persist changes to the field (eg. type of column) - for field in subset.fields.values(): - evolved_manifest.subsets[subset_name].add_field( - field.name, - field.type, - overwrite=True, - ) - - # Update subset location as this is currently always rewritten - evolved_manifest.subsets[subset_name]._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/{subset_name}" - - # Subset is not yet in manifest, add it - else: - evolved_manifest.add_subset(subset_name, subset.fields.values()) + # Update index location as this is always rewritten + evolved_manifest.add_or_update_field( + Field(name="index", location=component_spec.component_folder_name), + ) + + # TODO handle additionalFields + + # Add or update all produced fields defined in the component spec + for name, field in component_spec.produces.items(): + # If field was not part of the input manifest, add field to output manifest. + # If field was part of the input manifest and got produced by the component, update + # the manifest field. + evolved_manifest.add_or_update_field(field, overwrite=True) return evolved_manifest diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index ca9bb0944..dc940b5f7 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -5,6 +5,7 @@ import os import re import typing as t +from dataclasses import dataclass from enum import Enum import pyarrow as pa @@ -161,11 +162,33 @@ def __eq__(self, other): return False -class Field(t.NamedTuple): - """Class representing a single field or column in a Fondant subset.""" +class Field: + """Class representing a single field or column in a Fondant dataset.""" - name: str - type: Type + def __init__( + self, + name: str, + type: Type = None, + location: str = "", + ) -> None: + self._name = name + self._type = type + self._location = location + + @property + def name(self) -> str: + """The name of the field.""" + return self._name + + @property + def type(self) -> Type: + """The absolute location of the field.""" + return self._type + + @property + def location(self) -> str: + """The relative location of the field.""" + return self._location def validate_partition_size(arg_value): diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 8d684a3e5..064ea027d 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -28,44 +28,16 @@ } }, "consumes": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" }, "produces": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" }, "args": { "$ref": "#/definitions/args" } }, "definitions": { - "subset": { - "type": "object", - "properties": { - "fields": { - "$ref": "common.json#/definitions/fields" - }, - "additionalFields": { - "type": "boolean", - "default": true - } - }, - "required": [ - "fields" - ] - }, - "subsets": { - "type": "object", - "properties": { - "additionalSubsets": { - "type": "boolean", - "default": true - } - }, - "minProperties": 1, - "additionalProperties": { - "$ref": "#/definitions/subset" - } - }, "args": { "type": "object", "minProperties": 1, diff --git a/src/fondant/core/schemas/manifest.json b/src/fondant/core/schemas/manifest.json index 00ad6d1cc..77365dd5f 100644 --- a/src/fondant/core/schemas/manifest.json +++ b/src/fondant/core/schemas/manifest.json @@ -37,36 +37,33 @@ "location" ] }, - "subsets": { - "$ref": "#/definitions/subsets" + "fields": { + "$ref": "#/definitions/fields" } }, "required": [ "metadata", "index", - "subsets" + "fields" ], "definitions": { - "subset": { + "field": { "type": "object", "properties": { "location": { "type": "string", "pattern": "/.*" - }, - "fields": { - "$ref": "common.json#/definitions/fields" } }, "required": [ "location", - "fields" + "type" ] }, - "subsets": { + "fields": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/subset" + "$ref": "#/definitions/field" } } } diff --git a/tests/example_specs/evolution_examples/4/component.yaml b/tests/core/examples/component_specs/invalid_component.yaml similarity index 84% rename from tests/example_specs/evolution_examples/4/component.yaml rename to tests/core/examples/component_specs/invalid_component.yaml index 067b06da0..d1c88c444 100644 --- a/tests/example_specs/evolution_examples/4/component.yaml +++ b/tests/core/examples/component_specs/invalid_component.yaml @@ -7,14 +7,14 @@ consumes: fields: data: type: binary - + produces: - images: + captions: fields: - encoding: + data: type: string -args: +Arguments: storage_args: description: Storage arguments - type: str + type: str \ No newline at end of file diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/core/examples/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/example_specs/component_specs/kubeflow_component.yaml rename to tests/core/examples/component_specs/kubeflow_component.yaml diff --git a/tests/example_specs/evolution_examples/1/component.yaml b/tests/core/examples/component_specs/valid_component.yaml similarity index 62% rename from tests/example_specs/evolution_examples/1/component.yaml rename to tests/core/examples/component_specs/valid_component.yaml index 22ae0feb1..1215af1bd 100644 --- a/tests/example_specs/evolution_examples/1/component.yaml +++ b/tests/core/examples/component_specs/valid_component.yaml @@ -1,20 +1,21 @@ name: Example component description: This is an example component image: example_component:latest +tags: + - Data loading consumes: images: - fields: - data: - type: binary - -produces: + type: binary + embeddings: - fields: - data: - type: array - items: - type: float32 + type: array + items: + type: float32 + +produces: + captions: + type: string args: storage_args: diff --git a/tests/example_specs/component_specs/valid_component_no_args.yaml b/tests/core/examples/component_specs/valid_component_no_args.yaml similarity index 59% rename from tests/example_specs/component_specs/valid_component_no_args.yaml rename to tests/core/examples/component_specs/valid_component_no_args.yaml index c3adfa6aa..de11cb2ee 100644 --- a/tests/example_specs/component_specs/valid_component_no_args.yaml +++ b/tests/core/examples/component_specs/valid_component_no_args.yaml @@ -4,12 +4,13 @@ image: example_component:latest consumes: images: - fields: - data: - type: binary + type: binary + + embeddings: + type: array + items: + type: float32 produces: captions: - fields: - data: - type: string \ No newline at end of file + type: string diff --git a/tests/example_specs/component_specs/invalid_component.yaml b/tests/core/examples/evolution_examples/1/component.yaml similarity index 59% rename from tests/example_specs/component_specs/invalid_component.yaml rename to tests/core/examples/evolution_examples/1/component.yaml index 3fc8128b5..e91ae6f46 100644 --- a/tests/example_specs/component_specs/invalid_component.yaml +++ b/tests/core/examples/evolution_examples/1/component.yaml @@ -3,14 +3,16 @@ description: This is an example component image: example_component:latest consumes: - images: - data: binary + images_data: + type: binary produces: - captions: - data: string + embeddings_data: + type: array + items: + type: float32 -Arguments: +args: storage_args: description: Storage arguments - type: str \ No newline at end of file + type: str diff --git a/tests/core/examples/evolution_examples/1/output_manifest.json b/tests/core/examples/evolution_examples/1/output_manifest.json new file mode 100644 index 000000000..2a73e5f29 --- /dev/null +++ b/tests/core/examples/evolution_examples/1/output_manifest.json @@ -0,0 +1,36 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "embeddings_data": { + "type": "array", + "items": { + "type": "float32" + }, + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/8/component.yaml b/tests/core/examples/evolution_examples/2/component.yaml similarity index 69% rename from tests/example_specs/evolution_examples/8/component.yaml rename to tests/core/examples/evolution_examples/2/component.yaml index 5c204b9c2..2352adcb5 100644 --- a/tests/example_specs/evolution_examples/8/component.yaml +++ b/tests/core/examples/evolution_examples/2/component.yaml @@ -3,10 +3,12 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + +produces: + images_encoding: + type: string args: storage_args: diff --git a/tests/core/examples/evolution_examples/2/output_manifest.json b/tests/core/examples/evolution_examples/2/output_manifest.json new file mode 100644 index 000000000..ca1f6f361 --- /dev/null +++ b/tests/core/examples/evolution_examples/2/output_manifest.json @@ -0,0 +1,33 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "images_encoding": { + "type": "string", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/3/component.yaml b/tests/core/examples/evolution_examples/3/component.yaml new file mode 100644 index 000000000..13b1427b3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/component.yaml @@ -0,0 +1,16 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: string + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/3/output_manifest.json b/tests/core/examples/evolution_examples/3/output_manifest.json new file mode 100644 index 000000000..b11f7d8a3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "string", + "location":"/example_component_1" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/4/component.yaml b/tests/core/examples/evolution_examples/4/component.yaml new file mode 100644 index 000000000..1b766036d --- /dev/null +++ b/tests/core/examples/evolution_examples/4/component.yaml @@ -0,0 +1,12 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/4/output_manifest.json b/tests/core/examples/evolution_examples/4/output_manifest.json new file mode 100644 index 000000000..929c380ab --- /dev/null +++ b/tests/core/examples/evolution_examples/4/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/input_manifest.json b/tests/core/examples/evolution_examples/input_manifest.json new file mode 100644 index 000000000..664367cc2 --- /dev/null +++ b/tests/core/examples/evolution_examples/input_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"12345", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/invalid_manifest.json b/tests/core/examples/manifests/invalid_manifest.json new file mode 100644 index 000000000..51ec6c5e5 --- /dev/null +++ b/tests/core/examples/manifests/invalid_manifest.json @@ -0,0 +1,14 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields": { + "images": {} + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/valid_manifest.json b/tests/core/examples/manifests/valid_manifest.json new file mode 100644 index 000000000..0f7c58126 --- /dev/null +++ b/tests/core/examples/manifests/valid_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields":{ + "images": { + "location": "/component1", + "type": "binary" + }, + "height": { + "location": "/component2", + "type": "int32" + }, + "width": { + "location": "/component2", + "type": "int32" + }, + "caption": { + "location": "/component3", + "type": "string" + } + } +} \ No newline at end of file diff --git a/tests/test_component_specs.py b/tests/core/test_component_specs.py similarity index 85% rename from tests/test_component_specs.py rename to tests/core/test_component_specs.py index caf0344de..dcbf4c2ed 100644 --- a/tests/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -8,13 +8,12 @@ import yaml from fondant.core.component_spec import ( ComponentSpec, - ComponentSubset, KubeflowComponentSpec, ) from fondant.core.exceptions import InvalidComponentSpec from fondant.core.schema import Type -component_specs_path = Path(__file__).parent / "example_specs/component_specs" +component_specs_path = Path(__file__).parent / "examples/component_specs" @pytest.fixture() @@ -49,12 +48,19 @@ def test_component_spec_pkgutil_error(mock_get_data): def test_component_spec_validation(valid_fondant_schema, invalid_fondant_schema): - """Test that the manifest is validated correctly on instantiation.""" + """Test that the component spec is validated correctly on instantiation.""" ComponentSpec(valid_fondant_schema) with pytest.raises(InvalidComponentSpec): ComponentSpec(invalid_fondant_schema) +def test_component_spec_load_from_file(valid_fondant_schema, invalid_fondant_schema): + """Test that the component spec is validated correctly on instantiation.""" + ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + with pytest.raises(InvalidComponentSpec): + ComponentSpec.from_file(component_specs_path / "invalid_component.yaml") + + def test_attribute_access(valid_fondant_schema): """ Test that attributes can be accessed as expected: @@ -65,8 +71,8 @@ def test_attribute_access(valid_fondant_schema): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.consumes["images"].fields["data"].type == Type("binary") - assert fondant_component.consumes["embeddings"].fields["data"].type == Type.list( + assert fondant_component.consumes["images"].type == Type("binary") + assert fondant_component.consumes["embeddings"].type == Type.list( Type("float32"), ) @@ -129,15 +135,3 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr - - -def test_component_subset_repr(): - """Test that the __repr__ method of ComponentSubset returns the expected string.""" - component_subset_schema = { - "name": "Example subset", - "description": "This is an example subset", - } - - component_subset = ComponentSubset(component_subset_schema) - expected_repr = f"ComponentSubset({component_subset_schema!r})" - assert repr(component_subset) == expected_repr diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py new file mode 100644 index 000000000..0b255b9df --- /dev/null +++ b/tests/core/test_manifest.py @@ -0,0 +1,246 @@ +import json +import pkgutil +from collections import OrderedDict +from pathlib import Path + +import pytest +from fondant.core.component_spec import ComponentSpec +from fondant.core.exceptions import InvalidManifest +from fondant.core.manifest import Field, Manifest, Type + +manifest_path = Path(__file__).parent / "examples" / "manifests" +component_specs_path = Path(__file__).parent / "examples" / "component_specs" + + +@pytest.fixture() +def valid_manifest(): + with open(manifest_path / "valid_manifest.json") as f: + return json.load(f) + + +@pytest.fixture() +def invalid_manifest(): + with open(manifest_path / "invalid_manifest.json") as f: + return json.load(f) + + +def test_manifest_validation(valid_manifest, invalid_manifest): + """Test that the manifest is validated correctly on instantiation.""" + Manifest(valid_manifest) + with pytest.raises(InvalidManifest): + Manifest(invalid_manifest) + + +def test_set_base_path(valid_manifest): + """Test altering the base path in the manifest.""" + manifest = Manifest(valid_manifest) + tmp_path = "/tmp/base_path" + manifest.update_metadata(key="base_path", value=tmp_path) + + assert manifest.base_path == tmp_path + assert manifest._specification["metadata"]["base_path"] == tmp_path + + +def test_from_to_file(valid_manifest): + """Test reading from and writing to file.""" + tmp_path = "/tmp/manifest.json" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(valid_manifest, f) + + manifest = Manifest.from_file(tmp_path) + assert manifest.metadata == valid_manifest["metadata"] + + manifest.to_file(tmp_path) + with open(tmp_path, encoding="utf-8") as f: + assert json.load(f) == valid_manifest + + +def test_attribute_access(valid_manifest): + """ + Test that attributes can be accessed as expected: + - Fixed properties should be accessible as an attribute + - Dynamic properties should be accessible by lookup. + """ + manifest = Manifest(valid_manifest) + + assert manifest.metadata == valid_manifest["metadata"] + assert manifest.index.location == "/component1" + assert manifest.fields["images"].location == "/component1" + assert manifest.fields["images"].type == Type("binary") + + +def test_manifest_creation(): + """Test the stepwise creation of a manifest via the Manifest class.""" + base_path = "gs://bucket" + run_id = "run_id" + pipeline_name = "pipeline_name" + component_id = "component_id" + cache_key = "42" + + manifest = Manifest.create( + pipeline_name=pipeline_name, + base_path=base_path, + run_id=run_id, + component_id=component_id, + cache_key=cache_key, + ) + + manifest.add_or_update_field(Field(name="width", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height", type=Type("int32"))) + manifest.add_or_update_field(Field(name="data", type=Type("binary"))) + + assert manifest._specification == { + "metadata": { + "pipeline_name": pipeline_name, + "base_path": base_path, + "run_id": run_id, + "component_id": component_id, + "cache_key": cache_key, + }, + "index": {"location": f"/{component_id}"}, + "fields": { + "width": { + "type": "int32", + "location": f"/{component_id}", + }, + "height": { + "type": "int32", + "location": f"/{component_id}", + }, + "data": { + "type": "binary", + "location": f"/{component_id}", + }, + }, + } + + +def test_manifest_repr(): + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/", + run_id="A", + component_id="1", + cache_key="42", + ) + assert ( + manifest.__repr__() + == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," + " 'component_id': '1', 'cache_key': '42'}," + " 'index': {'location': '/1'}, 'fields': {}})" + ) + + +def test_manifest_alteration(valid_manifest): + """Test alteration functionalities of a manifest via the Manifest class.""" + manifest = Manifest(valid_manifest) + + # test adding a subset + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height2", type=Type("int32"))) + + assert "width2" in manifest.fields + assert "height2" in manifest.fields + + # test adding a duplicate subset + with pytest.raises(ValueError, match="A field with name width2 already exists"): + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + + # test removing a subset + manifest.remove_field("width2") + assert "images2" not in manifest.fields + + # test removing a nonexistant subset + with pytest.raises(ValueError, match="Field pictures not found in specification"): + manifest.remove_field("pictures") + + +def test_manifest_copy_and_adapt(valid_manifest): + """Test that a manifest can be copied and adapted without changing the original.""" + manifest = Manifest(valid_manifest) + new_manifest = manifest.copy() + new_manifest.remove_field("images") + assert manifest._specification == valid_manifest + assert new_manifest._specification != valid_manifest + + +def test_no_validate_schema(monkeypatch, valid_manifest): + monkeypatch.setattr(pkgutil, "get_data", lambda package, resource: None) + with pytest.raises(FileNotFoundError): + Manifest(valid_manifest) + + +def test_evolve_manifest(): + """Test that the fields are evolved as expected.""" + run_id = "A" + spec = ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + input_manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + output_manifest = input_manifest.evolve(component_spec=spec, run_id=run_id) + + assert output_manifest.base_path == input_manifest.base_path + assert output_manifest.run_id == run_id + assert output_manifest.index.location == "/" + spec.component_folder_name + assert output_manifest.fields["captions"].type.name == "string" + + +def test_fields(): + """Test that the fields can added and updated as expected.""" + run_id = "A" + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + # add a field + manifest.add_or_update_field(Field(name="field_1", type=Type("int32"))) + assert "field_1" in manifest.fields + + # add a duplicate field, but overwrite (update) + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=True, + ) + assert manifest.fields["field_1"].type.name == "string" + + # add duplicate field + with pytest.raises( + ValueError, + match="A field with name field_1 already exists. Set overwrite to true, " + "if you want to update the field.", + ): + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=False, + ) + + # delete a field + manifest.remove_field(name="field_1") + assert "field_1" not in manifest.fields + + +def test_field_mapping(valid_manifest): + """Test field mapping generation.""" + manifest = Manifest(valid_manifest) + manifest.add_or_update_field(Field(name="index", location="component2")) + field_mapping = manifest.field_mapping + assert field_mapping == OrderedDict( + { + "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ + "id", + "height", + "width", + ], + "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], + "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], + }, + ) diff --git a/tests/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py similarity index 83% rename from tests/test_manifest_evolution.py rename to tests/core/test_manifest_evolution.py index c79b76aaf..0d9181701 100644 --- a/tests/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -6,7 +6,7 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -examples_path = Path(__file__).parent / "example_specs/evolution_examples" +examples_path = Path(__file__).parent / "examples/evolution_examples" @pytest.fixture() @@ -41,7 +41,7 @@ def test_component_spec_location_update(): with open(examples_path / "input_manifest.json") as f: input_manifest = json.load(f) - with open(examples_path / "7/component.yaml") as f: + with open(examples_path / "4/component.yaml") as f: specification = yaml.safe_load(f) manifest = Manifest(input_manifest) @@ -50,7 +50,4 @@ def test_component_spec_location_update(): component_spec=component_spec, ) - assert ( - evolved_manifest._specification["subsets"]["images"]["location"] - == "/test_pipeline/12345/example_component/images" - ) + assert evolved_manifest.index.location == "/" + component_spec.component_folder_name diff --git a/tests/test_schema.py b/tests/core/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/core/test_schema.py diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml deleted file mode 100644 index c4b99e837..000000000 --- a/tests/example_specs/component_specs/valid_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest -tags: - - Data loading - -consumes: - images: - fields: - data: - type: binary - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - captions: - fields: - data: - type: string - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_specs/components/input_manifest.json b/tests/example_specs/components/input_manifest.json deleted file mode 100644 index 7af13d599..000000000 --- a/tests/example_specs/components/input_manifest.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index/12345/example_component" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } - } - - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/1/output_manifest.json b/tests/example_specs/evolution_examples/1/output_manifest.json deleted file mode 100644 index 17b94c0b0..000000000 --- a/tests/example_specs/evolution_examples/1/output_manifest.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/2/component.yaml b/tests/example_specs/evolution_examples/2/component.yaml deleted file mode 100644 index f37ff99d1..000000000 --- a/tests/example_specs/evolution_examples/2/component.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/2/output_manifest.json b/tests/example_specs/evolution_examples/2/output_manifest.json deleted file mode 100644 index 3a40b1c9d..000000000 --- a/tests/example_specs/evolution_examples/2/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/3/component.yaml b/tests/example_specs/evolution_examples/3/component.yaml deleted file mode 100644 index 6753a083b..000000000 --- a/tests/example_specs/evolution_examples/3/component.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalFields: false - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/3/output_manifest.json b/tests/example_specs/evolution_examples/3/output_manifest.json deleted file mode 100644 index a9abda6d0..000000000 --- a/tests/example_specs/evolution_examples/3/output_manifest.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/4/output_manifest.json b/tests/example_specs/evolution_examples/4/output_manifest.json deleted file mode 100644 index 24af4f2ac..000000000 --- a/tests/example_specs/evolution_examples/4/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - }, - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/5/component.yaml b/tests/example_specs/evolution_examples/5/component.yaml deleted file mode 100644 index 93aaf68b3..000000000 --- a/tests/example_specs/evolution_examples/5/component.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/5/output_manifest.json b/tests/example_specs/evolution_examples/5/output_manifest.json deleted file mode 100644 index 8bcf6141d..000000000 --- a/tests/example_specs/evolution_examples/5/output_manifest.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/6/component.yaml b/tests/example_specs/evolution_examples/6/component.yaml deleted file mode 100644 index 065061791..000000000 --- a/tests/example_specs/evolution_examples/6/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/6/output_manifest.json b/tests/example_specs/evolution_examples/6/output_manifest.json deleted file mode 100644 index b7521bf66..000000000 --- a/tests/example_specs/evolution_examples/6/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/7/component.yaml b/tests/example_specs/evolution_examples/7/component.yaml deleted file mode 100644 index 5746ffa4d..000000000 --- a/tests/example_specs/evolution_examples/7/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - data: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/7/output_manifest.json b/tests/example_specs/evolution_examples/7/output_manifest.json deleted file mode 100644 index a9eb8a308..000000000 --- a/tests/example_specs/evolution_examples/7/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "data":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/8/output_manifest.json b/tests/example_specs/evolution_examples/8/output_manifest.json deleted file mode 100644 index de2621c49..000000000 --- a/tests/example_specs/evolution_examples/8/output_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "custom_run_id", - "component_id": "example_component" - }, - "index": { - "location": "/test_pipeline/custom_run_id/example_component/index" - }, - "subsets": { - "images": { - "location": "/test_pipeline/12345/example_component/images", - "fields": { - "width": { - "type": "int32" - }, - "height": { - "type": "int32" - }, - "data": { - "type": "binary" - } - } - }, - "captions": { - "location": "/test_pipeline/12345/example_component/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} diff --git a/tests/example_specs/evolution_examples/input_manifest.json b/tests/example_specs/evolution_examples/input_manifest.json deleted file mode 100644 index 2ecf37243..000000000 --- a/tests/example_specs/evolution_examples/input_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"12345", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/12345/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/invalid_manifest.json b/tests/example_specs/manifests/invalid_manifest.json deleted file mode 100644 index 3fe8b1097..000000000 --- a/tests/example_specs/manifests/invalid_manifest.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "metadata": { - "base_path": "gs://bucket" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": [] - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/valid_manifest.json b/tests/example_specs/manifests/valid_manifest.json deleted file mode 100644 index 9bc00c512..000000000 --- a/tests/example_specs/manifests/valid_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 541775f84..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_component/Dockerfile b/tests/examples/example_component/Dockerfile similarity index 100% rename from tests/example_component/Dockerfile rename to tests/examples/example_component/Dockerfile diff --git a/tests/example_component/fondant_component.yaml b/tests/examples/example_component/fondant_component.yaml similarity index 100% rename from tests/example_component/fondant_component.yaml rename to tests/examples/example_component/fondant_component.yaml diff --git a/tests/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml similarity index 100% rename from tests/example_data/components/1.yaml rename to tests/examples/example_data/components/1.yaml diff --git a/tests/example_data/manifest.json b/tests/examples/example_data/manifest.json similarity index 100% rename from tests/example_data/manifest.json rename to tests/examples/example_data/manifest.json diff --git a/tests/example_data/raw/split.py b/tests/examples/example_data/raw/split.py similarity index 100% rename from tests/example_data/raw/split.py rename to tests/examples/example_data/raw/split.py diff --git a/tests/example_data/raw/testset.parquet b/tests/examples/example_data/raw/testset.parquet similarity index 100% rename from tests/example_data/raw/testset.parquet rename to tests/examples/example_data/raw/testset.parquet diff --git a/tests/example_data/subsets_input/index/part.0.parquet b/tests/examples/example_data/subsets_input/index/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.0.parquet rename to tests/examples/example_data/subsets_input/index/part.0.parquet diff --git a/tests/example_data/subsets_input/index/part.1.parquet b/tests/examples/example_data/subsets_input/index/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.1.parquet rename to tests/examples/example_data/subsets_input/index/part.1.parquet diff --git a/tests/example_data/subsets_input/index/part.2.parquet b/tests/examples/example_data/subsets_input/index/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/index/part.2.parquet rename to tests/examples/example_data/subsets_input/index/part.2.parquet diff --git a/tests/example_data/subsets_input/properties/part.0.parquet b/tests/examples/example_data/subsets_input/properties/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.0.parquet rename to tests/examples/example_data/subsets_input/properties/part.0.parquet diff --git a/tests/example_data/subsets_input/properties/part.1.parquet b/tests/examples/example_data/subsets_input/properties/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.1.parquet rename to tests/examples/example_data/subsets_input/properties/part.1.parquet diff --git a/tests/example_data/subsets_input/properties/part.2.parquet b/tests/examples/example_data/subsets_input/properties/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/properties/part.2.parquet rename to tests/examples/example_data/subsets_input/properties/part.2.parquet diff --git a/tests/example_data/subsets_input/types/part.0.parquet b/tests/examples/example_data/subsets_input/types/part.0.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.0.parquet rename to tests/examples/example_data/subsets_input/types/part.0.parquet diff --git a/tests/example_data/subsets_input/types/part.1.parquet b/tests/examples/example_data/subsets_input/types/part.1.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.1.parquet rename to tests/examples/example_data/subsets_input/types/part.1.parquet diff --git a/tests/example_data/subsets_input/types/part.2.parquet b/tests/examples/example_data/subsets_input/types/part.2.parquet similarity index 100% rename from tests/example_data/subsets_input/types/part.2.parquet rename to tests/examples/example_data/subsets_input/types/part.2.parquet diff --git a/tests/example_modules/component.py b/tests/examples/example_modules/component.py similarity index 100% rename from tests/example_modules/component.py rename to tests/examples/example_modules/component.py diff --git a/tests/example_modules/invalid_component.py b/tests/examples/example_modules/invalid_component.py similarity index 100% rename from tests/example_modules/invalid_component.py rename to tests/examples/example_modules/invalid_component.py diff --git a/tests/example_modules/invalid_double_components.py b/tests/examples/example_modules/invalid_double_components.py similarity index 100% rename from tests/example_modules/invalid_double_components.py rename to tests/examples/example_modules/invalid_double_components.py diff --git a/tests/example_modules/invalid_double_pipeline.py b/tests/examples/example_modules/invalid_double_pipeline.py similarity index 100% rename from tests/example_modules/invalid_double_pipeline.py rename to tests/examples/example_modules/invalid_double_pipeline.py diff --git a/tests/example_modules/pipeline.py b/tests/examples/example_modules/pipeline.py similarity index 100% rename from tests/example_modules/pipeline.py rename to tests/examples/example_modules/pipeline.py diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml rename to tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/examples/example_specs/components/arguments/component.yaml similarity index 100% rename from tests/example_specs/components/arguments/component.yaml rename to tests/examples/example_specs/components/arguments/component.yaml diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/examples/example_specs/components/arguments/component_default_args.yaml similarity index 100% rename from tests/example_specs/components/arguments/component_default_args.yaml rename to tests/examples/example_specs/components/arguments/component_default_args.yaml diff --git a/tests/example_specs/components/arguments/input_manifest.json b/tests/examples/example_specs/components/arguments/input_manifest.json similarity index 60% rename from tests/example_specs/components/arguments/input_manifest.json rename to tests/examples/example_specs/components/arguments/input_manifest.json index d98ddd95b..9ee2494f9 100644 --- a/tests/example_specs/components/arguments/input_manifest.json +++ b/tests/examples/example_specs/components/arguments/input_manifest.json @@ -7,16 +7,12 @@ "cache_key": "00" }, "index": { - "location": "/index" + "location": "/component_1" }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } + "fields": { + "data": { + "type": "binary", + "location": "/component_1" } } } \ No newline at end of file diff --git a/tests/example_specs/components/component.yaml b/tests/examples/example_specs/components/component.yaml similarity index 56% rename from tests/example_specs/components/component.yaml rename to tests/examples/example_specs/components/component.yaml index 19c8d5856..973cc3e6b 100644 --- a/tests/example_specs/components/component.yaml +++ b/tests/examples/example_specs/components/component.yaml @@ -3,19 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - additionalFields: false + images_data: + type: array + items: + type: float32 +additionalFields: false args: diff --git a/tests/examples/example_specs/components/input_manifest.json b/tests/examples/example_specs/components/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/examples/example_specs/components/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/cache/42.txt rename to tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json similarity index 100% rename from tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename to tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json diff --git a/tests/test_component.py b/tests/test_component.py index e759bd367..e5dcb3bc3 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -377,38 +377,22 @@ def test_wrap_transform(): "description": "Component for testing", "image": "component:test", "consumes": { - "image": { - "fields": { - "height": { - "type": "int16", - }, - "width": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "image_width": { + "type": "int16", + }, + "caption_text": { + "type": "string", }, }, "produces": { - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "caption_text": { + "type": "string", }, - "image": { - "fields": { - "height": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, }, }, @@ -425,9 +409,9 @@ def test_wrap_transform(): def transform(dataframe: pd.DataFrame) -> pd.DataFrame: # Check hierarchical columns assert dataframe.columns.tolist() == [ - ("image", "height"), - ("image", "width"), - ("caption", "text"), + "image_height", + "image_width", + "caption_text", ] return dataframe diff --git a/tests/test_manifest.py b/tests/test_manifest.py deleted file mode 100644 index 3af3ea425..000000000 --- a/tests/test_manifest.py +++ /dev/null @@ -1,239 +0,0 @@ -import json -import pkgutil -from pathlib import Path - -import pytest -from fondant.core.exceptions import InvalidManifest -from fondant.core.manifest import Field, Index, Manifest, Subset, Type - -manifest_path = Path(__file__).parent / "example_specs/manifests" - - -@pytest.fixture() -def valid_manifest(): - with open(manifest_path / "valid_manifest.json") as f: - return json.load(f) - - -@pytest.fixture() -def invalid_manifest(): - with open(manifest_path / "invalid_manifest.json") as f: - return json.load(f) - - -def test_manifest_validation(valid_manifest, invalid_manifest): - """Test that the manifest is validated correctly on instantiation.""" - Manifest(valid_manifest) - with pytest.raises(InvalidManifest): - Manifest(invalid_manifest) - - -def test_subset_init(): - """Test initializing a subset.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - assert subset.location == "/tmp/images/ABC/123" - assert ( - subset.__repr__() - == "Subset({'location': '/images/ABC/123', 'fields': {'data': {'type': 'binary'}}})" - ) - - -def test_subset_fields(): - """Test manipulating subset fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - - # add a field - subset.add_field(name="data2", type_=Type("binary")) - assert "data2" in subset.fields - - # add a duplicate field - with pytest.raises(ValueError, match="A field with name data2 already exists"): - subset.add_field(name="data2", type_=Type("binary")) - - # add a duplicate field but overwrite - subset.add_field(name="data2", type_=Type("string"), overwrite=True) - assert subset.fields["data2"].type == Type("string") - - # remove a field - subset.remove_field(name="data2") - assert "data2" not in subset.fields - - -def test_set_base_path(valid_manifest): - """Test altering the base path in the manifest.""" - manifest = Manifest(valid_manifest) - tmp_path = "/tmp/base_path" - manifest.update_metadata(key="base_path", value=tmp_path) - - assert manifest.base_path == tmp_path - assert manifest._specification["metadata"]["base_path"] == tmp_path - - -def test_from_to_file(valid_manifest): - """Test reading from and writing to file.""" - tmp_path = "/tmp/manifest.json" - with open(tmp_path, "w", encoding="utf-8") as f: - json.dump(valid_manifest, f) - - manifest = Manifest.from_file(tmp_path) - assert manifest.metadata == valid_manifest["metadata"] - - manifest.to_file(tmp_path) - with open(tmp_path, encoding="utf-8") as f: - assert json.load(f) == valid_manifest - - -def test_attribute_access(valid_manifest): - """ - Test that attributes can be accessed as expected: - - Fixed properties should be accessible as an attribute - - Dynamic properties should be accessible by lookup. - """ - manifest = Manifest(valid_manifest) - - assert manifest.metadata == valid_manifest["metadata"] - assert manifest.index.location == "gs://bucket/index" - assert manifest.subsets["images"].location == "gs://bucket/images" - assert manifest.subsets["images"].fields["data"].type == Type("binary") - - -def test_manifest_creation(): - """Test the stepwise creation of a manifest via the Manifest class.""" - base_path = "gs://bucket" - run_id = "run_id" - pipeline_name = "pipeline_name" - component_id = "component_id" - cache_key = "42" - - manifest = Manifest.create( - pipeline_name=pipeline_name, - base_path=base_path, - run_id=run_id, - component_id=component_id, - cache_key=cache_key, - ) - - manifest.add_subset("images", [("width", Type("int32")), ("height", Type("int32"))]) - manifest.subsets["images"].add_field("data", Type("binary")) - - assert manifest._specification == { - "metadata": { - "pipeline_name": pipeline_name, - "base_path": base_path, - "run_id": run_id, - "component_id": component_id, - "cache_key": cache_key, - }, - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": { - "images": { - "location": f"/{pipeline_name}/{run_id}/{component_id}/images", - "fields": { - "width": { - "type": "int32", - }, - "height": { - "type": "int32", - }, - "data": { - "type": "binary", - }, - }, - }, - }, - } - - -def test_manifest_repr(): - manifest = Manifest.create( - pipeline_name="NAME", - base_path="/", - run_id="A", - component_id="1", - cache_key="42", - ) - assert ( - manifest.__repr__() - == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," - " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1/index'}, 'subsets': {}})" - ) - - -def test_manifest_alteration(valid_manifest): - """Test alteration functionalities of a manifest via the Manifest class.""" - manifest = Manifest(valid_manifest) - - # test adding a subset - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) - assert "images2" in manifest.subsets - - # test adding a duplicate subset - with pytest.raises(ValueError, match="A subset with name images2 already exists"): - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) - - # test removing a subset - manifest.remove_subset("images2") - assert "images2" not in manifest.subsets - - # test removing a nonexistant subset - with pytest.raises(ValueError, match="Subset pictures not found in specification"): - manifest.remove_subset("pictures") - - -def test_manifest_copy_and_adapt(valid_manifest): - """Test that a manifest can be copied and adapted without changing the original.""" - manifest = Manifest(valid_manifest) - new_manifest = manifest.copy() - new_manifest.remove_subset("images") - assert manifest._specification == valid_manifest - assert new_manifest._specification != valid_manifest - - -def test_no_validate_schema(monkeypatch, valid_manifest): - monkeypatch.setattr(pkgutil, "get_data", lambda package, resource: None) - with pytest.raises(FileNotFoundError): - Manifest(valid_manifest) - - -def test_index_fields(): - """Test that the fields property of Index returns the expected fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - - index = Index(specification=subset_spec, base_path="/tmp") - - expected_fields = { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - assert index.fields == expected_fields From bb3b623a5a587ef4523c8bf41292e84726e8e902 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Thu, 23 Nov 2023 14:47:54 +0100 Subject: [PATCH 29/34] Refactor component package (#654) Refactor component package as part of #643 --------- Co-authored-by: Robbe Sneyders Co-authored-by: Philippe Moussalli --- src/fondant/component/data_io.py | 175 ++++++------------ src/fondant/component/executor.py | 34 +--- src/fondant/core/manifest.py | 48 +---- .../component_specs/arguments/component.yaml | 68 +++++++ .../arguments/component_default_args.yaml | 69 +++++++ .../arguments/input_manifest.json | 18 ++ .../examples/component_specs/component.yaml | 23 +++ .../component_specs/input_manifest.json | 17 ++ .../component/examples/data/components/1.yaml | 29 +++ tests/component/examples/data/manifest.json | 29 +++ .../component_1/part.0.parquet | Bin 0 -> 3542 bytes .../component_1/part.1.parquet | Bin 0 -> 3526 bytes .../component_1/part.2.parquet | Bin 0 -> 3584 bytes .../component_2/part.0.parquet | Bin 0 -> 3018 bytes .../component_2/part.1.parquet | Bin 0 -> 3085 bytes .../component_2/part.2.parquet | Bin 0 -> 3066 bytes .../example_pipeline/cache/42.txt | 1 + .../component_1/manifest.json | 31 ++++ tests/{ => component}/test_component.py | 4 +- tests/{ => component}/test_data_io.py | 128 +++++-------- tests/core/test_manifest.py | 19 -- tests/examples/example_data/raw/split.py | 10 +- 22 files changed, 421 insertions(+), 282 deletions(-) create mode 100644 tests/component/examples/component_specs/arguments/component.yaml create mode 100644 tests/component/examples/component_specs/arguments/component_default_args.yaml create mode 100644 tests/component/examples/component_specs/arguments/input_manifest.json create mode 100644 tests/component/examples/component_specs/component.yaml create mode 100644 tests/component/examples/component_specs/input_manifest.json create mode 100644 tests/component/examples/data/components/1.yaml create mode 100644 tests/component/examples/data/manifest.json create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.1.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.0.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.1.parquet create mode 100644 tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet create mode 100644 tests/component/examples/mock_base_path/example_pipeline/cache/42.txt create mode 100644 tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json rename tests/{ => component}/test_component.py (99%) rename tests/{ => component}/test_data_io.py (61%) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 7023c1ee2..79a181f8d 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -1,16 +1,19 @@ import logging import os import typing as t +from collections import defaultdict import dask.dataframe as dd from dask.diagnostics import ProgressBar from dask.distributed import Client -from fondant.core.component_spec import ComponentSpec, ComponentSubset +from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest logger = logging.getLogger(__name__) +DEFAULT_INDEX_NAME = "id" + class DataIO: def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None: @@ -82,73 +85,48 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: return dataframe - def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: + def load_dataframe(self) -> dd.DataFrame: """ - Function that loads a subset from the manifest as a Dask dataframe. - - Args: - subset_name: the name of the subset to load - fields: the fields to load from the subset + Function that loads the subsets defined in the component spec as a single Dask dataframe for + the user. Returns: - The subset as a dask dataframe + The Dask dataframe with all columns defined in the manifest field mapping """ - subset = self.manifest.subsets[subset_name] - remote_path = subset.location - - logger.info(f"Loading subset {subset_name} with fields {fields}...") + dataframe = None + field_mapping = defaultdict(list) - subset_df = dd.read_parquet( - remote_path, - columns=fields, - calculate_divisions=True, + # Add index field to field mapping to guarantee start reading with the index dataframe + field_mapping[self.manifest.get_field_location(DEFAULT_INDEX_NAME)].append( + DEFAULT_INDEX_NAME, ) - # add subset prefix to columns - subset_df = subset_df.rename( - columns={col: subset_name + "_" + col for col in subset_df.columns}, - ) + for field_name in self.component_spec.consumes: + location = self.manifest.get_field_location(field_name) + field_mapping[location].append(field_name) - return subset_df - - def _load_index(self) -> dd.DataFrame: - """ - Function that loads the index from the manifest as a Dask dataframe. - - Returns: - The index as a dask dataframe - """ - # get index subset from the manifest - index = self.manifest.index - # get remote path - remote_path = index.location - - # load index from parquet, expecting id and source columns - return dd.read_parquet(remote_path, calculate_divisions=True) - - def load_dataframe(self) -> dd.DataFrame: - """ - Function that loads the subsets defined in the component spec as a single Dask dataframe for - the user. + for location, fields in field_mapping.items(): + if DEFAULT_INDEX_NAME in fields: + fields.remove(DEFAULT_INDEX_NAME) - Returns: - The Dask dataframe with the field columns in the format (_) - as well as the index columns. - """ - # load index into dataframe - dataframe = self._load_index() - for name, subset in self.component_spec.consumes.items(): - fields = list(subset.fields.keys()) - subset_df = self._load_subset(name, fields) - # left joins -> filter on index - dataframe = dd.merge( - dataframe, - subset_df, - left_index=True, - right_index=True, - how="left", + partial_df = dd.read_parquet( + location, + columns=fields, + index=DEFAULT_INDEX_NAME, + calculate_divisions=True, ) + if dataframe is None: + # ensure that the index is set correctly and divisions are known. + dataframe = partial_df + else: + dataframe = dataframe.merge( + partial_df, + how="left", + left_index=True, + right_index=True, + ) + dataframe = self.partition_loaded_dataframe(dataframe) logging.info(f"Columns of dataframe: {list(dataframe.columns)}") @@ -170,79 +148,48 @@ def write_dataframe( dataframe: dd.DataFrame, dask_client: t.Optional[Client] = None, ) -> None: - write_tasks = [] + columns_to_produce = [ + column_name for column_name, field in self.component_spec.produces.items() + ] - dataframe.index = dataframe.index.rename("id") + dataframe.index = dataframe.index.rename(DEFAULT_INDEX_NAME) - # Turn index into an empty dataframe so we can write it - index_df = dataframe.index.to_frame().drop(columns=["id"]) - write_index_task = self._write_subset( - index_df, - subset_name="index", - subset_spec=self.component_spec.index, - ) - write_tasks.append(write_index_task) + # validation that all columns are in the dataframe + self.validate_dataframe_columns(dataframe, columns_to_produce) - for subset_name, subset_spec in self.component_spec.produces.items(): - subset_df = self._extract_subset_dataframe( - dataframe, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_subset_task = self._write_subset( - subset_df, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_tasks.append(write_subset_task) + dataframe = dataframe[columns_to_produce] + write_task = self._write_dataframe(dataframe) with ProgressBar(): logging.info("Writing data...") - # alternative implementation possible: futures = client.compute(...) - dd.compute(*write_tasks, scheduler=dask_client) + dd.compute(write_task, scheduler=dask_client) @staticmethod - def _extract_subset_dataframe( - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.DataFrame: - """Create subset dataframe to save with the original field name as the column name.""" - # Create a new dataframe with only the columns needed for the output subset - subset_columns = [f"{subset_name}_{field}" for field in subset_spec.fields] - try: - subset_df = dataframe[subset_columns] - except KeyError as e: + def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]): + """Validates that all columns are available in the dataset.""" + missing_fields = [] + for col in columns: + if col not in dataframe.columns: + missing_fields.append(col) + + if missing_fields: msg = ( - f"Field {e.args[0]} defined in output subset {subset_name} " + f"Fields {missing_fields} defined in output dataset " f"but not found in dataframe" ) raise ValueError( msg, ) - # Remove the subset prefix from the column names - subset_df = subset_df.rename( - columns={col: col[(len(f"{subset_name}_")) :] for col in subset_columns}, + def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: + """Create dataframe writing task.""" + location = ( + self.manifest.base_path + "/" + self.component_spec.component_folder_name ) - - return subset_df - - def _write_subset( - self, - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.core.Scalar: - if subset_name == "index": - location = self.manifest.index.location - else: - location = self.manifest.subsets[subset_name].location - - schema = {field.name: field.type.value for field in subset_spec.fields.values()} - + schema = { + field.name: field.type.value + for field in self.component_spec.produces.values() + } return self._create_write_task(dataframe, location=location, schema=schema) @staticmethod diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 3d4d6097f..d77200da8 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -491,14 +491,11 @@ def optional_fondant_arguments() -> t.List[str]: @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: - - Converts the columns to hierarchical format before passing the dataframe to the - transform function - Removes extra columns from the returned dataframe which are not defined in the component spec `produces` section - Sorts the columns from the returned dataframe according to the order in the component spec `produces` section to match the order in the `meta` argument passed to Dask's `map_partitions`. - - Flattens the returned dataframe columns. Args: transform: Transform method to wrap @@ -506,27 +503,13 @@ def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """ def wrapped_transform(dataframe: pd.DataFrame) -> pd.DataFrame: - # Switch to hierarchical columns - dataframe.columns = pd.MultiIndex.from_tuples( - tuple(column.split("_")) for column in dataframe.columns - ) - # Call transform method dataframe = transform(dataframe) # Drop columns not in specification - columns = [ - (subset_name, field) - for subset_name, subset in spec.produces.items() - for field in subset.fields - ] - dataframe = dataframe[columns] - - # Switch to flattened columns - dataframe.columns = [ - "_".join(column) for column in dataframe.columns.to_flat_index() - ] - return dataframe + columns = [name for name, field in spec.produces.items()] + + return dataframe[columns] return wrapped_transform @@ -552,11 +535,8 @@ def _execute_component( # Create meta dataframe with expected format meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series(dtype=pd.ArrowDtype(field.type.value)) meta_df = pd.DataFrame(meta_dict).set_index("id") wrapped_transform = self.wrap_transform(component.transform, spec=self.spec) @@ -573,8 +553,10 @@ def _execute_component( return dataframe + # TODO: fix in #244 def _infer_index_change(self) -> bool: """Infer if this component changes the index based on its component spec.""" + """ if not self.spec.accepts_additional_subsets: return True if not self.spec.outputs_additional_subsets: @@ -585,6 +567,8 @@ def _infer_index_change(self) -> bool: return any( not subset.additional_fields for subset in self.spec.produces.values() ) + """ + return False class DaskWriteExecutor(Executor[DaskWriteComponent]): diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index fc750620d..013ce2b71 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -4,7 +4,6 @@ import pkgutil import types import typing as t -from collections import OrderedDict from dataclasses import asdict, dataclass from pathlib import Path @@ -146,7 +145,7 @@ def metadata(self) -> t.Dict[str, t.Any]: @property def index(self) -> Field: - return Field(name="Index", location=self._specification["index"]["location"]) + return Field(name="id", location=self._specification["index"]["location"]) def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -155,43 +154,16 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] - @property - def field_mapping(self) -> t.Mapping[str, t.List[str]]: - """ - Retrieve a mapping of field locations to corresponding field names. - A dictionary where keys are field locations and values are lists - of column names. - - The method returns an immutable OrderedDict where the first dict element contains the - location of the dataframe with the index. This allows an efficient left join operation. - - Example: - { - "/base_path/component_1": ["Name", "HP"], - "/base_path/component_2": ["Type 1", "Type 2"], - } - """ - field_mapping = {} - for field_name, field in {"id": self.index, **self.fields}.items(): - location = ( - f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" - ) - if location in field_mapping: - field_mapping[location].append(field_name) - else: - field_mapping[location] = [field_name] - - # Sort field mapping that the first dataset contains the index - sorted_keys = sorted( - field_mapping.keys(), - key=lambda key: "id" in field_mapping[key], - reverse=True, - ) - sorted_field_mapping = OrderedDict( - (key, field_mapping[key]) for key in sorted_keys - ) + def get_field_location(self, field_name: str): + """Return absolute path to the field location.""" + if field_name == "id": + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{self.index.location}" + if field_name not in self.fields: + msg = f"Field {field_name} is not available in the manifest." + raise ValueError(msg) - return types.MappingProxyType(sorted_field_mapping) + field = self.fields[field_name] + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" @property def run_id(self) -> str: diff --git a/tests/component/examples/component_specs/arguments/component.yaml b/tests/component/examples/component_specs/arguments/component.yaml new file mode 100644 index 000000000..659ed0026 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/component.yaml @@ -0,0 +1,68 @@ +name: Example component +description: This is an example component +image: example_component:latest + +args: + string_default_arg: + description: default string argument + type: str + default: foo + integer_default_arg: + description: default integer argument + type: int + default: 0 + float_default_arg: + description: default float argument + type: float + default: 3.14 + bool_false_default_arg: + description: default bool argument + type: bool + default: False + bool_true_default_arg: + description: default bool argument + type: bool + default: True + list_default_arg: + description: default list argument + type: list + default: ["foo", "bar"] + dict_default_arg: + description: default dict argument + type: dict + default: {"foo":1, "bar":2} + string_default_arg_none: + description: default string argument + type: str + default: None + integer_default_arg_none: + description: default integer argument + type: int + default: 0 + float_default_arg_none: + description: default float argument + type: float + default: 0.0 + bool_default_arg_none: + description: default bool argument + type: bool + default: False + list_default_arg_none: + description: default list argument + type: list + default: [] + dict_default_arg_none: + description: default dict argument + type: dict + default: {} + override_default_arg: + description: argument with default python value type that can be overriden + type: str + default: foo + override_default_arg_with_none: + description: argument with default python type that can be overriden with None + type: str + optional_arg: + description: optional argument + type: str + default: None diff --git a/tests/component/examples/component_specs/arguments/component_default_args.yaml b/tests/component/examples/component_specs/arguments/component_default_args.yaml new file mode 100644 index 000000000..816211c04 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/component_default_args.yaml @@ -0,0 +1,69 @@ +name: Example component +description: This is an example component +image: example_component:latest + +args: + string_default_arg: + description: default string argument + type: str + default: foo + integer_default_arg: + description: default integer argument + type: int + default: 1 + float_default_arg: + description: default float argument + type: float + default: 3.14 + bool_false_default_arg: + description: default bool argument + type: bool + default: False + bool_true_default_arg: + description: default bool argument + type: bool + default: True + list_default_arg: + description: default list argument + type: list + default: ["foo", "bar"] + dict_default_arg: + description: default dict argument + type: dict + default: {"foo":1, "bar":2} + string_default_arg_none: + description: default string argument + type: str + default: None + integer_default_arg_none: + description: default integer argument + type: int + default: None + float_default_arg_none: + description: default float argument + type: float + default: None + bool_default_arg_none: + description: default bool argument + type: bool + default: None + list_default_arg_none: + description: default list argument + type: list + default: None + dict_default_arg_none: + description: default dict argument + type: dict + default: None + override_default_arg: + description: argument with default python value type that can be overriden + type: str + default: foo + override_default_none_arg: + description: argument with default None value type that can be overriden with a valid python type + type: float + default: None + override_default_arg_with_none: + description: argument with default python type that can be overriden with None + type: str + diff --git a/tests/component/examples/component_specs/arguments/input_manifest.json b/tests/component/examples/component_specs/arguments/input_manifest.json new file mode 100644 index 000000000..9ee2494f9 --- /dev/null +++ b/tests/component/examples/component_specs/arguments/input_manifest.json @@ -0,0 +1,18 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_123", + "component_id": "component_1", + "cache_key": "00" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "data": { + "type": "binary", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/component_specs/component.yaml b/tests/component/examples/component_specs/component.yaml new file mode 100644 index 000000000..973cc3e6b --- /dev/null +++ b/tests/component/examples/component_specs/component.yaml @@ -0,0 +1,23 @@ +name: Example component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: array + items: + type: float32 +additionalFields: false + + +args: + flag: + description: user argument + type: str + value: + description: integer value + type: int diff --git a/tests/component/examples/component_specs/input_manifest.json b/tests/component/examples/component_specs/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/component/examples/component_specs/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/data/components/1.yaml b/tests/component/examples/data/components/1.yaml new file mode 100644 index 000000000..95e5e578f --- /dev/null +++ b/tests/component/examples/data/components/1.yaml @@ -0,0 +1,29 @@ +name: Test component 1 +description: This is an example component +image: example_component:latest + +consumes: + Name: + type: "string" + HP: + type: "int32" + + Type 1: + type: "string" + Type 2: + type: "string" + +produces: + Name: + type: "string" + HP: + type: "int32" + Type 1: + type: "string" + Type 2: + type: "string" + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/component/examples/data/manifest.json b/tests/component/examples/data/manifest.json new file mode 100644 index 000000000..cc579fef1 --- /dev/null +++ b/tests/component/examples/data/manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "tests/component/examples/data", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "Name": { + "type": "string", + "location": "/component_1" + }, + "HP": { + "type": "int32", + "location": "/component_1" + }, + "Type 1": { + "type": "string", + "location": "/component_2" + }, + "Type 2": { + "type": "string", + "location": "/component_2" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fa5d96dad64c5e6291eb03909f8542a6af2faf7b GIT binary patch literal 3542 zcmcInUu+{s8J|sTXOpvYz9Vba;48-5IoCExZ706gM-IAn;(wPoj&rdcLltJdYkT7L zu5Y~?$9IZ^Djq686_3CJLe&WhLI{Z`B&aHcctAYx!Ye|61OgrqPZdHH-^@DynnQv@ zE&pa`zVDmw?|id%S*pYZm+((s`1;g_APeX^LT|Y?rV&DETa)ns+vXH33Wp=^N{EM9 zRn=|I8=HY?ri0BM))h54>sK|^(7Bhnke@ulrotsdI}dGQTAIoQ{WWY_#x-9!t>J-X zkb%lYLm6zTW}m2>DQ>D3?i&i%%BO7$s4Y`fJ)pm)s*2{GnY)j>wvM?}C=CRYXc`xu zD-)%yjyU3;@fwz8aAlv5yE0e7mW3^MN`wh%!pszR`YJY!A-9II?wPO~)UJ22KH#SP zk|`Tq?o(o!baAs|bMvsB&`CGE15ph+raBaSUhuiY{p4neD29pk-+k@#+;`lwJOPL1 zIj{ert*Uxr9-&un%p>EP*V`p}oAZV;n%cs|90^_|KRYv5wY5Gun%)(4h*eVu-0saH zX}7ge-)^nr8JcfqZc9-JIB>(C(+s1>ZH22uQ~R3TQ@Mb@ zj7>b?9=oSqYV+1+9%^Ig*cz9~=owZ=K!PH47NE3#!6SG=chGG&vmm-z{9q^Xvk4ew z&U5wUuy+lI@^36`{C>53bMt}vO8r~C&;IfFbGKht$TvFQ{l4~>w_+fRQ1ownNbm-e z$m0)+t`hF4!bCt0{Cp9(~?BmhAk9XarloF*Z0OgZ_EkFWr8z2Eb4cGxZ19%be3BY|o7SI4J0qy~| z0sDX`py1eAr zu6*xD-wt?VPu}^%lVAVhOF#V0bFca_5B&w%`>QV`xW?I=$d_5}e&7!LeKPu0Z}cD2 zq5s14Q`7ntkMPFiNk6|f>EdV-CukBUA$S+S4e$V_06btCFax*(@B(~*S-@3*A8-wD z9q=K*96$gB03Qa3fFR%_fR6$~fExf>kdFbvfR6(r09u@zfLj1s2q-Zt@0k|)6OPDpV;Neg%T!>#uhag^xpVa*3ai)aeBg;Y`seG> zH?KmyF5CJy9z3mH!HpymJ%eUP?P0-#1n;d0r>SnCjS%=OxPSoXWIQ1X3dq4waP|Mv zg&*u;jDDc03w%Kg$WcDjPp36!wm*9x~*v-(ITp*>|dlgn)j_yp9)57oiq(ajexXn_xht#h9MuQ zO^ecdaMdf=!XW-QqUsV?_CFe7DKu4V8751!Mr0G4 zBbw#@2%GTYr-lzzb3hE8QsXO2v88wfQVN$4K-_mC9D_uG5j?JNygu z6PMN`N!kX`&wWW+k5-N;o}>WR{!oUdB4zDc%7S_XQzpv4A-Oz}WY9L42YvW08OZlm7O znxn#xmD_VgJ+n_Hl*VD=RawXZ!QN0q86UB#(S}Er6Dof{se3aA3eMw4x^W zN{5Xc#6W$@6sy(sJ!>mdOtq>Pe1AmcYs&+2>Q|$_VnMKh1AhuMXrq_?N%nPhmGxp!RyWRvk35DVzBReiyo(ZKg+0cjR8B(~PHL zYCWydm}vc!wY_nEF6R$?h?Neu39XG>tH$QpA zd>!@o^kS!}ciCA0oiHyRDkGMsM5A7Oglj3A<>6pGws9cYn>)o6jrGjOhSt84c74FX1j-5EO*;U$V3xhmn#sqCL*iM|?3>iqO@ediCC`@71ez%3L+dIGEqLnHSRqMr5d0DC#GU#c7J7QgTPZrs;9ksFlDhTOwQ#~ygk7}`*gO$pQf z0A~nDl~qjNW6PQ%4^fA{?VR;WvY{|@O9nC}2dv^utWVbw%} zra3?7ne!S0tk4e@GFTG_qVpGv83P?*1shI^qW?}|=eOs}nmRI_-wRZcE{muy(H|`A zD%w!jJl~+tgJ0z1wMIw6Q5jUlQqu-V~U>^VR}9fXG|3 z?p=(X@3AR7>}YC-{s)!0B{_^6w?1$Tx^yM5D?!vEHfV1ksc56V3GsY+9`#LSME}N{ zMFaI^W*(~@WY7=1JF0w4f1XVN!6z8gZ+o*NU3I>_xosexACW|E%PM5jU?|A1v;^He z<6>NaHEMC@f+W^K2nX%hFQ=dvY1d1eL2n8LvOicV{9&VfySRrRHs0^O@#oVw7oUpq zbLqnu%3pr2dqJ5Bzv~GwUVof&&HGtL3H2~DBZY}qe`J&6d@1;Zdx!nglSj`Y zKMus+S^Rh*_B0_n*Glr!kA5}v%};m=x`+ypt4rl*@;E0_xIAvuZK=gFM^qjB1CRXVqqeVbLoE?Nguw_;BYb$ zvcuCQ2xz>!P_}&4lP&ufKeB4BtrFeI{ho_2ZTE^3VE)Q>Y}xmy{qN02x^BEL>Fl}MwjcO!SBA)p{Mc716wb~k!=C3H%$Uanf*so%4!lKq?e zE2Kra6vuHD0BPGCx5_0tT$nEH@FYeO zMe=|#$EAj#pLi|TQwFU}ZM6|A9r4+k$*VG3K zBt;nI)^ceR*2X2izEAWAV*R*ZNOE$krzm`;+U9#{v$YF)nRt}X42hj#6V^8yrM8$= zhSpp%ALV-r*sm%%dDsDaQNEFgH&&`~Aw%qPG9Sx7YSl{-dp@d&nKam_O1VR=gGYIV z@0E^P8Q24fDVeX;wwmTnGQZxgO~ige^y@2q`8=*a~6=y*{wwgSc~Dsbbkt6B6I6N+T;&g9h?EX1GRX~JuSJ^msw8H%&U4n@9TFZde386cxwy&+s@ryi zOd8Hi`s8w+Cj6*1Q^+cp^Q`k~C04TcXSN`Jus+f}KOe{I4TvMxD~($9bcA2Yk@(2t`Q-R&RBrv@%K;7VVQ# zwq*OHPZh`G1SeTif%I1WSwoK0EVpYJ>=5Trr-CiC=50?91X$~k`yfl|8CvDS$X414 zGI4evSPLx`@MN8FTVgG-RPFKo`xG10k@0AN&1FH?m#qseIKHrg_EqNJfL{srA60^X Kv&!%v+&=+gWo-8V literal 0 HcmV?d00001 diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d226a424901fa9ff3679ae88c43487366d1b2722 GIT binary patch literal 3584 zcmcInYiuJ|6`o1#tex1bTRhfSM96EJXsafv?IhdHDl0K|;#cg%aT42cx1@I-_S|?L z-g(4!meBGpEiANMo~&Yf{y%~pa! zNB-{PoO91P_ndp~j4NC<$#|F-{mgGKl^B_!9;K*Xd2U^zD5|C_mTvHV7_KUkU>Uk7 zd4p_UYPSX3iUt>KV2=SeWAnNu(aUTZ8F~+O=qG3L*iv;(k)&~o3I+p$j5PWKu@V+K z*s`@YJs(=rTP^IL2-GF%FgU?h<$qHfaHu9r~PK&BTItzgU2X@6ifXBbVEUXdTNq0 zEG*cXPJbiZL|x*+m2esK86ylN z)mwi#diLsF5r0em-Vc<&-tW%*4y31l=$~W)(M4)36lFc@NR^n8h#dLTc;t2e^gl06 z``*c1bOw$i#f(tUgrK7+m_I+t{E;p(3Fmk|pgmV9KTR$BFL|iFppQxcng9+^3Hm&k zaNi=_4+7;Wz$_pOSObIsp8zBP9|F7vcpPvO@O03(ybmVHBf1^&qsBB(!^HaD8gf`) zcsPnsS06$J<`b{Gq4Qpjd6K_mJz=x5RG58r2j~L^fJ49?z()Z`fR6#513VA-IDn+! zlYmbFJ`MN`;In`i0G|WA2zUwbdB7I{cL84ndpSMKzT?!Av|Xfvf_HV|?=fexheaoQ_kEEE-svB`fBN^AA$u2Xy@Ll2%U-m!NU>w4 zAWJG@#waE*HR2Z56jho8pBWDj;M5EUSfnCybTYd9f2_p!_JB^mrvZ%IWh3&mFY>cT zp=hHs>{u6RA~M;Zd+x-sCQAK1L04>5Gvha6Psgzse`Y2YcS(B=UvfBi;u_p#Afwre z0un7ODdOIFnyY!w8gz+Z+}5xFug*9~Yqr`QoG^5LA6~13u1XdXk%d6~PF&JJ9Nu1J z?ap>~_Imv2=+v3>Uc|}GN;I7F%@c)oJcl1qD=9jsZ` z7KrTm{hpA|Z1;i@i2Q}^xU~P#h!a9XLh#YzgjvD70Urk>%3XMo!|Pn)-^&+ zF3ct7l5q$@d}#U%Gc}i-n~xt+Y=Em)H|jUsx@7<5dy+g!ZiVBxO#pe;IBtPUx4A6G zwYf~&p{3L3_&q{gJ>lm!GwvK(M~lNRa9odrbj!+kX7O4XZ{Rc!^iDxZn1xJnpHKCZe0{Nn>WiIbLso=A;aVYU!P-TcukR3fTdeoZ zQij7#RZ;j{wZ*GhtGNYoxy2-(>k&P@U0A={SZ|4WrRVT6#U!sPpuegVaIX#el6)h* z*qE;_3OS;e!+ffEyIEgPxcsCd=CYupDi`+kwlpXzyt;nS%)uUrPnlw^wz_L=WQt3z z+Bx43iF|$D#3z1biW|H2+7DhtV_gw#kYWKSf2;>zwYJ{RK$J|x!n zH!;Z#U#}&0>s49It)AqpwbdhWJhesg?&RWPOiayHLh zUfWEqyZf`6kUvV^a~d^)8=UaCDUMLcg?M#x1wfa-!c}7LdV#}fEiLZK$7|shotJFV>@>2I&R{|9^14^=VyDI@!0hz zj#EWq*#)XtvSLx83bA0xk_BRi!~(Hk!Kz5GVB19(EI98?;yP(6fmB95^SSrF^X@(G z-6wDG*#K)}zp}88EopX|Wu_Ts>-s3eFo~AfZrkt@*IKG&YD*A1YHI}Fk#5C~N2bxL zi@Iezw$aeq4Z{J`s;A1lUx@2{&8S$$ zZ=Wb&pa^Id+2x^17G~|{XxAt$6>Jw3qQ=?F&qCL~pSFyQLDb~5c7yp0R^OA+lO+Uc z3C}Ffzb(EeHs2p3o~0K^Pc7p!JNsxP4N=LeJs7cBm@x}u9Xn%VM$S1H8!!skfir*u z7z4(Ev%my!4tNbX4@?46z%=kWZ~^sGIaOaFdtV`hCzy;FExzy0UI-PwnV_C@vUZ}h*uGpF`_KH*_)hr7$F zDRfyLI-FmQ`o10aJ+*uOLk6da5SVEjyPhks~u4hJ0imGbE<)GoEv0pZYw^&%pDv&Pv&?QoOx^!j$@e&+5c@e zZ5+0k^msO;%7Zy1by%Uv8+^%p)c$Jc!$Q1Jn`t{Sk7jh17D?=JjR)o-iD^+((UvN9 z8ZD~2u1PJi)hD7i`(mrr=n=AAwbj-d2C)Vf7yJtW4r*|RQx7?V3xS0YcfhzN_-yv3 zaMkQf*C%6F$Q7u>c%I(`$aRC~7x_qqkMn$mk5;HH5;^AY5##E>pX8(FH|~^$`jb50 z<1r3TLJ}#G2fi_~jrhn%$-3SxCGv}fU}jfZ%XcJWUAw78B-nS7y1$)_rfO1fFCYow zvM7XiN`cn4#{RY}N^_^u-CwbM~vE7f&fN@UAYJ>DtZLR=yokP@Jf1;SII4Q)NefB+(5sQ&&?$5v^C-% zv@Tvrt0nXu+-d)z(GWt6^or0ynHorzVOWi$iP4JGPhV*iUnIuEJqnt28o$Y7H29FG zY$k_h9t_&doth7&j?jZM!AGvZ z>X?{gtNYwdK6R!+b+p#^oi3U<9@iDbcndK$!16ZW z9l*PQs{jTN1;hZGfH)ulU;#-02S@>~0n&gBAPd+6@1^z<@<5ta|ju1X~M`|d#K)MNX`r^c3n_nc-0mxv)L=Q|ZU_mm3SM~?| z8?UZMUg3uPe}(h@Fyobj9`7%wFA#)((dRY=Ll!!|?_75%)sU5gU8$vYb)!RFnY&J@ zGIeutj1P%u0i~Q#;LtJ#0!h%Xr#Pch)67o%cvrqx?Sb7A=@9;WHtdY3JtE=sw@4JChFP! zDhoBBoMNd`DPFO*VyX37=|t{_h|e!~)L~w+R8c8MHEuJmj?Mcyzr-d$r^MlCZZaP+ zEA;vv)Pm{*4Z=O8v>w87VLr~th9snKAN#E;**L;gMp5WTEf~eeY@VO zS>+G*%JfV8MzkU7g@zO!owL*xc%(@Y*CiSyOG;{^njgx^lqu?x?dMCO3SVf*Y<#$m zwe23#wp-!D8qy8#1-w2l<|FWV9A}(9uwO#irtKe-hbbC)IvabrHf6|z)3g05p9qP| z9Lf*0Ixs6nsv#MBcz-}`@SfV2ZH?(_Idv%T>s_38GagDenQmq~1v)e8vmC3MWMoZ| zUcoUk>dHa7nNe$bhDEJ0;K3{ENk&1p%|X90jG2ReMW1;NcUltphph{*sMT0)5AL}C zpwWPWjQEP;LD?n*Sp!2n0Zk08i2e8#vzQ7p9`8{IlfnEv1EV1x{2TD2!Tuw5;df^a{t@^G DkXjW| literal 0 HcmV?d00001 diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1ae8001c02ac7c803353728bb9e7bb919d774ecc GIT binary patch literal 3066 zcmcImOKcle6dgN`;}l0NsAFtoiO6cLF4AB-acHV8I(A~muHz3_PcY2pl8a%OSzVD@4NV%vd$J)ko6f;ucfD=Mwx|6G zt)jLxqhj+M&ej_>LAM<^yinJgb;CAM;Dwgb7{aK9W>-=*X#lUWcEvVyBqb>hd?d8S zpldQ$msXwlnrR4?y1_E+9Ge+qRrV0;WS#yg=GY)j3RgdMqR$6d8K4l<7iCYFxI zxN^CCdH9N4o^hsc-##<^-47jx3Emj+vuQ#Gc{zFG)IuLPB7KqLV-QHimx!bW144t-*NcW6;sN$A$Zxs<*#osp1C42Cz!QdAT z|J2@u`xbK7$=(^rKvuG_HwPRtGi+m&;UNc89&s@hKm|0Q3akP;Py-C04w%3NpaC>T zT#gpW&?XtyVCewsKo{5mHh~v`E#M{KW#A(43UCQ{6?hGJ9e4w{47>@v1zZ8%2Cf3{ z0Ph0t0oQ={fe(NWfscTXM_i|_Lp^eRVm-(L%B*@{a<6O0<8{f~F&^gFK2|s*-&*_2 zdGPe8dq(ijeKwZ)_0+|I&Aq2BsZ|KpB1_9Mmw|8o0T$M$Zs zYcgGZR9L5$JXbwrdkMVD80+pNqhEgUci473eRs>@0M z(MG#wc6*MxSj8zy>@}q&$U;j%_+~&c5ROBGj9p$z96u4*+PY6>H;aIEijf?8yIbR)xldwR6jijL z2(5a9s;+BdLuhn~=w?@FH0m9aY)xr2wYov9;mPsPcsKwx0^3vfIV0oY@rl3|<8^bn z+*$sl)t26i!_SZ_T$$lGZV@2Y8IGIeVihjIaTP9Jp|)6Tm%l@dvps)`i(B8g(=61V z;u_Afk8zY;Dk3E~J>FrW7KUs)m*TlJeOEl)Svsz4qeKnF7#meZewoC#Wq!SxiE~=1rt4xdR~BoDR_Pq#lF_i3?2tU2 zBE}aB*|I#Rcc?F(4vRG%`Ezr{|0EEZzX!bC1AB}raR6C>#hB|aOn{9#>ACXgei zrmFRd(oO4PExT4qq6We#p3diIi>z4k5@dstQ2B;YGzj3m-imO zluSUU#6~|inU9*q$n+{|A@xCna7W2chiG2Rmol;;iRjy<{#FURkt0;>>9RhZiIX+h z(UH^gR2!~q)mt)4{Q7Dk@({lfQ^i_NmBRgV=Gy{KG+Cx~iAKI9&FTG`Z^(0LQ>;mr zpU;Pid`^{AAw(_dG^x>`z4gETK;`_n4+;5$;Kb9 zO#wVowQRSt@kNtCF!w_XldjdwNZ_G>()C=?#IOZd2aX*-)m+wdc>J zp);r5=UCDuBWsG0Nt$C`n_SOS=e6=8mn5yR6NgvQlbk|sGdumpapnU0m3%IyXs68) zf3J1%N?I+cui{So4~>QpVx(7$4$4-MJ<~xNy P3)YX}@uzbE|N8#{@S7Po literal 0 HcmV?d00001 diff --git a/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt new file mode 100644 index 000000000..4a9ff8afc --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt @@ -0,0 +1 @@ +tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/test_component.py b/tests/component/test_component.py similarity index 99% rename from tests/test_component.py rename to tests/component/test_component.py index e5dcb3bc3..830ce2963 100644 --- a/tests/test_component.py +++ b/tests/component/test_component.py @@ -23,8 +23,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest, Metadata -components_path = Path(__file__).parent / "example_specs/components" -base_path = Path(__file__).parent / "example_specs/mock_base_path" +components_path = Path(__file__).parent / "examples/component_specs" +base_path = Path(__file__).parent / "examples/mock_base_path" N_PARTITIONS = 2 diff --git a/tests/test_data_io.py b/tests/component/test_data_io.py similarity index 61% rename from tests/test_data_io.py rename to tests/component/test_data_io.py index 9ade4a329..30a4b7c10 100644 --- a/tests/test_data_io.py +++ b/tests/component/test_data_io.py @@ -8,8 +8,10 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -manifest_path = Path(__file__).parent / "example_data/manifest.json" -component_spec_path = Path(__file__).parent / "example_data/components/1.yaml" +manifest_path = Path(__file__).parent / "examples/data/manifest.json" +component_spec_path = ( + Path(__file__).parent / "examples/data/components/1.yaml" +) NUMBER_OF_TEST_ROWS = 151 @@ -37,33 +39,16 @@ def dataframe(manifest, component_spec): return data_loader.load_dataframe() -def test_load_index(manifest, component_spec): - """Test the loading of just the index.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - index_df = data_loader._load_index() - assert len(index_df) == NUMBER_OF_TEST_ROWS - assert index_df.index.name == "id" - - -def test_load_subset(manifest, component_spec): - """Test the loading of one field of a subset.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - subset_df = data_loader._load_subset(subset_name="types", fields=["Type 1"]) - assert len(subset_df) == NUMBER_OF_TEST_ROWS - assert list(subset_df.columns) == ["types_Type 1"] - assert subset_df.index.name == "id" - - def test_load_dataframe(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader(manifest=manifest, component_spec=component_spec) dataframe = dl.load_dataframe() assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == [ - "properties_Name", - "properties_HP", - "types_Type 1", - "types_Type 2", + "Name", + "HP", + "Type 1", + "Type 2", ] assert dataframe.index.name == "id" @@ -78,7 +63,7 @@ def test_load_dataframe_default(manifest, component_spec): def test_load_dataframe_rows(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader( manifest=manifest, component_spec=component_spec, @@ -89,34 +74,7 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_write_index( - tmp_path_factory, - dataframe, - manifest, - component_spec, - dask_client, -): - """Test writing out the index.""" - with tmp_path_factory.mktemp("temp") as fn: - # override the base path of the manifest with the temp dir - manifest.update_metadata("base_path", str(fn)) - data_writer = DaskDataWriter( - manifest=manifest, - component_spec=component_spec, - ) - # write out index to temp dir - data_writer.write_dataframe(dataframe, dask_client) - number_workers = os.cpu_count() - # read written data and assert - dataframe = dd.read_parquet(fn / "index") - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert dataframe.index.name == "id" - assert dataframe.npartitions in list( - range(number_workers - 1, number_workers + 2), - ) - - -def test_write_subsets( +def test_write_dataset( tmp_path_factory, dataframe, manifest, @@ -125,11 +83,7 @@ def test_write_subsets( ): """Test writing out subsets.""" # Dictionary specifying the expected subsets to write and their column names - subset_columns_dict = { - "index": [], - "properties": ["Name", "HP"], - "types": ["Type 1", "Type 2"], - } + columns = ["Name", "HP", "Type 1", "Type 2"] with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) @@ -137,13 +91,13 @@ def test_write_subsets( # write dataframe to temp dir data_writer.write_dataframe(dataframe, dask_client) # read written data and assert - for subset, subset_columns in subset_columns_dict.items(): - dataframe = dd.read_parquet(fn / subset) - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert list(dataframe.columns) == subset_columns - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == columns + assert dataframe.index.name == "id" +# TODO: check if this is still needed? def test_write_reset_index( tmp_path_factory, dataframe, @@ -151,7 +105,7 @@ def test_write_reset_index( component_spec, dask_client, ): - """Test writing out the index and subsets that have no dask index and checking + """Test writing out the index and fields that have no dask index and checking if the id index was created. """ dataframe = dataframe.reset_index(drop=True) @@ -160,10 +114,8 @@ def test_write_reset_index( data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) data_writer.write_dataframe(dataframe, dask_client) - - for subset in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / subset) - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" @pytest.mark.parametrize("partitions", list(range(1, 5))) @@ -189,29 +141,51 @@ def test_write_divisions( # noqa: PLR0913 data_writer.write_dataframe(dataframe, dask_client) - for target in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / target) - assert dataframe.index.name == "id" - assert dataframe.npartitions == partitions + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" + assert dataframe.npartitions == partitions + + +def test_write_fields_invalid( + tmp_path_factory, + dataframe, + manifest, + component_spec, + dask_client, +): + """Test writing out fields but the dataframe columns are incomplete.""" + with tmp_path_factory.mktemp("temp") as fn: + # override the base path of the manifest with the temp dir + manifest.update_metadata("base_path", str(fn)) + # Drop one of the columns required in the output + dataframe = dataframe.drop(["Type 2"], axis=1) + data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) + expected_error_msg = ( + r"Fields \['Type 2'\] defined in output dataset " + r"but not found in dataframe" + ) + with pytest.raises(ValueError, match=expected_error_msg): + data_writer.write_dataframe(dataframe, dask_client) -def test_write_subsets_invalid( +def test_write_fields_invalid_several_fields_missing( tmp_path_factory, dataframe, manifest, component_spec, dask_client, ): - """Test writing out subsets but the dataframe columns are incomplete.""" + """Test writing out fields but the dataframe columns are incomplete.""" with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) # Drop one of the columns required in the output - dataframe = dataframe.drop(["types_Type 2"], axis=1) + dataframe = dataframe.drop(["Type 1"], axis=1) + dataframe = dataframe.drop(["Type 2"], axis=1) data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) expected_error_msg = ( - r"Field \['types_Type 2'\] not in index defined in output subset " - r"types but not found in dataframe" + r"Fields \['Type 1', 'Type 2'\] defined in output dataset " + r"but not found in dataframe" ) with pytest.raises(ValueError, match=expected_error_msg): data_writer.write_dataframe(dataframe, dask_client) diff --git a/tests/core/test_manifest.py b/tests/core/test_manifest.py index 0b255b9df..c24d27c9c 100644 --- a/tests/core/test_manifest.py +++ b/tests/core/test_manifest.py @@ -1,6 +1,5 @@ import json import pkgutil -from collections import OrderedDict from pathlib import Path import pytest @@ -226,21 +225,3 @@ def test_fields(): # delete a field manifest.remove_field(name="field_1") assert "field_1" not in manifest.fields - - -def test_field_mapping(valid_manifest): - """Test field mapping generation.""" - manifest = Manifest(valid_manifest) - manifest.add_or_update_field(Field(name="index", location="component2")) - field_mapping = manifest.field_mapping - assert field_mapping == OrderedDict( - { - "gs://bucket/test_pipeline/test_pipeline_12345/component2": [ - "id", - "height", - "width", - ], - "gs://bucket/test_pipeline/test_pipeline_12345/component1": ["images"], - "gs://bucket/test_pipeline/test_pipeline_12345/component3": ["caption"], - }, - ) diff --git a/tests/examples/example_data/raw/split.py b/tests/examples/example_data/raw/split.py index 6800ee323..ade466125 100644 --- a/tests/examples/example_data/raw/split.py +++ b/tests/examples/example_data/raw/split.py @@ -13,7 +13,7 @@ import dask.dataframe as dd data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent / "subsets_input/" +output_path = Path(__file__).parent.parent def split_into_subsets(): @@ -22,17 +22,13 @@ def split_into_subsets(): master_df = master_df.set_index("id", sorted=True) master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - # create index subset - index_df = master_df.index.to_frame().drop(columns=["id"]) - index_df.to_parquet(output_path / "index") - # create properties subset properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "properties") + properties_df.to_parquet(output_path / "component_1") # create types subset types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "types") + types_df.to_parquet(output_path / "component_2") if __name__ == "__main__": From e4eadf3ddb59925dd2d86c66abf2d70513ce3eb7 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 08:46:50 +0100 Subject: [PATCH 30/34] Use new data format (#667) This PR applies the usage of the new data format: - fixes all tests - update component specifications and component code - remove subset field usage in `pipeline.py` --------- Co-authored-by: Robbe Sneyders --- components/caption_images/README.md | 6 +- .../caption_images/fondant_component.yaml | 12 +-- components/caption_images/src/main.py | 4 +- components/chunk_text/README.md | 8 +- components/chunk_text/fondant_component.yaml | 16 ++-- components/chunk_text/src/main.py | 7 +- .../chunk_text/tests/chunk_text_test.py | 6 +- components/download_images/README.md | 10 +-- .../download_images/fondant_component.yaml | 24 +++--- components/download_images/src/main.py | 5 +- .../download_images/tests/test_component.py | 8 +- components/embed_images/README.md | 6 +- .../embed_images/fondant_component.yaml | 18 ++--- components/embed_images/src/main.py | 4 +- components/embed_text/README.md | 8 +- components/embed_text/fondant_component.yaml | 22 +++--- components/embed_text/src/main.py | 4 +- .../embedding_based_laion_retrieval/README.md | 6 +- .../fondant_component.yaml | 18 ++--- .../src/main.py | 6 +- components/filter_image_resolution/README.md | 5 +- .../fondant_component.yaml | 10 +-- .../filter_image_resolution/src/main.py | 4 +- components/filter_text_length/README.md | 3 +- .../filter_text_length/fondant_component.yaml | 6 +- components/filter_text_length/src/main.py | 4 +- .../tests/text_length_filter_test.py | 2 +- components/image_cropping/README.md | 10 +-- .../image_cropping/fondant_component.yaml | 20 ++--- components/image_cropping/src/main.py | 4 +- .../image_resolution_extraction/README.md | 10 +-- .../fondant_component.yaml | 20 ++--- .../image_resolution_extraction/src/main.py | 7 +- components/index_weaviate/README.md | 5 +- .../index_weaviate/fondant_component.yaml | 14 ++-- components/language_filter/README.md | 3 +- .../language_filter/fondant_component.yaml | 6 +- components/language_filter/src/main.py | 2 +- components/load_from_files/README.md | 5 +- .../load_from_files/fondant_component.yaml | 12 ++- components/load_from_hf_hub/README.md | 3 +- .../load_from_hf_hub/fondant_component.yaml | 10 +-- components/load_from_hf_hub/src/main.py | 25 +++---- components/load_from_parquet/README.md | 3 +- .../load_from_parquet/fondant_component.yaml | 6 +- components/load_from_parquet/src/main.py | 25 +++---- components/minhash_generator/README.md | 6 +- .../minhash_generator/fondant_component.yaml | 16 ++-- components/minhash_generator/src/main.py | 4 +- components/normalize_text/README.md | 3 +- .../normalize_text/fondant_component.yaml | 6 +- components/normalize_text/src/main.py | 12 +-- .../prompt_based_laion_retrieval/README.md | 6 +- .../fondant_component.yaml | 14 ++-- .../prompt_based_laion_retrieval/src/main.py | 6 +- components/resize_images/README.md | 6 +- .../resize_images/fondant_component.yaml | 12 +-- components/resize_images/src/main.py | 2 +- components/segment_images/README.md | 6 +- .../segment_images/fondant_component.yaml | 12 +-- components/segment_images/src/main.py | 2 +- components/write_to_hf_hub/README.md | 3 +- .../write_to_hf_hub/fondant_component.yaml | 8 +- components/write_to_hf_hub/src/main.py | 19 +++-- scripts/component_readme/readme_template.md | 14 +--- src/fondant/core/manifest.py | 8 +- src/fondant/core/schema.py | 3 +- src/fondant/pipeline/pipeline.py | 56 ++++++-------- tests/component/test_data_io.py | 4 +- tests/examples/example_data/components/1.yaml | 35 --------- tests/examples/example_data/manifest.json | 35 --------- tests/examples/example_data/raw/split.py | 35 --------- .../examples/example_data/raw/testset.parquet | Bin 15048 -> 0 bytes .../subsets_input/index/part.0.parquet | Bin 1701 -> 0 bytes .../subsets_input/index/part.1.parquet | Bin 1707 -> 0 bytes .../subsets_input/index/part.2.parquet | Bin 1715 -> 0 bytes .../subsets_input/properties/part.0.parquet | Bin 3542 -> 0 bytes .../subsets_input/properties/part.1.parquet | Bin 3526 -> 0 bytes .../subsets_input/properties/part.2.parquet | Bin 3584 -> 0 bytes .../subsets_input/types/part.0.parquet | Bin 3018 -> 0 bytes .../subsets_input/types/part.1.parquet | Bin 3085 -> 0 bytes .../subsets_input/types/part.2.parquet | Bin 3066 -> 0 bytes .../second_component/fondant_component.yaml | 27 ------- .../second_component/fondant_component.yaml | 29 -------- .../second_component/fondant_component.yaml | 27 ------- .../fourth_component/fondant_component.yaml | 38 ---------- .../third_component/fondant_component.yaml | 33 --------- .../components/arguments/component.yaml | 68 ----------------- .../arguments/component_default_args.yaml | 69 ------------------ .../components/arguments/input_manifest.json | 18 ----- .../components/input_manifest.json | 17 ----- .../example_pipeline/cache/42.txt | 1 - .../component_1/manifest.json | 31 -------- .../component_2/manifest.json | 36 --------- .../component_1/manifest.json | 36 --------- .../component_2/manifest.json | 36 --------- .../components/dummy_component/Dockerfile | 0 .../components/dummy_component/README.md | 0 .../dummy_component/fondant_component.yaml | 8 +- .../dummy_component/requirements.txt | 0 .../components/dummy_component/src/main.py | 0 .../load_from_parquet/fondant_component.yaml | 4 +- .../sample_pipeline_test/data/sample.parquet | Bin .../test_sample_pipeline.py | 6 +- .../compiled_pipeline/kubeflow_pipeline.yml | 0 .../first_component/fondant_component.yaml | 12 +-- .../second_component/fondant_component.yaml | 19 +++-- .../first_component/fondant_component.yaml | 17 ++--- .../second_component/fondant_component.yaml | 24 ++++++ .../first_component/fondant_component.yaml | 18 ++--- .../second_component/fondant_component.yaml | 21 ++++++ .../example_1/first_component/Dockerfile | 0 .../first_component/fondant_component.yaml | 19 ++--- .../example_1/fourth_component/Dockerfile | 0 .../fourth_component/fondant_component.yaml | 29 ++++++++ .../example_1/second_component/Dockerfile | 0 .../second_component/fondant_component.yaml | 18 +++++ .../example_1/third_component/Dockerfile | 0 .../third_component/fondant_component.yaml} | 19 ++--- tests/{ => pipeline}/test_compiler.py | 4 +- tests/{ => pipeline}/test_pipeline.py | 4 +- tests/{ => pipeline}/test_runner.py | 2 +- tests/test_cli.py | 32 ++++---- tox.ini | 2 +- 124 files changed, 420 insertions(+), 1059 deletions(-) delete mode 100644 tests/examples/example_data/components/1.yaml delete mode 100644 tests/examples/example_data/manifest.json delete mode 100644 tests/examples/example_data/raw/split.py delete mode 100644 tests/examples/example_data/raw/testset.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/index/part.2.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/properties/part.2.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.0.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.1.parquet delete mode 100644 tests/examples/example_data/subsets_input/types/part.2.parquet delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml delete mode 100644 tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml delete mode 100644 tests/examples/example_specs/components/arguments/component.yaml delete mode 100644 tests/examples/example_specs/components/arguments/component_default_args.yaml delete mode 100644 tests/examples/example_specs/components/arguments/input_manifest.json delete mode 100644 tests/examples/example_specs/components/input_manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json delete mode 100644 tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/Dockerfile (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/README.md (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/fondant_component.yaml (73%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/requirements.txt (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/dummy_component/src/main.py (100%) rename tests/{ => integration_tests}/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml (95%) rename tests/{ => integration_tests}/sample_pipeline_test/data/sample.parquet (100%) rename tests/{ => integration_tests}/test_sample_pipeline.py (91%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/compiled_pipeline/kubeflow_pipeline.yml (100%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_1/first_component/fondant_component.yaml (62%) rename tests/{examples/example_pipelines/valid_pipeline => pipeline/examples/pipelines/invalid_pipeline}/example_1/second_component/fondant_component.yaml (55%) rename tests/{examples/example_pipelines/valid_pipeline/example_1 => pipeline/examples/pipelines/invalid_pipeline/example_2}/first_component/fondant_component.yaml (61%) create mode 100644 tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/invalid_pipeline/example_3/first_component/fondant_component.yaml (53%) create mode 100644 tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/first_component/Dockerfile (100%) rename tests/{examples/example_pipelines/invalid_pipeline/example_2 => pipeline/examples/pipelines/valid_pipeline/example_1}/first_component/fondant_component.yaml (50%) rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/fourth_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/second_component/Dockerfile (100%) create mode 100644 tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename tests/{examples/example_pipelines => pipeline/examples/pipelines}/valid_pipeline/example_1/third_component/Dockerfile (100%) rename tests/{examples/example_specs/components/component.yaml => pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml} (59%) rename tests/{ => pipeline}/test_compiler.py (99%) rename tests/{ => pipeline}/test_pipeline.py (98%) rename tests/{ => pipeline}/test_runner.py (98%) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 8bb38e996..401747cbb 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -7,13 +7,11 @@ This component captions images using a BLIP model from the Hugging Face hub **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- captions - - text: string +- captions_text: string ### Arguments diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml index 7a72cd815..3da8e4720 100644 --- a/components/caption_images/fondant_component.yaml +++ b/components/caption_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - text: - type: utf8 + captions_text: + type: utf8 args: model_id: diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py index 934ea09ce..86be52b40 100644 --- a/components/caption_images/src/main.py +++ b/components/caption_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.max_new_tokens = max_new_tokens def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -112,4 +112,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(captions) - return pd.concat(results).to_frame(name=("captions", "text")) + return pd.concat(results).to_frame(name=("captions_text")) diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 97b3309e0..a12d74980 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -11,14 +11,12 @@ consists of the id of the original document followed by the chunk index. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - original_document_id: string +- text_data: string +- text_original_document_id: string ### Arguments diff --git a/components/chunk_text/fondant_component.yaml b/components/chunk_text/fondant_component.yaml index d266b4dac..159e67556 100644 --- a/components/chunk_text/fondant_component.yaml +++ b/components/chunk_text/fondant_component.yaml @@ -10,18 +10,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - original_document_id: - type: string + text_data: + type: string + text_original_document_id: + type: string args: chunk_size: diff --git a/components/chunk_text/src/main.py b/components/chunk_text/src/main.py index 8c41220d2..da46cbbd7 100644 --- a/components/chunk_text/src/main.py +++ b/components/chunk_text/src/main.py @@ -38,7 +38,7 @@ def __init__( def chunk_text(self, row) -> t.List[t.Tuple]: # Multi-index df has id under the name attribute doc_id = row.name - text_data = row[("text", "data")] + text_data = row[("text_data")] docs = self.text_splitter.create_documents([text_data]) return [ (doc_id, f"{doc_id}_{chunk_id}", chunk.page_content) @@ -63,9 +63,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ) results_df = results_df.set_index("id") - # Set multi-index column for the expected subset and field - results_df.columns = pd.MultiIndex.from_product( - [["text"], results_df.columns], - ) - return results_df diff --git a/components/chunk_text/tests/chunk_text_test.py b/components/chunk_text/tests/chunk_text_test.py index a47683ed3..f95180f98 100644 --- a/components/chunk_text/tests/chunk_text_test.py +++ b/components/chunk_text/tests/chunk_text_test.py @@ -7,7 +7,7 @@ def test_transform(): """Test chunk component method.""" input_dataframe = pd.DataFrame( { - ("text", "data"): [ + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo", "ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis", "parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec,", @@ -25,8 +25,8 @@ def test_transform(): expected_output_dataframe = pd.DataFrame( { - ("text", "original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], - ("text", "data"): [ + ("text_original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer", "amet, consectetuer adipiscing elit. Aenean", "elit. Aenean commodo", diff --git a/components/download_images/README.md b/components/download_images/README.md index b491007b5..6ed54d66d 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -14,15 +14,13 @@ from the img2dataset library. **This component consumes:** -- images - - url: string +- images_url: string **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 1982a96ba..abe19c653 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -13,21 +13,17 @@ tags: - Image processing consumes: - images: - fields: - url: - type: string + images_url: + type: string produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 - additionalFields: false + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 +# additionalFields: false args: timeout: @@ -53,7 +49,7 @@ args: description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". type: str default: 'border' - resize_only_if_bigger: + resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool default: False diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py index 8a37b86eb..070859e07 100644 --- a/components/download_images/src/main.py +++ b/components/download_images/src/main.py @@ -119,7 +119,7 @@ async def download_dataframe() -> None: images = await asyncio.gather( *[ self.download_and_resize_image(id_, url, semaphore=semaphore) - for id_, url in zip(dataframe.index, dataframe["images"]["url"]) + for id_, url in zip(dataframe.index, dataframe["images_url"]) ], ) results.extend(images) @@ -134,8 +134,5 @@ async def download_dataframe() -> None: results_df = results_df.dropna() results_df = results_df.set_index("id", drop=True) - results_df.columns = pd.MultiIndex.from_product( - [["images"], results_df.columns], - ) return results_df diff --git a/components/download_images/tests/test_component.py b/components/download_images/tests/test_component.py index 1f690e6e5..d851ecd73 100644 --- a/components/download_images/tests/test_component.py +++ b/components/download_images/tests/test_component.py @@ -45,7 +45,7 @@ def test_transform(respx_mock): input_dataframe = pd.DataFrame( { - ("images", "url"): urls, + "images_url": urls, }, index=pd.Index(ids, name="id"), ) @@ -55,9 +55,9 @@ def test_transform(respx_mock): resized_images = [component.resizer(io.BytesIO(image))[0] for image in images] expected_dataframe = pd.DataFrame( { - ("images", "data"): resized_images, - ("images", "width"): [image_size] * len(ids), - ("images", "height"): [image_size] * len(ids), + "images_data": resized_images, + "images_width": [image_size] * len(ids), + "images_height": [image_size] * len(ids), }, index=pd.Index(ids, name="id"), ) diff --git a/components/embed_images/README.md b/components/embed_images/README.md index eec02f577..23e746136 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -7,13 +7,11 @@ Component that generates CLIP embeddings from images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- embeddings - - data: list +- embeddings_data: list ### Arguments diff --git a/components/embed_images/fondant_component.yaml b/components/embed_images/fondant_component.yaml index a176b2f6b..86fdb53a4 100644 --- a/components/embed_images/fondant_component.yaml +++ b/components/embed_images/fondant_component.yaml @@ -2,21 +2,17 @@ name: Embed images description: Component that generates CLIP embeddings from images image: fndnt/embed_images:dev tags: - - Image processing + - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: model_id: diff --git a/components/embed_images/src/main.py b/components/embed_images/src/main.py index 03c647dc0..a0270b1e8 100644 --- a/components/embed_images/src/main.py +++ b/components/embed_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.batch_size = batch_size def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -110,4 +110,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(embeddings) - return pd.concat(results).to_frame(name=("embeddings", "data")) + return pd.concat(results).to_frame(name=("embeddings_data")) diff --git a/components/embed_text/README.md b/components/embed_text/README.md index a30a9ec4f..c53a779b9 100644 --- a/components/embed_text/README.md +++ b/components/embed_text/README.md @@ -7,14 +7,12 @@ Component that generates embeddings of text passages. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list ### Arguments diff --git a/components/embed_text/fondant_component.yaml b/components/embed_text/fondant_component.yaml index 2e34c5c0a..a1a3ca816 100644 --- a/components/embed_text/fondant_component.yaml +++ b/components/embed_text/fondant_component.yaml @@ -5,21 +5,17 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 - + text_data: + type: string + text_embedding: + type: array + items: + type: float32 + args: model_provider: description: | diff --git a/components/embed_text/src/main.py b/components/embed_text/src/main.py index c8c2acfde..3fdc08e47 100644 --- a/components/embed_text/src/main.py +++ b/components/embed_text/src/main.py @@ -65,7 +65,7 @@ def get_embeddings_vectors(self, texts): return self.embedding_model.embed_documents(texts.tolist()) def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe[("text", "embedding")] = self.get_embeddings_vectors( - dataframe[("text", "data")], + dataframe["text_embedding"] = self.get_embeddings_vectors( + dataframe["text_data"], ) return dataframe diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 454253416..f19d55b03 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -9,13 +9,11 @@ used to find images similar to the embedded images / captions. **This component consumes:** -- embeddings - - data: list +- embeddings_data: list **This component produces:** -- images - - url: string +- images_url: string ### Arguments diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml index d93e634a3..af147c158 100644 --- a/components/embedding_based_laion_retrieval/fondant_component.yaml +++ b/components/embedding_based_laion_retrieval/fondant_component.yaml @@ -7,19 +7,15 @@ tags: - Data retrieval consumes: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string +# additionalFields: false args: num_images: diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py index b350e6142..0f7697dc3 100644 --- a/components/embedding_based_laion_retrieval/src/main.py +++ b/components/embedding_based_laion_retrieval/src/main.py @@ -58,18 +58,18 @@ async def async_query(): embedding_input=embedding.tolist(), ), ) - for embedding in dataframe["embeddings"]["data"] + for embedding in dataframe["embeddings_data"] ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)["id", "url"] results_df = results_df.set_index("id") # Cast the index to string results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.columns = ["images_url"] return results_df diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 1bc0c27f5..e7093e680 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -7,9 +7,8 @@ Component that filters images based on minimum size and max aspect ratio **This component consumes:** -- images - - width: int32 - - height: int32 +- images_width: int32 +- images_height: int32 **This component produces no data.** diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 0512d87f9..b6ff8cbe7 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -5,12 +5,10 @@ tags: - Image processing consumes: - images: - fields: - width: - type: int32 - height: - type: int32 + images_width: + type: int32 + images_height: + type: int32 args: min_image_dim: diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py index 8fbfdfa77..b169196ec 100644 --- a/components/filter_image_resolution/src/main.py +++ b/components/filter_image_resolution/src/main.py @@ -23,8 +23,8 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None: self.max_aspect_ratio = max_aspect_ratio def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - width = dataframe["images"]["width"] - height = dataframe["images"]["height"] + width = dataframe["images_width"] + height = dataframe["images_height"] min_image_dim = np.minimum(width, height) max_image_dim = np.maximum(width, height) aspect_ratio = max_image_dim / min_image_dim diff --git a/components/filter_text_length/README.md b/components/filter_text_length/README.md index ed89dd128..4c5730180 100644 --- a/components/filter_text_length/README.md +++ b/components/filter_text_length/README.md @@ -7,8 +7,7 @@ A component that filters out text based on their length **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/filter_text_length/fondant_component.yaml b/components/filter_text_length/fondant_component.yaml index fee0fb242..2451f5981 100644 --- a/components/filter_text_length/fondant_component.yaml +++ b/components/filter_text_length/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: min_characters_length: diff --git a/components/filter_text_length/src/main.py b/components/filter_text_length/src/main.py index 3e2f472a4..e3a6b0d61 100644 --- a/components/filter_text_length/src/main.py +++ b/components/filter_text_length/src/main.py @@ -23,10 +23,10 @@ def __init__(self, *_, min_characters_length: int, min_words_length: int): def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """Filter out text based on their length.""" - caption_num_words = dataframe["text"]["data"].apply( + caption_num_words = dataframe["text_data"].apply( lambda x: len(fasttext.tokenize(x)), ) - caption_num_chars = dataframe["text"]["data"].apply(len) + caption_num_chars = dataframe["text_data"].apply(len) mask = (caption_num_words >= self.min_words_length) & ( caption_num_chars >= self.min_characters_length diff --git a/components/filter_text_length/tests/text_length_filter_test.py b/components/filter_text_length/tests/text_length_filter_test.py index eea98864e..55c927e79 100644 --- a/components/filter_text_length/tests/text_length_filter_test.py +++ b/components/filter_text_length/tests/text_length_filter_test.py @@ -24,6 +24,6 @@ def test_run_component_test(): # Then: dataframe only contains one row assert len(dataframe) == 1 assert ( - dataframe.loc[2]["text"]["data"] + dataframe.loc[2]["text_data"] == "This a valid sentence which should be still there" ) diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 5d679c457..e59af3af6 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -22,15 +22,13 @@ right side is border-cropped image. **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_cropping/fondant_component.yaml b/components/image_cropping/fondant_component.yaml index 416bc2c1d..130b14324 100644 --- a/components/image_cropping/fondant_component.yaml +++ b/components/image_cropping/fondant_component.yaml @@ -20,20 +20,16 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 args: cropping_threshold: diff --git a/components/image_cropping/src/main.py b/components/image_cropping/src/main.py index c670fdeb8..6a62e309c 100644 --- a/components/image_cropping/src/main.py +++ b/components/image_cropping/src/main.py @@ -46,12 +46,12 @@ def __init__( def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: # crop images - dataframe["images"]["data"] = dataframe["images"]["data"].apply( + dataframe["images_data"] = dataframe["images_data"].apply( lambda image: remove_borders(image, self.cropping_threshold, self.padding), ) # extract width and height - dataframe["images"][["width", "height"]] = dataframe["images"]["data"].apply( + dataframe["images_width", "images_height"] = dataframe["images_data"].apply( extract_dimensions, axis=1, result_type="expand", diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index a69a4df4e..77e11742d 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -7,15 +7,13 @@ Component that extracts image resolution data from the images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_resolution_extraction/fondant_component.yaml b/components/image_resolution_extraction/fondant_component.yaml index 1ddbf4afb..f840da680 100644 --- a/components/image_resolution_extraction/fondant_component.yaml +++ b/components/image_resolution_extraction/fondant_component.yaml @@ -5,17 +5,13 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 \ No newline at end of file + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 \ No newline at end of file diff --git a/components/image_resolution_extraction/src/main.py b/components/image_resolution_extraction/src/main.py index 823b7b70f..a8715d831 100644 --- a/components/image_resolution_extraction/src/main.py +++ b/components/image_resolution_extraction/src/main.py @@ -38,8 +38,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """ logger.info("Filtering dataset...") - dataframe[[("images", "width"), ("images", "height")]] = dataframe[ - [("images", "data")] - ].apply(lambda x: extract_dimensions(x.images.data), axis=1) + dataframe["images_width", "images_height"] = dataframe[["images_data"]].apply( + lambda x: extract_dimensions(x.images.data), + axis=1, + ) return dataframe diff --git a/components/index_weaviate/README.md b/components/index_weaviate/README.md index ce4729c52..efa6286a0 100644 --- a/components/index_weaviate/README.md +++ b/components/index_weaviate/README.md @@ -7,9 +7,8 @@ Component that takes embeddings of text snippets and indexes them into a weaviat **This component consumes:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list **This component produces no data.** diff --git a/components/index_weaviate/fondant_component.yaml b/components/index_weaviate/fondant_component.yaml index d20d168fd..cb06ad683 100644 --- a/components/index_weaviate/fondant_component.yaml +++ b/components/index_weaviate/fondant_component.yaml @@ -5,14 +5,12 @@ tags: - Data writing consumes: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 + text_data: + type: string + text_embedding: + type: array + items: + type: float32 args: weaviate_url: diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c3afd6435..3aebe1e26 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -7,8 +7,7 @@ A component that filters text based on the provided language. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml index ab59a58be..3a98f27f7 100644 --- a/components/language_filter/fondant_component.yaml +++ b/components/language_filter/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: language: diff --git a/components/language_filter/src/main.py b/components/language_filter/src/main.py index f306512e4..4c753d1b4 100644 --- a/components/language_filter/src/main.py +++ b/components/language_filter/src/main.py @@ -38,7 +38,7 @@ def predict_lang(self, text: str): def is_language(self, row): """Predict if text of a row is written in the defined language.""" - return self.language in self.predict_lang(row["text"]) + return self.language in self.predict_lang(row["text_data"]) class LanguageFilterComponent(PandasTransformComponent): diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 834f568e5..9a618f176 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -11,9 +11,8 @@ location. It supports the following formats: .zip, gzip, tar and tar.gz. **This component produces:** -- file - - filename: string - - content: binary +- file_filename: string +- file_content: binary ### Arguments diff --git a/components/load_from_files/fondant_component.yaml b/components/load_from_files/fondant_component.yaml index 11416e5b5..2e0167b9d 100644 --- a/components/load_from_files/fondant_component.yaml +++ b/components/load_from_files/fondant_component.yaml @@ -7,13 +7,11 @@ tags: - Data loading produces: - file: - fields: - filename: - type: string - content: - type: binary - + file_filename: + type: string + file_content: + type: binary + args: directory_uri: description: Local or remote path to the directory containing the files diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 1faa0175a..e14e6f440 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from the hub **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index d6a625971..7e72f2b22 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: #TODO: fill in here + type: binary args: dataset_name: @@ -19,10 +17,10 @@ args: type: dict default: {} image_column_names: - description: Optional argument, a list containing the original image column names in case the + description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: [] + default: [ ] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index b978a96af..ccb2dd2ab 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -54,16 +54,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -99,11 +95,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index c83f7e9e8..d6bda66c3 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from a parquet uri **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml index 5cc5796fa..894069c59 100644 --- a/components/load_from_parquet/fondant_component.yaml +++ b/components/load_from_parquet/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: dataset_uri: diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py index ddd338552..117ae10ce 100644 --- a/components/load_from_parquet/src/main.py +++ b/components/load_from_parquet/src/main.py @@ -50,16 +50,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -85,11 +81,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 422fdc7af..5fc4cb86e 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -7,13 +7,11 @@ A component that generates minhashes of text. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - minhash: list +- text_minhash: list ### Arguments diff --git a/components/minhash_generator/fondant_component.yaml b/components/minhash_generator/fondant_component.yaml index 6528112ef..1747982f8 100644 --- a/components/minhash_generator/fondant_component.yaml +++ b/components/minhash_generator/fondant_component.yaml @@ -5,18 +5,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - minhash: - type: array - items: - type: uint64 + text_minhash: + type: array + items: + type: uint64 args: shingle_ngram_size: description: Define size of ngram used for the shingle generation diff --git a/components/minhash_generator/src/main.py b/components/minhash_generator/src/main.py index c8034334b..f61e34fcb 100644 --- a/components/minhash_generator/src/main.py +++ b/components/minhash_generator/src/main.py @@ -51,10 +51,10 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - dataframe[("text", "shingles")] = dataframe[("text", "data")].apply( + dataframe["text_shingles"] = dataframe["text_data"].apply( create_shingles, ) - dataframe[("text", "minhash")] = dataframe[("text", "shingles")].apply( + dataframe["text_minhash"] = dataframe["text_shingles"].apply( compute_minhash, ) diff --git a/components/normalize_text/README.md b/components/normalize_text/README.md index edc955a79..3609ba0de 100644 --- a/components/normalize_text/README.md +++ b/components/normalize_text/README.md @@ -19,8 +19,7 @@ the training of large language models. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/normalize_text/fondant_component.yaml b/components/normalize_text/fondant_component.yaml index d6551f578..fd9cfc4cb 100644 --- a/components/normalize_text/fondant_component.yaml +++ b/components/normalize_text/fondant_component.yaml @@ -17,10 +17,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: remove_additional_whitespaces: diff --git a/components/normalize_text/src/main.py b/components/normalize_text/src/main.py index 47220fba4..a98b7b36b 100644 --- a/components/normalize_text/src/main.py +++ b/components/normalize_text/src/main.py @@ -89,31 +89,31 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Pandas dataframe """ if self.normalize_lines: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( normalize_lines, ) if self.do_lowercase: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( lambda x: x.lower(), ) if self.apply_nfc: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( self._do_nfc_normalization, ) if self.remove_punctuation: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_punctuation, ) if self.remove_additional_whitespaces: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_additional_whitespaces, ) # remove all empty rows - dataframe = dataframe[dataframe[("text", "data")].astype(bool)] + dataframe = dataframe[dataframe["text_data"].astype(bool)] return dataframe diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index af43a9826..8d7ffcf70 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -12,13 +12,11 @@ This component doesn’t return the actual images, only URLs. **This component consumes:** -- prompts - - text: string +- prompts_text: string **This component produces:** -- images - - url: string +- images_url: string ### Arguments diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index fdd7589dc..02ea08349 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -10,17 +10,13 @@ tags: - Data retrieval consumes: - prompts: - fields: - text: - type: string + prompts_text: + type: string produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string +# additionalFields: false args: num_images: diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index c9459060f..2168f5ef0 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -56,18 +56,18 @@ async def async_query(): self.client.query, prompt, ) - for prompt in dataframe["prompts"]["text"] + for prompt in dataframe["prompts_text"] ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)["id", "url"] results_df = results_df.set_index("id") # Cast the index to string results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.columns = ["images_url"] return results_df diff --git a/components/resize_images/README.md b/components/resize_images/README.md index 593b2ca76..89561e7a5 100644 --- a/components/resize_images/README.md +++ b/components/resize_images/README.md @@ -7,13 +7,11 @@ Component that resizes images based on given width and height **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary +- images_data: binary ### Arguments diff --git a/components/resize_images/fondant_component.yaml b/components/resize_images/fondant_component.yaml index 6ab866d12..6112815c4 100644 --- a/components/resize_images/fondant_component.yaml +++ b/components/resize_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary + images_data: + type: binary args: resize_width: diff --git a/components/resize_images/src/main.py b/components/resize_images/src/main.py index 434dd29db..d5d4207bb 100644 --- a/components/resize_images/src/main.py +++ b/components/resize_images/src/main.py @@ -29,6 +29,6 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: axis=1, ) - dataframe[("images", "data")] = result + dataframe["images_data"] = result return dataframe diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 9f475d516..418eacb13 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -7,13 +7,11 @@ Component that creates segmentation masks for images using a model from the Hugg **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- segmentations - - data: binary +- segmentations_data: binary ### Arguments diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml index fca45e541..34fbd9fcd 100644 --- a/components/segment_images/fondant_component.yaml +++ b/components/segment_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - segmentations: - fields: - data: - type: binary + segmentations_data: + type: binary args: model_id: diff --git a/components/segment_images/src/main.py b/components/segment_images/src/main.py index 0f8f46faa..4e06c5d89 100644 --- a/components/segment_images/src/main.py +++ b/components/segment_images/src/main.py @@ -150,4 +150,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: results.append(segmentations) - return pd.concat(results).to_frame(name=("segmentations", "data")) + return pd.concat(results).to_frame(name=("segmentations_data")) diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 54978470a..ec80bf334 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -7,8 +7,7 @@ Component that writes a dataset to the hub **This component consumes:** -- dummy_variable - - data: binary +- dummy_variable: binary **This component produces no data.** diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 363f2507c..b4391fbbc 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data writing consumes: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: hf_token: @@ -23,7 +21,7 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: [] + default: [ ] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index 0ed01b961..6d464f0f2 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -74,16 +74,15 @@ def write( # Get columns to write and schema write_columns = [] schema_dict = {} - for subset_name, subset in self.spec.consumes.items(): - for field in subset.fields.values(): - column_name = f"{subset_name}_{field.name}" - write_columns.append(column_name) - if self.image_column_names and column_name in self.image_column_names: - schema_dict[column_name] = datasets.Image() - else: - schema_dict[column_name] = generate_from_arrow_type( - field.type.value, - ) + for field_name, field in self.spec.consumes.items(): + column_name = field.name + write_columns.append(column_name) + if self.image_column_names and column_name in self.image_column_names: + schema_dict[column_name] = datasets.Image() + else: + schema_dict[column_name] = generate_from_arrow_type( + field.type.value, + ) schema = datasets.Features(schema_dict).arrow_schema dataframe = dataframe[write_columns] diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 1266b56d3..54ad2e417 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -8,11 +8,8 @@ {% if consumes %} **This component consumes:** -{% for subset_name, subset in consumes.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in consumes.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component consumes no data.** @@ -21,11 +18,8 @@ {% if produces %} **This component produces:** -{% for subset_name, subset in produces.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in produces.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component produces no data.** diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 013ce2b71..58c8ab045 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -188,7 +188,7 @@ def fields(self) -> t.Mapping[str, Field]: { name: Field( name=name, - type=Type(field["type"]), + type=Type.from_json(field), location=field["location"], ) for name, field in self._specification["fields"].items() @@ -222,8 +222,8 @@ def _add_or_update_index(self, field: Field, overwrite: bool = True): if field.name != "index": msg = ( - f"The field name is {field.name}. If you try to update the index, set the field" - f"name to `index`." + f"The field name is {field.name}. If you try to update the index, " # nosec B608 + f"set the field name to `index`." ) raise ValueError(msg) @@ -238,7 +238,7 @@ def remove_field(self, name: str) -> None: del self._specification["fields"][name] - def evolve( # noqa : PLR0912 (too many branches) + def evolve( # : PLR0912 (too many branches) self, component_spec: ComponentSpec, *, diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index dc940b5f7..2599b5de1 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -5,7 +5,6 @@ import os import re import typing as t -from dataclasses import dataclass from enum import Enum import pyarrow as pa @@ -168,7 +167,7 @@ class Field: def __init__( self, name: str, - type: Type = None, + type: Type = Type("null"), location: str = "", ) -> None: self._name = name diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 36f81b7db..05be61c17 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -443,13 +443,13 @@ def _validate_pipeline_definition(self, run_id: str): if not load_component: # Check subset exists for ( - component_subset_name, - component_subset, + component_field_name, + component_field, ) in component_spec.consumes.items(): - if component_subset_name not in manifest.subsets: + if component_field_name not in manifest.fields: msg = ( - f"Component '{component_spec.name}' is trying to invoke the subset " - f"'{component_subset_name}', which has not been defined or created " + f"Component '{component_spec.name}' is trying to invoke the field " + f"'{component_field_name}', which has not been defined or created " f"in the previous components." ) raise InvalidPipelineDefinition( @@ -457,36 +457,22 @@ def _validate_pipeline_definition(self, run_id: str): ) # Get the corresponding manifest fields - manifest_fields = manifest.subsets[component_subset_name].fields - - # Check fields - for field_name, subset_field in component_subset.fields.items(): - # Check if invoked field exists - if field_name not in manifest_fields: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The component is " - f"trying to invoke the field '{field_name}' which has not been " - f"previously defined. Current available fields are " - f"{manifest_fields}\n" - ) - raise InvalidPipelineDefinition( - msg, - ) - # Check if the invoked field schema matches the current schema - if subset_field != manifest_fields[field_name]: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The '{field_name}' " - f"field is currently defined with the following schema:\n" - f"{manifest_fields[field_name]}\nThe current component to " - f"trying to invoke it with this schema:\n{subset_field}" - ) - raise InvalidPipelineDefinition( - msg, - ) + manifest_field = manifest.fields[component_field_name] + + # Check if the invoked field schema matches the current schema + if component_field.type != manifest_field.type: + msg = ( + f"The invoked field '{component_field_name}' of the " + f"'{component_spec.name}' component does not match the " + f"previously created field type.\n The '{manifest_field.name}' " + f"field is currently defined with the following type:\n" + f"{manifest_field.type}\nThe current component to " + f"trying to invoke it with this type:\n{component_field.type}" + ) + raise InvalidPipelineDefinition( + msg, + ) + manifest = manifest.evolve(component_spec, run_id=run_id) load_component = False diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index 30a4b7c10..d9dad121f 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -9,9 +9,7 @@ from fondant.core.manifest import Manifest manifest_path = Path(__file__).parent / "examples/data/manifest.json" -component_spec_path = ( - Path(__file__).parent / "examples/data/components/1.yaml" -) +component_spec_path = Path(__file__).parent / "examples/data/components/1.yaml" NUMBER_OF_TEST_ROWS = 151 diff --git a/tests/examples/example_data/components/1.yaml b/tests/examples/example_data/components/1.yaml deleted file mode 100644 index 0c245a512..000000000 --- a/tests/examples/example_data/components/1.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Test component 1 -description: This is an example component -image: example_component:latest - -consumes: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" - -produces: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_data/manifest.json b/tests/examples/example_data/manifest.json deleted file mode 100644 index 8fe4ef16b..000000000 --- a/tests/examples/example_data/manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "tests/example_data/subsets_input", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "properties": { - "location": "/properties", - "fields": { - "Name": { - "type": "string" - }, - "HP": { - "type": "int32" - } - } - }, - "types": { - "location": "/types", - "fields": { - "Type 1": { - "type": "string" - }, - "Type 2": { - "type": "string" - } - } - } - } - } \ No newline at end of file diff --git a/tests/examples/example_data/raw/split.py b/tests/examples/example_data/raw/split.py deleted file mode 100644 index ade466125..000000000 --- a/tests/examples/example_data/raw/split.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -This is a small script to split the raw data into different subsets to be used while testing. - -The data is the 151 first pokemon and the following fields are available: - -'id', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', -'Sp. Atk', 'Sp. Def', 'Speed', 'source', 'Legendary' - - -""" -from pathlib import Path - -import dask.dataframe as dd - -data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent - - -def split_into_subsets(): - # read in complete dataset - master_df = dd.read_parquet(path=data_path / "testset.parquet") - master_df = master_df.set_index("id", sorted=True) - master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - - # create properties subset - properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "component_1") - - # create types subset - types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "component_2") - - -if __name__ == "__main__": - split_into_subsets() diff --git a/tests/examples/example_data/raw/testset.parquet b/tests/examples/example_data/raw/testset.parquet deleted file mode 100644 index e7b9c625f0c104d9fb7c08137912df65d1915cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15048 zcmdU$3v?URna5{r$&qX+krTi0r726d9GWh)1lpxMx;-tW6xyEMF70NarCpZo?*ALf zugC^^8gfnrKWXOPJ9F>-{`dRty)#aS-Cb@_7|ty>ysU0DJepx(mNU$gY6Z)vStdih zM8RZgRg3~CK^9m7R6q@~K@QLWEm#Vc0UgK%%fSku2L_M_E&)c64_1OpfeEYvmx0y5 z3<`h+Tn-9B5wL<{Py*I~E5KT?4wQoRU;`)v<-i6iKqaUG8^M*J8q|PI;3`lHt_IhD zYe5~b0|%%FPS5}vK@(^OF3n>Dt1eq@F0fo)SY$0OS#!nOb*1Y!l$F~mDyueLSzWW~s@kis zxwg*ksCPCrHZ{9iHn+C5Z|UfCcXjvl_Vo`8ZoO{XXSVO)JYF9^ubV#n2@5+BJaG{RnE><~scZy?SV<8pxxM$BF4RWuOc z#|&Bx+8kiNRoE{0qzD&!>z)br%gUuH0Sziu26 zYPiq{ul#pYM??tnT$pE%n44fjQHTY2Hf*#COo`y)YQ#~Vw_i!l$WZtnn;`%*{%8}Uh>6#%4bYX z5pEPCF;Z}y23_Hm`!pR=DA9SZJ}mf1#{bZFiPBhDQctoPl_w&xZ7ia&Tg6Ea>Dx7goE6utX7fHm~o7rqdP*`(XA6aj16M_w*zRR1yAS!Cmv?M~~ ztXPz*<2>jvoz zeb4Hh0=!>Y5K!@>yz(Y}F9Jz~SN~4a0;_3rRE;8+3pbG>3BS5Pw*}EE@#RwbjmFfs z={BS8*uF>9*@|$K{YSH%hjBS?EYa}3u`47+L~fitskcWEO5(0%`av#?x(k-;+i>p) zugozzG1!HWQ1FTwM;H)rcnBiIc=pw@HhwHNCdnfMVI+N;+Q^hBWmh*U$W<_c{x;^K zH;PRLO=>Puv~xi|IUC)6S>EeL>)Ar<-esoqySCt*b27tlB%{@EnY>;+#MV({hWkdN z6g(P*2+!tr_t6|dCUnI86cO7c%r_#N?d-rLB86x=YLItyMyWwW{yVcuMxwCz_E zc@HV9C)ZkkQzF}TPIDGC?MtlTdu*#APd06!kSv!p7@Mtc6caQ-WmhMyMOIImOAY(M z`YLw0Y1(XTtv+QnbGR$U^CI;d_9BJIigdBu7MN`>YdA2^Y!9*1c2mj~psf8|-6iHI&4LB%y-nq3g-J*i{7R+Gyj8xZQ$oP|6i?+{|#~h?G4@rvn>LA8!Jc zb_eril&5{h0H8h6_eyoxSCz^8Eid}(ME(O>U2Q+D8u583^$Ydy%`iMLe+@pJX;ghD zS(>3WczP-_?_kdzN_o_h;)#hJIq^|_uEec7{^9KAjrBitR{iv=`yTn#aLMG9 zq3)}(dj~_`edG_;r#k|FSi!D2akymsWdDsOiZ7wLWy!?1Ed_En&%)9Aj^2@=mL)w^Uo-wPDen-kU}GxbWap8h(cW*qt1E8H!Tc zFxS$CnKpkAK(UEZ+9`#G56MW<(w4LY(DstHnqA;BK-<#G0c|H60c{P}1KO5$0@@Z@ zfeS&Q=psiaOm;!6SR&2k9Q(F2`zmWcY*6`t6$2Y1zNM#yT zv`L^3nI%0H8N1KzN-3XJTxZreIg{#$BKxtD*0*cVW+*$3I1g8L9xvL~|6s61p~%%M zmM>Ye*;4XC#hMphU30jw^?6Il-gi~c+qP|c%l>Vf`U^*^>lo$KdnNX8k$tlDtmXP& zyj!!(T)oFq`oiPRx2h)p<;it9g*AIR+F$YY7N5+pzVDcoIitBFsqtx5-oY&E2iIEP zsfEVr&0eH43u^qP(r_@d)o`7x@wp5|IVpM#DM}mrNh^dO)HkaG_K8VRIcE1;1>qlQKTsZE=GQkg~*ZRlM9i(ok@ z(Pkc_qc%$V;j=&aL)*r5#iuikDw-1LLqf&8OD2zP)al-gWrB)?-}*eG~jo zPKWRNMmXp9dvuq$&C1f(&2^S5Z`|=v*=paN$1gu&(w{u4?0qq_{p~Y$i_y0FN#mEA zt=sc&JhH|;RdHvD;lz%ErLWm;Y<_6)kxlt})#R4{uKP-L`yCDX-v(IgPg-Ub&$45) z=p9L_{=O>j!7S?=P1YxzP(8iri*#r~)t^xs{tVCVmsNi=rTVp`I{7L>{T$CGJxEj1 zkF+K|Nhk7X(uK4o-6;Z45GTz^HyS(jpbwDVZGgrQX*fN0NP8MPr1vsFUQc>I0m!Rq zOjQAjDx^QOuf0K*k70z=?}Bo{LtoqJ z`ciB%vhMIV3b*tYE*qzB}v%Ov)*_OZlX9YJL zO>A2I!TrK`?C0K+$(@D&y6^h43EvCv)vh{q;t|g5em;Nut&H_4_pIJaocs`vElpDJ zAF1+AW?7%!Vx4M*;OT8A!P8Tj1qGi{;PF$d!6^%VBtu~*H}4?9Nmi1N=6sTsLcaN&pu$(>++A=YM*!4GUn3E z%WIY1Z9jH1w-@NyFV@waQ@wn=^yV|Q6CTx18h5U8*M0lVtDk*bbF+QTZRg5f&M4c< z2$Rg!Ki@IY(&K78Wq*cg{(N7VqlB@(I50=ZuJC%ZJ(lv^(@XNsDy=W|TK9HC$n>@^ z(xU~1d@0kgA3|=Dh5R@6JRzqAM9fwQB(a{1Pg0WS&XtgsW8_uS^YXMOlgE+dBr6RI zlJXiro(XwpM4R?Pcpnz?Kbk+kPG=fb`Z=Pdry?_O&YzlJmBr=*Y_s~;Q{j&0`l7G* zM=!567TnsgSFKptnNh<{Og^k?oEpZ;rn_q3-G`R{v6FFVI)35nS(WiaMbUlvT4RaP zKH_@q@2pwQriowfZ#m#@dR068+VU$Ne&E+fHAN3H)+0O4QwZ*sbhT%)@;+2ppW0?U zwiODcH-J6nY!I*k`Ds#Rorl+FVd2?t+r*X+d z;n_2lKgeuQ7pPbCXnUh;GEXZq?_^)TrTgH1=9TB#?z^RE>tk(Qw#4fh&Bu=wI!`l4 z|5#AByi9Q<&~jXL>G6sC&*(fwrO%8PY@X_ADN`B;JU{zo@bkBJp2?qj`N-1OEH1~v z9sm45p`(SdzU7}4T8YQ2eCmO8a^duApY_KaWKM56$()|Lu*^DxzSWR?m!vXHKgEaL z@(e5I?9FE%@XsDmEz=jwrow7DB`H^6AI)H_?+nj&A}cDzBi_`!@HCtE9BciNV0}

7N6% zU@4%1tOL1VIiOj}0P=tlP?%c@E(IpA3Xo&124+wIEPx!W5EKC`z(!`qnaELSo}$GJ zg~9b;1E7UVIUonE0F{6iCKN`=Dal!Dz$QT3APSu{Q&Q+8XQM?lxxNFCBa-tq0CIYA zZWm|)YZTUT`1yJoo~b0{0`Box;GS&li(FH2l)vz)fcw z=@Fg$A-73SMb_WOQ_r)zXT!yc?b}0fQQVF<3M1PEy#5;BzCG!rcqR9g);ck6JrY9f zNbl|c^zqTT=>C~tEVB{j~!Y{_g?_%spF2<_*(_heCgw@txwRW;z z3S=-WO&Em_s}7~U|Upd}s>ypoT%pmZo63@2tXB+oG3 zH&c2Lf3)D^VjPO^w(ubo$2%mtwx_$XwAymR4YQ4z-wKQ1qwD9jB9(StD}+#NV`X|9 zW|HSLfx33XI{y_> z>E!cjo6%?f(3wg8DA}4j((6g4UBF6`&v?@-noYlem8Bjqrq?zn;{sNfTAig=Hz(r) zSBH))TwN;Tf>wto3ky}2O1^+qCHMd7b@TV@31Wg@b{4EduR*8%-N`HtZ zg($VWe0|yaG7G}HB~|sDOdImbEjKVmjot0u(zhwOIWoR;=?41P{G07|dncd|jw|gB zzrE3J_uK3Ja+t*QSH8)Z1IakI+v{B@@3_(x z6yt+Uy_Nm8_EArBZ_E>F6SfErk4vl>@`QS$p2nJyf&TVLzb#d+yjtk*8uPn?sq$h? z*b{V%d{gf(*RbU8tB4AoU}J2s%eB$fR8{V28jHH>TY_9)Z*;J+CO*(tE>b!9-u1Xo z$PvKx0dILt8Q<@a^IW0cIM>%cI@r`BwAR}Nl#O|t#UWp_I40k>zGZYE7#0UAI+OW* zy=4P^?trhUf!dBf^~MKmH6vVKRfy_xiM>5;p{&vsDzBN>_EcZ_M9WOOV2=@(K>ZsG z5B80Z^xLWep1vO3*Di2<$@;uajj&ncuKo&ll^1PH=2exo52xCb-rle$)M;z4cabe( zgZ=IRt}h#yvC()OX8n*T;Q*Wll{VY?RQko)zxEb**VzPUY5)+7ky2QxQ9D1GlrYXqORsSW4^wn$5Y`B zc#^hVs7<4|M^wJwLUw{b`kVu)`qJBjF+DrZ=8yY@Z8Q?-TuigBM!W1k3yq<9^<04Q z+}ZDjkG4k^Ys2$ z36*25!wtd^)@oSi+3n4#br9BPPCDQ0uSbqV%eD?$zB%kTcY0}U7olZevmeViM+2QV z(0Mzp_v$5l*?siYATI)uVYf>i6&p5CcBe~DY@x)Nr6!6}UgM&+p1q!yqxe+J+;2d> zA1yrR-mjig8|I8bxzw(;8#YwCHjHlAfLws5??n+OEJ??72{NQ(Z=U-nx@X4=lEaS%N+vZ>I z{J8$h^;0Yz?5o-}*jZnF-t`~Xf7yQ1b1Ce1UjOY9Z8cz*Y5U7oonwEDF}oe>0+bj= z$!a=v$nrIIr_LMd&0OEO@bxql96jO_5O zqxh2B>u}Re7alLuYftodfXbJ5;7cCwJqwOcxxWqa8d|<7&e0^fXead+Mt~i?b|>W~ zGv#vA{&W6#m$z0;;s^@K`z>6LB5>T~CuzUr@zspc9OfeUXIHUOZ#Y~kZz9d9O`3a)_H zz*QiC2zVVtK@7YB-UM;*7I+)H0}|j}AcFV6HEiVMM5evO1;(SV@|fe4E{3`f3ST7SB@ z9{w){#FDrkA_NGgH7tZ6a57J#p^@6tErlz=>j9pHL|7~yi~q05`7YQx-^o=%V##R8 zQEW{aMSl&Ph=#4{(*t#2jxBp6-b#ES8k%@NnGk)`fv4-9aw6J_rNhS_nED?zrsP#UpK@+{E~R0+ zH*^0?d2W7T#-XmbgCUAI5lw>)mEnw6@643pa4_M)j`iWl7}(sJ$)wV$w0H{ZJoU3; zDxcaCPf2uzHkBCpij8*?zvv0 zkoji}FN&vEDV=Zus}}_8nNtT(#Yv}~Kk7(zO3Pii6+f<8=2)(@a#FT&%(h!Dv+Kq^ zqsU-Cs+#FhwN&e~>@>sL`9lTo%2Ll%XVsh4vWs`~J=Xr5`;WEubhJ~VhHRN8t27Up zRd(fl_*L>5R+;cP^W2R?ZQGo9TB&9-shHOHt6WEn%tUV$(z;YMtNlT@Y84E{YW1~} zr@OM$?8%jKx^o_PT9Ec9@Vi*MKC6v8?aVv|_F-@z*^%4RBPn}UGc}a%-+j3{=Kf5+%BitR+XO-&THy)>{?H?kGvd+ ziRap}HuLtEm8IGPrCk_%d+*l7F6^}$)c6={!KsLChrCH_Du?@5Yt!Q7>HygpKWBNzP0ltdGbGW>e1 fyPK-xYo_k02qt_=EZjHb$F@ diff --git a/tests/examples/example_data/subsets_input/index/part.1.parquet b/tests/examples/example_data/subsets_input/index/part.1.parquet deleted file mode 100644 index be1028aaaf00e22c04ab54f275a1f5b9da8678ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1707 zcmcJQ&u6vrnR6C&BFRdo?a9MGz@av&~X)+w&rX21q70c_)hT{dya{xU4yUAC|o zh~R%95J@n8+)MHOo5B(de-g~Ij{uBAmg55ZcROQf-J~PYvzJ2e_%mWT-Gs}g! zZzJ5xr5a~&a!`sZ`Q|7S)%V*$o@CA4P zC@6vw*adr_3@TtBh@c7{f*PoU1~>qR;0QFqF?a+XgBFm04B9{ePry@f0=@(t@C>M{ z(H)JYvRM${P@!vxiyyY6^w4)E~vEz>T?bH+AGWe5pil3W0fo=xM1@Eek2_JWC+weHG zOxx((w9O-T{F%obdDHRy84q7~?0B8pn7tBa_Y|756m8hjSwyjl~ zr==tc!h%$^GYh5#8pbf0g%@U^3!$f*Rwzm*ZM~ohs)`yY z8t4|^@5lJooiQ$4xbP!%df~#ITlXfK=$G(+3%VV~NsJ3`^1HV<|5N9jd+z~uX+6ya zxziALvQp&?jyyuhH!DGnT)aT^iw^}!do4^lzyK!bf*!Dd4II!1F1P~*V7L|zdMv_- zML2|I3?^U-j=(W^5zN3#;AQX%con<`UI%Z0H^E!rZSW3w7rY1F2OoeB!CmkX_!xWw zJ_VnF&%qaK;p< zap{H}-~KjP{q_3pW3^}W+v0cj^S{qtxO`VLzc#-A!TR&WP5un;jh|NITriefC!tt0 zU}{_-Vnj~Dk<-4|0Kv45g%AWz=14R&R1MvhnG$?Fz_O5t5sSy- z|7&u#3uA-r)0ZS%+n-qdA|yl%VRl!vb~vfb&qxvz9})nj&B_he1>Ba={0!Db>U+h_+~yWPBhgE=Ucc46Hox!a>FN_~j5|e2=^kL?X-c!S(4i$t zJG9vGZH2;;K4!+7=X!-gW}h+qD84RGI%Wb^KM2+{rxuY|B#0^|oS{JZT?(7csC8lle%l)SUFh z%vsIUl$(1dt643Va0aC#ajTwF%WOYP%cXX@b6s&7p1N(_Teqt?dzqbouWKqbODSbb zU}uE$nz${eVyI5f&w-eDt{iGpe}5TCtQ^VB{K(&X_vWP9rK4K0l3%V-vQ%fsRWeyW zMr}Wvu6Ao?d)`T`Cs-5VYW)@W-@>g^T)5u{XZ=>72D4C=P5w*lfl}6!6z77uLvQ$f uq|S{@m`A20nsDX8ossTcQC;_n|HViw8b2}rO2p!E{L=jqBxDzVSAPN5#JEWS diff --git a/tests/examples/example_data/subsets_input/properties/part.0.parquet b/tests/examples/example_data/subsets_input/properties/part.0.parquet deleted file mode 100644 index 5217045981270e26d7f66f5757130739b23b27b6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3542 zcmcInUu+{s8J|sTXOpvYz9Vba;48-5IoCExZC`wCjvRFD#Q!dK9Oq&?hAPZ@*Y?Eg zUEg{)j_(u+RXkJ$RXox@AXJ^8AcT;3LV~J7hzG<2FT5fINFd+=@l+vH@y)FBuQ?u| zoR#0~>^I+h|9*DnjqNHl`6mDO;2A0NdsiyyOo?+@%l? zv#P4woHsTD({u-$J*+EgaMrJCs-bf)b0I%@h)sn{hPEHr#I!V(3;Ju=w2Ujha7x1i z%OC@li-ywJQq4Y5Ia4gE7VaAg*UF`A3#2VmRXt$8rmBkOo|(IcyS9$Gl~4)xYj%izjBA9rc4f-MVM?iCRxqzN-q-0rK`G=|(7%D893ZZNyv#rlAo z_DiO0bh*!oWzxmXj?K-(c0woJ@HS*M=$PtI@OdHT4)@dRC88K6)_?!zm$~n{XL$kv z&2wJ=16x(~#5_W;UYke86|c8T^fu=Wr8TvMi8&IyNPc!^u4-$2ayY#s>X56Z4!E7` z1=4P7qrTm0akHrAp7mOE`s;;;En|zOpEWe!%-p7;5D4I!KdTu=kJ}7aiKg~7yQgvi ze;J#2z&&zLyVT~5Vh*@5bZm`jGJ1y95s;t=oduxQFL(q`=r+3PW*$U0%OCtCem()C z%y}-q9QLl^Q10!8jX$iGuNUvDuhhTY`{JLDzI5{qg?zK~y&q_QeK!WW2u1(ShXijh z=|`TR=qll!Doh0Az%OP3f99hhzQgm3=&#_wlYfR8pdn1(19eun9;2ZUQ90=K}AF>VRc}(AUOKh^s0zPD>^7$15WL%vdLYZazT@i_gE~6w!S-{F?Oo3jg?9 z?^VA4ix|Z5?tfzP2|d~c0Y6n{xKQ-hBx}p z>Ck`S`iW)zx<`0>^0=R0n{;ush!eDklaRa%;0AaAQve1etLg0((aIa~=RjCgmcfwzY5GC@; zLO9}x_Sm}0RvtxkxWPb2w>1qkT13^9z4J21@}4#7Q^km_lcu4l5zyA{UVn75V95Jw z)1vYoT=fdJFsMI@s5+>_R|DOIMCg4=Eu4$vMEa2;p&1@6KtSW{ zBDwNKPmb(Eeu!?}UcQ*`iTF&q7o0%lFQnthKB5y=LQ}<-VX`u7L^iQGqE+sXunAxM z)bW984v3*sX*|9ZTZ%=X1d*}pvx`f~rCX6BBzmPvWwUn2=}7T?{ssDpOKXxOZ2{=# zo+K?xskW4nq_&i9Gg&HiZhc6F>nH2Et+ewFBeTNR^O7`_U>`i05qPpqFzy0U!`)i4 zyIU_x(%KOAr(VqWG`o?lF4q&K136cNlFHJH?PG8`E+4lP8`PNTC#;GP znxp*fe8z&k$&Osxq58H`I~;7JCDQ0=nw+h)@b)KYTVkgcT) zak-~~{fd?+!#3E9%k@;UeyfsfW~p6?$ce&(My(Wc*5jIz%|QH>PJZ8jUu*?U?v)N2 zS;&FLlrB`O>$}!wy0Fr!o{Rk<)vw(ekdwI5h0WbseC)T;Z`2OE^~7;L61nfh|5W0Z z_3A**L43J$@#J^XdFro+Yt=y`vtqOOAnts(v&Hh)QysZi=``ajF}0r3Xik(rWo>s{ zpNsVa&SItgEke1GYt`6ptnEJGwuiKir@kofjEjpk!8ONDGn;`klX-ZSr*nSv zs@2SCXL;_+`c|UkV-~I@3ON1 zHep^kP)4jyiAKHf5Z6|0R)_ud!;O8(E&}&7*V7mqT2n9e<-{_xQQTgRZ|oOI%jrDY zgHnqQYfJ$BB}qmZ`jfWPR-Or=Dd`Iv*+5$>t94=14sB&7+o9uHar~Z=I!xN6BlP3( zhxV+r%?!5b1pxiol$eldIG!K~u-2ycL5}t_e6~)FtY=t|spI>AZDcCoiJfUj!nQC~ t=l$+oB!>Igg#R(D#b&?1$S$<-_`(W1S6Ofsejzx2r~tqG%kUrFzX2YBUC00c diff --git a/tests/examples/example_data/subsets_input/properties/part.1.parquet b/tests/examples/example_data/subsets_input/properties/part.1.parquet deleted file mode 100644 index ac842a070982a6f00c7fd496ab555a5916e220db..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3526 zcmcInO>84c74FX1iJdsJ*;U$V3xhmn#sqCL*iM|?3>iqdJt9tAKycv70dav>-Oe8w4zNtg zUsd<3SMR<0UcIti;c79)#r)pGJew*qB164GQNMI8Oi>h-mb(hJEceX%plg}3J_)iC z)`ylvGi(K$eHm%)nfd-m90)yn#*@_*ddB^#g|r?Xc|f^h$|^<%_QG@4)Q5^}Ntg}< zIa5HItYP{NThSGHh`RJm_l!@HO@*0VG?68_Ui zcYnz{>oW&fq3_LSu`Uio_s0ua6CGj&n{JAt|3P8*_vb3QHnQA54%Uz%i>NQr@6Ycj z`q0q5Kcsyg-qh(ogi5HZVO6#;Y-HAB=^Fj+%?vU-*b=0#iS&J68qD8%YaRw5^2Us3 z2V?iUY#I-{y4I!tNo8+HF7w)r_g#Y?T@CI?5VeR++80bI`lxR~JYSzfeM=e9pZRiV zpuNn@VXccy`krrFlaJ^xvuPms7-RZvUv6Y*?zcC#OvLjelE^JtgG`zX1^Ja0p_^kS znaSWPwJ?2766+#_gLeEk6VQwFZ z^uaUbuOAscP^Kb3@CF%QATdWx23S`ashFAYi~bL${onIOem52IJl|Ph5sn_kOi<9x z!9Y>){$zrAk1jG{`zSu5UDK48roQT(a#0Jj9*P6R0TsY&fHoii$N=^MD}eP`k4q-2 zzT%zQpq5C0(Ds~x_^LDQa~fEWJf2}ax?vUetDfEbYhyC-T2T!8E z2*%%8_+&o*I4QbU%ktw7e>?Hr&xw`DhqFP(HD+ZZSFt&H%jN&kwaA}lB2TA-|Ap)4 zQuUKb=GnE=s(o_J6(^fsCR7m`5`a|zq3LzN8qjiwXnd1s07=wcz_$S325bNbxk*3@ zkOpi52(ei}4v+^F09$|}padubwgDs|RX`1}1K0)Z0qTGTpb2OJNW$&`-T>SOdJNDVbYTF$gg;E)c+J89&G(TU!1(1xV;=cp;#YRt8F*#I9191-SZ97VqBWzxGI3O zEsk5^l3gytaa}IewPnfVrTHNdZl2BOsww9kdfSRUpXazC2kT(8jlgIxg1!Yv4qJ^x zuhlGZTzUxWldt7fWzf#nSDNwiA)l*TyjGI8H^`)wQT#!;^sKiv-}?>h0nkhpoR z-sf`=U+z*o`Mp%0_)}4%-fw5t26lW9cfKc8?fq*>iC1fq5L=7lW>O)0BKZ^3tua4W z`GcHA%Li4NDc-lzK zayz!cF1Y}pJ_XJe+8T~02m-8i$$gL`^$e|gZe%lK2bny*5A21u3V5>5v?H;X*s9L^ q-MbVUGLZRTfUPCLFqZ5KEi}HcLe5ndSchK;&W9?)->eG!gZmecoorbE diff --git a/tests/examples/example_data/subsets_input/properties/part.2.parquet b/tests/examples/example_data/subsets_input/properties/part.2.parquet deleted file mode 100644 index 1d7df89dbdd9ac5c06af6e7be741f0c26d523b58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3584 zcmcInTWlj&8JfE5aIz4tX8P-21pf8JRl?59M&x~_xwhvHt zRR!(-liM_y9~-6)aeWE@E$p z?c;V^u&r2V!3MSuxEY(*HB2wFWn}0*)S;iA&dZjnYYN6=6cq{u1qo^N2jV4J=*X6> zwdwirn%-*3f$?A+V6Upg{h_N@r0;MBsZErkmA!A_E@s%x@s516LxMgEDUY1qw zZ?A6&153h&2Z+Ui)~A)p$HIuW4-e^dxQyB=nj8sNWX+VlW07rS8@lOTV^Le~AfrnM zCvpR1AW=8zYu8JtYar7LhE`IM${Jy?WnhQ|F_1w9rGS9D|0 z4qSGU@vP1Ge3T#1yyzJa~t?23UbS(C5)G!C=2t|8foh5WLG=^M#1?pZy3lq3Q77>W@Y zX&Q|;=bB^3PW^)H+?yjXO0@6thvUH&G?D+#bm=e4l_$#E__@aS)LVZ& zdj9HNQU12{{U0iSyWgGuJxEXeC@{eUV~b(R7h^r^NX5)ZREqw2Ec!-Z>R*?p{O{&2 zHVxa6Vn!$!!f>M~cz$t|`4e4YlFs&gKzpuI0h(G4T=GzRAwM+-XaYDuCFJ+Wg!?w( zeh?_n0A>JLz#1R|_yiyc_z>WAz>|QRfM-Mg<$Z8TF44V_IBHDuG)=7k?IDNtM~1T~ zb@d@sU_SMRn>z31xF_{1))O%+ONE)&MxOufy|=#Aul%xF6rW@Z{%`!`kK>TDufK1C z@eFe|lCL-!`-dm`*67rm!KvSjP5c+GADYGoKIZ=DahJb7>JdqzFiBJk7T*R)02#0k z=l~Re3eW&LpbNMSFaRc`+9DQgV&MQNJwP8Y02~7D06q#h0(=bc0^mi!#{r}Sp9Fjg z@M*wj0G|bX4)A%vOMsUFUjTd&a2N0;z$<`P0bd5Z2Dk_K3Y6`u!%}>0cz@^aX8p$; z)_+`DQnrg!Fz`R^_dB&0z5D*?1Mk$2-#_(-%TT?GzTPE*hgC0DTBKN?DM(mF zjE`c1lOt|pO;V)^h?(&K0d~!BB8ya1icQ3p|Bs#cXgBE8BTZoRE*q7m{L!C34ow@I zW_?|xiO6Js;klELH4*ps1YNOJ%}m^gKbw%n#B&e;Dn*``|w&NbQN1jL>2<^I|;0TIJ~`x?#_00 z=6d4j=row~Q6$L8N-Ug<)OJ_^`4N=Ecb#F z$oz%nxU~1^#7Uun5qz{bX;x(3fR6)`l&e^QnSg~WCBu<7`i^o zoL!uqPaIKfkgHZV>NniJWdH4dnp`Pvh2ywQ0J+vUZh=d;xh%)ExlG%krPJs5JwjYP z;paCq?lX*z6^CEoxE=?3aCeNr-BE&Z7m)7l))zawjWWlr^guuTOhHxbX0Eo-m|H*K z^EHdt%JPPs=7HWRC`q%BDem)g{S;qcY$4EXHY7zD6s{Gr7U(WYe0_(=+hV&GU_?+e>hboW}`tkT8J1nInd(6 Y11s(xWwB-WPH=xz1^x+D;J@F00P_QnLjV8( diff --git a/tests/examples/example_data/subsets_input/types/part.0.parquet b/tests/examples/example_data/subsets_input/types/part.0.parquet deleted file mode 100644 index 6074b2fc3bd605b27f527b0d1b07e7ebc07c0fd2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3018 zcmcImOKcif6df=o*l`-$%}_{bOQ}QL)^_+9*N(i%00u*V*u*eE6FD=?7zWP_gpVx+Dna{oVop@~2m-Vw2_Dd7{#FS#ES!SAHwyupb3=?k(t(FB3F{LS+rnY#YEjLH-9`01ExMk{1 zO;Ana(T%#&s_QnG)>OINZYpJyeL3E&cXWyH!~~_;V;OdxP0g_)JIPvEt80mw9i@bh z5zrxO=2J7SqRDcenVj+jg26~M8qMu=bGh7usc4Sz1eyq<>$d`-%XoF#vp29TDi_Vg4O$U^mqvY zTEcUa{V$XEsm1&Ih->Kt>8WKrva*jyQV^A_+Jg~`i5W97=CLytX5^fWu>hlh6*vRf zfH7bkI15Yw=YZFM^S~r91xy340~Y`mumf)Z4!{Xq1l|N(z+1rEz&n5&co*;h?*TKw zCBO?@24;cxfe(NWfjM9vSOBh^v#t7|4*B_w8(A=kt-es&VLS9VY^HvUotZtw3TNPZ zV?SBXUvp1H1=srHxzul~8#C!G`F7!ncI%IWJF^cZ<@4%SU#ow9YfSC^Y{JD_`n$`l zNOV~q+U#G9dcPU>KC`<1MFyv>mopV&9M)F3Dr*93W!Q;J=EF^ViAlMT^^hrWXtE#* z$uM@&>2j{Vk`8!fd+`5N%>K)SUG&=Q&&Dn=jC0OmZ3wy~v>ZQJ?t2tnlJ~dEb+x1E zEzec=#~wxU+@5oL4AVCC%hYqyah}cnNRoXskGnlvAI8hMj>F7$iF3BE5 z>m9Ap8#wA>4M!ZYYjRtVgtmb2`yN?GIL-|+Hn$aFBSrsJ#jtX7WK4pLkWwp?0G#+C|IwcJZAC1P!i4OK-xN8&pYzu!tlIHjbisu<7ih+3>&x{0`W$S=mbgr`4u zW=C3AyVMs+`bA9ze^yN>-3s{pVj&zV1hb)XobYmr7)ahN@fn}t_p4GohWxYDM6F(t zdr4K)GJB;sY9Kj9lKK2vv3(agZ}MXjM0gPVq* z&-;pewkpNf2J5(UvrDw?6nVc&bfbHLuFuJQ1m55|O&{zRU#8Zd=VTrYQQr(?{z01p z@$`M+pf)@w_oN=xI8Z7i?+N@$hvr>d+fUWFP8$1x ztnrH+OKMrqG{nFnMpLd|IkbzXfDJYbkfhGMfWb4v1O{Uq zS$(XcDypLHx{0bP$|8%Xt1i3hqH3$E%c_g2Y8PGf4|G-a+&h@ygjA}Uj{MEzoO93p z&Uf#*!dpxsOuEP)9pn#=4Cy6_^8}&Iy9t6|+pVr4J5Fo4R$CXeNigw(rL;Zpl2q#r zOEv1Q$>~sAnF7mJr!O^B$+0lYwuMf|5m+y@q&*ay&=kpPgKKkjtEF~Y2Fk=XS{;id z$P}4bBxQ1zbd&DD3b8PWr|@}>2SMoky$QI>4%b=Y92ME13dM+2qk8Q82flOEC)DG^ zA_e~7eP@82_D5$4x8LX9>6^-2n4Iv+-g{2(U1#u5N6<4izkd;`C&&o`ZnN+p2>AZc zLEZ&5UcqW4c19~k%GhN%PWx<$rW_Ym?dnlw+m@nej@Lp-tq%e@!NN%jBwr%wkxEw= zCUKz8b1WqUzFI~z`G_;`zx&s(PUlqW2j* zU}a3svDJO*CZ9MnpgLOX`%V{4oN^G6Q&TQtdB#IT0V{yZfK|ZSjK_5aMZAF`*1_^7 z;4Q%0fU5uo5Cg;k8-N5L319&!00&3|t^qQDEFcHi1Z)BFfC69}umdOpN&p^E22=o5 zz;(b4z)iqAfL*{XfH31(7eP35N%jM+rc5@T3hwh9dwd>x5aT5l##q5ge`W7D_t~p+ z(;Gq{_1R+PxAm?0?2dA?{7Ap?`_Zk1k7V^hlrMOHCP2Cd`)@*(@$x+I zct4*EJ~rvredM@8 zN|mXbi*uB1+Qohad+tz%peqn#bTthkYO12iyC-dS=3NVp9tfnmhAOpWg@S0KtDF6y zqa`-sKty(3u>@JL1PH%FDF%eYiGjuzcaoP@siUK(WKOc6?DGxfIKgDZ{-4;ODvuU{ z)SyB;Fa3h~u>IxE2Mh6BZMN;1c~rw%Sw^uZH6EHrB*sNyMcayCwc1#9O%>ZhyN^US z`$D_j>Y-%&O1q=B3}g+5mqJS+3e=zmQ;#~AqDv9#i11A_g~BF(#cogEd#7GUSGc~x zFiakRu4@dl%*5(Ul40shypC<2{mpNVQR@u^Wna=HtsgflcS#?v5iD2L3v`~kOr<)>u5El7pO@Oi0 zazm5)+zOYpU~IG@@9DA4kiVdDYOfCY!eTiVEk_DbiA8xCRix9mt9&kG`@@>dCV_vU!8KcTrJvSBJ-1(F zp$5b$o-UOV6>Bq|UaOT(a6d%xe59iedBxL3r5w|^jf6Uu_cMNpO@dCz!%=QBA2loV z+8)$`>H`hJJ*Bi3!g*mnPG3_6z0{QB*l$&VH{=LahPoWAW#VWJj&)?z64r+4+x1q> zDu1w7reEMUVhvF*G^FszoJOfD@JN#)u1hpZmXy*)HQ%r3rdEmMZ9iWMRro?fW)s78 ztZny@w%rOJ){t&^FW~igo{zxWdB*7j`z4fX4(55DhbbC)I`UNKcF^vPw&fptTA0Jrw;{wt&4d#69<_l)6K$uKx_PzV^x!koGH@F zI7fDSIh<)`)molGJ;^BOwsG8V3}bFVzoO4PhdV8W{KM9TSJY~(wg-3If6!PEVmC3^ zVt7!l2}xGrGm(TQ1}paCSM)jP&%vIAG+7*&4fY?g3%@(J;2(j%0ku*TeEKYqaB3iXp4{5NSI5brbT|2R}uI(mn;df>p3N>!0K0TNe4FGw6f;?4na;GbE?bLmzbQI!PCRD=Ns$Cxv8|8x?{S;a+YL{g4c4ZC&-H6g4aa5<`_M~2$CBnDa9P| z&J>zL!vnJgldCm#k|3AJ>?A3Zhe!|U3C$A6MsT2j3w*)s$Q37yvcr9lc$}JEJ({4a z)#|0O%VKrGlevBS^w_uGcL*YU!xbWZ!T2G<6AXA(dxkU-C7piRf79i^?h0RXhP``3 z2PdI^f^-rv9D)Zyz+5HuEa}5+%o;aQIms=#p9E3DB>L#k=UZMt&0ScfFXaH1z2G9XbfB`rUFafP` zuiHWq+9<*XNIHNnpa<9lYyn;bYy(~bybQPicm;3~@G9Un!0UiF0G9x70^R~#2D}Zp z0(b{-74R_=#itNI@C< z#y(%3gQq5Z3tVXF)5+{F^UFtatJ1mBXWH|>ZohEss;JzMZ+)Tue#e;n0er%L`a-0; zf7+aii1+A@*MD~;{K;eCuSP@vB7^;QFPM!J0Z&ViC5zv@penZ~A*p_acHG!Q*2D5gyB=)B1GU+*J?kjCH9LcBcXK=t4^fPZ!i<^{XEjaC zx~lLdXZ8^3MvpViW(Nh^kj$3S)R8nAor+9FC}4x?FFmZBicd{b+eE-e7Yb+ClXhDM zE{r{ce$m-iNbJ&gkZ^IJXXvCohY?4?dWNPuH2A?2 z2Z1Mcg0T!pbjobJQ7+{`*aH8=vy7&;E9v4)DVE>hmx>mz=ae%_f(LnnQ6nuTnW^)! zZj@)^RStYBC0P}E%q)|#z&9@Q>>9Ffi)^=*P0~t5Q&m1)sPbCMs+Eq zBdruMH+0Xgw_;S-Zle@?h+a>~yjGBf=+K&avCXlFlQmoyuNN(0Ngd{VqpTTfd3wdx zv&Bf6Ey!Yeaj=iovmL~3yUa#a#2Y>fcz+(uhhyzH`|$(kC6cfA*Lg4xgV$GL`G4T1 z1bHx8zE@#W5q_FM`2km5qpW9SL2uym0ky$m>jlWLEaEAP1Ay%95na9 zX=QESd~oO#?x9)^ENmBSidP~669#9VZ%fv3!7z^Fi*2HRaZT7)T5ujd7VJO4!bj&a HeD(hg6`~ko diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml deleted file mode 100644 index 389da55a1..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - caption: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml deleted file mode 100644 index 3c996e9d6..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - description: - type: binary - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml deleted file mode 100644 index c02abbaa1..000000000 --- a/tests/examples/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: string - - captions: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml deleted file mode 100644 index 3cda0cc6c..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: Fourth component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str - some_list: - description: Some list - type: list - items: - type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml deleted file mode 100644 index 091a7d9d5..000000000 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Third component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/examples/example_specs/components/arguments/component.yaml b/tests/examples/example_specs/components/arguments/component.yaml deleted file mode 100644 index 659ed0026..000000000 --- a/tests/examples/example_specs/components/arguments/component.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -args: - string_default_arg: - description: default string argument - type: str - default: foo - integer_default_arg: - description: default integer argument - type: int - default: 0 - float_default_arg: - description: default float argument - type: float - default: 3.14 - bool_false_default_arg: - description: default bool argument - type: bool - default: False - bool_true_default_arg: - description: default bool argument - type: bool - default: True - list_default_arg: - description: default list argument - type: list - default: ["foo", "bar"] - dict_default_arg: - description: default dict argument - type: dict - default: {"foo":1, "bar":2} - string_default_arg_none: - description: default string argument - type: str - default: None - integer_default_arg_none: - description: default integer argument - type: int - default: 0 - float_default_arg_none: - description: default float argument - type: float - default: 0.0 - bool_default_arg_none: - description: default bool argument - type: bool - default: False - list_default_arg_none: - description: default list argument - type: list - default: [] - dict_default_arg_none: - description: default dict argument - type: dict - default: {} - override_default_arg: - description: argument with default python value type that can be overriden - type: str - default: foo - override_default_arg_with_none: - description: argument with default python type that can be overriden with None - type: str - optional_arg: - description: optional argument - type: str - default: None diff --git a/tests/examples/example_specs/components/arguments/component_default_args.yaml b/tests/examples/example_specs/components/arguments/component_default_args.yaml deleted file mode 100644 index 816211c04..000000000 --- a/tests/examples/example_specs/components/arguments/component_default_args.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -args: - string_default_arg: - description: default string argument - type: str - default: foo - integer_default_arg: - description: default integer argument - type: int - default: 1 - float_default_arg: - description: default float argument - type: float - default: 3.14 - bool_false_default_arg: - description: default bool argument - type: bool - default: False - bool_true_default_arg: - description: default bool argument - type: bool - default: True - list_default_arg: - description: default list argument - type: list - default: ["foo", "bar"] - dict_default_arg: - description: default dict argument - type: dict - default: {"foo":1, "bar":2} - string_default_arg_none: - description: default string argument - type: str - default: None - integer_default_arg_none: - description: default integer argument - type: int - default: None - float_default_arg_none: - description: default float argument - type: float - default: None - bool_default_arg_none: - description: default bool argument - type: bool - default: None - list_default_arg_none: - description: default list argument - type: list - default: None - dict_default_arg_none: - description: default dict argument - type: dict - default: None - override_default_arg: - description: argument with default python value type that can be overriden - type: str - default: foo - override_default_none_arg: - description: argument with default None value type that can be overriden with a valid python type - type: float - default: None - override_default_arg_with_none: - description: argument with default python type that can be overriden with None - type: str - diff --git a/tests/examples/example_specs/components/arguments/input_manifest.json b/tests/examples/example_specs/components/arguments/input_manifest.json deleted file mode 100644 index 9ee2494f9..000000000 --- a/tests/examples/example_specs/components/arguments/input_manifest.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_123", - "component_id": "component_1", - "cache_key": "00" - }, - "index": { - "location": "/component_1" - }, - "fields": { - "data": { - "type": "binary", - "location": "/component_1" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/components/input_manifest.json b/tests/examples/example_specs/components/input_manifest.json deleted file mode 100644 index 80fa0b91d..000000000 --- a/tests/examples/example_specs/components/input_manifest.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/example_component" - }, - "fields": { - "data": { - "location": "/example_component", - "type": "binary" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt deleted file mode 100644 index 768ddfb21..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/cache/42.txt +++ /dev/null @@ -1 +0,0 @@ -tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 47c2fe949..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/component_1" - }, - "fields": - { - "data": { - "type": "binary", - "location": "/component_1" - }, - "height": { - "type": "int32", - "location": "/component_1" - }, - "width": { - "type": "int32", - "location": "/component_1" - }, - "captions": { - "type": "string", - "location": "/component_1" - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json deleted file mode 100644 index 78cfec59a..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json deleted file mode 100644 index f00c64aac..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json deleted file mode 100644 index f7a6f429d..000000000 --- a/tests/examples/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/sample_pipeline_test/components/dummy_component/Dockerfile b/tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/Dockerfile rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile diff --git a/tests/sample_pipeline_test/components/dummy_component/README.md b/tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/README.md rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md diff --git a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml similarity index 73% rename from tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml index 1091703eb..0a041fa3d 100644 --- a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml @@ -4,13 +4,9 @@ description: Dummy component for testing custom components image: fndnt/dummy_component:dev consumes: - text: - fields: - data: + text_data: type: string produces: - text: - fields: - data: + text_data: type: string \ No newline at end of file diff --git a/tests/sample_pipeline_test/components/dummy_component/requirements.txt b/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/requirements.txt rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt diff --git a/tests/sample_pipeline_test/components/dummy_component/src/main.py b/tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/src/main.py rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py diff --git a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml similarity index 95% rename from tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml index 35c43aadb..eddb6e580 100644 --- a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml @@ -3,9 +3,7 @@ description: Component that loads a dataset from a parquet uri image: fndnt/load_from_parquet:dev produces: - text: - fields: - data: + text_data: type: string args: diff --git a/tests/sample_pipeline_test/data/sample.parquet b/tests/integration_tests/sample_pipeline_test/data/sample.parquet similarity index 100% rename from tests/sample_pipeline_test/data/sample.parquet rename to tests/integration_tests/sample_pipeline_test/data/sample.parquet diff --git a/tests/test_sample_pipeline.py b/tests/integration_tests/test_sample_pipeline.py similarity index 91% rename from tests/test_sample_pipeline.py rename to tests/integration_tests/test_sample_pipeline.py index fefc65531..8e7f6fbda 100644 --- a/tests/test_sample_pipeline.py +++ b/tests/integration_tests/test_sample_pipeline.py @@ -17,7 +17,7 @@ # work around to make test executable on M1 Macbooks os.environ["DOCKER_DEFAULT_PLATFORM"] = "linux/amd64" -BASE_PATH = Path("./tests/sample_pipeline_test") +BASE_PATH = Path("./tests/integration_tests/sample_pipeline_test") NUMBER_OF_COMPONENTS = 3 @@ -57,6 +57,7 @@ def sample_pipeline(data_dir="./data") -> Pipeline: return pipeline +@pytest.mark.skip(reason="Skipping due to random failure.") def test_local_runner(sample_pipeline, tmp_path_factory): with tmp_path_factory.mktemp("temp") as data_dir: sample_pipeline.base_path = str(data_dir) @@ -64,7 +65,8 @@ def test_local_runner(sample_pipeline, tmp_path_factory): sample_pipeline, output_path="docker-compose.yaml", extra_volumes=[ - str(Path("tests/sample_pipeline_test/data").resolve()) + ":/data", + str(Path("tests/integration_tests/sample_pipeline_test/data").resolve()) + + ":/data", ], ) DockerRunner().run("docker-compose.yaml") diff --git a/tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/examples/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 62% rename from tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml index abe5091ea..066519825 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml @@ -3,16 +3,12 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - data: - type: string + captions_data: + type: string args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 55% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml index 2f9907df1..e9b67d68e 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml @@ -3,18 +3,17 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + + caption_data: + type: string produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 61% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml index 18ea49b2c..053b4c5b5 100644 --- a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml @@ -2,17 +2,16 @@ name: First component description: This is an example component image: example_component:latest -produces: - images: - fields: - data: - type: binary +consumes: + images_data: + type: binary - captions: - fields: - data: - type: string +produces: + captions_data: + type: string + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml new file mode 100644 index 000000000..a1a7995a2 --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml @@ -0,0 +1,24 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + captions_description: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 53% rename from tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml index 45964a8c6..053b4c5b5 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml @@ -3,21 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - data: - type: string + captions_data: + type: string - images: - fields: - data: - type: binary + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml new file mode 100644 index 000000000..8e0517f0a --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml @@ -0,0 +1,21 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: string + + captions_data: + type: string + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml similarity index 50% rename from tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml index 45964a8c6..0841688e9 100644 --- a/tests/examples/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml @@ -2,22 +2,13 @@ name: First component description: This is an example component image: example_component:latest -consumes: - images: - fields: - data: - type: binary - produces: - captions: - fields: - data: - type: string + images_data: + type: binary + + captions_data: + type: string - images: - fields: - data: - type: binary args: storage_args: description: Storage arguments diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml new file mode 100644 index 000000000..1cef340bd --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml @@ -0,0 +1,29 @@ +name: Fourth component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str + some_list: + description: Some list + type: list + items: + type: int \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml new file mode 100644 index 000000000..fa328ae01 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml @@ -0,0 +1,18 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/examples/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/examples/example_specs/components/component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml similarity index 59% rename from tests/examples/example_specs/components/component.yaml rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml index 973cc3e6b..fb6ebbaa0 100644 --- a/tests/examples/example_specs/components/component.yaml +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml @@ -1,4 +1,4 @@ -name: Example component +name: Third component description: This is an example component image: example_component:latest @@ -6,18 +6,19 @@ consumes: images_data: type: binary -produces: - images_data: + captions_data: + type: string + + embeddings_data: type: array items: type: float32 -additionalFields: false +produces: + images_data: + type: binary args: - flag: - description: user argument + storage_args: + description: Storage arguments type: str - value: - description: integer value - type: int diff --git a/tests/test_compiler.py b/tests/pipeline/test_compiler.py similarity index 99% rename from tests/test_compiler.py rename to tests/pipeline/test_compiler.py index 903c7963c..2c34f7f4e 100644 --- a/tests/test_compiler.py +++ b/tests/pipeline/test_compiler.py @@ -20,9 +20,9 @@ VertexPipelineConfigs, ) -COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") +COMPONENTS_PATH = Path("./tests/pipeline/examples/pipelines/valid_pipeline") -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") TEST_PIPELINES = [ ( diff --git a/tests/test_pipeline.py b/tests/pipeline/test_pipeline.py similarity index 98% rename from tests/test_pipeline.py rename to tests/pipeline/test_pipeline.py index 37d421ef6..b4deebc97 100644 --- a/tests/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -8,8 +8,8 @@ from fondant.core.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline, Resources -valid_pipeline_path = Path(__file__).parent / "example_pipelines/valid_pipeline" -invalid_pipeline_path = Path(__file__).parent / "example_pipelines/invalid_pipeline" +valid_pipeline_path = Path(__file__).parent / "examples/pipelines/valid_pipeline" +invalid_pipeline_path = Path(__file__).parent / "examples/pipelines/invalid_pipeline" def yaml_file_to_dict(file_path): diff --git a/tests/test_runner.py b/tests/pipeline/test_runner.py similarity index 98% rename from tests/test_runner.py rename to tests/pipeline/test_runner.py index 84ad63304..011f65e55 100644 --- a/tests/test_runner.py +++ b/tests/pipeline/test_runner.py @@ -11,7 +11,7 @@ VertexRunner, ) -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") def test_docker_runner(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 7897719aa..61fa8630f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -55,16 +55,16 @@ def test_basic_invocation(command): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", - "example_modules/component", - "example_modules.component.py", - "example_modules/component.py", + "examples.example_modules.component", + "examples.example_modules/component", + "examples.example_modules.component.py", + "examples.example_modules/component.py", ], ) def test_get_module(module_str): """Test get module method.""" module = get_module(module_str) - assert module.__name__ == "example_modules.component" + assert module.__name__ == "examples.example_modules.component" def test_get_module_error(): @@ -77,7 +77,7 @@ def test_get_module_error(): "module_str", [ __name__, # cannot be split - "example_modules.component", # module does not exist + "examples.example_modules.component", # module does not exist ], ) def test_component_from_module(module_str): @@ -89,8 +89,10 @@ def test_component_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.invalid_component", # module contains more than one component class - "example_modules.invalid_double_components", # module does not contain a component class + # module contains more than one component class + "examples.example_modules.invalid_component", + # module does not contain a component class + "examples.example_modules.invalid_double_components", ], ) def test_component_from_module_error(module_str): @@ -103,7 +105,7 @@ def test_component_from_module_error(module_str): "module_str", [ __name__, - "example_modules.pipeline", + "examples.example_modules.pipeline", ], ) def test_pipeline_from_module(module_str): @@ -115,8 +117,10 @@ def test_pipeline_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", # module does not contain a pipeline instance - "example_modules.invalid_double_pipeline", # module contains many pipeline instances + # module does not contain a pipeline instance + "examples.example_modules.component", + # module contains many pipeline instances + "examples.example_modules.invalid_double_pipeline", ], ) def test_pipeline_from_module_error(module_str): @@ -417,7 +421,7 @@ def test_vertex_run(tmp_path_factory): def test_component_build(mock_build, mock_push): """Test that the build command works as expected.""" args = argparse.Namespace( - component_dir=Path(__file__).parent / "example_component", + component_dir=Path(__file__).parent / "examples/example_component", tag="image:test", build_arg=["key=value"], nocache=True, @@ -435,7 +439,7 @@ def test_component_build(mock_build, mock_push): # Check that docker build and push were executed correctly mock_build.assert_called_with( - path=str(Path(__file__).parent / "example_component"), + path=str(Path(__file__).parent / "examples/example_component"), tag="image:test", buildargs={"key": "value"}, nocache=True, @@ -449,7 +453,7 @@ def test_component_build(mock_build, mock_push): # Check that the component specification file was updated correctly with open( - Path(__file__).parent / "example_component" / "fondant_component.yaml", + Path(__file__).parent / "examples/example_component" / "fondant_component.yaml", "r+", ) as f: content = f.read() diff --git a/tox.ini b/tox.ini index acd58f104..d22216b49 100644 --- a/tox.ini +++ b/tox.ini @@ -48,6 +48,6 @@ commands_pre= poetry install --all-extras poetry show commands= - poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing + poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing --ignore=tests/integration_tests commands_post= bash ./scripts/post-build.sh From f0344c81ce845c7cdba31f0bf51e3bb111373d31 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 09:14:50 +0100 Subject: [PATCH 31/34] Resolve conflicts --- .../examples/evolution_examples/1/component.yaml | 13 ------------- .../first_component/fondant_component.yaml | 7 ------- .../second_component/fondant_component.yaml | 6 ------ .../first_component/fondant_component.yaml | 8 -------- 4 files changed, 34 deletions(-) diff --git a/tests/core/examples/evolution_examples/1/component.yaml b/tests/core/examples/evolution_examples/1/component.yaml index 457363d38..99123c767 100644 --- a/tests/core/examples/evolution_examples/1/component.yaml +++ b/tests/core/examples/evolution_examples/1/component.yaml @@ -3,18 +3,6 @@ description: This is an example component image: example_component:latest consumes: -<<<<<<<< HEAD:tests/core/examples/component_specs/invalid_component.yaml - images: - fields: - data: - type: binary - -produces: - captions: - fields: - data: - type: string -======== images_data: type: binary @@ -23,7 +11,6 @@ produces: type: array items: type: float32 ->>>>>>>> feature/redesign-dataset-format-and-interface:tests/core/examples/evolution_examples/1/component.yaml Arguments: storage_args: diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml index 763945e1b..f41ee8678 100644 --- a/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml @@ -7,15 +7,8 @@ consumes: type: binary produces: -<<<<<<<< HEAD:tests/core/examples/evolution_examples/1/component.yaml - embeddings_data: - type: array - items: - type: float32 -======== captions_data: type: string ->>>>>>>> feature/redesign-dataset-format-and-interface:tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml args: storage_args: diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml index 7791a969f..fa328ae01 100644 --- a/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml @@ -5,12 +5,6 @@ image: example_component:latest consumes: images_data: type: binary -<<<<<<<< HEAD:tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml -======== - - caption_data: - type: string ->>>>>>>> feature/redesign-dataset-format-and-interface:tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml produces: embeddings_data: diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml index 95e72f13c..78cae1b36 100644 --- a/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml @@ -7,17 +7,9 @@ consumes: type: binary produces: -<<<<<<<< HEAD:tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml images_data: type: binary -======== ->>>>>>>> feature/redesign-dataset-format-and-interface:tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml - captions_data: - type: string - - images_data: - type: binary args: storage_args: description: Storage arguments From 826f0618b703079cb6890bffa03eaaef108783eb Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 10:49:27 +0100 Subject: [PATCH 32/34] Addressing comments --- src/fondant/pipeline/pipeline.py | 195 ++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 2 deletions(-) diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 05be61c17..c25203487 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -138,6 +138,9 @@ def __init__( cluster_type: t.Optional[str] = "default", client_kwargs: t.Optional[dict] = None, resources: t.Optional[Resources] = None, + schema: t.Optional[t.Dict[str, t.Any]] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None, ) -> None: self.component_dir = Path(component_dir) self.input_partition_rows = input_partition_rows @@ -154,7 +157,9 @@ def __init__( self._add_component_argument("cache", self.cache) self._add_component_argument("cluster_type", cluster_type) self._add_component_argument("client_kwargs", client_kwargs) - + self._add_component_argument("schema", schema) + self._add_component_argument("consumes", consumes) + self._add_component_argument("produces", produces) self.arguments.setdefault("component_spec", self.component_spec.specification) self.resources = resources or Resources() @@ -221,6 +226,8 @@ def from_registry( cache: t.Optional[bool] = True, cluster_type: t.Optional[str] = "default", client_kwargs: t.Optional[dict] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None, ) -> "ComponentOp": """Load a reusable component by its name. @@ -248,6 +255,8 @@ def from_registry( cache=cache, cluster_type=cluster_type, client_kwargs=client_kwargs, + consumes=consumes, + produces=produces, ) def get_component_cache_key( @@ -319,11 +328,193 @@ def __init__( self._graph: t.OrderedDict[str, t.Any] = OrderedDict() self.task_without_dependencies_added = False + def _build_component_op( + self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + schema: t.Optional[t.Dict[str, t.Any]] = None, + consumes: t.Optional[t.Dict[str, t.Any]] = None, + produces: t.Optional[t.Dict[str, t.Any]] = None, + ) -> ComponentOp: + """Building ComponentOp.""" + if not self._is_custom_component(path_or_name=name): + name = self._get_registry_path(name) + return ComponentOp( + name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + schema=schema, + consumes=consumes, + produces=produces, + ) + + def read( + self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + schema: t.Dict[str, str], + ) -> "Pipeline": + """ + Add a reading component to the pipeline. + + Args: + name: Name of the resuable component or a path to the component directory. + arguments: A dictionary containing the argument name and value for the operation. + input_partition_rows: The number of rows to load per partition. Set to override the + automatic partitioning. + cache: If true the cached results of previous components will be used, if available. + cluster_type: The type of cluster to use for distributed execution (default is "local"). + client_kwargs: Keyword arguments used to initialise the dask client. + resources: The resources to assign to the operation. + schema: Schema which will be used to initialise the dataset. + """ + component_op = self._build_component_op( + name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + schema=schema, + ) + + self.add_op(component_op) + return self + + def apply( + self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + consumes: t.Optional[t.Dict[str, str]] = None, + produces: t.Optional[t.Dict[str, str]] = None, + ) -> "Pipeline": + """ + Add a reading component to the pipeline. + + Args: + name: Name of the resuable component or a path to the component directory. + arguments: A dictionary containing the argument name and value for the operation. + input_partition_rows: The number of rows to load per partition. Set to override the + automatic partitioning. + cache: If true the cached results of previous components will be used, if available. + cluster_type: The type of cluster to use for distributed execution (default is "local"). + client_kwargs: Keyword arguments used to initialise the dask client. + resources: The resources to assign to the operation. + consumes: Dataframe columns that will be consumed by the component. + produces: Dataframe columns that will be produced by the component. + """ + component_op = self._build_component_op( + name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + consumes=consumes, + produces=produces, + ) + + previous_component = self._get_previous_component() + self.add_op(component_op, dependencies=previous_component) + return self + + def write( + self, + name, + *, + arguments: t.Optional[t.Dict[str, t.Any]] = None, + input_partition_rows: t.Optional[t.Union[str, int]] = None, + cache: t.Optional[bool] = True, + cluster_type: t.Optional[str] = "default", + client_kwargs: t.Optional[dict] = None, + resources: t.Optional[Resources] = None, + consumes: t.Optional[t.Dict[str, str]] = None, + schema: t.Optional[t.Dict[str, str]] = None, + ): + """ + Add a reading component to the pipeline. + + Args: + name: Name of the resuable component or a path to the component directory. + arguments: A dictionary containing the argument name and value for the operation. + input_partition_rows: The number of rows to load per partition. Set to override the + automatic partitioning. + cache: If true the cached results of previous components will be used, if available. + cluster_type: The type of cluster to use for distributed execution (default is "local"). + client_kwargs: Keyword arguments used to initialise the dask client. + resources: The resources to assign to the operation. + consumes: Dataframe columns that will be consumed by the component. + schema: Schema which will be used to write the dataset. + """ + component_op = self._build_component_op( + name, + arguments=arguments, + input_partition_rows=input_partition_rows, + cache=cache, + cluster_type=cluster_type, + client_kwargs=client_kwargs, + resources=resources, + consumes=consumes, + schema=schema, + ) + + # Get previous component + previous_component = self._get_previous_component() + self.add_op(component_op, dependencies=previous_component) + + @staticmethod + def _is_custom_component(path_or_name): + """Checks if name is a local path and a custom component.""" + components_dir: Path = Path(path_or_name) + return components_dir.exists() and components_dir.is_dir() + + @staticmethod + def _get_registry_path(name): + """Checks if name is a local path and a custom component.""" + components_dir: Path = t.cast(Path, files("fondant") / f"components/{name}") + if not (components_dir.exists() and components_dir.is_dir()): + msg = f"No reusable component with name {name} found." + raise ValueError(msg) + return components_dir + + def _get_previous_component(self) -> ComponentOp: + """Return previous component that was added to the task graph.""" + previous_component = list(self._graph.items())[-1][0] + if previous_component is None: + msg = "No previous component found." + raise ValueError(msg) + + return previous_component + def add_op( self, task: ComponentOp, dependencies: t.Optional[t.Union[ComponentOp, t.List[ComponentOp]]] = None, - ): + ) -> "Pipeline": """ Add a task to the pipeline with an optional dependency. From 4bb35a4de6e5ac22889c5201a90b1f51f5dbe138 Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 11:51:44 +0100 Subject: [PATCH 33/34] Overwriting consumes and produces of component specification --- src/fondant/component/component.py | 7 ++- src/fondant/component/executor.py | 57 +++++++++++++++---- src/fondant/core/component_spec.py | 24 ++++++++ src/fondant/pipeline/pipeline.py | 9 ++- tests/component/test_component.py | 46 ++++++++++++++++ tests/core/test_component_specs.py | 35 ++++++++++++ tests/pipeline/test_pipeline.py | 88 +++++++++++++++++------------- 7 files changed, 214 insertions(+), 52 deletions(-) diff --git a/src/fondant/component/component.py b/src/fondant/component/component.py index 5b33a3876..919308938 100644 --- a/src/fondant/component/component.py +++ b/src/fondant/component/component.py @@ -16,7 +16,12 @@ class BaseComponent: **kwargs: The provided user arguments are passed in as keyword arguments """ - def __init__(self, spec: ComponentSpec, schema: t.Optional[t.Dict[str, str]] = None, **kwargs): + def __init__( + self, + spec: ComponentSpec, + schema: t.Optional[t.Dict[str, str]] = None, + **kwargs, + ): pass diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 31ef923c8..8518b8448 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -69,7 +69,7 @@ def __init__( client_kwargs: t.Optional[dict] = None, schema: t.Optional[t.Dict[str, str]] = None, consumes: t.Optional[t.Dict[str, str]] = None, - produces: t.Optional[t.Dict[str, str]] = None + produces: t.Optional[t.Dict[str, str]] = None, ) -> None: self.spec = spec self.cache = cache @@ -79,6 +79,12 @@ def __init__( self.user_arguments = user_arguments self.input_partition_rows = input_partition_rows self.schema = schema + + # TODO: either overwrite the component spec + self.spec.overwrite_field_mapping(consumes, section_to_overwrite="consumes") + self.spec.overwrite_field_mapping(produces, section_to_overwrite="produces") + + # TODO: or accessing attributes in data_io.py self.consumes = consumes self.produces = produces @@ -133,13 +139,19 @@ def from_args(cls) -> "Executor": cache = args.cache cluster_type = args.cluster_type client_kwargs = args.client_kwargs + schema = args.schema + consumes = args.consumes + produces = args.produces return cls.from_spec( component_spec, cache=cache, input_partition_rows=input_partition_rows, cluster_type=cluster_type, - client_kwargs=client_kwargs + client_kwargs=client_kwargs, + schema=schema, + consumes=consumes, + produces=produces, ) @classmethod @@ -150,7 +162,10 @@ def from_spec( cache: bool, input_partition_rows: int, cluster_type: t.Optional[str], - client_kwargs: t.Optional[dict] + client_kwargs: t.Optional[dict], + schema: t.Optional[dict], + consumes: t.Optional[dict], + produces: t.Optional[dict], ) -> "Executor": """Create an executor from a component spec.""" args_dict = vars(cls._add_and_parse_args(component_spec)) @@ -170,6 +185,15 @@ def from_spec( if "client_kwargs" in args_dict: args_dict.pop("client_kwargs") + if "schema" in args_dict: + args_dict.pop("schema") + + if "consumes" in args_dict: + args_dict.pop("consumes") + + if "produces" in args_dict: + args_dict.pop("produces") + input_manifest_path = args_dict.pop("input_manifest_path") output_manifest_path = args_dict.pop("output_manifest_path") metadata = args_dict.pop("metadata") @@ -184,7 +208,10 @@ def from_spec( user_arguments=args_dict, input_partition_rows=input_partition_rows, cluster_type=cluster_type, - client_kwargs=client_kwargs + client_kwargs=client_kwargs, + schema=schema, + consumes=consumes, + produces=produces, ) @classmethod @@ -261,12 +288,18 @@ def _execute_component( A Dask DataFrame containing the output data """ - def _write_data(self, dataframe: dd.DataFrame, *, manifest: Manifest, produces: t.Optional[t.Dict[str, str]]): + def _write_data( + self, + dataframe: dd.DataFrame, + *, + manifest: Manifest, + produces: t.Optional[t.Dict[str, str]], + ): """Create a data writer given a manifest and writes out the index and subsets.""" data_writer = DaskDataWriter( manifest=manifest, component_spec=self.spec, - produces=produces + produces=produces, ) data_writer.write_dataframe(dataframe, self.client) @@ -351,7 +384,11 @@ def _run_execution( component_spec=self.spec, run_id=self.metadata.run_id, ) - self._write_data(dataframe=output_df, manifest=output_manifest, produces=self.produces) + self._write_data( + dataframe=output_df, + manifest=output_manifest, + produces=self.produces, + ) return output_manifest @@ -489,7 +526,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, - consumes=self.consumes + consumes=self.consumes, ) dataframe = data_loader.load_dataframe() return component.transform(dataframe) @@ -542,7 +579,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, - consumes=self.consumes + consumes=self.consumes, ) dataframe = data_loader.load_dataframe() @@ -604,7 +641,7 @@ def _execute_component( manifest=manifest, component_spec=self.spec, input_partition_rows=self.input_partition_rows, - consumes=self.consumes + consumes=self.consumes, ) dataframe = data_loader.load_dataframe() component.write(dataframe) diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 4dd945568..cb1bce770 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -181,6 +181,30 @@ def produces(self) -> t.Mapping[str, Field]: }, ) + def overwrite_field_mapping( + self, + mapping_dict: t.Dict[str, str], + section_to_overwrite: str, + ): + if section_to_overwrite not in ["consumes", "produces"]: + msg = ( + f"Can not overwrite {section_to_overwrite} because it is not part of the component " + f"specification." + ) + raise ValueError(msg) + + for name, item in mapping_dict.items(): + if name in self._specification[section_to_overwrite]: + self._specification[section_to_overwrite][item] = self._specification[ + section_to_overwrite + ].pop(name) + else: + msg = ( + f"Can not map {name} to {item}, because {name} is not part of the " + f"component specification." + ) + raise ValueError(msg) + @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index c25203487..8b3deaafe 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -151,6 +151,9 @@ def __init__( self.cache = self._configure_caching_from_image_tag(cache) self.cluster_type = cluster_type self.client_kwargs = client_kwargs + self.schema = schema + self.consumes = consumes + self.produces = produces self.arguments = arguments or {} self._add_component_argument("input_partition_rows", input_partition_rows) @@ -240,6 +243,8 @@ def from_registry( cache: Set to False to disable caching, True by default. cluster_type: The type of cluster to use for distributed execution (default is "local"). client_kwargs: Keyword arguments used to initialise the dask client. + consumes: Dataframe columns that will be consumed by the component. + produces: Dataframe columns that will be produced by the component. """ components_dir: Path = t.cast(Path, files("fondant") / f"components/{name}") @@ -453,7 +458,7 @@ def write( client_kwargs: t.Optional[dict] = None, resources: t.Optional[Resources] = None, consumes: t.Optional[t.Dict[str, str]] = None, - schema: t.Optional[t.Dict[str, str]] = None, + schema: t.Optional[t.Dict[str, str]], ): """ Add a reading component to the pipeline. @@ -503,7 +508,7 @@ def _get_registry_path(name): def _get_previous_component(self) -> ComponentOp: """Return previous component that was added to the task graph.""" - previous_component = list(self._graph.items())[-1][0] + previous_component = list(self._graph.items())[-1][-1]["fondant_component_op"] if previous_component is None: msg = "No previous component found." raise ValueError(msg) diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 830ce2963..79402875e 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -457,3 +457,49 @@ def write(self, dataframe): with mock.patch.object(MyWriteComponent, "write", write): executor.execute(MyWriteComponent) write.mock.assert_called_once() + + +def test_component_overwriting_consumes_produces(metadata): + # Mock CLI arguments + sys.argv = [ + "", + "--input_manifest_path", + str(components_path / "arguments/input_manifest.json"), + "--metadata", + metadata.to_json(), + "--output_manifest_path", + str(components_path / "arguments/output_manifest.json"), + "--component_spec", + yaml_file_to_json_string(components_path / "component.yaml"), + "--cache", + "True", + "--flag", + "success", + "--value", + "1", + "--input_partition_rows", + "100", + "--override_default_arg", + "bar", + "--override_default_none_arg", + "3.14", + "--override_default_arg_with_none", + "None", + "--consumes", + '{"images_data": "images"}', + "--produces", + '{"images_data": "images"}', + ] + + class MyExecutor(Executor): + """Base component with dummy methods so it can be instantiated.""" + + def _load_or_create_manifest(self) -> Manifest: + pass + + def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: + pass + + executor = MyExecutor.from_args() + assert "images" in executor.spec.consumes + assert "images" in executor.spec.produces diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py index dcbf4c2ed..07d507149 100644 --- a/tests/core/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -135,3 +135,38 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr + + +def test_overwriting_field_mapping(valid_fondant_schema): + """Test overwriting the field mapping.""" + fondant_component = ComponentSpec(valid_fondant_schema) + fondant_component.overwrite_field_mapping( + {"images": "pictures"}, + section_to_overwrite="consumes", + ) + fondant_component.overwrite_field_mapping( + {"captions": "text"}, + section_to_overwrite="produces", + ) + + assert "pictures" in fondant_component.consumes + assert "text" in fondant_component.produces + + with pytest.raises( + ValueError, + match="Can not map captions to text, because captions is not part of the component " + "specification.", + ): + fondant_component.overwrite_field_mapping( + {"captions": "text"}, + section_to_overwrite="produces", + ) + + with pytest.raises( + ValueError, + match="Can not overwrite subsets because it is not part of the component specification.", + ): + fondant_component.overwrite_field_mapping( + {"captions": "text"}, + section_to_overwrite="subsets", + ) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 30c4cdb1b..60e2fe82a 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -29,13 +29,13 @@ def default_pipeline_args(): "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_component_op( - valid_pipeline_example, + valid_pipeline_example, ): component_args = {"storage_args": "a dummy string arg"} example_dir, component_names = valid_pipeline_example @@ -75,50 +75,60 @@ def test_new_pipeline_interface(): dataset = pipeline.read( name="load_from_hf_hub", schema={ - "image": "binary" # or pa.binary() - } + "image": "binary", # or pa.binary() + }, ) dataset = dataset.apply( name="caption_images", consumes={ - "images_data": "image" + "images_data": "image", }, produces={ - "captions": "text" - } + "captions": "text", + }, ) dataset = dataset.apply( name="embed_text", consumes={ - "text_data": "text" - } + "text_data": "text", + }, ) dataset.write( - name="write_data", + name="write_to_hf_hub", schema={ - "image": "image", - "caption": "text" - } + "image": "binary", + "caption": "string", + }, ) - assert True + # Get component_ops from pipeline + component_ops = [ + element[1]["fondant_component_op"] for element in list(pipeline._graph.items()) + ] + + assert component_ops[0].schema == {"image": "binary"} + assert component_ops[1].consumes == {"images_data": "image"} + assert component_ops[1].produces == {"captions": "text"} + assert component_ops[2].consumes == {"text_data": "text"} + assert component_ops[2].produces is None + assert component_ops[3].schema == {"image": "binary", "caption": "string"} @pytest.mark.parametrize( "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_component_op_hash( - valid_pipeline_example, - monkeypatch, + valid_pipeline_example, + monkeypatch, ): example_dir, component_names = valid_pipeline_example components_path = Path(valid_pipeline_path / example_dir) @@ -141,16 +151,16 @@ def test_component_op_hash( comp_0_op_spec_0_copy = copy.deepcopy(comp_0_op_spec_0) assert ( - comp_0_op_spec_0.get_component_cache_key() - != comp_0_op_spec_1.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + != comp_0_op_spec_1.get_component_cache_key() ) assert ( - comp_0_op_spec_0.get_component_cache_key() - == comp_0_op_spec_0_copy.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + == comp_0_op_spec_0_copy.get_component_cache_key() ) assert ( - comp_0_op_spec_0.get_component_cache_key() - != comp_1_op_spec_0.get_component_cache_key() + comp_0_op_spec_0.get_component_cache_key() + != comp_1_op_spec_0.get_component_cache_key() ) @@ -177,16 +187,16 @@ def test_component_op_caching_strategy(monkeypatch): "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) def test_valid_pipeline( - default_pipeline_args, - valid_pipeline_example, - tmp_path, - monkeypatch, + default_pipeline_args, + valid_pipeline_example, + tmp_path, + monkeypatch, ): """Test that a valid pipeline definition can be compiled without errors.""" example_dir, component_names = valid_pipeline_example @@ -232,8 +242,8 @@ def test_valid_pipeline( "valid_pipeline_example", [ ( - "example_1", - ["first_component", "second_component", "third_component"], + "example_1", + ["first_component", "second_component", "third_component"], ), ], ) @@ -276,8 +286,8 @@ def test_invalid_pipeline_dependencies(default_pipeline_args, valid_pipeline_exa ], ) def test_invalid_pipeline_declaration( - default_pipeline_args, - invalid_pipeline_example, + default_pipeline_args, + invalid_pipeline_example, ): """Test that an InvalidPipelineDefinition exception is raised when attempting to register invalid components combinations. @@ -346,8 +356,8 @@ def test_reusable_component_op(): component_name = "this_component_does_not_exist" with pytest.raises( - ValueError, - match=f"No reusable component with name {component_name} " "found.", + ValueError, + match=f"No reusable component with name {component_name} " "found.", ): ComponentOp.from_registry( name=component_name, @@ -374,8 +384,8 @@ def test_defining_reusable_component_op_with_custom_spec(): ) assert ( - load_from_hub_custom_op.component_spec - == load_from_hub_default_op.component_spec + load_from_hub_custom_op.component_spec + == load_from_hub_default_op.component_spec ) From e7a960f8ab63ff37b2d8b37385ba6ec7557cae3d Mon Sep 17 00:00:00 2001 From: Matthias Richter Date: Fri, 24 Nov 2023 14:53:51 +0100 Subject: [PATCH 34/34] Consumes and produces renaming --- src/fondant/component/data_io.py | 49 +++++++++++++++++++----------- src/fondant/component/executor.py | 5 --- src/fondant/core/component_spec.py | 24 --------------- tests/component/test_component.py | 48 +---------------------------- tests/component/test_data_io.py | 37 ++++++++++++++++++++++ tests/core/test_component_specs.py | 35 --------------------- 6 files changed, 69 insertions(+), 129 deletions(-) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 79a181f8d..82fe51241 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -20,17 +20,18 @@ def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None self.manifest = manifest self.component_spec = component_spec - class DaskDataLoader(DataIO): def __init__( - self, - *, - manifest: Manifest, - component_spec: ComponentSpec, - input_partition_rows: t.Optional[int] = None, + self, + *, + manifest: Manifest, + component_spec: ComponentSpec, + input_partition_rows: t.Optional[int] = None, + consumes: t.Optional[t.Dict[str, str]] = None ): super().__init__(manifest=manifest, component_spec=component_spec) self.input_partition_rows = input_partition_rows + self.consumes = consumes def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: """ @@ -101,6 +102,7 @@ def load_dataframe(self) -> dd.DataFrame: DEFAULT_INDEX_NAME, ) + for field_name in self.component_spec.consumes: location = self.manifest.get_field_location(field_name) field_mapping[location].append(field_name) @@ -131,22 +133,28 @@ def load_dataframe(self) -> dd.DataFrame: logging.info(f"Columns of dataframe: {list(dataframe.columns)}") + # Renaming dataframe columns + if self.consumes: + dataframe = dataframe.rename(columns=self.consumes) + return dataframe class DaskDataWriter(DataIO): def __init__( - self, - *, - manifest: Manifest, - component_spec: ComponentSpec, + self, + *, + manifest: Manifest, + component_spec: ComponentSpec, + produces: t.Optional[t.Dict[str, str]] = None ): super().__init__(manifest=manifest, component_spec=component_spec) + self.produces = produces def write_dataframe( - self, - dataframe: dd.DataFrame, - dask_client: t.Optional[Client] = None, + self, + dataframe: dd.DataFrame, + dask_client: t.Optional[Client] = None, ) -> None: columns_to_produce = [ column_name for column_name, field in self.component_spec.produces.items() @@ -158,6 +166,11 @@ def write_dataframe( self.validate_dataframe_columns(dataframe, columns_to_produce) dataframe = dataframe[columns_to_produce] + + # Renaming dataframe produces + if self.produces: + dataframe = dataframe.rename(columns=self.produces) + write_task = self._write_dataframe(dataframe) with ProgressBar(): @@ -184,7 +197,7 @@ def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]): def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: """Create dataframe writing task.""" location = ( - self.manifest.base_path + "/" + self.component_spec.component_folder_name + self.manifest.base_path + "/" + self.component_spec.component_folder_name ) schema = { field.name: field.type.value @@ -194,10 +207,10 @@ def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: @staticmethod def _create_write_task( - dataframe: dd.DataFrame, - *, - location: str, - schema: t.Dict[str, str], + dataframe: dd.DataFrame, + *, + location: str, + schema: t.Dict[str, str], ) -> dd.core.Scalar: """ Creates a delayed Dask task to upload the given DataFrame to the remote storage location diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 8518b8448..c925f5ef6 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -80,11 +80,6 @@ def __init__( self.input_partition_rows = input_partition_rows self.schema = schema - # TODO: either overwrite the component spec - self.spec.overwrite_field_mapping(consumes, section_to_overwrite="consumes") - self.spec.overwrite_field_mapping(produces, section_to_overwrite="produces") - - # TODO: or accessing attributes in data_io.py self.consumes = consumes self.produces = produces diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index cb1bce770..4dd945568 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -181,30 +181,6 @@ def produces(self) -> t.Mapping[str, Field]: }, ) - def overwrite_field_mapping( - self, - mapping_dict: t.Dict[str, str], - section_to_overwrite: str, - ): - if section_to_overwrite not in ["consumes", "produces"]: - msg = ( - f"Can not overwrite {section_to_overwrite} because it is not part of the component " - f"specification." - ) - raise ValueError(msg) - - for name, item in mapping_dict.items(): - if name in self._specification[section_to_overwrite]: - self._specification[section_to_overwrite][item] = self._specification[ - section_to_overwrite - ].pop(name) - else: - msg = ( - f"Can not map {name} to {item}, because {name} is not part of the " - f"component specification." - ) - raise ValueError(msg) - @property def args(self) -> t.Mapping[str, Argument]: args = self.default_arguments diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 79402875e..54cec78b6 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -446,7 +446,7 @@ def __init__(self, *args, flag, value): self.flag = flag self.value = value - def write(self, dataframe): + def write(self, dataframe, produces): assert self.flag == "success" assert self.value == 1 assert isinstance(dataframe, dd.DataFrame) @@ -457,49 +457,3 @@ def write(self, dataframe): with mock.patch.object(MyWriteComponent, "write", write): executor.execute(MyWriteComponent) write.mock.assert_called_once() - - -def test_component_overwriting_consumes_produces(metadata): - # Mock CLI arguments - sys.argv = [ - "", - "--input_manifest_path", - str(components_path / "arguments/input_manifest.json"), - "--metadata", - metadata.to_json(), - "--output_manifest_path", - str(components_path / "arguments/output_manifest.json"), - "--component_spec", - yaml_file_to_json_string(components_path / "component.yaml"), - "--cache", - "True", - "--flag", - "success", - "--value", - "1", - "--input_partition_rows", - "100", - "--override_default_arg", - "bar", - "--override_default_none_arg", - "3.14", - "--override_default_arg_with_none", - "None", - "--consumes", - '{"images_data": "images"}', - "--produces", - '{"images_data": "images"}', - ] - - class MyExecutor(Executor): - """Base component with dummy methods so it can be instantiated.""" - - def _load_or_create_manifest(self) -> Manifest: - pass - - def _process_dataset(self, manifest: Manifest) -> t.Union[None, dd.DataFrame]: - pass - - executor = MyExecutor.from_args() - assert "images" in executor.spec.consumes - assert "images" in executor.spec.produces diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index d9dad121f..7a6be4a94 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -51,6 +51,20 @@ def test_load_dataframe(manifest, component_spec): assert dataframe.index.name == "id" +def test_load_dataframe_custom_consumes(manifest, component_spec): + """Test merging of fields in a dataframe based on a component_spec.""" + dl = DaskDataLoader(manifest=manifest, component_spec=component_spec, consumes={"Name": "custom_name"}) + dataframe = dl.load_dataframe() + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == [ + "custom_name", + "HP", + "Type 1", + "Type 2", + ] + assert dataframe.index.name == "id" + + def test_load_dataframe_default(manifest, component_spec): """Test merging of subsets in a dataframe based on a component_spec.""" dl = DaskDataLoader(manifest=manifest, component_spec=component_spec) @@ -95,6 +109,29 @@ def test_write_dataset( assert dataframe.index.name == "id" +def test_write_dataset_custom_produces( + tmp_path_factory, + dataframe, + manifest, + component_spec, + dask_client, +): + """Test writing out subsets.""" + # Dictionary specifying the expected subsets to write and their column names + columns = ["custom_name", "HP", "Type 1", "Type 2"] + with tmp_path_factory.mktemp("temp") as fn: + # override the base path of the manifest with the temp dir + manifest.update_metadata("base_path", str(fn)) + data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec, produces={"Name": "custom_name"}) + # write dataframe to temp dir + data_writer.write_dataframe(dataframe, dask_client) + # read written data and assert + dataframe = dd.read_parquet(fn) + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == columns + assert dataframe.index.name == "id" + + # TODO: check if this is still needed? def test_write_reset_index( tmp_path_factory, diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py index 07d507149..dcbf4c2ed 100644 --- a/tests/core/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -135,38 +135,3 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr - - -def test_overwriting_field_mapping(valid_fondant_schema): - """Test overwriting the field mapping.""" - fondant_component = ComponentSpec(valid_fondant_schema) - fondant_component.overwrite_field_mapping( - {"images": "pictures"}, - section_to_overwrite="consumes", - ) - fondant_component.overwrite_field_mapping( - {"captions": "text"}, - section_to_overwrite="produces", - ) - - assert "pictures" in fondant_component.consumes - assert "text" in fondant_component.produces - - with pytest.raises( - ValueError, - match="Can not map captions to text, because captions is not part of the component " - "specification.", - ): - fondant_component.overwrite_field_mapping( - {"captions": "text"}, - section_to_overwrite="produces", - ) - - with pytest.raises( - ValueError, - match="Can not overwrite subsets because it is not part of the component specification.", - ): - fondant_component.overwrite_field_mapping( - {"captions": "text"}, - section_to_overwrite="subsets", - )