ml6team · PhilippeMoussalli · Jan 30, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py
@@ -94,8 +94,8 @@ def __init__(
         image: str,
         *,
         description: t.Optional[str] = None,
-        consumes: t.Optional[t.Dict[str, t.Union[str, pa.DataType, bool]]] = None,
-        produces: t.Optional[t.Dict[str, t.Union[str, pa.DataType, bool]]] = None,
+        consumes: t.Optional[t.Mapping[str, t.Union[str, pa.DataType, bool]]] = None,
+        produces: t.Optional[t.Mapping[str, t.Union[str, pa.DataType, bool]]] = None,
         previous_index: t.Optional[str] = None,
         args: t.Optional[t.Dict[str, t.Any]] = None,
         tags: t.Optional[t.List[str]] = None,

diff --git a/src/fondant/pipeline/lightweight_component.py b/src/fondant/pipeline/lightweight_component.py
@@ -5,7 +5,10 @@
 from dataclasses import asdict, dataclass
 from functools import wraps
 
+import pyarrow as pa
+
 from fondant.component import BaseComponent, Component
+from fondant.core.schema import Field, Type
 
 
 @dataclass
@@ -28,11 +31,24 @@ class PythonComponent(BaseComponent):
     def image(cls) -> Image:
         raise NotImplementedError
 
+    @classmethod
+    def consumes(cls) -> t.Optional[t.Union[list, str]]:
+        pass
+
+    @classmethod
+    def get_consumes_spec(
+        cls,
+        dataset_fields: t.Mapping[str, Field],
+        apply_consumes: t.Optional[t.Dict[str, t.Union[str, pa.DataType]]],
+    ):
+        pass
+
 
 def lightweight_component(
     *args,
     extra_requires: t.Optional[t.List[str]] = None,
     base_image: t.Optional[str] = None,
+    consumes: t.Optional[t.Union[list, str]] = None,
 ):
     """Decorator to enable a python component."""
 
@@ -121,6 +137,55 @@ class PythonComponentOp(cls, PythonComponent):
             def image(cls) -> Image:
                 return image
 
+            @classmethod
+            def consumes(cls) -> t.Optional[t.Union[list, str]]:
+                return consumes
+
+            @classmethod
+            def get_consumes_spec(
+                cls,
+                dataset_fields: t.Mapping[str, Field],
+                apply_consumes: t.Optional[t.Dict[str, t.Union[str, pa.DataType]]],
+            ):
+                consumes = cls.consumes()
+
+                if consumes == "generic":
+                    return {"additionalProperties": True}
+
+                # Get consumes spec from the dataset
+                consumes_spec = {k: v.type.to_dict() for k, v in dataset_fields.items()}
+
+                # Modify naming based on the consumes argument in the 'apply' method
+                if apply_consumes:
+                    for k, v in apply_consumes.items():
+                        if isinstance(v, str):
+                            consumes_spec[k] = consumes_spec.pop(v)
+                        elif isinstance(v, pa.DataType):
+                            consumes_spec[k] = Type(v).to_dict()
+                        else:
+                            msg = (
+                                f"Invalid data type for field `{k}` in the `apply_consumes` "
+                                f"argument. Only string and pa.DataType are allowed."
+                            )
+                            raise ValueError(
+                                msg,
+                            )
+
+                # Filter for values that are not in the user defined consumes list
+                if consumes:
+                    for field_to_consume in consumes:
+                        if field_to_consume not in consumes_spec.keys():
+                            msg = f"Field `{field_to_consume}` is not available in the dataset."
+                            raise ValueError(
+                                msg,
+                            )
+
+                        consumes_spec = {
+                            k: v for k, v in consumes_spec.items() if k in consumes
+                        }
+
+                return consumes_spec
+
         return PythonComponentOp
 
     # Call wrapper with function (`args[0]`) when no additional arguments were passed

diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py
@@ -206,7 +206,12 @@ def from_component_yaml(cls, path, **kwargs) -> "ComponentOp":
         )
 
     @classmethod
-    def from_ref(cls, ref: t.Any, **kwargs) -> "ComponentOp":
+    def from_ref(
+        cls,
+        ref: t.Any,
+        fields: t.Optional[t.Mapping[str, Field]] = None,
+        **kwargs,
+    ) -> "ComponentOp":
         """Create a ComponentOp from a reference. The reference can
         be a reusable component name, a path to a custom component,
         or a python component class.
@@ -217,11 +222,16 @@ def from_ref(cls, ref: t.Any, **kwargs) -> "ComponentOp":
                 image = ref.image()
                 description = ref.__doc__ or "python component"
 
+                if fields:
+                    consumes_spec = ref.get_consumes_spec(fields, kwargs["consumes"])
+                else:
+                    consumes_spec = {"additionalProperties": True}
+
                 component_spec = ComponentSpec(
                     name,
                     image.base_image,
                     description=description,
-                    consumes={"additionalProperties": True},
+                    consumes=consumes_spec,
                     produces={"additionalProperties": True},
                     args={
                         name: arg.to_spec()
@@ -724,6 +734,7 @@ def apply(
         """
         operation = ComponentOp.from_ref(
             ref,
+            fields=self.fields,
             produces=produces,
             consumes=consumes,
             arguments=arguments,
@@ -771,6 +782,7 @@ def write(
         """
         operation = ComponentOp.from_ref(
             ref,
+            fields=self.fields,
             consumes=consumes,
             arguments=arguments,
             input_partition_rows=input_partition_rows,

diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
@@ -71,7 +71,7 @@ def test_component_op(
 
 
 def test_component_op_python_component(default_pipeline_args):
-    @lightweight_component()
+    @lightweight_component(consumes="generic")
     class Foo(DaskLoadComponent):
         def load(self) -> dd.DataFrame:
             df = pd.DataFrame(

diff --git a/tests/pipeline/test_python_component.py b/tests/pipeline/test_python_component.py
@@ -7,25 +7,50 @@
 import pyarrow as pa
 import pytest
 from fondant.component import DaskLoadComponent, PandasTransformComponent
+from fondant.core.component_spec import OperationSpec
 from fondant.core.exceptions import InvalidPythonComponent
 from fondant.pipeline import Pipeline, lightweight_component
 from fondant.pipeline.compiler import DockerCompiler
+from fondant.testing import DockerComposeConfigs
 
 
-def test_build_python_script():
-    @lightweight_component()
+@pytest.fixture()
+def load_pipeline():
+    pipeline = Pipeline(
+        name="dummy-pipeline",
+        base_path="./data",
+    )
+
+    @lightweight_component(
+        base_image="python:3.8-slim-buster",
+        extra_requires=["pandas", "dask"],
+        consumes="generic",
+    )
     class CreateData(DaskLoadComponent):
         def load(self) -> dd.DataFrame:
             df = pd.DataFrame(
                 {
                     "x": [1, 2, 3],
                     "y": [4, 5, 6],
+                    "z": [7, 8, 9],
                 },
                 index=pd.Index(["a", "b", "c"], name="id"),
             )
             return dd.from_pandas(df, npartitions=1)
 
-    assert CreateData.image().script == textwrap.dedent(
+    load_script = CreateData.image().script
+
+    dataset = pipeline.read(
+        ref=CreateData,
+        produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()},
+    )
+
+    return pipeline, dataset, load_script
+
+
+def test_build_python_script(load_pipeline):
+    _, _, load_script = load_pipeline
+    assert load_script == textwrap.dedent(
         """\
         from typing import *
         import typing as t
@@ -43,6 +68,7 @@ def load(self) -> dd.DataFrame:
                     {
                         "x": [1, 2, 3],
                         "y": [4, 5, 6],
+                        "z": [7, 8, 9],
                     },
                     index=pd.Index(["a", "b", "c"], name="id"),
                 )
@@ -51,31 +77,8 @@ def load(self) -> dd.DataFrame:
     )
 
 
-def test_lightweight_component_sdk():
-    pipeline = Pipeline(
-        name="dummy-pipeline",
-        base_path="./data",
-    )
-
-    @lightweight_component(
-        base_image="python:3.8-slim-buster",
-        extra_requires=["pandas", "dask"],
-    )
-    class CreateData(DaskLoadComponent):
-        def load(self) -> dd.DataFrame:
-            df = pd.DataFrame(
-                {
-                    "x": [1, 2, 3],
-                    "y": [4, 5, 6],
-                },
-                index=pd.Index(["a", "b", "c"], name="id"),
-            )
-            return dd.from_pandas(df, npartitions=1)
-
-    dataset = pipeline.read(
-        ref=CreateData,
-        produces={"x": pa.int32(), "y": pa.int32()},
-    )
+def test_lightweight_component_sdk(load_pipeline):
+    pipeline, dataset, load_script = load_pipeline
 
     assert len(pipeline._graph.keys()) == 1
     operation_spec_dict = pipeline._graph["CreateData"][
@@ -90,10 +93,14 @@ def load(self) -> dd.DataFrame:
             "produces": {"additionalProperties": True},
         },
         "consumes": {},
-        "produces": {"x": {"type": "int32"}, "y": {"type": "int32"}},
+        "produces": {
+            "x": {"type": "int32"},
+            "y": {"type": "int32"},
+            "z": {"type": "int32"},
+        },
     }
 
-    @lightweight_component()
+    @lightweight_component(consumes="generic")
     class AddN(PandasTransformComponent):
         def __init__(self, n: int, **kwargs):
             self.n = n
@@ -104,8 +111,8 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
 
     _ = dataset.apply(
         ref=AddN,
-        produces={"x": pa.int32(), "y": pa.int32()},
-        consumes={"x": pa.int32(), "y": pa.int32()},
+        produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()},
+        consumes={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()},
         arguments={"n": 1},
     )
     assert len(pipeline._graph.keys()) == 1 + 1
@@ -120,14 +127,88 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
             "produces": {"additionalProperties": True},
             "args": {"n": {"type": "int"}},
         },
-        "consumes": {"x": {"type": "int32"}, "y": {"type": "int32"}},
-        "produces": {"x": {"type": "int32"}, "y": {"type": "int32"}},
+        "consumes": {
+            "x": {"type": "int32"},
+            "y": {"type": "int32"},
+            "z": {"type": "int32"},
+        },
+        "produces": {
+            "x": {"type": "int32"},
+            "y": {"type": "int32"},
+            "z": {"type": "int32"},
+        },
     }
     pipeline._validate_pipeline_definition(run_id="dummy-run-id")
 
     DockerCompiler().compile(pipeline)
 
 
+def test_valid_consumes_mapping(tmp_path_factory, load_pipeline):
+    @lightweight_component(
+        base_image="python:3.8",
+        extra_requires=[
+            "fondant[component]@git+https://github.com/ml6team/fondant@main",
+        ],
+        consumes=["a", "y"],
+    )
+    class AddN(PandasTransformComponent):
+        def __init__(self, n: int, **kwargs):
+            self.n = n
+
+        def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+            dataframe["a"] = dataframe["a"].map(lambda x: x + self.n)
+            return dataframe
+
+    pipeline, dataset, _ = load_pipeline
+
+    _ = dataset.apply(
+        ref=AddN,
+        consumes={"a": "x"},
+        produces={"a": pa.int32()},
+        arguments={"n": 1},
+    )
+
+    with tmp_path_factory.mktemp("temp") as fn:
+        output_path = str(fn / "kubeflow_pipeline.yml")
+        DockerCompiler().compile(pipeline=pipeline, output_path=output_path)
+        pipeline_configs = DockerComposeConfigs.from_spec(output_path)
+        operation_spec = OperationSpec.from_json(
+            pipeline_configs.component_configs["AddN"].arguments["operation_spec"],
+        )
+        assert all(k in ["a", "y"] for k in operation_spec.inner_consumes)
+        assert "z" not in operation_spec.inner_consumes
+
+
+def test_invalid_consumes_mapping(tmp_path_factory, load_pipeline):
+    @lightweight_component(
+        base_image="python:3.8",
+        extra_requires=[
+            "fondant[component]@git+https://github.com/ml6team/fondant@main",
+        ],
+        consumes=["nonExistingField"],
+    )
+    class AddN(PandasTransformComponent):
+        def __init__(self, n: int, **kwargs):
+            self.n = n
+
+        def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+            dataframe["a"] = dataframe["a"].map(lambda x: x + self.n)
+            return dataframe
+
+    _, dataset, _ = load_pipeline
+
+    with pytest.raises(
+        ValueError,
+        match="Field `nonExistingField` is not available in the dataset.",
+    ):
+        _ = dataset.apply(
+            ref=AddN,
+            consumes={"a": "x"},
+            produces={"a": pa.int32()},
+            arguments={"n": 1},
+        )
+
+
 def test_lightweight_component_missing_decorator():
     pipeline = Pipeline(
         name="dummy-pipeline",