-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Start from dataset schema for lightweight python component consumes
#789
Changes from 11 commits
7a6b78a
c007244
e52a5c4
161f214
66a9103
2e10af1
e8d763f
6619b3a
d898e4a
2d80a77
cef482a
8c9d154
4c97282
3ab1bae
b59fb8c
de5a3c1
3943c4b
d8e5563
85f0994
5b69298
12c6f37
60dc6f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,10 @@ | |
from dataclasses import asdict, dataclass | ||
from functools import wraps | ||
|
||
import pyarrow as pa | ||
|
||
from fondant.component import BaseComponent, Component | ||
from fondant.core.schema import Field, Type | ||
|
||
|
||
@dataclass | ||
|
@@ -28,11 +31,24 @@ class PythonComponent(BaseComponent): | |
def image(cls) -> Image: | ||
raise NotImplementedError | ||
|
||
@classmethod | ||
def consumes(cls) -> t.Optional[t.Union[list, str]]: | ||
pass | ||
|
||
@classmethod | ||
def get_consumes_spec( | ||
cls, | ||
dataset_fields: t.Mapping[str, Field], | ||
apply_consumes: t.Optional[t.Dict[str, t.Union[str, pa.DataType]]], | ||
): | ||
pass | ||
|
||
|
||
def lightweight_component( | ||
*args, | ||
extra_requires: t.Optional[t.List[str]] = None, | ||
base_image: t.Optional[str] = None, | ||
consumes: t.Optional[t.Union[list, str]] = None, | ||
): | ||
"""Decorator to enable a python component.""" | ||
|
||
|
@@ -121,6 +137,55 @@ class PythonComponentOp(cls, PythonComponent): | |
def image(cls) -> Image: | ||
return image | ||
|
||
@classmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can make this a class property by combining the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be reversed, but seems like it only works for Python 3.9 and 3.10 (docs). Just making it a class attribute could work as well: class BaseClass:
consumes: ConsumesType
class Class(BaseClass):
consumes=consumes_ # cannot be the same name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm I still don't quite follow what should be done here, aren't the class methods needed for the decorators? what's the need for attributes in this example? |
||
def consumes(cls) -> t.Optional[t.Union[list, str]]: | ||
return consumes | ||
|
||
@classmethod | ||
def get_consumes_spec( | ||
PhilippeMoussalli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cls, | ||
dataset_fields: t.Mapping[str, Field], | ||
apply_consumes: t.Optional[t.Dict[str, t.Union[str, pa.DataType]]], | ||
): | ||
consumes = cls.consumes() | ||
|
||
if consumes == "generic": | ||
return {"additionalProperties": True} | ||
|
||
# Get consumes spec from the dataset | ||
consumes_spec = {k: v.type.to_dict() for k, v in dataset_fields.items()} | ||
|
||
# Modify naming based on the consumes argument in the 'apply' method | ||
RobbeSneyders marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if apply_consumes: | ||
for k, v in apply_consumes.items(): | ||
if isinstance(v, str): | ||
consumes_spec[k] = consumes_spec.pop(v) | ||
elif isinstance(v, pa.DataType): | ||
consumes_spec[k] = Type(v).to_dict() | ||
else: | ||
msg = ( | ||
f"Invalid data type for field `{k}` in the `apply_consumes` " | ||
f"argument. Only string and pa.DataType are allowed." | ||
) | ||
raise ValueError( | ||
msg, | ||
) | ||
|
||
# Filter for values that are not in the user defined consumes list | ||
if consumes: | ||
mrchtr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for field_to_consume in consumes: | ||
if field_to_consume not in consumes_spec.keys(): | ||
msg = f"Field `{field_to_consume}` is not available in the dataset." | ||
raise ValueError( | ||
msg, | ||
) | ||
|
||
consumes_spec = { | ||
k: v for k, v in consumes_spec.items() if k in consumes | ||
} | ||
|
||
return consumes_spec | ||
|
||
return PythonComponentOp | ||
|
||
# Call wrapper with function (`args[0]`) when no additional arguments were passed | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -206,7 +206,12 @@ def from_component_yaml(cls, path, **kwargs) -> "ComponentOp": | |
) | ||
|
||
@classmethod | ||
def from_ref(cls, ref: t.Any, **kwargs) -> "ComponentOp": | ||
def from_ref( | ||
cls, | ||
ref: t.Any, | ||
fields: t.Optional[t.Mapping[str, Field]] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer to keep this fields argument out of here since this is specific to the lightweight Python components. Can we move this to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see a straightforwards way of doing this, unless we somehow pass the fields to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My original implementation did this in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, this might make less sense after the refactoring on main since my first commits. If we can address my comment above, it's fine for me to keep it like this for now. Would be good to add the argument to the docstring though. |
||
**kwargs, | ||
) -> "ComponentOp": | ||
"""Create a ComponentOp from a reference. The reference can | ||
be a reusable component name, a path to a custom component, | ||
or a python component class. | ||
|
@@ -217,11 +222,16 @@ def from_ref(cls, ref: t.Any, **kwargs) -> "ComponentOp": | |
image = ref.image() | ||
description = ref.__doc__ or "python component" | ||
|
||
if fields: | ||
consumes_spec = ref.get_consumes_spec(fields, kwargs["consumes"]) | ||
else: | ||
consumes_spec = {"additionalProperties": True} | ||
|
||
component_spec = ComponentSpec( | ||
name, | ||
image.base_image, | ||
description=description, | ||
consumes={"additionalProperties": True}, | ||
consumes=consumes_spec, | ||
produces={"additionalProperties": True}, | ||
args={ | ||
name: arg.to_spec() | ||
|
@@ -724,6 +734,7 @@ def apply( | |
""" | ||
operation = ComponentOp.from_ref( | ||
ref, | ||
fields=self.fields, | ||
produces=produces, | ||
consumes=consumes, | ||
arguments=arguments, | ||
|
@@ -771,6 +782,7 @@ def write( | |
""" | ||
operation = ComponentOp.from_ref( | ||
ref, | ||
fields=self.fields, | ||
consumes=consumes, | ||
arguments=arguments, | ||
input_partition_rows=input_partition_rows, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,25 +7,50 @@ | |
import pyarrow as pa | ||
import pytest | ||
from fondant.component import DaskLoadComponent, PandasTransformComponent | ||
from fondant.core.component_spec import OperationSpec | ||
from fondant.core.exceptions import InvalidPythonComponent | ||
from fondant.pipeline import Pipeline, lightweight_component | ||
from fondant.pipeline.compiler import DockerCompiler | ||
from fondant.testing import DockerComposeConfigs | ||
|
||
|
||
def test_build_python_script(): | ||
@lightweight_component() | ||
@pytest.fixture() | ||
def load_pipeline(): | ||
pipeline = Pipeline( | ||
name="dummy-pipeline", | ||
base_path="./data", | ||
) | ||
|
||
@lightweight_component( | ||
base_image="python:3.8-slim-buster", | ||
extra_requires=["pandas", "dask"], | ||
consumes="generic", | ||
) | ||
class CreateData(DaskLoadComponent): | ||
def load(self) -> dd.DataFrame: | ||
df = pd.DataFrame( | ||
{ | ||
"x": [1, 2, 3], | ||
"y": [4, 5, 6], | ||
"z": [7, 8, 9], | ||
}, | ||
index=pd.Index(["a", "b", "c"], name="id"), | ||
) | ||
return dd.from_pandas(df, npartitions=1) | ||
|
||
assert CreateData.image().script == textwrap.dedent( | ||
load_script = CreateData.image().script | ||
|
||
dataset = pipeline.read( | ||
ref=CreateData, | ||
produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()}, | ||
) | ||
|
||
return pipeline, dataset, load_script | ||
|
||
|
||
def test_build_python_script(load_pipeline): | ||
_, _, load_script = load_pipeline | ||
assert load_script == textwrap.dedent( | ||
"""\ | ||
from typing import * | ||
import typing as t | ||
|
@@ -43,6 +68,7 @@ def load(self) -> dd.DataFrame: | |
{ | ||
"x": [1, 2, 3], | ||
"y": [4, 5, 6], | ||
"z": [7, 8, 9], | ||
}, | ||
index=pd.Index(["a", "b", "c"], name="id"), | ||
) | ||
|
@@ -51,31 +77,8 @@ def load(self) -> dd.DataFrame: | |
) | ||
|
||
|
||
def test_lightweight_component_sdk(): | ||
pipeline = Pipeline( | ||
name="dummy-pipeline", | ||
base_path="./data", | ||
) | ||
|
||
@lightweight_component( | ||
base_image="python:3.8-slim-buster", | ||
extra_requires=["pandas", "dask"], | ||
) | ||
class CreateData(DaskLoadComponent): | ||
def load(self) -> dd.DataFrame: | ||
df = pd.DataFrame( | ||
{ | ||
"x": [1, 2, 3], | ||
"y": [4, 5, 6], | ||
}, | ||
index=pd.Index(["a", "b", "c"], name="id"), | ||
) | ||
return dd.from_pandas(df, npartitions=1) | ||
|
||
dataset = pipeline.read( | ||
ref=CreateData, | ||
produces={"x": pa.int32(), "y": pa.int32()}, | ||
) | ||
def test_lightweight_component_sdk(load_pipeline): | ||
pipeline, dataset, load_script = load_pipeline | ||
|
||
assert len(pipeline._graph.keys()) == 1 | ||
operation_spec_dict = pipeline._graph["CreateData"][ | ||
|
@@ -90,10 +93,14 @@ def load(self) -> dd.DataFrame: | |
"produces": {"additionalProperties": True}, | ||
}, | ||
"consumes": {}, | ||
"produces": {"x": {"type": "int32"}, "y": {"type": "int32"}}, | ||
"produces": { | ||
"x": {"type": "int32"}, | ||
"y": {"type": "int32"}, | ||
"z": {"type": "int32"}, | ||
}, | ||
} | ||
|
||
@lightweight_component() | ||
@lightweight_component(consumes="generic") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer to have a empty consumes instead of passing "generic" here. For me it would be fine to consume the whole dataset. It would make the usability less complex and reduce the efficiency of the component execution. We should keep the base interface as simple as possible. Pipeline improvements will probably following later during the development cycle. I think we don't want to use the term "generic component". What is the issue when we pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the issue is there are still 3 general options that should be supported without the possibility to mix some of them together.
The only solution would be to somehow mix the 1st and 3rd option but this would require us to change the component spec to support both dynamic and specified fields which is not something we currently support There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mrchtr updated based on the feedback, consumes is This is more evident here @lightweight_component(
base_image="python:3.8-slim-buster",
extra_requires=["pandas", "dask"],
)
class CreateData(DaskLoadComponent):
def load(self) -> dd.DataFrame:
....
dataset = pipeline.read(
ref=CreateData,
produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()},
)
# dataset schema has x,y,z
@lightweight_component
class AddN(PandasTransformComponent):
def __init__(self, n: int, **kwargs):
self.n = n
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
dataframe["x"] = dataframe["x"].map(lambda x: x + self.n)
return dataframe
_ = dataset.apply(
ref=AddN,
produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()},
consumes=None, # This now has to be defined as None since we can't define dynamic fields but we can already infer the schema based on the dataset
arguments={"n": 1},
) I think both options are valuable with small tradeoffs.
Happy to hear other takes on this @RobbeSneyders @GeorgesLorre There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with not supporting Lightweight Python components can still be implemented in a generic way without it. It just means that the implementation of the component depends on the |
||
class AddN(PandasTransformComponent): | ||
def __init__(self, n: int, **kwargs): | ||
self.n = n | ||
|
@@ -104,8 +111,8 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | |
|
||
_ = dataset.apply( | ||
ref=AddN, | ||
produces={"x": pa.int32(), "y": pa.int32()}, | ||
consumes={"x": pa.int32(), "y": pa.int32()}, | ||
produces={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()}, | ||
consumes={"x": pa.int32(), "y": pa.int32(), "z": pa.int32()}, | ||
arguments={"n": 1}, | ||
) | ||
assert len(pipeline._graph.keys()) == 1 + 1 | ||
|
@@ -120,14 +127,88 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | |
"produces": {"additionalProperties": True}, | ||
"args": {"n": {"type": "int"}}, | ||
}, | ||
"consumes": {"x": {"type": "int32"}, "y": {"type": "int32"}}, | ||
"produces": {"x": {"type": "int32"}, "y": {"type": "int32"}}, | ||
"consumes": { | ||
"x": {"type": "int32"}, | ||
"y": {"type": "int32"}, | ||
"z": {"type": "int32"}, | ||
}, | ||
"produces": { | ||
"x": {"type": "int32"}, | ||
"y": {"type": "int32"}, | ||
"z": {"type": "int32"}, | ||
}, | ||
} | ||
pipeline._validate_pipeline_definition(run_id="dummy-run-id") | ||
|
||
DockerCompiler().compile(pipeline) | ||
|
||
|
||
def test_valid_consumes_mapping(tmp_path_factory, load_pipeline): | ||
@lightweight_component( | ||
base_image="python:3.8", | ||
extra_requires=[ | ||
"fondant[component]@git+https://github.com/ml6team/fondant@main", | ||
], | ||
consumes=["a", "y"], | ||
) | ||
class AddN(PandasTransformComponent): | ||
def __init__(self, n: int, **kwargs): | ||
self.n = n | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
dataframe["a"] = dataframe["a"].map(lambda x: x + self.n) | ||
return dataframe | ||
|
||
pipeline, dataset, _ = load_pipeline | ||
|
||
_ = dataset.apply( | ||
ref=AddN, | ||
consumes={"a": "x"}, | ||
produces={"a": pa.int32()}, | ||
arguments={"n": 1}, | ||
) | ||
|
||
with tmp_path_factory.mktemp("temp") as fn: | ||
output_path = str(fn / "kubeflow_pipeline.yml") | ||
DockerCompiler().compile(pipeline=pipeline, output_path=output_path) | ||
pipeline_configs = DockerComposeConfigs.from_spec(output_path) | ||
operation_spec = OperationSpec.from_json( | ||
pipeline_configs.component_configs["AddN"].arguments["operation_spec"], | ||
) | ||
assert all(k in ["a", "y"] for k in operation_spec.inner_consumes) | ||
assert "z" not in operation_spec.inner_consumes | ||
|
||
|
||
def test_invalid_consumes_mapping(tmp_path_factory, load_pipeline): | ||
@lightweight_component( | ||
base_image="python:3.8", | ||
extra_requires=[ | ||
"fondant[component]@git+https://github.com/ml6team/fondant@main", | ||
], | ||
consumes=["nonExistingField"], | ||
) | ||
class AddN(PandasTransformComponent): | ||
def __init__(self, n: int, **kwargs): | ||
self.n = n | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
dataframe["a"] = dataframe["a"].map(lambda x: x + self.n) | ||
return dataframe | ||
|
||
_, dataset, _ = load_pipeline | ||
|
||
with pytest.raises( | ||
ValueError, | ||
match="Field `nonExistingField` is not available in the dataset.", | ||
): | ||
_ = dataset.apply( | ||
ref=AddN, | ||
consumes={"a": "x"}, | ||
produces={"a": pa.int32()}, | ||
arguments={"n": 1}, | ||
) | ||
|
||
|
||
def test_lightweight_component_missing_decorator(): | ||
pipeline = Pipeline( | ||
name="dummy-pipeline", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think a docstring would be good.