From 8b9713a945c9be0804d92dfe10732d8c7100c5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georges=20Lorr=C3=A9?= <35808396+GeorgesLorre@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:37:52 +0100 Subject: [PATCH] Fix Readme script generation (#821) This fixes the failing builds but is it what we want ? --- components/caption_images/README.md | 2 +- scripts/component_readme/generate_readme.py | 24 ++++++++++++--------- scripts/component_readme/readme_template.md | 16 +++++++------- src/fondant/component/data_io.py | 2 +- src/fondant/core/component_spec.py | 12 +++++++---- src/fondant/core/manifest.py | 2 +- src/fondant/pipeline/compiler.py | 2 +- src/fondant/pipeline/pipeline.py | 2 +- tests/component/test_data_io.py | 7 ++++-- tests/core/test_component_specs.py | 2 +- tests/core/test_manifest_evolution.py | 2 +- tests/pipeline/test_pipeline.py | 2 +- tests/pipeline/test_python_component.py | 8 +++---- 13 files changed, 47 insertions(+), 36 deletions(-) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 2158b173b..1ba005318 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -49,7 +49,7 @@ pipeline = Pipeline(...) dataset = pipeline.read(...) dataset = dataset.apply( - "caption_images", + "", arguments={ # Add arguments # "model_id": "Salesforce/blip-image-captioning-base", diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py index 587eb821d..b26953ee2 100644 --- a/scripts/component_readme/generate_readme.py +++ b/scripts/component_readme/generate_readme.py @@ -2,6 +2,7 @@ from pathlib import Path import jinja2 + from fondant.core.component_spec import ComponentSpec @@ -11,24 +12,25 @@ def read_component_spec(component_spec_path: Path) -> ComponentSpec: def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> str: env = jinja2.Environment( - loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent), - trim_blocks=True + loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent), trim_blocks=True ) env.filters["eval"] = eval template = env.get_template("readme_template.md") return template.render( - id=component_dir.name, + component_id=component_spec.safe_name, name=component_spec.name, - component_folder_name=component_spec.component_folder_name, description=component_spec.description, consumes=component_spec.consumes, produces=component_spec.produces, is_consumes_generic=component_spec.is_generic("consumes"), is_produces_generic=component_spec.is_generic("produces"), - arguments=[arg for arg in component_spec.args.values() - if arg.name not in component_spec.default_arguments], + arguments=[ + arg + for arg in component_spec.args.values() + if arg.name not in component_spec.default_arguments + ], tests=(component_dir / "tests").exists(), tags=component_spec.tags, ) @@ -48,10 +50,12 @@ def main(component_spec_path: Path): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("component_specs", - nargs="+", - type=Path, - help="Path to the component spec to generate a readme from") + parser.add_argument( + "component_specs", + nargs="+", + type=Path, + help="Path to the component spec to generate a readme from", + ) args = parser.parse_args() for spec in args.component_specs: diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 5907bc8a6..42244bab0 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -1,13 +1,13 @@ # {{ name }} - + ## Description {{ description }} - + ## Inputs / outputs - + ### Consumes {% if consumes %} **This component consumes:** @@ -33,7 +33,7 @@ See the usage example below on how to define a field name for additional fields. {% endif %} - + ### Produces {% if produces %} **This component produces:** @@ -55,7 +55,7 @@ the type of the field that should be used to write the output dataset. **This component does not produce data.** {% endif %} - + ## Arguments {% if arguments %} @@ -70,7 +70,7 @@ The component takes the following arguments to alter its behavior: This component takes no arguments. {% endif %} - + ## Usage You can add this component to your pipeline using the following code: @@ -94,7 +94,7 @@ dataset = dataset.apply(...) dataset.write( {% endif %} {% endif %} - "{{ id }}", + "{{ component_id }}", arguments={ # Add arguments {% for argument in arguments %} @@ -121,7 +121,7 @@ dataset.write( ``` {% if tests %} - + ## Testing You can run the tests using docker with BuildKit. From this directory, run: diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 5e853c615..c3b7bfd06 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -199,7 +199,7 @@ def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: """Create dataframe writing task.""" location = ( f"{self.manifest.base_path}/{self.manifest.pipeline_name}/" - f"{self.manifest.run_id}/{self.operation_spec.component_folder_name}" + f"{self.manifest.run_id}/{self.operation_spec.component_name}" ) schema = { diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index 886039e5a..27fffce7c 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -100,7 +100,7 @@ def __init__( tags: t.Optional[t.List[str]] = None, ): spec_dict: t.Dict[str, t.Any] = { - "name": self.sanitized_component_name(name), + "name": name, "image": image, } @@ -179,6 +179,10 @@ def from_dict(cls, component_spec_dict: t.Dict[str, t.Any]) -> "ComponentSpec": def name(self): return self._specification["name"] + @property + def safe_name(self): + return self.sanitized_component_name(self._specification["name"]) + def sanitized_component_name(self, name) -> str: """Cleans and converts a component name.""" return name.lower().replace(" ", "_") @@ -516,9 +520,9 @@ def outer_produces(self) -> t.Mapping[str, Field]: return self._outer_produces @property - def component_folder_name(self) -> str: - """Get the component folder name.""" - return self._component_spec.name + def component_name(self) -> str: + """Get the component name.""" + return self._component_spec.safe_name @property def previous_index(self) -> t.Optional[str]: diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 15ecf87d7..7640f97a4 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -256,7 +256,7 @@ def evolve( # : PLR0912 (too many branches) evolved_manifest = self.copy() # Update `run_id` and `component_id` in the metadata - component_id = operation_spec.component_folder_name + component_id = operation_spec.component_name evolved_manifest.update_metadata(key="component_id", value=component_id) evolved_manifest.update_metadata(key="run_id", value=run_id) diff --git a/src/fondant/pipeline/compiler.py b/src/fondant/pipeline/compiler.py index 5f06fa5d9..155c371c9 100644 --- a/src/fondant/pipeline/compiler.py +++ b/src/fondant/pipeline/compiler.py @@ -358,7 +358,7 @@ def from_fondant_component_spec( re.sub( "-+", "-", - re.sub("[^-0-9a-z]+", "-", fondant_component.name.lower()), + re.sub("[^-0-9a-z]+", "-", fondant_component.safe_name.lower()), ) .lstrip("-") .rstrip("-") diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 38a35d024..8720cd83b 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -305,7 +305,7 @@ def _get_registry_path(name: str) -> Path: @property def component_name(self) -> str: - return self.component_spec.name + return self.component_spec.safe_name def get_component_cache_key( self, diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py index 85f9d9a55..ec733d3cc 100644 --- a/tests/component/test_data_io.py +++ b/tests/component/test_data_io.py @@ -137,7 +137,10 @@ def test_write_dataset( data_writer.write_dataframe(dataframe, dask_client) # read written data and assert dataframe = dd.read_parquet( - temp_dir / manifest.pipeline_name / manifest.run_id / component_spec.name, + temp_dir + / manifest.pipeline_name + / manifest.run_id + / component_spec.safe_name, ) assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == columns @@ -178,7 +181,7 @@ def test_write_dataset_custom_produces( temp_dir / manifest.pipeline_name / manifest.run_id - / component_spec_produces.name, + / component_spec_produces.safe_name, ) assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == expected_columns diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py index 179a03b9d..0f81ae265 100644 --- a/tests/core/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -91,7 +91,7 @@ def test_component_spec_no_args(valid_fondant_schema_no_args): """Test that a component spec without args is supported.""" fondant_component = ComponentSpec.from_dict(valid_fondant_schema_no_args) - assert fondant_component.name == "example_component" + assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" assert fondant_component.args == fondant_component.default_arguments diff --git a/tests/core/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py index 33d5bdbca..b8e03ef0e 100644 --- a/tests/core/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -135,5 +135,5 @@ def test_component_spec_location_update(): ) assert evolved_manifest.index.location.endswith( - component_spec.name, + component_spec.safe_name, ) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 48abf51ee..3360c7a42 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -93,7 +93,7 @@ def load(self) -> dd.DataFrame: component = ComponentOp.from_ref(Foo, produces={"bar": pa.string()}) assert component.component_spec._specification == { - "name": "foo", + "name": "Foo", "image": fondant_image_name, "description": "lightweight component", "consumes": {"additionalProperties": True}, diff --git a/tests/pipeline/test_python_component.py b/tests/pipeline/test_python_component.py index 22f5f208f..da018a6ae 100644 --- a/tests/pipeline/test_python_component.py +++ b/tests/pipeline/test_python_component.py @@ -94,7 +94,7 @@ def load(self) -> dd.DataFrame: ].operation_spec.to_dict() assert operation_spec_dict == { "specification": { - "name": "createdata", + "name": "CreateData", "image": "python:3.8-slim-buster", "description": "lightweight component", "consumes": {"additionalProperties": True}, @@ -138,7 +138,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: operation_spec_dict = pipeline._graph["addn"]["operation"].operation_spec.to_dict() assert operation_spec_dict == { "specification": { - "name": "addn", + "name": "AddN", "image": default_fondant_image, "description": "lightweight component", "consumes": {"additionalProperties": True}, @@ -200,7 +200,7 @@ def load(self) -> dd.DataFrame: assert operation_spec_without_image == { "specification": { - "name": "createdata", + "name": "CreateData", "image": "python:3.8-slim-buster", "description": "lightweight component", "consumes": {"additionalProperties": True}, @@ -289,7 +289,7 @@ def load(self) -> dd.DataFrame: assert operation_spec_without_image == { "specification": { - "name": "createdata", + "name": "CreateData", "image": default_fondant_image, "description": "lightweight component", "consumes": {"additionalProperties": True},