Fix Readme script generation (#821)

This fixes the failing builds but is it what we want ?
ml6team · Jan 30, 2024 · 8b9713a · 8b9713a
1 parent d87efb9
commit 8b9713a
Show file tree

Hide file tree

Showing 13 changed files with 47 additions and 36 deletions.
diff --git a/components/caption_images/README.md b/components/caption_images/README.md
@@ -49,7 +49,7 @@ pipeline = Pipeline(...)
 dataset = pipeline.read(...)
 
 dataset = dataset.apply(
-    "caption_images",
+    "",
     arguments={
         # Add arguments
         # "model_id": "Salesforce/blip-image-captioning-base",

diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import jinja2
+
 from fondant.core.component_spec import ComponentSpec
 
 
@@ -11,24 +12,25 @@ def read_component_spec(component_spec_path: Path) -> ComponentSpec:
 
 def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> str:
     env = jinja2.Environment(
-        loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent),
-        trim_blocks=True
+        loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent), trim_blocks=True
     )
     env.filters["eval"] = eval
 
     template = env.get_template("readme_template.md")
 
     return template.render(
-        id=component_dir.name,
+        component_id=component_spec.safe_name,
         name=component_spec.name,
-        component_folder_name=component_spec.component_folder_name,
         description=component_spec.description,
         consumes=component_spec.consumes,
         produces=component_spec.produces,
         is_consumes_generic=component_spec.is_generic("consumes"),
         is_produces_generic=component_spec.is_generic("produces"),
-        arguments=[arg for arg in component_spec.args.values()
-                   if arg.name not in component_spec.default_arguments],
+        arguments=[
+            arg
+            for arg in component_spec.args.values()
+            if arg.name not in component_spec.default_arguments
+        ],
         tests=(component_dir / "tests").exists(),
         tags=component_spec.tags,
     )
@@ -48,10 +50,12 @@ def main(component_spec_path: Path):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("component_specs",
-                        nargs="+",
-                        type=Path,
-                        help="Path to the component spec to generate a readme from")
+    parser.add_argument(
+        "component_specs",
+        nargs="+",
+        type=Path,
+        help="Path to the component spec to generate a readme from",
+    )
     args = parser.parse_args()
 
     for spec in args.component_specs:

diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md
@@ -1,13 +1,13 @@
 # {{ name }}
 
-<a id="{{ component_folder_name }}#description"></a>
+<a id="{{ component_id }}#description"></a>
 ## Description
 {{ description }}
 
-<a id="{{ component_folder_name }}#inputs_outputs"></a>
+<a id="{{ component_id }}#inputs_outputs"></a>
 ## Inputs / outputs 
 
-<a id="{{ component_folder_name }}#consumes"></a>
+<a id="{{ component_id }}#consumes"></a>
 ### Consumes 
 {% if consumes %}
 **This component consumes:**
@@ -33,7 +33,7 @@ See the usage example below on how to define a field name for additional fields.
 {% endif %}
 
 
-<a id="{{ component_folder_name }}#produces"></a>  
+<a id="{{ component_id }}#produces"></a>  
 ### Produces 
 {% if produces %}
 **This component produces:**
@@ -55,7 +55,7 @@ the type of the field that should be used to write the output dataset.
 **This component does not produce data.**
 {% endif %}
 
-<a id="{{ component_folder_name }}#arguments"></a>
+<a id="{{ component_id }}#arguments"></a>
 ## Arguments
 
 {% if arguments %}
@@ -70,7 +70,7 @@ The component takes the following arguments to alter its behavior:
 This component takes no arguments.
 {% endif %}
 
-<a id="{{ component_folder_name }}#usage"></a>
+<a id="{{ component_id }}#usage"></a>
 ## Usage 
 
 You can add this component to your pipeline using the following code:
@@ -94,7 +94,7 @@ dataset = dataset.apply(...)
 dataset.write(
 {% endif %}
 {% endif %}
-    "{{ id }}",
+    "{{ component_id }}",
     arguments={
         # Add arguments
 {% for argument in arguments %}
@@ -121,7 +121,7 @@ dataset.write(
 ```
 
 {% if tests %}
-<a id="{{ component_folder_name }}#testing"></a>
+<a id="{{ component_id }}#testing"></a>
 ## Testing
 
 You can run the tests using docker with BuildKit. From this directory, run:

diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py
@@ -199,7 +199,7 @@ def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar:
         """Create dataframe writing task."""
         location = (
             f"{self.manifest.base_path}/{self.manifest.pipeline_name}/"
-            f"{self.manifest.run_id}/{self.operation_spec.component_folder_name}"
+            f"{self.manifest.run_id}/{self.operation_spec.component_name}"
         )
 
         schema = {

diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py
@@ -100,7 +100,7 @@ def __init__(
         tags: t.Optional[t.List[str]] = None,
     ):
         spec_dict: t.Dict[str, t.Any] = {
-            "name": self.sanitized_component_name(name),
+            "name": name,
             "image": image,
         }
 
@@ -179,6 +179,10 @@ def from_dict(cls, component_spec_dict: t.Dict[str, t.Any]) -> "ComponentSpec":
     def name(self):
         return self._specification["name"]
 
+    @property
+    def safe_name(self):
+        return self.sanitized_component_name(self._specification["name"])
+
     def sanitized_component_name(self, name) -> str:
         """Cleans and converts a component name."""
         return name.lower().replace(" ", "_")
@@ -516,9 +520,9 @@ def outer_produces(self) -> t.Mapping[str, Field]:
         return self._outer_produces
 
     @property
-    def component_folder_name(self) -> str:
-        """Get the component folder name."""
-        return self._component_spec.name
+    def component_name(self) -> str:
+        """Get the component name."""
+        return self._component_spec.safe_name
 
     @property
     def previous_index(self) -> t.Optional[str]:

diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py
@@ -256,7 +256,7 @@ def evolve(  # : PLR0912 (too many branches)
         evolved_manifest = self.copy()
 
         # Update `run_id` and `component_id` in the metadata
-        component_id = operation_spec.component_folder_name
+        component_id = operation_spec.component_name
         evolved_manifest.update_metadata(key="component_id", value=component_id)
         evolved_manifest.update_metadata(key="run_id", value=run_id)
 

diff --git a/src/fondant/pipeline/compiler.py b/src/fondant/pipeline/compiler.py
@@ -358,7 +358,7 @@ def from_fondant_component_spec(
             re.sub(
                 "-+",
                 "-",
-                re.sub("[^-0-9a-z]+", "-", fondant_component.name.lower()),
+                re.sub("[^-0-9a-z]+", "-", fondant_component.safe_name.lower()),
             )
             .lstrip("-")
             .rstrip("-")

diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py
@@ -305,7 +305,7 @@ def _get_registry_path(name: str) -> Path:
 
     @property
     def component_name(self) -> str:
-        return self.component_spec.name
+        return self.component_spec.safe_name
 
     def get_component_cache_key(
         self,

diff --git a/tests/component/test_data_io.py b/tests/component/test_data_io.py
@@ -137,7 +137,10 @@ def test_write_dataset(
         data_writer.write_dataframe(dataframe, dask_client)
         # read written data and assert
         dataframe = dd.read_parquet(
-            temp_dir / manifest.pipeline_name / manifest.run_id / component_spec.name,
+            temp_dir
+            / manifest.pipeline_name
+            / manifest.run_id
+            / component_spec.safe_name,
         )
         assert len(dataframe) == NUMBER_OF_TEST_ROWS
         assert list(dataframe.columns) == columns
@@ -178,7 +181,7 @@ def test_write_dataset_custom_produces(
             temp_dir
             / manifest.pipeline_name
             / manifest.run_id
-            / component_spec_produces.name,
+            / component_spec_produces.safe_name,
         )
         assert len(dataframe) == NUMBER_OF_TEST_ROWS
         assert list(dataframe.columns) == expected_columns

diff --git a/tests/core/test_component_specs.py b/tests/core/test_component_specs.py
@@ -91,7 +91,7 @@ def test_component_spec_no_args(valid_fondant_schema_no_args):
     """Test that a component spec without args is supported."""
     fondant_component = ComponentSpec.from_dict(valid_fondant_schema_no_args)
 
-    assert fondant_component.name == "example_component"
+    assert fondant_component.name == "Example component"
     assert fondant_component.description == "This is an example component"
     assert fondant_component.args == fondant_component.default_arguments
 

diff --git a/tests/core/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py
@@ -135,5 +135,5 @@ def test_component_spec_location_update():
     )
 
     assert evolved_manifest.index.location.endswith(
-        component_spec.name,
+        component_spec.safe_name,
     )
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
@@ -93,7 +93,7 @@ def load(self) -> dd.DataFrame:
 
     component = ComponentOp.from_ref(Foo, produces={"bar": pa.string()})
     assert component.component_spec._specification == {
-        "name": "foo",
+        "name": "Foo",
         "image": fondant_image_name,
         "description": "lightweight component",
         "consumes": {"additionalProperties": True},

diff --git a/tests/pipeline/test_python_component.py b/tests/pipeline/test_python_component.py
@@ -94,7 +94,7 @@ def load(self) -> dd.DataFrame:
     ].operation_spec.to_dict()
     assert operation_spec_dict == {
         "specification": {
-            "name": "createdata",
+            "name": "CreateData",
             "image": "python:3.8-slim-buster",
             "description": "lightweight component",
             "consumes": {"additionalProperties": True},
@@ -138,7 +138,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
     operation_spec_dict = pipeline._graph["addn"]["operation"].operation_spec.to_dict()
     assert operation_spec_dict == {
         "specification": {
-            "name": "addn",
+            "name": "AddN",
             "image": default_fondant_image,
             "description": "lightweight component",
             "consumes": {"additionalProperties": True},
@@ -200,7 +200,7 @@ def load(self) -> dd.DataFrame:
 
     assert operation_spec_without_image == {
         "specification": {
-            "name": "createdata",
+            "name": "CreateData",
             "image": "python:3.8-slim-buster",
             "description": "lightweight component",
             "consumes": {"additionalProperties": True},
@@ -289,7 +289,7 @@ def load(self) -> dd.DataFrame:
 
     assert operation_spec_without_image == {
         "specification": {
-            "name": "createdata",
+            "name": "CreateData",
             "image": default_fondant_image,
             "description": "lightweight component",
             "consumes": {"additionalProperties": True},