ml6team · RobbeSneyders · Oct 19, 2023 · Oct 18, 2023 · Oct 19, 2023
diff --git a/components/download_images/README.md b/components/download_images/README.md
@@ -35,9 +35,9 @@ The component takes the following arguments to alter its behavior:
 | n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running into timeout errors. A lower number of connections can increase the success rate but lower the throughput. | 100 |
 | image_size | int | Size of the images after resizing. | 256 |
 | resize_mode | str | Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". | border |
-| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | False |
+| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | / |
 | min_image_size | int | Minimum size of the images. | / |
-| max_aspect_ratio | float | Maximum aspect ratio of the images. | inf |
+| max_aspect_ratio | float | Maximum aspect ratio of the images. | 99.9 |
 
 ### Usage
 
@@ -56,9 +56,9 @@ download_images_op = ComponentOp.from_registry(
         # "n_connections": 100,
         # "image_size": 256,
         # "resize_mode": "border",
-        # "resize_only_if_bigger": "False",
+        # "resize_only_if_bigger": False,
         # "min_image_size": 0,
-        # "max_aspect_ratio": "inf",
+        # "max_aspect_ratio": 99.9,
     }
 )
 pipeline.add_op(download_images_op, dependencies=[...])  #Add previous component as dependency

diff --git a/components/embed_text/README.md b/components/embed_text/README.md
@@ -22,14 +22,6 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| input_manifest_path | str | Path to the input manifest | / |
-| component_spec | dict | The component specification as a dictionary | / |
-| input_partition_rows | int | The number of rows to load per partition.                         Set to override the automatic partitioning | / |
-| cache | bool | Set to False to disable caching, True by default. | True |
-| cluster_type | str | The cluster type to use for the execution | default |
-| client_kwargs | dict | Keyword arguments to pass to the Dask client | / |
-| metadata | str | Metadata arguments containing the run id and base path | / |
-| output_manifest_path | str | Path to the output manifest | / |
 | model_provider | str | The provider of the model - corresponding to langchain embedding classes. Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai, vertexai. | huggingface |
 | model | str | The model to generate embeddings from. Choose an available model name to pass to the model provider's langchain embedding class. | / |
 | api_keys | dict | The API keys to use for the model provider that are written to environment variables.Pass only the keys required by the model provider or conveniently pass all keys you will ever need. Pay attention how to name the dictionary keys so that they can be used by the model provider. | / |
@@ -47,14 +39,6 @@ embed_text_op = ComponentOp.from_registry(
     name="embed_text",
     arguments={
         # Add arguments
-        # "input_manifest_path": ,
-        # "component_spec": {},
-        # "input_partition_rows": 0,
-        # "cache": True,
-        # "cluster_type": "default",
-        # "client_kwargs": {},
-        # "metadata": ,
-        # "output_manifest_path": ,
         # "model_provider": "huggingface",
         # "model": ,
         # "api_keys": {},

diff --git a/components/index_weaviate/README.md b/components/index_weaviate/README.md
@@ -19,14 +19,6 @@ The component takes the following arguments to alter its behavior:
 
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
-| input_manifest_path | str | Path to the input manifest | / |
-| component_spec | dict | The component specification as a dictionary | / |
-| input_partition_rows | int | The number of rows to load per partition.                         Set to override the automatic partitioning | / |
-| cache | bool | Set to False to disable caching, True by default. | True |
-| cluster_type | str | The cluster type to use for the execution | default |
-| client_kwargs | dict | Keyword arguments to pass to the Dask client | / |
-| metadata | str | Metadata arguments containing the run id and base path | / |
-| output_manifest_path | str | Path to the output manifest | / |
 | weaviate_url | str | The URL of the weaviate instance. | http://localhost:8080 |
 | batch_size | int | The batch size to be used.Parameter of weaviate.batch.Batch().configure(). | 100 |
 | dynamic | bool | Whether to use dynamic batching or not.Parameter of weaviate.batch.Batch().configure(). | True |
@@ -47,14 +39,6 @@ index_weaviate_op = ComponentOp.from_registry(
     name="index_weaviate",
     arguments={
         # Add arguments
-        # "input_manifest_path": ,
-        # "component_spec": {},
-        # "input_partition_rows": 0,
-        # "cache": True,
-        # "cluster_type": "default",
-        # "client_kwargs": {},
-        # "metadata": ,
-        # "output_manifest_path": ,
         # "weaviate_url": "http://localhost:8080",
         # "batch_size": 100,
         # "dynamic": True,

diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md
@@ -20,9 +20,9 @@ The component takes the following arguments to alter its behavior:
 | -------- | ---- | ----------- | ------- |
 | dataset_name | str | Name of dataset on the hub | / |
 | column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | / |
-| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | None |
-| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None |
-| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None |
+| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | / |
+| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / |
+| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / |
 
 ### Usage
 
@@ -38,9 +38,9 @@ load_from_hf_hub_op = ComponentOp.from_registry(
         # Add arguments
         # "dataset_name": ,
         # "column_name_mapping": {},
-        # "image_column_names": "None",
-        # "n_rows_to_load": "None",
-        # "index_column": "None",
+        # "image_column_names": [],
+        # "n_rows_to_load": 0,
+        # "index_column": ,
     }
 )
 pipeline.add_op(load_from_hf_hub_op, dependencies=[...])  #Add previous component as dependency

diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md
@@ -19,9 +19,9 @@ The component takes the following arguments to alter its behavior:
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
 | dataset_uri | str | The remote path to the parquet file/folder containing the dataset | / |
-| column_name_mapping | dict | Mapping of the consumed dataset | None |
-| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None |
-| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None |
+| column_name_mapping | dict | Mapping of the consumed dataset | / |
+| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / |
+| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / |
 
 ### Usage
 
@@ -36,9 +36,9 @@ load_from_parquet_op = ComponentOp.from_registry(
     arguments={
         # Add arguments
         # "dataset_uri": ,
-        # "column_name_mapping": "None",
-        # "n_rows_to_load": "None",
-        # "index_column": "None",
+        # "column_name_mapping": {},
+        # "n_rows_to_load": 0,
+        # "index_column": ,
     }
 )
 pipeline.add_op(load_from_parquet_op, dependencies=[...])  #Add previous component as dependency

diff --git a/components/segment_images/README.md b/components/segment_images/README.md
@@ -22,7 +22,7 @@ The component takes the following arguments to alter its behavior:
 | argument | type | description | default |
 | -------- | ---- | ----------- | ------- |
 | model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small |
-| batch_size | int | batch size to use | / |
+| batch_size | int | batch size to use | 8 |
 
 ### Usage
 
@@ -37,7 +37,7 @@ segment_images_op = ComponentOp.from_registry(
     arguments={
         # Add arguments
         # "model_id": "openmmlab/upernet-convnext-small",
-        # "batch_size": 0,
+        # "batch_size": 8,
     }
 )
 pipeline.add_op(segment_images_op, dependencies=[...])  #Add previous component as dependency

diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md
@@ -21,8 +21,8 @@ The component takes the following arguments to alter its behavior:
 | hf_token | str | The hugging face token used to write to the hub | / |
 | username | str | The username under which to upload the dataset | / |
 | dataset_name | str | The name of the dataset to upload | / |
-| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | None |
-| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | None |
+| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | / |
+| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | / |
 
 ### Usage
 
@@ -39,8 +39,8 @@ write_to_hf_hub_op = ComponentOp.from_registry(
         # "hf_token": ,
         # "username": ,
         # "dataset_name": ,
-        # "image_column_names": "None",
-        # "column_name_mapping": "None",
+        # "image_column_names": [],
+        # "column_name_mapping": {},
     }
 )
 pipeline.add_op(write_to_hf_hub_op, dependencies=[...])  #Add previous component as dependency

diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py
@@ -24,7 +24,8 @@ def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> st
         description=component_spec.description,
         consumes=component_spec.consumes,
         produces=component_spec.produces,
-        arguments=component_spec.args.values(),
+        arguments=[arg for arg in component_spec.args.values()
+                   if arg.name not in component_spec.default_arguments],
         tests=(component_dir / "tests").exists()
     )