From e8f65224796c0a649848e7bd98766b55bf8ff764 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Thu, 19 Oct 2023 01:05:55 +0200 Subject: [PATCH 1/2] Update component readmes --- components/caption_images/README.md | 16 +++++++++++ components/chunk_text/README.md | 16 +++++++++++ components/download_images/README.md | 24 +++++++++++++--- components/embed_images/README.md | 16 +++++++++++ .../embedding_based_laion_retrieval/README.md | 16 +++++++++++ components/filter_comments/README.md | 16 +++++++++++ components/filter_image_resolution/README.md | 16 +++++++++++ components/filter_line_length/README.md | 16 +++++++++++ components/image_cropping/README.md | 16 +++++++++++ .../image_resolution_extraction/README.md | 21 +++++++++++++- components/language_filter/README.md | 16 +++++++++++ components/load_from_files/README.md | 16 +++++++++++ components/load_from_hf_hub/README.md | 28 +++++++++++++++---- components/load_from_parquet/README.md | 28 +++++++++++++++---- components/minhash_generator/README.md | 16 +++++++++++ components/pii_redaction/README.md | 21 +++++++++++++- .../prompt_based_laion_retrieval/README.md | 16 +++++++++++ components/segment_images/README.md | 20 +++++++++++-- components/text_length_filter/README.md | 16 +++++++++++ components/text_normalization/README.md | 16 +++++++++++ components/write_to_hf_hub/README.md | 24 +++++++++++++--- 21 files changed, 366 insertions(+), 24 deletions(-) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 8bb38e996..952688497 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -21,6 +21,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | model_id | str | Id of the BLIP model on the Hugging Face hub | Salesforce/blip-image-captioning-base | | batch_size | int | Batch size to use for inference | 8 | | max_new_tokens | int | Maximum token length of each caption | 50 | @@ -37,6 +45,14 @@ caption_images_op = ComponentOp.from_registry( name="caption_images", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "model_id": "Salesforce/blip-image-captioning-base", # "batch_size": 8, # "max_new_tokens": 50, diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 97b3309e0..7f4a9d221 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -26,6 +26,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | chunk_size | int | Maximum size of chunks to return | / | | chunk_overlap | int | Overlap in characters between chunks | / | @@ -41,6 +49,14 @@ chunk_text_op = ComponentOp.from_registry( name="chunk_text", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "chunk_size": 0, # "chunk_overlap": 0, } diff --git a/components/download_images/README.md b/components/download_images/README.md index 5a392aaeb..e03f0edb8 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -30,14 +30,22 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | timeout | int | Maximum time (in seconds) to wait when trying to download an image, | 10 | | retries | int | Number of times to retry downloading an image if it fails. | / | | n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running into timeout errors. A lower number of connections can increase the success rate but lower the throughput. | 100 | | image_size | int | Size of the images after resizing. | 256 | | resize_mode | str | Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". | border | -| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | False | +| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | / | | min_image_size | int | Minimum size of the images. | / | -| max_aspect_ratio | float | Maximum aspect ratio of the images. | inf | +| max_aspect_ratio | float | Maximum aspect ratio of the images. | 99.9 | ### Usage @@ -51,14 +59,22 @@ download_images_op = ComponentOp.from_registry( name="download_images", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "timeout": 10, # "retries": 0, # "n_connections": 100, # "image_size": 256, # "resize_mode": "border", - # "resize_only_if_bigger": "False", + # "resize_only_if_bigger": False, # "min_image_size": 0, - # "max_aspect_ratio": "inf", + # "max_aspect_ratio": 99.9, } ) pipeline.add_op(download_images_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/embed_images/README.md b/components/embed_images/README.md index eec02f577..1e3fe1820 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -21,6 +21,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | model_id | str | Model id of a CLIP model on the Hugging Face hub | openai/clip-vit-large-patch14 | | batch_size | int | Batch size to use when embedding | 8 | @@ -36,6 +44,14 @@ embed_images_op = ComponentOp.from_registry( name="embed_images", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "model_id": "openai/clip-vit-large-patch14", # "batch_size": 8, } diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 454253416..9e0caa3a6 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -23,6 +23,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | @@ -39,6 +47,14 @@ embedding_based_laion_retrieval_op = ComponentOp.from_registry( name="embedding_based_laion_retrieval", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "num_images": 0, # "aesthetic_score": 9, # "aesthetic_weight": 0.5, diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index e0d55e57d..6807c5cc7 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -18,6 +18,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | min_comments_ratio | float | The minimum code to comment ratio | 0.1 | | max_comments_ratio | float | The maximum code to comment ratio | 0.9 | @@ -33,6 +41,14 @@ filter_comments_op = ComponentOp.from_registry( name="filter_comments", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "min_comments_ratio": 0.1, # "max_comments_ratio": 0.9, } diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 1bc0c27f5..8f9f15463 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -19,6 +19,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | min_image_dim | int | Minimum image dimension | / | | max_aspect_ratio | float | Maximum aspect ratio | / | @@ -34,6 +42,14 @@ filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "min_image_dim": 0, # "max_aspect_ratio": 0.0, } diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 46f5699e4..6ad52fde6 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -20,6 +20,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | avg_line_length_threshold | int | Threshold for average line length to filter on | / | | max_line_length_threshold | int | Threshold for maximum line length to filter on | / | | alphanum_fraction_threshold | float | Alphanum fraction to filter on | / | @@ -36,6 +44,14 @@ filter_line_length_op = ComponentOp.from_registry( name="filter_line_length", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "avg_line_length_threshold": 0, # "max_line_length_threshold": 0, # "alphanum_fraction_threshold": 0.0, diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 5d679c457..4c36156f6 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -38,6 +38,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | cropping_threshold | int | Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 | -30 | | padding | int | Padding for the image cropping. The padding is added to all borders of the image. | 10 | @@ -53,6 +61,14 @@ image_cropping_op = ComponentOp.from_registry( name="image_cropping", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "cropping_threshold": -30, # "padding": 10, } diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index a69a4df4e..1edb9d776 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -19,7 +19,18 @@ Component that extracts image resolution data from the images ### Arguments -This component takes no arguments. +The component takes the following arguments to alter its behavior: + +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | ### Usage @@ -33,6 +44,14 @@ image_resolution_extraction_op = ComponentOp.from_registry( name="image_resolution_extraction", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , } ) pipeline.add_op(image_resolution_extraction_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c3afd6435..e7473b8c4 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -18,6 +18,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | en | ### Usage @@ -32,6 +40,14 @@ language_filter_op = ComponentOp.from_registry( name="language_filter", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "language": "en", } ) diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 834f568e5..e14110cbe 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -21,6 +21,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | directory_uri | str | Local or remote path to the directory containing the files | / | ### Usage @@ -35,6 +43,14 @@ load_from_files_op = ComponentOp.from_registry( name="load_from_files", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "directory_uri": , } ) diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 6d5bdded6..741d62d06 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -18,11 +18,19 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | dataset_name | str | Name of dataset on the hub | / | | column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | / | -| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | None | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | -| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | +| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | / | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | ### Usage @@ -36,11 +44,19 @@ load_from_hf_hub_op = ComponentOp.from_registry( name="load_from_hf_hub", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "dataset_name": , # "column_name_mapping": {}, - # "image_column_names": "None", - # "n_rows_to_load": "None", - # "index_column": "None", + # "image_column_names": [], + # "n_rows_to_load": 0, + # "index_column": , } ) pipeline.add_op(load_from_hf_hub_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 9155c8163..23a457748 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -18,10 +18,18 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | dataset_uri | str | The remote path to the parquet file/folder containing the dataset | / | -| column_name_mapping | dict | Mapping of the consumed dataset | None | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | -| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | +| column_name_mapping | dict | Mapping of the consumed dataset | / | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | ### Usage @@ -35,10 +43,18 @@ load_from_parquet_op = ComponentOp.from_registry( name="load_from_parquet", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "dataset_uri": , - # "column_name_mapping": "None", - # "n_rows_to_load": "None", - # "index_column": "None", + # "column_name_mapping": {}, + # "n_rows_to_load": 0, + # "index_column": , } ) pipeline.add_op(load_from_parquet_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 422fdc7af..d317dfd06 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -21,6 +21,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | shingle_ngram_size | int | Define size of ngram used for the shingle generation | 3 | ### Usage @@ -35,6 +43,14 @@ minhash_generator_op = ComponentOp.from_registry( name="minhash_generator", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "shingle_ngram_size": 3, } ) diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index e9143486b..8d4b2e17b 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -37,7 +37,18 @@ code. ### Arguments -This component takes no arguments. +The component takes the following arguments to alter its behavior: + +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | ### Usage @@ -51,6 +62,14 @@ pii_redaction_op = ComponentOp.from_registry( name="pii_redaction", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , } ) pipeline.add_op(pii_redaction_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 45fb1c034..1992ad03b 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -26,6 +26,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | @@ -43,6 +51,14 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( name="prompt_based_laion_retrieval", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "num_images": 0, # "aesthetic_score": 9, # "aesthetic_weight": 0.5, diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 2f569d42e..b06c5125d 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -21,8 +21,16 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small | -| batch_size | int | batch size to use | / | +| batch_size | int | batch size to use | 8 | ### Usage @@ -36,8 +44,16 @@ segment_images_op = ComponentOp.from_registry( name="segment_images", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "model_id": "openmmlab/upernet-convnext-small", - # "batch_size": 0, + # "batch_size": 8, } ) pipeline.add_op(segment_images_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index 01ee0ba1a..5db996d19 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -18,6 +18,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | min_characters_length | int | Minimum number of characters | / | | min_words_length | int | Mininum number of words | / | @@ -33,6 +41,14 @@ text_length_filter_op = ComponentOp.from_registry( name="text_length_filter", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "min_characters_length": 0, # "min_words_length": 0, } diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 6ae6fb97f..2d62a28cb 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -30,6 +30,14 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / | | apply_nfc | bool | If true apply nfc normalization | / | | normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / | @@ -48,6 +56,14 @@ text_normalization_op = ComponentOp.from_registry( name="text_normalization", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "remove_additional_whitespaces": False, # "apply_nfc": False, # "normalize_lines": False, diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 1f68f45f5..0ee0f34db 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -18,11 +18,19 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | +| input_manifest_path | str | Path to the input manifest | / | +| component_spec | dict | The component specification as a dictionary | / | +| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | +| cache | bool | Set to False to disable caching, True by default. | True | +| cluster_type | str | The cluster type to use for the execution | default | +| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | +| metadata | str | Metadata arguments containing the run id and base path | / | +| output_manifest_path | str | Path to the output manifest | / | | hf_token | str | The hugging face token used to write to the hub | / | | username | str | The username under which to upload the dataset | / | | dataset_name | str | The name of the dataset to upload | / | -| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | None | -| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | None | +| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | / | +| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | / | ### Usage @@ -36,11 +44,19 @@ write_to_hf_hub_op = ComponentOp.from_registry( name="write_to_hf_hub", arguments={ # Add arguments + # "input_manifest_path": , + # "component_spec": {}, + # "input_partition_rows": 0, + # "cache": True, + # "cluster_type": "default", + # "client_kwargs": {}, + # "metadata": , + # "output_manifest_path": , # "hf_token": , # "username": , # "dataset_name": , - # "image_column_names": "None", - # "column_name_mapping": "None", + # "image_column_names": [], + # "column_name_mapping": {}, } ) pipeline.add_op(write_to_hf_hub_op, dependencies=[...]) #Add previous component as dependency From 2a6ab9125811f992c5729fef2470d1b154e936a6 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Thu, 19 Oct 2023 09:24:33 +0200 Subject: [PATCH 2/2] Remove internal arguments from component READMEs --- components/caption_images/README.md | 16 -------------- components/chunk_text/README.md | 16 -------------- components/download_images/README.md | 16 -------------- components/embed_images/README.md | 16 -------------- components/embed_text/README.md | 16 -------------- .../embedding_based_laion_retrieval/README.md | 16 -------------- components/filter_comments/README.md | 16 -------------- components/filter_image_resolution/README.md | 16 -------------- components/filter_line_length/README.md | 16 -------------- components/image_cropping/README.md | 16 -------------- .../image_resolution_extraction/README.md | 21 +------------------ components/index_weaviate/README.md | 16 -------------- components/language_filter/README.md | 16 -------------- components/load_from_files/README.md | 16 -------------- components/load_from_hf_hub/README.md | 16 -------------- components/load_from_parquet/README.md | 16 -------------- components/minhash_generator/README.md | 16 -------------- components/pii_redaction/README.md | 21 +------------------ .../prompt_based_laion_retrieval/README.md | 16 -------------- components/segment_images/README.md | 16 -------------- components/text_length_filter/README.md | 16 -------------- components/text_normalization/README.md | 16 -------------- components/write_to_hf_hub/README.md | 16 -------------- scripts/component_readme/generate_readme.py | 3 ++- 24 files changed, 4 insertions(+), 377 deletions(-) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 952688497..8bb38e996 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -21,14 +21,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | model_id | str | Id of the BLIP model on the Hugging Face hub | Salesforce/blip-image-captioning-base | | batch_size | int | Batch size to use for inference | 8 | | max_new_tokens | int | Maximum token length of each caption | 50 | @@ -45,14 +37,6 @@ caption_images_op = ComponentOp.from_registry( name="caption_images", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "model_id": "Salesforce/blip-image-captioning-base", # "batch_size": 8, # "max_new_tokens": 50, diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 7f4a9d221..97b3309e0 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -26,14 +26,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | chunk_size | int | Maximum size of chunks to return | / | | chunk_overlap | int | Overlap in characters between chunks | / | @@ -49,14 +41,6 @@ chunk_text_op = ComponentOp.from_registry( name="chunk_text", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "chunk_size": 0, # "chunk_overlap": 0, } diff --git a/components/download_images/README.md b/components/download_images/README.md index e03f0edb8..b491007b5 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -30,14 +30,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | timeout | int | Maximum time (in seconds) to wait when trying to download an image, | 10 | | retries | int | Number of times to retry downloading an image if it fails. | / | | n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running into timeout errors. A lower number of connections can increase the success rate but lower the throughput. | 100 | @@ -59,14 +51,6 @@ download_images_op = ComponentOp.from_registry( name="download_images", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "timeout": 10, # "retries": 0, # "n_connections": 100, diff --git a/components/embed_images/README.md b/components/embed_images/README.md index 1e3fe1820..eec02f577 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -21,14 +21,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | model_id | str | Model id of a CLIP model on the Hugging Face hub | openai/clip-vit-large-patch14 | | batch_size | int | Batch size to use when embedding | 8 | @@ -44,14 +36,6 @@ embed_images_op = ComponentOp.from_registry( name="embed_images", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "model_id": "openai/clip-vit-large-patch14", # "batch_size": 8, } diff --git a/components/embed_text/README.md b/components/embed_text/README.md index f743d483c..a30a9ec4f 100644 --- a/components/embed_text/README.md +++ b/components/embed_text/README.md @@ -22,14 +22,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | model_provider | str | The provider of the model - corresponding to langchain embedding classes. Currently the following providers are supported: aleph_alpha, cohere, huggingface, openai, vertexai. | huggingface | | model | str | The model to generate embeddings from. Choose an available model name to pass to the model provider's langchain embedding class. | / | | api_keys | dict | The API keys to use for the model provider that are written to environment variables.Pass only the keys required by the model provider or conveniently pass all keys you will ever need. Pay attention how to name the dictionary keys so that they can be used by the model provider. | / | @@ -47,14 +39,6 @@ embed_text_op = ComponentOp.from_registry( name="embed_text", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "model_provider": "huggingface", # "model": , # "api_keys": {}, diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 9e0caa3a6..454253416 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -23,14 +23,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | @@ -47,14 +39,6 @@ embedding_based_laion_retrieval_op = ComponentOp.from_registry( name="embedding_based_laion_retrieval", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "num_images": 0, # "aesthetic_score": 9, # "aesthetic_weight": 0.5, diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index 6807c5cc7..e0d55e57d 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | min_comments_ratio | float | The minimum code to comment ratio | 0.1 | | max_comments_ratio | float | The maximum code to comment ratio | 0.9 | @@ -41,14 +33,6 @@ filter_comments_op = ComponentOp.from_registry( name="filter_comments", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "min_comments_ratio": 0.1, # "max_comments_ratio": 0.9, } diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 8f9f15463..1bc0c27f5 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -19,14 +19,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | min_image_dim | int | Minimum image dimension | / | | max_aspect_ratio | float | Maximum aspect ratio | / | @@ -42,14 +34,6 @@ filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "min_image_dim": 0, # "max_aspect_ratio": 0.0, } diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 6ad52fde6..46f5699e4 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -20,14 +20,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | avg_line_length_threshold | int | Threshold for average line length to filter on | / | | max_line_length_threshold | int | Threshold for maximum line length to filter on | / | | alphanum_fraction_threshold | float | Alphanum fraction to filter on | / | @@ -44,14 +36,6 @@ filter_line_length_op = ComponentOp.from_registry( name="filter_line_length", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "avg_line_length_threshold": 0, # "max_line_length_threshold": 0, # "alphanum_fraction_threshold": 0.0, diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 4c36156f6..5d679c457 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -38,14 +38,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | cropping_threshold | int | Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 | -30 | | padding | int | Padding for the image cropping. The padding is added to all borders of the image. | 10 | @@ -61,14 +53,6 @@ image_cropping_op = ComponentOp.from_registry( name="image_cropping", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "cropping_threshold": -30, # "padding": 10, } diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index 1edb9d776..a69a4df4e 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -19,18 +19,7 @@ Component that extracts image resolution data from the images ### Arguments -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | +This component takes no arguments. ### Usage @@ -44,14 +33,6 @@ image_resolution_extraction_op = ComponentOp.from_registry( name="image_resolution_extraction", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , } ) pipeline.add_op(image_resolution_extraction_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/index_weaviate/README.md b/components/index_weaviate/README.md index 1e5ea0093..ce4729c52 100644 --- a/components/index_weaviate/README.md +++ b/components/index_weaviate/README.md @@ -19,14 +19,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | weaviate_url | str | The URL of the weaviate instance. | http://localhost:8080 | | batch_size | int | The batch size to be used.Parameter of weaviate.batch.Batch().configure(). | 100 | | dynamic | bool | Whether to use dynamic batching or not.Parameter of weaviate.batch.Batch().configure(). | True | @@ -47,14 +39,6 @@ index_weaviate_op = ComponentOp.from_registry( name="index_weaviate", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "weaviate_url": "http://localhost:8080", # "batch_size": 100, # "dynamic": True, diff --git a/components/language_filter/README.md b/components/language_filter/README.md index e7473b8c4..c3afd6435 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | en | ### Usage @@ -40,14 +32,6 @@ language_filter_op = ComponentOp.from_registry( name="language_filter", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "language": "en", } ) diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index e14110cbe..834f568e5 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -21,14 +21,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | directory_uri | str | Local or remote path to the directory containing the files | / | ### Usage @@ -43,14 +35,6 @@ load_from_files_op = ComponentOp.from_registry( name="load_from_files", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "directory_uri": , } ) diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 741d62d06..1faa0175a 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | dataset_name | str | Name of dataset on the hub | / | | column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | / | | image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | / | @@ -44,14 +36,6 @@ load_from_hf_hub_op = ComponentOp.from_registry( name="load_from_hf_hub", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "dataset_name": , # "column_name_mapping": {}, # "image_column_names": [], diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 23a457748..c83f7e9e8 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | dataset_uri | str | The remote path to the parquet file/folder containing the dataset | / | | column_name_mapping | dict | Mapping of the consumed dataset | / | | n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | @@ -43,14 +35,6 @@ load_from_parquet_op = ComponentOp.from_registry( name="load_from_parquet", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "dataset_uri": , # "column_name_mapping": {}, # "n_rows_to_load": 0, diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index d317dfd06..422fdc7af 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -21,14 +21,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | shingle_ngram_size | int | Define size of ngram used for the shingle generation | 3 | ### Usage @@ -43,14 +35,6 @@ minhash_generator_op = ComponentOp.from_registry( name="minhash_generator", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "shingle_ngram_size": 3, } ) diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index 8d4b2e17b..e9143486b 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -37,18 +37,7 @@ code. ### Arguments -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | +This component takes no arguments. ### Usage @@ -62,14 +51,6 @@ pii_redaction_op = ComponentOp.from_registry( name="pii_redaction", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , } ) pipeline.add_op(pii_redaction_op, dependencies=[...]) #Add previous component as dependency diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 1992ad03b..45fb1c034 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -26,14 +26,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | @@ -51,14 +43,6 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( name="prompt_based_laion_retrieval", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "num_images": 0, # "aesthetic_score": 9, # "aesthetic_weight": 0.5, diff --git a/components/segment_images/README.md b/components/segment_images/README.md index b06c5125d..9f475d516 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -21,14 +21,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small | | batch_size | int | batch size to use | 8 | @@ -44,14 +36,6 @@ segment_images_op = ComponentOp.from_registry( name="segment_images", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "model_id": "openmmlab/upernet-convnext-small", # "batch_size": 8, } diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index 5db996d19..01ee0ba1a 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | min_characters_length | int | Minimum number of characters | / | | min_words_length | int | Mininum number of words | / | @@ -41,14 +33,6 @@ text_length_filter_op = ComponentOp.from_registry( name="text_length_filter", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "min_characters_length": 0, # "min_words_length": 0, } diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 2d62a28cb..6ae6fb97f 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -30,14 +30,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / | | apply_nfc | bool | If true apply nfc normalization | / | | normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / | @@ -56,14 +48,6 @@ text_normalization_op = ComponentOp.from_registry( name="text_normalization", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "remove_additional_whitespaces": False, # "apply_nfc": False, # "normalize_lines": False, diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 0ee0f34db..54978470a 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -18,14 +18,6 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| input_manifest_path | str | Path to the input manifest | / | -| component_spec | dict | The component specification as a dictionary | / | -| input_partition_rows | int | The number of rows to load per partition. Set to override the automatic partitioning | / | -| cache | bool | Set to False to disable caching, True by default. | True | -| cluster_type | str | The cluster type to use for the execution | default | -| client_kwargs | dict | Keyword arguments to pass to the Dask client | / | -| metadata | str | Metadata arguments containing the run id and base path | / | -| output_manifest_path | str | Path to the output manifest | / | | hf_token | str | The hugging face token used to write to the hub | / | | username | str | The username under which to upload the dataset | / | | dataset_name | str | The name of the dataset to upload | / | @@ -44,14 +36,6 @@ write_to_hf_hub_op = ComponentOp.from_registry( name="write_to_hf_hub", arguments={ # Add arguments - # "input_manifest_path": , - # "component_spec": {}, - # "input_partition_rows": 0, - # "cache": True, - # "cluster_type": "default", - # "client_kwargs": {}, - # "metadata": , - # "output_manifest_path": , # "hf_token": , # "username": , # "dataset_name": , diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py index 7edd5902c..d6dbb2700 100644 --- a/scripts/component_readme/generate_readme.py +++ b/scripts/component_readme/generate_readme.py @@ -24,7 +24,8 @@ def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> st description=component_spec.description, consumes=component_spec.consumes, produces=component_spec.produces, - arguments=component_spec.args.values(), + arguments=[arg for arg in component_spec.args.values() + if arg.name not in component_spec.default_arguments], tests=(component_dir / "tests").exists() )