From 9d4877f9282029dc2663e6e9b1686e3eef68f573 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Wed, 4 Oct 2023 13:24:58 +0200 Subject: [PATCH 1/5] Add script to generate component readmes --- scripts/component_readme/generate_readme.py | 45 ++++++++++++++ scripts/component_readme/readme_template.md | 65 +++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 scripts/component_readme/generate_readme.py create mode 100644 scripts/component_readme/readme_template.md diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py new file mode 100644 index 000000000..831c66bab --- /dev/null +++ b/scripts/component_readme/generate_readme.py @@ -0,0 +1,45 @@ +import argparse +from pathlib import Path + +import jinja2 +from fondant.component_spec import ComponentSpec + + +def read_component_spec(component_dir: Path) -> ComponentSpec: + return ComponentSpec.from_file(component_dir / "fondant_component.yaml") + + +def generate_readme(component_spec: ComponentSpec, *, id_: str) -> str: + template_path = Path(__file__).with_name("readme_template.md") + with open(template_path, "r") as f: + template = jinja2.Template(f.read(), trim_blocks=True) + + return template.render( + id=id_, + name=component_spec.name, + description=component_spec.description, + consumes=component_spec.consumes, + produces=component_spec.produces, + arguments=component_spec.args.values(), + ) + + +def write_readme(readme: str, component_dir: Path) -> None: + with open(component_dir / "README.md", "w") as f: + f.write(readme) + + +def main(component_dir: Path): + component_spec = read_component_spec(component_dir) + readme = generate_readme(component_spec, id_=component_dir.name) + write_readme(readme, component_dir=component_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--component_dir", + type=Path, + help="Path to the component to generate a readme for") + args = parser.parse_args() + + main(args.component_dir) diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md new file mode 100644 index 000000000..a0d6e7ae7 --- /dev/null +++ b/scripts/component_readme/readme_template.md @@ -0,0 +1,65 @@ +# {{ name }} + +### Description +{{ description }} + +### Inputs/Outputs + +**The component comsumes:** +{% for subset_name, subset in consumes.items() %} +- {{ subset_name }} +{% for field in subset.fields.values() %} + - {{ field.name }}: {{ field.type.value }} +{% endfor %} +{% endfor %} + +**The component produces:** +{% for subset_name, subset in produces.items() %} +- {{ subset_name }} +{% for field in subset.fields.values() %} + - {{ field.name }}: {{ field.type.value }} +{% endfor %} +{% endfor %} + +### Arguments + +{% if arguments %} +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +{% for argument in arguments %} +| {{ argument.name }} | {{ argument.type }} | {{ argument.description }} | +{% endfor %} +{% else %} +This component takes no arguments. +{% endif %} + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +{{ id }}_op = ComponentOp.from_registry( + name="{{ id }}", + arguments={ + # Add arguments +{% for argument in arguments %} +{% if argument.default %} + "{{ argument.name }}": {{ argument.default }}, +{% endif %} +{% endfor %} + } +) +pipeline.add_op({{ name }}_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` From fb6d32a412def27512908d0c846fef11b5c3edac Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Wed, 4 Oct 2023 13:25:37 +0200 Subject: [PATCH 2/5] Add readmes for all components --- components/caption_images/README.md | 50 ++++++++++++-- .../caption_images/fondant_component.yaml | 8 +-- components/download_images/README.md | 64 ++++++++++++++++-- .../download_images/fondant_component.yaml | 17 ++++- components/embed_images/README.md | 48 ++++++++++++-- .../embed_images/fondant_component.yaml | 2 +- .../embedding_based_laion_retrieval/README.md | 52 +++++++++++++++ .../fondant_component.yaml | 6 +- components/filter_comments/README.md | 47 +++++++++++++ components/filter_image_resolution/README.md | 46 +++++++++++++ components/filter_line_length/README.md | 48 ++++++++++++++ components/image_cropping/README.md | 66 ++++++++++++++++--- .../image_cropping/fondant_component.yaml | 17 ++++- .../image_resolution_extraction/README.md | 44 +++++++++++++ components/language_filter/README.md | 48 ++++++++++++-- .../language_filter/fondant_component.yaml | 2 +- components/load_from_files/README.md | 58 +++++++++------- .../load_from_files/fondant_component.yaml | 4 +- components/load_from_hf_hub/README.md | 51 ++++++++++++++ components/load_from_parquet/README.md | 50 ++++++++++++++ components/minhash_generator/README.md | 47 +++++++++++++ components/pii_redaction/README.md | 63 ++++++++++++++++-- .../pii_redaction/fondant_component.yaml | 22 ++++++- .../prompt_based_laion_retrieval/README.md | 56 ++++++++++++++-- .../fondant_component.yaml | 7 +- components/segment_images/README.md | 48 ++++++++++++++ components/text_length_filter/README.md | 45 +++++++++++++ components/text_normalization/README.md | 58 ++++++++++++++-- .../text_normalization/fondant_component.yaml | 16 ++++- components/write_to_hf_hub/README.md | 50 ++++++++++++++ 30 files changed, 1057 insertions(+), 83 deletions(-) create mode 100644 components/embedding_based_laion_retrieval/README.md create mode 100644 components/filter_comments/README.md create mode 100644 components/filter_image_resolution/README.md create mode 100644 components/filter_line_length/README.md create mode 100644 components/image_resolution_extraction/README.md create mode 100644 components/load_from_hf_hub/README.md create mode 100644 components/load_from_parquet/README.md create mode 100644 components/minhash_generator/README.md create mode 100644 components/segment_images/README.md create mode 100644 components/text_length_filter/README.md create mode 100644 components/write_to_hf_hub/README.md diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 83a78e706..02b2c6b9d 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -1,9 +1,51 @@ -# caption_images +# Caption images ### Description -This component captions inputted images using [BLIP](https://huggingface.co/docs/transformers/model_doc/blip). +This component captions images using a BLIP model from the Hugging Face hub -### **Inputs/Outputs** +### Inputs/Outputs -See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. +**The component comsumes:** +- images + - data: binary +**The component produces:** +- captions + - text: string + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| model_id | str | Id of the BLIP model on the Hugging Face hub | +| batch_size | int | Batch size to use for inference | +| max_new_tokens | int | Maximum token length of each caption | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +caption_images_op = ComponentOp.from_registry( + name="caption_images", + arguments={ + # Add arguments + "model_id": Salesforce/blip-image-captioning-base, + "batch_size": 8, + "max_new_tokens": 50, + } +) +pipeline.add_op(Caption images_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml index 24a9f6815..9f12f6ef1 100644 --- a/components/caption_images/fondant_component.yaml +++ b/components/caption_images/fondant_component.yaml @@ -1,5 +1,5 @@ name: Caption images -description: Component that captions images using a model from the Hugging Face hub +description: This component captions images using a BLIP model from the Hugging Face hub image: ghcr.io/ml6team/caption_images:dev consumes: @@ -16,14 +16,14 @@ produces: args: model_id: - description: id of the model on the Hugging Face hub + description: Id of the BLIP model on the Hugging Face hub type: str default: "Salesforce/blip-image-captioning-base" batch_size: - description: batch size to use + description: Batch size to use for inference type: int default: 8 max_new_tokens: - description: maximum token length of each caption + description: Maximum token length of each caption type: int default: 50 \ No newline at end of file diff --git a/components/download_images/README.md b/components/download_images/README.md index b759e8b56..2fe88a3d1 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -1,15 +1,67 @@ -# download_images +# Download images ### Description -This component takes in image URLs as input and downloads the images, along with some metadata (like their height and width). -The images are stored in a new colum as bytes objects. This component also resizes the images using the [resizer](https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py) function from the img2dataset library. +Component that downloads images from a list of URLs. -If the component is unable to retrieve the image at a URL (for any reason), it will return `None` for that particular URL. +This component takes in image URLs as input and downloads the images, along with some metadata +(like their height and width). The images are stored in a new colum as bytes objects. This +component also resizes the images using the +[resizer](https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py) function +from the img2dataset library. -### **Inputs/Outputs** -See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. +### Inputs/Outputs +**The component comsumes:** +- images + - url: string + +**The component produces:** +- images + - data: binary + - width: int32 + - height: int32 + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| timeout | int | Maximum time (in seconds) to wait when trying to download an image, | +| retries | int | Number of times to retry downloading an image if it fails. | +| n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running +into timeout errors. A lower number of connections can increase the success rate but lower +the throughput. + | +| image_size | int | Size of the images after resizing. | +| resize_mode | str | Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". | +| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | +| min_image_size | int | Minimum size of the images. | +| max_aspect_ratio | float | Maximum aspect ratio of the images. | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +download_images_op = ComponentOp.from_registry( + name="download_images", + arguments={ + # Add arguments + "timeout": 10, + "n_connections": 100, + "image_size": 256, + "resize_mode": border, + "resize_only_if_bigger": False, + "max_aspect_ratio": inf, + } +) +pipeline.add_op(Download images_op, dependencies=[...]) #Add previous component as dependency +``` ### Testing diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index bc3d0a647..4e329e79e 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -1,5 +1,13 @@ name: Download images -description: Component that downloads images based on URLs +description: | + Component that downloads images from a list of URLs. + + This component takes in image URLs as input and downloads the images, along with some metadata + (like their height and width). The images are stored in a new colum as bytes objects. This + component also resizes the images using the + [resizer](https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py) function + from the img2dataset library. + image: ghcr.io/ml6team/download_images:dev consumes: @@ -21,7 +29,7 @@ produces: args: timeout: - description: Maximum time (in seconds) to wait when trying to download an image + description: Maximum time (in seconds) to wait when trying to download an image, type: int default: 10 retries: @@ -29,7 +37,10 @@ args: type: int default: 0 n_connections: - description: Number of concurrent connections opened per process. Decrease this number if you are running into timeout errors. A lower number of connections can increase the success rate but lower the throughput. + description: | + Number of concurrent connections opened per process. Decrease this number if you are running + into timeout errors. A lower number of connections can increase the success rate but lower + the throughput. type: int default: 100 image_size: diff --git a/components/embed_images/README.md b/components/embed_images/README.md index 126e6844b..0c59286e6 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -1,9 +1,49 @@ # Embed images ### Description -This component takes images as input and embeds them using a CLIP model from Hugging Face. -The embeddings are stored in a new colum as arrays of floats. +Component that generates CLIP embeddings from images -### **Inputs/Outputs** +### Inputs/Outputs -See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. \ No newline at end of file +**The component comsumes:** +- images + - data: binary + +**The component produces:** +- embeddings + - data: list + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| model_id | str | Model id of a CLIP model on the Hugging Face hub | +| batch_size | int | Batch size to use when embedding | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +embed_images_op = ComponentOp.from_registry( + name="embed_images", + arguments={ + # Add arguments + "model_id": openai/clip-vit-large-patch14, + "batch_size": 8, + } +) +pipeline.add_op(Embed images_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/embed_images/fondant_component.yaml b/components/embed_images/fondant_component.yaml index d56868031..3c84e2e16 100644 --- a/components/embed_images/fondant_component.yaml +++ b/components/embed_images/fondant_component.yaml @@ -1,5 +1,5 @@ name: Embed images -description: Component that embeds images using CLIP +description: Component that generates CLIP embeddings from images image: ghcr.io/ml6team/embed_images:dev consumes: diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md new file mode 100644 index 000000000..24981ec26 --- /dev/null +++ b/components/embedding_based_laion_retrieval/README.md @@ -0,0 +1,52 @@ +# Embedding based LAION retrieval + +### Description +This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings. It can be +used to find images similar to the embedded images / captions. + + +### Inputs/Outputs + +**The component comsumes:** +- embeddings + - data: list + +**The component produces:** +- images + - url: string + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| num_images | int | Number of images to retrieve for each prompt | +| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | +| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +embedding_based_laion_retrieval_op = ComponentOp.from_registry( + name="embedding_based_laion_retrieval", + arguments={ + # Add arguments + "aesthetic_score": 9, + "aesthetic_weight": 0.5, + } +) +pipeline.add_op(Embedding based LAION retrieval_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml index 0380ba526..23c5d60da 100644 --- a/components/embedding_based_laion_retrieval/fondant_component.yaml +++ b/components/embedding_based_laion_retrieval/fondant_component.yaml @@ -1,5 +1,7 @@ -name: LAION retrieval -description: A component that retrieves image URLs from LAION-5B based on a set of CLIP embeddings +name: Embedding based LAION retrieval +description: | + This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings. It can be + used to find images similar to the embedded images / captions. image: ghcr.io/ml6team/embedding_based_laion_retrieval:dev consumes: diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md new file mode 100644 index 000000000..9310c6595 --- /dev/null +++ b/components/filter_comments/README.md @@ -0,0 +1,47 @@ +# Filter comments + +### Description +Component that filters code based on the code to comment ratio + +### Inputs/Outputs + +**The component comsumes:** +- code + - content: string + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| min_comments_ratio | float | The minimum code to comment ratio | +| max_comments_ratio | float | The maximum code to comment ratio | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +filter_comments_op = ComponentOp.from_registry( + name="filter_comments", + arguments={ + # Add arguments + "min_comments_ratio": 0.1, + "max_comments_ratio": 0.9, + } +) +pipeline.add_op(Filter comments_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md new file mode 100644 index 000000000..445556ba3 --- /dev/null +++ b/components/filter_image_resolution/README.md @@ -0,0 +1,46 @@ +# Filter image resolution + +### Description +Component that filters images based on minimum size and max aspect ratio + +### Inputs/Outputs + +**The component comsumes:** +- images + - width: int32 + - height: int32 + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| min_image_dim | int | Minimum image dimension | +| max_aspect_ratio | float | Maximum aspect ratio | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +filter_image_resolution_op = ComponentOp.from_registry( + name="filter_image_resolution", + arguments={ + # Add arguments + } +) +pipeline.add_op(Filter image resolution_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md new file mode 100644 index 000000000..3e82c1d68 --- /dev/null +++ b/components/filter_line_length/README.md @@ -0,0 +1,48 @@ +# Filter line length + +### Description +Component that filters code based on line length + +### Inputs/Outputs + +**The component comsumes:** +- code + - avg_line_length: double + - max_line_length: int32 + - alphanum_fraction: double + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| avg_line_length_threshold | int | Threshold for average line length to filter on | +| max_line_length_threshold | int | Threshold for maximum line length to filter on | +| alphanum_fraction_threshold | float | Alphanum fraction to filter on | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +filter_line_length_op = ComponentOp.from_registry( + name="filter_line_length", + arguments={ + # Add arguments + } +) +pipeline.add_op(Filter line length_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 1aa393ed2..ccfcfae27 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -1,16 +1,66 @@ -# Image cropping component +# Image cropping -## Description -This component is based on the `TransformComponent` and is used to crop out image borders. This is typically useful when working with graphical images that have single-color borders (e.g. logos, icons, etc.). +### Description +This component crops out image borders. This is typically useful when working with graphical +images that have single-color borders (e.g. logos, icons, etc.). -## Usage -The component takes an image and calculates which color is most present in the border. It then crops the image in order to minimize this single-color border. The `padding` argument will add extra border to the image before cropping it, in order to avoid cutting off parts of the image. -The resulting crop will always be square. If a crop is not possible, the component will return the original image. +The component takes an image and calculates which color is most present in the border. It then +crops the image in order to minimize this single-color border. The `padding` argument will add +extra border to the image before cropping it, in order to avoid cutting off parts of the image. +The resulting crop will always be square. If a crop is not possible, the component will return +the original image. -## Examples -Examples of image cropping by removing the single-color border. Left side is original image, right side is border-cropped image. +#### Examples +Examples of image cropping by removing the single-color border. Left side is original image, +right side is border-cropped image. ![Example of image cropping by removing the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png) ![Example of image cropping by removing the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png) +### Inputs/Outputs + +**The component comsumes:** +- images + - data: binary + +**The component produces:** +- images + - data: binary + - width: int32 + - height: int32 + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| cropping_threshold | int | Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 | +| padding | int | Padding for the image cropping. The padding is added to all borders of the image. | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +image_cropping_op = ComponentOp.from_registry( + name="image_cropping", + arguments={ + # Add arguments + "cropping_threshold": -30, + "padding": 10, + } +) +pipeline.add_op(Image cropping_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/image_cropping/fondant_component.yaml b/components/image_cropping/fondant_component.yaml index a4fedb9a4..65072471d 100644 --- a/components/image_cropping/fondant_component.yaml +++ b/components/image_cropping/fondant_component.yaml @@ -1,6 +1,21 @@ name: Image cropping -description: Component that removes single-colored borders around images and crops them appropriately image: ghcr.io/ml6team/image_cropping:dev +description: | + This component crops out image borders. This is typically useful when working with graphical + images that have single-color borders (e.g. logos, icons, etc.). + + The component takes an image and calculates which color is most present in the border. It then + crops the image in order to minimize this single-color border. The `padding` argument will add + extra border to the image before cropping it, in order to avoid cutting off parts of the image. + The resulting crop will always be square. If a crop is not possible, the component will return + the original image. + + #### Examples + Examples of image cropping by removing the single-color border. Left side is original image, + right side is border-cropped image. + + ![Example of image cropping by removing the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png) + ![Example of image cropping by removing the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png) consumes: images: diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md new file mode 100644 index 000000000..6b4afeff6 --- /dev/null +++ b/components/image_resolution_extraction/README.md @@ -0,0 +1,44 @@ +# Image resolution extraction + +### Description +Component that extracts image resolution data from the images + +### Inputs/Outputs + +**The component comsumes:** +- images + - data: binary + +**The component produces:** +- images + - data: binary + - width: int32 + - height: int32 + +### Arguments + +This component takes no arguments. + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +image_resolution_extraction_op = ComponentOp.from_registry( + name="image_resolution_extraction", + arguments={ + # Add arguments + } +) +pipeline.add_op(Image resolution extraction_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/language_filter/README.md b/components/language_filter/README.md index ba9bd4636..b508b9486 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -1,7 +1,45 @@ -# Language filter +# Filter languages -## Description -This component is based on the `TransformComponent` and is used to filter a dataframe based on language. -It allows you to remove rows that do not match the provided language, thus providing a way to focus -on specific languages within your data. +### Description +A component that filters text based on the provided language. +### Inputs/Outputs + +**The component comsumes:** +- text + - data: string + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +language_filter_op = ComponentOp.from_registry( + name="language_filter", + arguments={ + # Add arguments + "language": en, + } +) +pipeline.add_op(Filter languages_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml index d639a9a6e..b40f43a1f 100644 --- a/components/language_filter/fondant_component.yaml +++ b/components/language_filter/fondant_component.yaml @@ -1,5 +1,5 @@ name: Filter languages -description: A component that filters text based on the language. +description: A component that filters text based on the provided language. image: ghcr.io/ml6team/filter_language:latest consumes: diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 962baab36..aa1126f1f 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -1,35 +1,47 @@ # Load from files -## Description -This component is based on the `DaskLoadComponent` and is used to load dataset from files within a directory. -It allows you to load datasets which -- Have files within a local data directory -- Have compressed files like .zip, gzip, tar or tar.gz within the data directory -- Are hosted on remote locations like AWS S3 bucket, Azure's Blob storage or GCP's cloud storage +### Description +This component loads data from files in a local or remote (AWS S3, Azure Blob storage, GCS) +location. It supports the following formats: .zip, gzip, tar and tar.gz. -And returns a dataframe with two columns -- file_filename(containing the file name in string format) -- file_content (containing the respective file content in bytes format) -Here is an illustration of how to use this component in your pipeline -on a local directory with zip files +### Inputs/Outputs + +**The component comsumes:** + +**The component produces:** +- file + - filename: string + - content: binary + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| directory_uri | str | Local or remote path to the directory containing the files | + +### Usage + +You can add this component to your pipeline using the following code: ```python -from fondant.pipeline import Pipeline, ComponentOp +from fondant.pipeline import ComponentOp -my_pipeline = Pipeline( - pipeline_name="my_pipeline", - base_path="./", # TODO: update this - pipeline_description="This is my pipeline", -) -load_from_files = ComponentOp( - component_dir="components/load_from_files", +load_from_files_op = ComponentOp.from_registry( + name="load_from_files", arguments={ - "directory_uri": "./data.zip", # change this to your - # directory_uri, remote or local - }, + # Add arguments + } ) +pipeline.add_op(Load from files_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing -my_pipeline.add_op(load_from_files, dependencies=[]) +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test ``` \ No newline at end of file diff --git a/components/load_from_files/fondant_component.yaml b/components/load_from_files/fondant_component.yaml index 2673e13db..6481c8e14 100644 --- a/components/load_from_files/fondant_component.yaml +++ b/components/load_from_files/fondant_component.yaml @@ -1,5 +1,7 @@ name: Load from files -description: Component that loads a dataset from files +description: | + This component loads data from files in a local or remote (AWS S3, Azure Blob storage, GCS) + location. It supports the following formats: .zip, gzip, tar and tar.gz. image: ghcr.io/ml6team/load_from_files:dev produces: diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md new file mode 100644 index 000000000..a884c8711 --- /dev/null +++ b/components/load_from_hf_hub/README.md @@ -0,0 +1,51 @@ +# Load from hub + +### Description +Component that loads a dataset from the hub + +### Inputs/Outputs + +**The component comsumes:** + +**The component produces:** +- dummy_variable + - data: binary + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| dataset_name | str | Name of dataset on the hub | +| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | +| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +load_from_hf_hub_op = ComponentOp.from_registry( + name="load_from_hf_hub", + arguments={ + # Add arguments + "image_column_names": None, + "n_rows_to_load": None, + "index_column": None, + } +) +pipeline.add_op(Load from hub_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md new file mode 100644 index 000000000..7f5d01db9 --- /dev/null +++ b/components/load_from_parquet/README.md @@ -0,0 +1,50 @@ +# Load from parquet + +### Description +Component that loads a dataset from a parquet uri + +### Inputs/Outputs + +**The component comsumes:** + +**The component produces:** +- dummy_variable + - data: binary + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | +| column_name_mapping | dict | Mapping of the consumed dataset | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +load_from_parquet_op = ComponentOp.from_registry( + name="load_from_parquet", + arguments={ + # Add arguments + "column_name_mapping": None, + "n_rows_to_load": None, + "index_column": None, + } +) +pipeline.add_op(Load from parquet_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md new file mode 100644 index 000000000..1240ec7c4 --- /dev/null +++ b/components/minhash_generator/README.md @@ -0,0 +1,47 @@ +# MinHash generator + +### Description +A component that generates minhashes of text. + +### Inputs/Outputs + +**The component comsumes:** +- text + - data: string + +**The component produces:** +- text + - minhash: list + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| shingle_ngram_size | int | Define size of ngram used for the shingle generation | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +minhash_generator_op = ComponentOp.from_registry( + name="minhash_generator", + arguments={ + # Add arguments + "shingle_ngram_size": 3, + } +) +pipeline.add_op(MinHash generator_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index 39183017d..f3a13d26e 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -1,13 +1,62 @@ -## PII detection and redaction +# PII redaction -This component detects and redacts Personal Identifiable Information (PII) from code. Redaction means that sensitive data is replaced by random data. +### Description +This component detects and redacts Personal Identifiable Information (PII) from code. +Redaction means that sensitive data is replaced by random data. -The code is based on the PII removal code used as part of the [BigCode project](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii). +The code is based on the PII removal code used as part of the +[BigCode project](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii). -### PII detection +#### PII detection -The component detects emails, IP addresses and API/SSH keys in text datasets (in particular datasets of source code). Regexes are used for emails and IP addresses (they are adapted from [BigScience PII pipeline](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/02_pii)). The [`detect-secrets`](https://github.com/Yelp/detect-secrets) package is used for finding secrets keys. Additionally filters are implemented on top to reduce the number of false positives, using the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package. +The component detects emails, IP addresses and API/SSH keys in text datasets (in particular +datasets of source code). Regexes are used for emails and IP addresses (they are adapted from +[BigScience PII pipeline](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/02_pii)). +The [`detect-secrets`](https://github.com/Yelp/detect-secrets) package is used for finding +secrets keys. Additionally filters are implemented on top to reduce the number of false +positives, using the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package. -### PII redaction +#### PII redaction -PII is replaced by random data which is stored in the `replacements.json` file. \ No newline at end of file +PII is replaced by random data which is stored in the `replacements.json` file. +A component that detects and redacts Personal Identifiable Information (PII) from +code. + + +### Inputs/Outputs + +**The component comsumes:** +- code + - content: string + +**The component produces:** +- code + - content: string + +### Arguments + +This component takes no arguments. + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +pii_redaction_op = ComponentOp.from_registry( + name="pii_redaction", + arguments={ + # Add arguments + } +) +pipeline.add_op(PII redaction_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/pii_redaction/fondant_component.yaml b/components/pii_redaction/fondant_component.yaml index 11d1166b7..b64f11fa3 100644 --- a/components/pii_redaction/fondant_component.yaml +++ b/components/pii_redaction/fondant_component.yaml @@ -1,5 +1,25 @@ name: PII redaction -description: A component that detects and redacts Personal Identifiable Information (PII) from code. +description: | + This component detects and redacts Personal Identifiable Information (PII) from code. + Redaction means that sensitive data is replaced by random data. + + The code is based on the PII removal code used as part of the + [BigCode project](https://github.com/bigcode-project/bigcode-dataset/tree/main/pii). + + #### PII detection + + The component detects emails, IP addresses and API/SSH keys in text datasets (in particular + datasets of source code). Regexes are used for emails and IP addresses (they are adapted from + [BigScience PII pipeline](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/02_pii)). + The [`detect-secrets`](https://github.com/Yelp/detect-secrets) package is used for finding + secrets keys. Additionally filters are implemented on top to reduce the number of false + positives, using the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package. + + #### PII redaction + + PII is replaced by random data which is stored in the `replacements.json` file. + A component that detects and redacts Personal Identifiable Information (PII) from + code. image: ghcr.io/ml6team/pii_redaction:dev consumes: diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index ef6c68464..63f4813d7 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -1,9 +1,57 @@ -# prompt_based_laion_retrieval +# LAION retrieval ### Description -This component retrieves image URLs from the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) based on text prompts. The retrieval itself is done based on CLIP embeddings similarity between the prompt sentences and the captions in the LAION dataset. This component doesn’t return the actual images, only URLs. +This component retrieves image URLs from the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) +based on text prompts. The retrieval itself is done based on CLIP embeddings similarity between +the prompt sentences and the captions in the LAION dataset. -### **Inputs/Outputs** +This component doesn’t return the actual images, only URLs. -See the [`component specification`](fondant_component.yaml) for a more detailed description of all the input/output parameters. +### Inputs/Outputs + +**The component comsumes:** +- prompts + - text: string + +**The component produces:** +- images + - url: string + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| num_images | int | Number of images to retrieve for each prompt | +| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | +| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | +| url | str | The url of the backend clip retrieval service, defaults to the public service | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +prompt_based_laion_retrieval_op = ComponentOp.from_registry( + name="prompt_based_laion_retrieval", + arguments={ + # Add arguments + "aesthetic_score": 9, + "aesthetic_weight": 0.5, + "url": https://knn.laion.ai/knn-service, + } +) +pipeline.add_op(LAION retrieval_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index 544f7afc8..88f8d20dd 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -1,5 +1,10 @@ name: LAION retrieval -description: A component that retrieves image URLs from LAION-5B based on a set of seed prompts +description: | + This component retrieves image URLs from the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) + based on text prompts. The retrieval itself is done based on CLIP embeddings similarity between + the prompt sentences and the captions in the LAION dataset. + + This component doesn’t return the actual images, only URLs. image: ghcr.io/ml6team/prompt_based_laion_retrieval:dev consumes: diff --git a/components/segment_images/README.md b/components/segment_images/README.md new file mode 100644 index 000000000..4524e4672 --- /dev/null +++ b/components/segment_images/README.md @@ -0,0 +1,48 @@ +# Segment images + +### Description +Component that creates segmentation masks for images using a model from the Hugging Face hub + +### Inputs/Outputs + +**The component comsumes:** +- images + - data: binary + +**The component produces:** +- segmentations + - data: binary + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| model_id | str | id of the model on the Hugging Face hub | +| batch_size | int | batch size to use | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +segment_images_op = ComponentOp.from_registry( + name="segment_images", + arguments={ + # Add arguments + "model_id": openmmlab/upernet-convnext-small, + } +) +pipeline.add_op(Segment images_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md new file mode 100644 index 000000000..c1d89619b --- /dev/null +++ b/components/text_length_filter/README.md @@ -0,0 +1,45 @@ +# Filter text length + +### Description +A component that filters out text based on their length + +### Inputs/Outputs + +**The component comsumes:** +- text + - data: string + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| min_characters_length | int | Minimum number of characters | +| min_words_length | int | Mininum number of words | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +text_length_filter_op = ComponentOp.from_registry( + name="text_length_filter", + arguments={ + # Add arguments + } +) +pipeline.add_op(Filter text length_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 7b01ecbd7..8c70e585e 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -1,12 +1,60 @@ -# Text normalization component +# Normalize text -This component implements several text normalization techniques to clean and preprocess textual data: +### Description +This component implements several text normalization techniques to clean and preprocess textual +data: - Apply lowercasing: Converts all text to lowercase - Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs - Apply NFC normalization: Converts characters to their canonical representation -- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) +- Remove common seen patterns in webpages following the implementation of + [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) - Remove punctuation: Strips punctuation marks from the text -These text normalization techniques are valuable for preparing text data before using it for -the training of large language models. \ No newline at end of file +These text normalization techniques are valuable for preparing text data before using it for +the training of large language models. + + +### Inputs/Outputs + +**The component comsumes:** +- text + - data: string + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | +| apply_nfc | bool | If true apply nfc normalization | +| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | +| do_lowercase | bool | If true apply lowercasing | +| remove_punctuation | str | If true punctuation will be removed | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +text_normalization_op = ComponentOp.from_registry( + name="text_normalization", + arguments={ + # Add arguments + } +) +pipeline.add_op(Normalize text_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index f9d2bfabb..b99625bcc 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -1,6 +1,18 @@ -name: Normalize text. -description: A component that normalizes text. +name: Normalize text image: ghcr.io/ml6team/text_normalization:latest +description: | + This component implements several text normalization techniques to clean and preprocess textual + data: + + - Apply lowercasing: Converts all text to lowercase + - Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs + - Apply NFC normalization: Converts characters to their canonical representation + - Remove common seen patterns in webpages following the implementation of + [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) + - Remove punctuation: Strips punctuation marks from the text + + These text normalization techniques are valuable for preparing text data before using it for + the training of large language models. consumes: text: diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md new file mode 100644 index 000000000..b53f575bb --- /dev/null +++ b/components/write_to_hf_hub/README.md @@ -0,0 +1,50 @@ +# Write to hub + +### Description +Component that writes a dataset to the hub + +### Inputs/Outputs + +**The component comsumes:** +- dummy_variable + - data: binary + +**The component produces:** + +### Arguments + +The component takes the following arguments to alter its behavior: + +| argument | type | description | +| -------- | ---- | ----------- | +| hf_token | str | The hugging face token used to write to the hub | +| username | str | The username under which to upload the dataset | +| dataset_name | str | The name of the dataset to upload | +| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | +| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | + +### Usage + +You can add this component to your pipeline using the following code: + +```python +from fondant.pipeline import ComponentOp + + +write_to_hf_hub_op = ComponentOp.from_registry( + name="write_to_hf_hub", + arguments={ + # Add arguments + "image_column_names": None, + "column_name_mapping": None, + } +) +pipeline.add_op(Write to hub_op, dependencies=[...]) #Add previous component as dependency +``` + +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` \ No newline at end of file From 52ee68d7754bb5bd5087fbc9eb33e945ae80aa7d Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Wed, 4 Oct 2023 19:18:07 +0200 Subject: [PATCH 3/5] Address PR comments --- components/caption_images/README.md | 24 +++++----- components/download_images/README.md | 47 +++++++++---------- components/embed_images/README.md | 20 ++++---- .../embedding_based_laion_retrieval/README.md | 29 +++++------- components/filter_comments/README.md | 20 ++++---- components/filter_image_resolution/README.md | 18 +++---- components/filter_line_length/README.md | 21 +++++---- components/image_cropping/README.md | 20 ++++---- .../image_resolution_extraction/README.md | 8 ++-- components/language_filter/README.md | 16 +++---- components/load_from_files/README.md | 15 +++--- components/load_from_hf_hub/README.md | 30 ++++++------ components/load_from_parquet/README.md | 27 ++++++----- components/minhash_generator/README.md | 16 +++---- components/pii_redaction/README.md | 8 ++-- .../prompt_based_laion_retrieval/README.md | 27 ++++++----- components/segment_images/README.md | 19 ++++---- components/text_length_filter/README.md | 18 +++---- components/text_normalization/README.md | 27 ++++++----- components/write_to_hf_hub/README.md | 29 +++++++----- scripts/component_readme/generate_readme.py | 18 ++++--- scripts/component_readme/readme_template.md | 28 +++++++---- 22 files changed, 259 insertions(+), 226 deletions(-) diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 02b2c6b9d..fef8bb921 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -3,13 +3,13 @@ ### Description This component captions images using a BLIP model from the Hugging Face hub -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - data: binary -**The component produces:** +**This component produces:** - captions - text: string @@ -17,11 +17,11 @@ This component captions images using a BLIP model from the Hugging Face hub The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| model_id | str | Id of the BLIP model on the Hugging Face hub | -| batch_size | int | Batch size to use for inference | -| max_new_tokens | int | Maximum token length of each caption | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| model_id | str | Id of the BLIP model on the Hugging Face hub | Salesforce/blip-image-captioning-base | +| batch_size | int | Batch size to use for inference | 8 | +| max_new_tokens | int | Maximum token length of each caption | 50 | ### Usage @@ -35,12 +35,12 @@ caption_images_op = ComponentOp.from_registry( name="caption_images", arguments={ # Add arguments - "model_id": Salesforce/blip-image-captioning-base, - "batch_size": 8, - "max_new_tokens": 50, + # "model_id": "Salesforce/blip-image-captioning-base", + # "batch_size": 8, + # "max_new_tokens": 50, } ) -pipeline.add_op(Caption images_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(caption_images_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/download_images/README.md b/components/download_images/README.md index 2fe88a3d1..e23779857 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -10,13 +10,13 @@ component also resizes the images using the from the img2dataset library. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - url: string -**The component produces:** +**This component produces:** - images - data: binary - width: int32 @@ -26,19 +26,16 @@ from the img2dataset library. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| timeout | int | Maximum time (in seconds) to wait when trying to download an image, | -| retries | int | Number of times to retry downloading an image if it fails. | -| n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running -into timeout errors. A lower number of connections can increase the success rate but lower -the throughput. - | -| image_size | int | Size of the images after resizing. | -| resize_mode | str | Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". | -| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | -| min_image_size | int | Minimum size of the images. | -| max_aspect_ratio | float | Maximum aspect ratio of the images. | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| timeout | int | Maximum time (in seconds) to wait when trying to download an image, | 10 | +| retries | int | Number of times to retry downloading an image if it fails. | / | +| n_connections | int | Number of concurrent connections opened per process. Decrease this number if you are running into timeout errors. A lower number of connections can increase the success rate but lower the throughput. | 100 | +| image_size | int | Size of the images after resizing. | 256 | +| resize_mode | str | Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". | border | +| resize_only_if_bigger | bool | If True, resize only if image is bigger than image_size. | False | +| min_image_size | int | Minimum size of the images. | / | +| max_aspect_ratio | float | Maximum aspect ratio of the images. | inf | ### Usage @@ -52,15 +49,17 @@ download_images_op = ComponentOp.from_registry( name="download_images", arguments={ # Add arguments - "timeout": 10, - "n_connections": 100, - "image_size": 256, - "resize_mode": border, - "resize_only_if_bigger": False, - "max_aspect_ratio": inf, + # "timeout": 10, + # "retries": 0, + # "n_connections": 100, + # "image_size": 256, + # "resize_mode": "border", + # "resize_only_if_bigger": "False", + # "min_image_size": 0, + # "max_aspect_ratio": "inf", } ) -pipeline.add_op(Download images_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(download_images_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing @@ -68,4 +67,4 @@ pipeline.add_op(Download images_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/embed_images/README.md b/components/embed_images/README.md index 0c59286e6..a4d39eec6 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -3,13 +3,13 @@ ### Description Component that generates CLIP embeddings from images -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - data: binary -**The component produces:** +**This component produces:** - embeddings - data: list @@ -17,10 +17,10 @@ Component that generates CLIP embeddings from images The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| model_id | str | Model id of a CLIP model on the Hugging Face hub | -| batch_size | int | Batch size to use when embedding | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| model_id | str | Model id of a CLIP model on the Hugging Face hub | openai/clip-vit-large-patch14 | +| batch_size | int | Batch size to use when embedding | 8 | ### Usage @@ -34,11 +34,11 @@ embed_images_op = ComponentOp.from_registry( name="embed_images", arguments={ # Add arguments - "model_id": openai/clip-vit-large-patch14, - "batch_size": 8, + # "model_id": "openai/clip-vit-large-patch14", + # "batch_size": 8, } ) -pipeline.add_op(Embed images_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(embed_images_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 24981ec26..3d18043dd 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -5,13 +5,13 @@ This component retrieves image URLs from LAION-5B based on a set of CLIP embeddi used to find images similar to the embedded images / captions. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - embeddings - data: list -**The component produces:** +**This component produces:** - images - url: string @@ -19,11 +19,11 @@ used to find images similar to the embedded images / captions. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| num_images | int | Number of images to retrieve for each prompt | -| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | -| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| num_images | int | Number of images to retrieve for each prompt | / | +| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | +| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | ### Usage @@ -37,16 +37,11 @@ embedding_based_laion_retrieval_op = ComponentOp.from_registry( name="embedding_based_laion_retrieval", arguments={ # Add arguments - "aesthetic_score": 9, - "aesthetic_weight": 0.5, + # "num_images": 0, + # "aesthetic_score": 9, + # "aesthetic_weight": 0.5, } ) -pipeline.add_op(Embedding based LAION retrieval_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(embedding_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index 9310c6595..8870b6ac5 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -3,22 +3,22 @@ ### Description Component that filters code based on the code to comment ratio -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - code - content: string -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| min_comments_ratio | float | The minimum code to comment ratio | -| max_comments_ratio | float | The maximum code to comment ratio | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| min_comments_ratio | float | The minimum code to comment ratio | 0.1 | +| max_comments_ratio | float | The maximum code to comment ratio | 0.9 | ### Usage @@ -32,11 +32,11 @@ filter_comments_op = ComponentOp.from_registry( name="filter_comments", arguments={ # Add arguments - "min_comments_ratio": 0.1, - "max_comments_ratio": 0.9, + # "min_comments_ratio": 0.1, + # "max_comments_ratio": 0.9, } ) -pipeline.add_op(Filter comments_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(filter_comments_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 445556ba3..4f7df62a9 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -3,23 +3,23 @@ ### Description Component that filters images based on minimum size and max aspect ratio -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - width: int32 - height: int32 -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| min_image_dim | int | Minimum image dimension | -| max_aspect_ratio | float | Maximum aspect ratio | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| min_image_dim | int | Minimum image dimension | None | +| max_aspect_ratio | float | Maximum aspect ratio | None | ### Usage @@ -33,9 +33,11 @@ filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", arguments={ # Add arguments + # "min_image_dim": 0, + # "max_aspect_ratio": 0.0, } ) -pipeline.add_op(Filter image resolution_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(filter_image_resolution_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 3e82c1d68..96393f89d 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -3,25 +3,25 @@ ### Description Component that filters code based on line length -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - code - avg_line_length: double - max_line_length: int32 - alphanum_fraction: double -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| avg_line_length_threshold | int | Threshold for average line length to filter on | -| max_line_length_threshold | int | Threshold for maximum line length to filter on | -| alphanum_fraction_threshold | float | Alphanum fraction to filter on | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| avg_line_length_threshold | int | Threshold for average line length to filter on | None | +| max_line_length_threshold | int | Threshold for maximum line length to filter on | None | +| alphanum_fraction_threshold | float | Alphanum fraction to filter on | None | ### Usage @@ -35,9 +35,12 @@ filter_line_length_op = ComponentOp.from_registry( name="filter_line_length", arguments={ # Add arguments + # "avg_line_length_threshold": 0, + # "max_line_length_threshold": 0, + # "alphanum_fraction_threshold": 0.0, } ) -pipeline.add_op(Filter line length_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(filter_line_length_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index ccfcfae27..f9a7993e9 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -18,13 +18,13 @@ right side is border-cropped image. ![Example of image cropping by removing the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png) -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - data: binary -**The component produces:** +**This component produces:** - images - data: binary - width: int32 @@ -34,10 +34,10 @@ right side is border-cropped image. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| cropping_threshold | int | Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 | -| padding | int | Padding for the image cropping. The padding is added to all borders of the image. | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| cropping_threshold | int | Threshold parameter used for detecting borders. A lower (negative) parameter results in a more performant border detection, but can cause overcropping. Default is -30 | -30 | +| padding | int | Padding for the image cropping. The padding is added to all borders of the image. | 10 | ### Usage @@ -51,11 +51,11 @@ image_cropping_op = ComponentOp.from_registry( name="image_cropping", arguments={ # Add arguments - "cropping_threshold": -30, - "padding": 10, + # "cropping_threshold": -30, + # "padding": 10, } ) -pipeline.add_op(Image cropping_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(image_cropping_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index 6b4afeff6..cd833b966 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -3,13 +3,13 @@ ### Description Component that extracts image resolution data from the images -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - data: binary -**The component produces:** +**This component produces:** - images - data: binary - width: int32 @@ -33,7 +33,7 @@ image_resolution_extraction_op = ComponentOp.from_registry( # Add arguments } ) -pipeline.add_op(Image resolution extraction_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(image_resolution_extraction_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/language_filter/README.md b/components/language_filter/README.md index b508b9486..c90eefc87 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -3,21 +3,21 @@ ### Description A component that filters text based on the provided language. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - text - data: string -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| language | str | A valid language code or identifier (e.g., "en", "fr", "de"). | en | ### Usage @@ -31,10 +31,10 @@ language_filter_op = ComponentOp.from_registry( name="language_filter", arguments={ # Add arguments - "language": en, + # "language": "en", } ) -pipeline.add_op(Filter languages_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(language_filter_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index aa1126f1f..e8fd3321d 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -5,11 +5,11 @@ This component loads data from files in a local or remote (AWS S3, Azure Blob st location. It supports the following formats: .zip, gzip, tar and tar.gz. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes no data.** -**The component produces:** +**This component produces:** - file - filename: string - content: binary @@ -18,9 +18,9 @@ location. It supports the following formats: .zip, gzip, tar and tar.gz. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| directory_uri | str | Local or remote path to the directory containing the files | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| directory_uri | str | Local or remote path to the directory containing the files | None | ### Usage @@ -34,9 +34,10 @@ load_from_files_op = ComponentOp.from_registry( name="load_from_files", arguments={ # Add arguments + # "directory_uri": , } ) -pipeline.add_op(Load from files_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(load_from_files_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index a884c8711..c91b8c0ca 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -3,11 +3,11 @@ ### Description Component that loads a dataset from the hub -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes no data.** -**The component produces:** +**This component produces:** - dummy_variable - data: binary @@ -15,13 +15,13 @@ Component that loads a dataset from the hub The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| dataset_name | str | Name of dataset on the hub | -| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | -| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | -| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| dataset_name | str | Name of dataset on the hub | None | +| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | None | +| image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | None | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | ### Usage @@ -35,12 +35,14 @@ load_from_hf_hub_op = ComponentOp.from_registry( name="load_from_hf_hub", arguments={ # Add arguments - "image_column_names": None, - "n_rows_to_load": None, - "index_column": None, + # "dataset_name": , + # "column_name_mapping": {}, + # "image_column_names": "None", + # "n_rows_to_load": "None", + # "index_column": "None", } ) -pipeline.add_op(Load from hub_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(load_from_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 7f5d01db9..72618ba77 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -3,11 +3,11 @@ ### Description Component that loads a dataset from a parquet uri -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes no data.** -**The component produces:** +**This component produces:** - dummy_variable - data: binary @@ -15,12 +15,12 @@ Component that loads a dataset from a parquet uri The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | -| column_name_mapping | dict | Mapping of the consumed dataset | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | -| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | None | +| column_name_mapping | dict | Mapping of the consumed dataset | None | +| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | +| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | ### Usage @@ -34,12 +34,13 @@ load_from_parquet_op = ComponentOp.from_registry( name="load_from_parquet", arguments={ # Add arguments - "column_name_mapping": None, - "n_rows_to_load": None, - "index_column": None, + # "dataset_uri": , + # "column_name_mapping": "None", + # "n_rows_to_load": "None", + # "index_column": "None", } ) -pipeline.add_op(Load from parquet_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(load_from_parquet_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 1240ec7c4..1de50836b 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -3,13 +3,13 @@ ### Description A component that generates minhashes of text. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - text - data: string -**The component produces:** +**This component produces:** - text - minhash: list @@ -17,9 +17,9 @@ A component that generates minhashes of text. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| shingle_ngram_size | int | Define size of ngram used for the shingle generation | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| shingle_ngram_size | int | Define size of ngram used for the shingle generation | 3 | ### Usage @@ -33,10 +33,10 @@ minhash_generator_op = ComponentOp.from_registry( name="minhash_generator", arguments={ # Add arguments - "shingle_ngram_size": 3, + # "shingle_ngram_size": 3, } ) -pipeline.add_op(MinHash generator_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(minhash_generator_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index f3a13d26e..3b4cbf2d4 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -23,13 +23,13 @@ A component that detects and redacts Personal Identifiable Information (PII) fro code. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - code - content: string -**The component produces:** +**This component produces:** - code - content: string @@ -51,7 +51,7 @@ pii_redaction_op = ComponentOp.from_registry( # Add arguments } ) -pipeline.add_op(PII redaction_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(pii_redaction_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 63f4813d7..814d1557a 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -8,13 +8,13 @@ the prompt sentences and the captions in the LAION dataset. This component doesn’t return the actual images, only URLs. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - prompts - text: string -**The component produces:** +**This component produces:** - images - url: string @@ -22,12 +22,12 @@ This component doesn’t return the actual images, only URLs. The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| num_images | int | Number of images to retrieve for each prompt | -| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | -| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | -| url | str | The url of the backend clip retrieval service, defaults to the public service | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| num_images | int | Number of images to retrieve for each prompt | None | +| aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | +| aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | +| url | str | The url of the backend clip retrieval service, defaults to the public service | https://knn.laion.ai/knn-service | ### Usage @@ -41,12 +41,13 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( name="prompt_based_laion_retrieval", arguments={ # Add arguments - "aesthetic_score": 9, - "aesthetic_weight": 0.5, - "url": https://knn.laion.ai/knn-service, + # "num_images": 0, + # "aesthetic_score": 9, + # "aesthetic_weight": 0.5, + # "url": "https://knn.laion.ai/knn-service", } ) -pipeline.add_op(LAION retrieval_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(prompt_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 4524e4672..66ea287d8 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -3,13 +3,13 @@ ### Description Component that creates segmentation masks for images using a model from the Hugging Face hub -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - images - data: binary -**The component produces:** +**This component produces:** - segmentations - data: binary @@ -17,10 +17,10 @@ Component that creates segmentation masks for images using a model from the Hugg The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| model_id | str | id of the model on the Hugging Face hub | -| batch_size | int | batch size to use | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small | +| batch_size | int | batch size to use | None | ### Usage @@ -34,10 +34,11 @@ segment_images_op = ComponentOp.from_registry( name="segment_images", arguments={ # Add arguments - "model_id": openmmlab/upernet-convnext-small, + # "model_id": "openmmlab/upernet-convnext-small", + # "batch_size": 0, } ) -pipeline.add_op(Segment images_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(segment_images_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index c1d89619b..86d3111ee 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -3,22 +3,22 @@ ### Description A component that filters out text based on their length -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - text - data: string -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| min_characters_length | int | Minimum number of characters | -| min_words_length | int | Mininum number of words | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| min_characters_length | int | Minimum number of characters | None | +| min_words_length | int | Mininum number of words | None | ### Usage @@ -32,9 +32,11 @@ text_length_filter_op = ComponentOp.from_registry( name="text_length_filter", arguments={ # Add arguments + # "min_characters_length": 0, + # "min_words_length": 0, } ) -pipeline.add_op(Filter text length_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(text_length_filter_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 8c70e585e..8d2723d4e 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -15,25 +15,25 @@ These text normalization techniques are valuable for preparing text data before the training of large language models. -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - text - data: string -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | -| apply_nfc | bool | If true apply nfc normalization | -| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | -| do_lowercase | bool | If true apply lowercasing | -| remove_punctuation | str | If true punctuation will be removed | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | None | +| apply_nfc | bool | If true apply nfc normalization | None | +| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | None | +| do_lowercase | bool | If true apply lowercasing | None | +| remove_punctuation | str | If true punctuation will be removed | None | ### Usage @@ -47,9 +47,14 @@ text_normalization_op = ComponentOp.from_registry( name="text_normalization", arguments={ # Add arguments + # "remove_additional_whitespaces": False, + # "apply_nfc": False, + # "normalize_lines": False, + # "do_lowercase": False, + # "remove_punctuation": , } ) -pipeline.add_op(Normalize text_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(text_normalization_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index b53f575bb..3ab86b2b9 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -3,25 +3,25 @@ ### Description Component that writes a dataset to the hub -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +**This component consumes:** - dummy_variable - data: binary -**The component produces:** +**This component produces no data.** ### Arguments The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | -| hf_token | str | The hugging face token used to write to the hub | -| username | str | The username under which to upload the dataset | -| dataset_name | str | The name of the dataset to upload | -| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | -| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | +| hf_token | str | The hugging face token used to write to the hub | None | +| username | str | The username under which to upload the dataset | None | +| dataset_name | str | The name of the dataset to upload | None | +| image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | None | +| column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | None | ### Usage @@ -35,11 +35,14 @@ write_to_hf_hub_op = ComponentOp.from_registry( name="write_to_hf_hub", arguments={ # Add arguments - "image_column_names": None, - "column_name_mapping": None, + # "hf_token": , + # "username": , + # "dataset_name": , + # "image_column_names": "None", + # "column_name_mapping": "None", } ) -pipeline.add_op(Write to hub_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op(write_to_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` ### Testing diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py index 831c66bab..72e4aaba4 100644 --- a/scripts/component_readme/generate_readme.py +++ b/scripts/component_readme/generate_readme.py @@ -1,4 +1,5 @@ import argparse +import ast from pathlib import Path import jinja2 @@ -9,18 +10,23 @@ def read_component_spec(component_dir: Path) -> ComponentSpec: return ComponentSpec.from_file(component_dir / "fondant_component.yaml") -def generate_readme(component_spec: ComponentSpec, *, id_: str) -> str: - template_path = Path(__file__).with_name("readme_template.md") - with open(template_path, "r") as f: - template = jinja2.Template(f.read(), trim_blocks=True) +def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> str: + env = jinja2.Environment( + loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent), + trim_blocks=True + ) + env.filters["eval"] = eval + + template = env.get_template("readme_template.md") return template.render( - id=id_, + id=component_dir.name, name=component_spec.name, description=component_spec.description, consumes=component_spec.consumes, produces=component_spec.produces, arguments=component_spec.args.values(), + tests=(component_dir / "tests").exists() ) @@ -31,7 +37,7 @@ def write_readme(readme: str, component_dir: Path) -> None: def main(component_dir: Path): component_spec = read_component_spec(component_dir) - readme = generate_readme(component_spec, id_=component_dir.name) + readme = generate_readme(component_spec, component_dir=component_dir) write_readme(readme, component_dir=component_dir) diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index a0d6e7ae7..672ef6faa 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -3,33 +3,41 @@ ### Description {{ description }} -### Inputs/Outputs +### Inputs / outputs -**The component comsumes:** +{% if consumes %} +**This component consumes:** {% for subset_name, subset in consumes.items() %} - {{ subset_name }} {% for field in subset.fields.values() %} - {{ field.name }}: {{ field.type.value }} {% endfor %} {% endfor %} +{% else %} +**This component consumes no data.** +{% endif %} -**The component produces:** +{% if produces %} +**This component produces:** {% for subset_name, subset in produces.items() %} - {{ subset_name }} {% for field in subset.fields.values() %} - {{ field.name }}: {{ field.type.value }} {% endfor %} {% endfor %} +{% else %} +**This component produces no data.** +{% endif %} ### Arguments {% if arguments %} The component takes the following arguments to alter its behavior: -| argument | type | description | -| -------- | ---- | ----------- | +| argument | type | description | default | +| -------- | ---- | ----------- | ------- | {% for argument in arguments %} -| {{ argument.name }} | {{ argument.type }} | {{ argument.description }} | +| {{ argument.name }} | {{ argument.type }} | {{ argument.description.replace("\n", "") }} | {{ argument.default or "/" }} | {% endfor %} {% else %} This component takes no arguments. @@ -49,17 +57,21 @@ from fondant.pipeline import ComponentOp # Add arguments {% for argument in arguments %} {% if argument.default %} - "{{ argument.name }}": {{ argument.default }}, + # "{{ argument.name }}": {{ '\"' + argument.default + '\"' if argument.default is string else argument.default }}, +{% else %} + # "{{ argument.name }}": {{ (argument.type|eval)() }}, {% endif %} {% endfor %} } ) -pipeline.add_op({{ name }}_op, dependencies=[...]) #Add previous component as dependency +pipeline.add_op({{ id }}_op, dependencies=[...]) #Add previous component as dependency ``` +{% if tests %} ### Testing You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test ``` +{% endif %} From 8c86a54866f1be85752f717b62a594cf5fbab966 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Wed, 4 Oct 2023 19:26:37 +0200 Subject: [PATCH 4/5] Update test files --- .../example_2/docker-compose.yml | 16 +- .../example_2/kubeflow_pipeline.yml | 225 ++++++++++-------- 2 files changed, 139 insertions(+), 102 deletions(-) diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml index a662b8311..1452bde94 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml @@ -41,8 +41,20 @@ services: - --cluster_type - default - --component_spec - - '{"name": "Image cropping", "description": "Component that removes single-colored - borders around images and crops them appropriately", "image": "ghcr.io/ml6team/image_cropping:dev", + - '{"name": "Image cropping", "image": "ghcr.io/ml6team/image_cropping:dev", "description": + "This component crops out image borders. This is typically useful when working + with graphical \nimages that have single-color borders (e.g. logos, icons, etc.).\n\nThe + component takes an image and calculates which color is most present in the border. + It then \ncrops the image in order to minimize this single-color border. The + `padding` argument will add \nextra border to the image before cropping it, + in order to avoid cutting off parts of the image.\nThe resulting crop will always + be square. If a crop is not possible, the component will return \nthe original + image.\n\n#### Examples\nExamples of image cropping by removing the single-color + border. Left side is original image, \nright side is border-cropped image.\n\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n", "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, "produces": {"images": {"fields": {"data": {"type": "binary"}, "width": {"type": "int32"}, "height": {"type": "int32"}}}}, "args": {"cropping_threshold": {"description": diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml index c583ce539..02cade54a 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml @@ -1,21 +1,16 @@ apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: - annotations: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00' - pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test - pipeline", "name": "test_pipeline"}' generateName: test-pipeline- - labels: - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 + annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.22, pipelines.kubeflow.org/pipeline_compilation_time: '2023-01-01T00:00:00', + pipelines.kubeflow.org/pipeline_spec: '{"description": "description of the test + pipeline", "name": "test_pipeline"}'} + labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.22} spec: - arguments: - parameters: [] entrypoint: test-pipeline - serviceAccountName: pipeline-runner templates: - - container: + - name: first-component + container: args: [] command: - fondant @@ -34,7 +29,7 @@ spec: - --input_partition_rows - None - --cache - - 'False' + - "False" - --storage_args - a dummy string arg - --output_manifest_path @@ -49,27 +44,21 @@ spec: artifacts: - name: input_manifest_path path: /tmp/inputs/input_manifest_path/data - raw: - data: '' + raw: {data: ''} + outputs: + artifacts: + - {name: first-component-output_manifest_path, path: /tmp/outputs/output_manifest_path/data} metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "False", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"storage_args\": - {\"description\": \"Storage arguments\", \"type\": \"str\"}}, \"description\": - \"This is an example component\", \"image\": \"example_component:latest\", - \"name\": \"First component\", \"produces\": {\"captions\": {\"fields\": - {\"data\": {\"type\": \"string\"}}}, \"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}}}}}", "input_partition_rows": "None", "metadata": - "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": - \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", - \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}' - pipelines.kubeflow.org/component_ref: '{"digest": "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}' - pipelines.kubeflow.org/component_spec: '{"description": "This is an example - component", "implementation": {"container": {"command": ["fondant", "execute", - "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", - {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, - "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", - {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, + labels: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + pipelines.kubeflow.org/enable_caching: "true" + annotations: {pipelines.kubeflow.org/component_spec: '{"description": "This + is an example component", "implementation": {"container": {"command": ["fondant", + "execute", "main", "--input_manifest_path", {"inputPath": "input_manifest_path"}, + "--metadata", {"inputValue": "metadata"}, "--component_spec", {"inputValue": + "component_spec"}, "--input_partition_rows", {"inputValue": "input_partition_rows"}, + "--cache", {"inputValue": "cache"}, "--storage_args", {"inputValue": "storage_args"}, "--output_manifest_path", {"outputPath": "output_manifest_path"}, "--cluster_type", {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": "client_kwargs"}], "image": "example_component:latest"}}, "inputs": [{"description": "Path @@ -87,17 +76,19 @@ spec: dask client", "name": "client_kwargs", "type": "JsonObject"}, {"description": "Storage arguments", "name": "storage_args", "type": "String"}], "name": "First component", "outputs": [{"description": "Path to the output manifest", - "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: first-component - outputs: - artifacts: - - name: first-component-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - container: + "name": "output_manifest_path", "type": "String"}]}', pipelines.kubeflow.org/component_ref: '{"digest": + "ba182d1dd6a5f8fdffb3c9e487c84d1d1b9ebbfe4b5a137a4af02be832c0c820"}', pipelines.kubeflow.org/arguments.parameters: '{"cache": + "False", "client_kwargs": "{}", "cluster_type": "default", "component_spec": + "{\"args\": {\"storage_args\": {\"description\": \"Storage arguments\", + \"type\": \"str\"}}, \"description\": \"This is an example component\", + \"image\": \"example_component:latest\", \"name\": \"First component\", + \"produces\": {\"captions\": {\"fields\": {\"data\": {\"type\": \"string\"}}}, + \"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}}}}}", "input_partition_rows": + "None", "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", + \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"first_component\", + \"cache_key\": \"1\"}", "storage_args": "a dummy string arg"}'} + - name: image-cropping + container: args: [] command: - fondant @@ -115,14 +106,27 @@ spec: is -30", "type": "int"}, "padding": {"default": 10, "description": "Padding for the image cropping. The padding is added to all borders of the image.", "type": "int"}}, "consumes": {"images": {"fields": {"data": {"type": "binary"}}}}, - "description": "Component that removes single-colored borders around images - and crops them appropriately", "image": "ghcr.io/ml6team/image_cropping:dev", - "name": "Image cropping", "produces": {"images": {"fields": {"data": {"type": - "binary"}, "height": {"type": "int32"}, "width": {"type": "int32"}}}}}' + "description": "This component crops out image borders. This is typically + useful when working with graphical \nimages that have single-color borders + (e.g. logos, icons, etc.).\n\nThe component takes an image and calculates + which color is most present in the border. It then \ncrops the image in order + to minimize this single-color border. The `padding` argument will add \nextra + border to the image before cropping it, in order to avoid cutting off parts + of the image.\nThe resulting crop will always be square. If a crop is not + possible, the component will return \nthe original image.\n\n#### Examples\nExamples + of image cropping by removing the single-color border. Left side is original + image, \nright side is border-cropped image.\n\n![Example of image cropping + by removing the single-color border. Left side is original, right side is + cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n", + "image": "ghcr.io/ml6team/image_cropping:dev", "name": "Image cropping", "produces": + {"images": {"fields": {"data": {"type": "binary"}, "height": {"type": "int32"}, + "width": {"type": "int32"}}}}}' - --input_partition_rows - None - --cache - - 'True' + - "True" - --cropping_threshold - '0' - --padding @@ -137,38 +141,38 @@ spec: imagePullPolicy: Always inputs: artifacts: - - name: first-component-output_manifest_path - path: /tmp/inputs/input_manifest_path/data + - {name: first-component-output_manifest_path, path: /tmp/inputs/input_manifest_path/data} + outputs: + artifacts: + - {name: image-cropping-output_manifest_path, path: /tmp/outputs/output_manifest_path/data} metadata: - annotations: - pipelines.kubeflow.org/arguments.parameters: '{"cache": "True", "client_kwargs": - "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"cropping_threshold\": - {\"default\": -30, \"description\": \"Threshold parameter used for detecting - borders. A lower (negative) parameter results in a more performant border - detection, but can cause overcropping. Default is -30\", \"type\": \"int\"}, - \"padding\": {\"default\": 10, \"description\": \"Padding for the image - cropping. The padding is added to all borders of the image.\", \"type\": - \"int\"}}, \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": - \"binary\"}}}}, \"description\": \"Component that removes single-colored - borders around images and crops them appropriately\", \"image\": \"ghcr.io/ml6team/image_cropping:dev\", - \"name\": \"Image cropping\", \"produces\": {\"images\": {\"fields\": {\"data\": - {\"type\": \"binary\"}, \"height\": {\"type\": \"int32\"}, \"width\": {\"type\": - \"int32\"}}}}}", "cropping_threshold": "0", "input_partition_rows": "None", - "metadata": "{\"base_path\": \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", - \"run_id\": \"test_pipeline-20230101000000\", \"component_id\": \"image_cropping\", - \"cache_key\": \"2\"}", "padding": "0"}' - pipelines.kubeflow.org/component_ref: '{"digest": "d31e5d546956a42a470f033e2be84f229d3e926dfa7a7a1703c94ff47a1cb992"}' - pipelines.kubeflow.org/component_spec: '{"description": "Component that removes - single-colored borders around images and crops them appropriately", "implementation": - {"container": {"command": ["fondant", "execute", "main", "--input_manifest_path", - {"inputPath": "input_manifest_path"}, "--metadata", {"inputValue": "metadata"}, - "--component_spec", {"inputValue": "component_spec"}, "--input_partition_rows", - {"inputValue": "input_partition_rows"}, "--cache", {"inputValue": "cache"}, - "--cropping_threshold", {"inputValue": "cropping_threshold"}, "--padding", - {"inputValue": "padding"}, "--output_manifest_path", {"outputPath": "output_manifest_path"}, - "--cluster_type", {"inputValue": "cluster_type"}, "--client_kwargs", {"inputValue": - "client_kwargs"}], "image": "ghcr.io/ml6team/image_cropping:dev"}}, "inputs": - [{"description": "Path to the input manifest", "name": "input_manifest_path", + labels: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + pipelines.kubeflow.org/enable_caching: "true" + annotations: {pipelines.kubeflow.org/component_spec: '{"description": "This + component crops out image borders. This is typically useful when working + with graphical \nimages that have single-color borders (e.g. logos, icons, + etc.).\n\nThe component takes an image and calculates which color is most + present in the border. It then \ncrops the image in order to minimize this + single-color border. The `padding` argument will add \nextra border to the + image before cropping it, in order to avoid cutting off parts of the image.\nThe + resulting crop will always be square. If a crop is not possible, the component + will return \nthe original image.\n\n#### Examples\nExamples of image cropping + by removing the single-color border. Left side is original image, \nright + side is border-cropped image.\n\n![Example of image cropping by removing + the single-color border. Left side is original, right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\n", + "implementation": {"container": {"command": ["fondant", "execute", "main", + "--input_manifest_path", {"inputPath": "input_manifest_path"}, "--metadata", + {"inputValue": "metadata"}, "--component_spec", {"inputValue": "component_spec"}, + "--input_partition_rows", {"inputValue": "input_partition_rows"}, "--cache", + {"inputValue": "cache"}, "--cropping_threshold", {"inputValue": "cropping_threshold"}, + "--padding", {"inputValue": "padding"}, "--output_manifest_path", {"outputPath": + "output_manifest_path"}, "--cluster_type", {"inputValue": "cluster_type"}, + "--client_kwargs", {"inputValue": "client_kwargs"}], "image": "ghcr.io/ml6team/image_cropping:dev"}}, + "inputs": [{"description": "Path to the input manifest", "name": "input_manifest_path", "type": "String"}, {"description": "Metadata arguments containing the run id and base path", "name": "metadata", "type": "String"}, {"default": "None", "description": "The component specification as a dictionary", "name": "component_spec", @@ -186,26 +190,47 @@ spec: "Integer"}, {"default": 10, "description": "Padding for the image cropping. The padding is added to all borders of the image.", "name": "padding", "type": "Integer"}], "name": "Image cropping", "outputs": [{"description": "Path - to the output manifest", "name": "output_manifest_path", "type": "String"}]}' - labels: - pipelines.kubeflow.org/enable_caching: 'true' - pipelines.kubeflow.org/kfp_sdk_version: 1.8.22 - pipelines.kubeflow.org/pipeline-sdk-type: kfp - name: image-cropping - outputs: - artifacts: - - name: image-cropping-output_manifest_path - path: /tmp/outputs/output_manifest_path/data - - dag: + to the output manifest", "name": "output_manifest_path", "type": "String"}]}', + pipelines.kubeflow.org/component_ref: '{"digest": "bd073ded3bbd5c9bc5fd3abd3b8e8d19d65c17d6914f117596c78a5eddbd99d0"}', + pipelines.kubeflow.org/arguments.parameters: '{"cache": "True", "client_kwargs": + "{}", "cluster_type": "default", "component_spec": "{\"args\": {\"cropping_threshold\": + {\"default\": -30, \"description\": \"Threshold parameter used for detecting + borders. A lower (negative) parameter results in a more performant border + detection, but can cause overcropping. Default is -30\", \"type\": \"int\"}, + \"padding\": {\"default\": 10, \"description\": \"Padding for the image + cropping. The padding is added to all borders of the image.\", \"type\": + \"int\"}}, \"consumes\": {\"images\": {\"fields\": {\"data\": {\"type\": + \"binary\"}}}}, \"description\": \"This component crops out image borders. + This is typically useful when working with graphical \\nimages that have + single-color borders (e.g. logos, icons, etc.).\\n\\nThe component takes + an image and calculates which color is most present in the border. It then + \\ncrops the image in order to minimize this single-color border. The `padding` + argument will add \\nextra border to the image before cropping it, in order + to avoid cutting off parts of the image.\\nThe resulting crop will always + be square. If a crop is not possible, the component will return \\nthe original + image.\\n\\n#### Examples\\nExamples of image cropping by removing the single-color + border. Left side is original image, \\nright side is border-cropped image.\\n\\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_1.png)\\n![Example + of image cropping by removing the single-color border. Left side is original, + right side is cropped image](../../docs/art/components/image_cropping/component_border_crop_0.png)\\n\", + \"image\": \"ghcr.io/ml6team/image_cropping:dev\", \"name\": \"Image cropping\", + \"produces\": {\"images\": {\"fields\": {\"data\": {\"type\": \"binary\"}, + \"height\": {\"type\": \"int32\"}, \"width\": {\"type\": \"int32\"}}}}}", + "cropping_threshold": "0", "input_partition_rows": "None", "metadata": "{\"base_path\": + \"/foo/bar\", \"pipeline_name\": \"test_pipeline\", \"run_id\": \"test_pipeline-20230101000000\", + \"component_id\": \"image_cropping\", \"cache_key\": \"2\"}", "padding": + "0"}'} + - name: test-pipeline + dag: tasks: - - name: first-component - template: first-component - - arguments: - artifacts: - - from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}' - name: first-component-output_manifest_path - dependencies: - - first-component - name: image-cropping + - {name: first-component, template: first-component} + - name: image-cropping template: image-cropping - name: test-pipeline + dependencies: [first-component] + arguments: + artifacts: + - {name: first-component-output_manifest_path, from: '{{tasks.first-component.outputs.artifacts.first-component-output_manifest_path}}'} + arguments: + parameters: [] + serviceAccountName: pipeline-runner From c566b3b3d1a2ef0a153b7c2a8485e965ad0a3b3b Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Wed, 4 Oct 2023 19:36:22 +0200 Subject: [PATCH 5/5] Add readme generation as pre-commit hook --- .pre-commit-config.yaml | 12 ++++++++++-- components/caption_images/README.md | 2 +- components/embed_images/README.md | 6 ------ components/filter_comments/README.md | 6 ------ components/filter_image_resolution/README.md | 10 ++-------- components/filter_line_length/README.md | 12 +++--------- components/image_cropping/README.md | 6 ------ .../image_resolution_extraction/README.md | 6 ------ components/language_filter/README.md | 2 +- components/load_from_files/README.md | 4 ++-- components/load_from_hf_hub/README.md | 10 ++-------- components/load_from_parquet/README.md | 8 +------- components/minhash_generator/README.md | 2 +- components/pii_redaction/README.md | 6 ------ .../prompt_based_laion_retrieval/README.md | 8 +------- components/segment_images/README.md | 8 +------- components/text_length_filter/README.md | 6 +++--- components/text_normalization/README.md | 12 ++++++------ components/write_to_hf_hub/README.md | 12 +++--------- scripts/component_readme/generate_readme.py | 18 ++++++++++-------- 20 files changed, 47 insertions(+), 109 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5876f3b6..adfaf43b3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,6 @@ repos: "--exit-non-zero-on-fix", ] - - repo: https://github.com/PyCQA/bandit rev: 1.7.4 hooks: @@ -55,4 +54,13 @@ repos: - types-jsonschema - types-PyYAML - types-requests - pass_filenames: false \ No newline at end of file + pass_filenames: false + + - repo: local + hooks: + - id: generate_component_readmes + name: Generate component READMEs + language: python + entry: python scripts/component_readme/generate_readme.py + files: ^components/.*/fondant_component.yaml + additional_dependencies: ["fondant"] \ No newline at end of file diff --git a/components/caption_images/README.md b/components/caption_images/README.md index fef8bb921..b65ec6176 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -48,4 +48,4 @@ pipeline.add_op(caption_images_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/embed_images/README.md b/components/embed_images/README.md index a4d39eec6..c805533ca 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -41,9 +41,3 @@ embed_images_op = ComponentOp.from_registry( pipeline.add_op(embed_images_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index 8870b6ac5..dff4c6730 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -39,9 +39,3 @@ filter_comments_op = ComponentOp.from_registry( pipeline.add_op(filter_comments_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 4f7df62a9..3074d74af 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -18,8 +18,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| min_image_dim | int | Minimum image dimension | None | -| max_aspect_ratio | float | Maximum aspect ratio | None | +| min_image_dim | int | Minimum image dimension | / | +| max_aspect_ratio | float | Maximum aspect ratio | / | ### Usage @@ -40,9 +40,3 @@ filter_image_resolution_op = ComponentOp.from_registry( pipeline.add_op(filter_image_resolution_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 96393f89d..340cf486f 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -19,9 +19,9 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| avg_line_length_threshold | int | Threshold for average line length to filter on | None | -| max_line_length_threshold | int | Threshold for maximum line length to filter on | None | -| alphanum_fraction_threshold | float | Alphanum fraction to filter on | None | +| avg_line_length_threshold | int | Threshold for average line length to filter on | / | +| max_line_length_threshold | int | Threshold for maximum line length to filter on | / | +| alphanum_fraction_threshold | float | Alphanum fraction to filter on | / | ### Usage @@ -43,9 +43,3 @@ filter_line_length_op = ComponentOp.from_registry( pipeline.add_op(filter_line_length_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index f9a7993e9..31b8f8c00 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -58,9 +58,3 @@ image_cropping_op = ComponentOp.from_registry( pipeline.add_op(image_cropping_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index cd833b966..2ec9e4b14 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -36,9 +36,3 @@ image_resolution_extraction_op = ComponentOp.from_registry( pipeline.add_op(image_resolution_extraction_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c90eefc87..1409e8c08 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -42,4 +42,4 @@ pipeline.add_op(language_filter_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index e8fd3321d..89a857fd7 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -20,7 +20,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| directory_uri | str | Local or remote path to the directory containing the files | None | +| directory_uri | str | Local or remote path to the directory containing the files | / | ### Usage @@ -45,4 +45,4 @@ pipeline.add_op(load_from_files_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index c91b8c0ca..a59f95540 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -17,8 +17,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| dataset_name | str | Name of dataset on the hub | None | -| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | None | +| dataset_name | str | Name of dataset on the hub | / | +| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | / | | image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | None | | n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | | index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | @@ -45,9 +45,3 @@ load_from_hf_hub_op = ComponentOp.from_registry( pipeline.add_op(load_from_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 72618ba77..57d826707 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -17,7 +17,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | None | +| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | / | | column_name_mapping | dict | Mapping of the consumed dataset | None | | n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | | index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | @@ -43,9 +43,3 @@ load_from_parquet_op = ComponentOp.from_registry( pipeline.add_op(load_from_parquet_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 1de50836b..95766e907 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -44,4 +44,4 @@ pipeline.add_op(minhash_generator_op, dependencies=[...]) #Add previous compone You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index 3b4cbf2d4..347a27aa4 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -54,9 +54,3 @@ pii_redaction_op = ComponentOp.from_registry( pipeline.add_op(pii_redaction_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 814d1557a..a7467ead4 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -24,7 +24,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| num_images | int | Number of images to retrieve for each prompt | None | +| num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | | url | str | The url of the backend clip retrieval service, defaults to the public service | https://knn.laion.ai/knn-service | @@ -50,9 +50,3 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(prompt_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 66ea287d8..fdb057414 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -20,7 +20,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | | model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small | -| batch_size | int | batch size to use | None | +| batch_size | int | batch size to use | / | ### Usage @@ -41,9 +41,3 @@ segment_images_op = ComponentOp.from_registry( pipeline.add_op(segment_images_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index 86d3111ee..5eec6a9a7 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -17,8 +17,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| min_characters_length | int | Minimum number of characters | None | -| min_words_length | int | Mininum number of words | None | +| min_characters_length | int | Minimum number of characters | / | +| min_words_length | int | Mininum number of words | / | ### Usage @@ -44,4 +44,4 @@ pipeline.add_op(text_length_filter_op, dependencies=[...]) #Add previous compon You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 8d2723d4e..79fad72a2 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -29,11 +29,11 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | None | -| apply_nfc | bool | If true apply nfc normalization | None | -| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | None | -| do_lowercase | bool | If true apply lowercasing | None | -| remove_punctuation | str | If true punctuation will be removed | None | +| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / | +| apply_nfc | bool | If true apply nfc normalization | / | +| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / | +| do_lowercase | bool | If true apply lowercasing | / | +| remove_punctuation | str | If true punctuation will be removed | / | ### Usage @@ -62,4 +62,4 @@ pipeline.add_op(text_normalization_op, dependencies=[...]) #Add previous compon You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 3ab86b2b9..318b8ee5b 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -17,9 +17,9 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| hf_token | str | The hugging face token used to write to the hub | None | -| username | str | The username under which to upload the dataset | None | -| dataset_name | str | The name of the dataset to upload | None | +| hf_token | str | The hugging face token used to write to the hub | / | +| username | str | The username under which to upload the dataset | / | +| dataset_name | str | The name of the dataset to upload | / | | image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | None | | column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | None | @@ -45,9 +45,3 @@ write_to_hf_hub_op = ComponentOp.from_registry( pipeline.add_op(write_to_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py index 72e4aaba4..7edd5902c 100644 --- a/scripts/component_readme/generate_readme.py +++ b/scripts/component_readme/generate_readme.py @@ -1,13 +1,12 @@ import argparse -import ast from pathlib import Path import jinja2 from fondant.component_spec import ComponentSpec -def read_component_spec(component_dir: Path) -> ComponentSpec: - return ComponentSpec.from_file(component_dir / "fondant_component.yaml") +def read_component_spec(component_spec_path: Path) -> ComponentSpec: + return ComponentSpec.from_file(component_spec_path) def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> str: @@ -35,17 +34,20 @@ def write_readme(readme: str, component_dir: Path) -> None: f.write(readme) -def main(component_dir: Path): - component_spec = read_component_spec(component_dir) +def main(component_spec_path: Path): + component_spec = read_component_spec(component_spec_path) + component_dir = component_spec_path.parent readme = generate_readme(component_spec, component_dir=component_dir) write_readme(readme, component_dir=component_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-d", "--component_dir", + parser.add_argument("component_specs", + nargs="+", type=Path, - help="Path to the component to generate a readme for") + help="Path to the component spec to generate a readme from") args = parser.parse_args() - main(args.component_dir) + for spec in args.component_specs: + main(spec)