From 52cdbb2e9c55b83a486187c136ef532fd44e57e4 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Thu, 5 Oct 2023 11:03:23 +0200 Subject: [PATCH] Add component hub doc page (#487) This PR adds a script and template to automatically add a component hub page to our docs. --- components/caption_images/README.md | 6 +- components/download_images/README.md | 10 ++- components/embed_images/README.md | 6 +- .../embedding_based_laion_retrieval/README.md | 6 +- components/filter_comments/README.md | 3 +- components/filter_image_resolution/README.md | 5 +- components/filter_line_length/README.md | 7 +- components/image_cropping/README.md | 10 ++- .../image_resolution_extraction/README.md | 10 ++- components/language_filter/README.md | 3 +- components/load_from_files/README.md | 5 +- components/load_from_hf_hub/README.md | 3 +- components/load_from_parquet/README.md | 3 +- components/minhash_generator/README.md | 6 +- components/pii_redaction/README.md | 6 +- .../prompt_based_laion_retrieval/README.md | 6 +- components/segment_images/README.md | 6 +- components/text_length_filter/README.md | 3 +- components/text_normalization/README.md | 3 +- components/write_to_hf_hub/README.md | 3 +- docs/.readthedocs.yaml | 5 +- docs/components/components.md | 2 +- docs/components/hub.md | 88 +++++++++++++++++++ docs/overrides/partials/toc.html | 56 ++++++++++++ mkdocs.yml | 1 + scripts/component_readme/generate_hub.py | 36 ++++++++ scripts/component_readme/hub_template.md | 14 +++ scripts/component_readme/readme_template.md | 6 +- 28 files changed, 274 insertions(+), 44 deletions(-) create mode 100644 docs/components/hub.md create mode 100644 docs/overrides/partials/toc.html create mode 100644 scripts/component_readme/generate_hub.py create mode 100644 scripts/component_readme/hub_template.md diff --git a/components/caption_images/README.md b/components/caption_images/README.md index b65ec6176..8bb38e996 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -6,12 +6,14 @@ This component captions images using a BLIP model from the Hugging Face hub ### Inputs / outputs **This component consumes:** + - images - - data: binary + - data: binary **This component produces:** + - captions - - text: string + - text: string ### Arguments diff --git a/components/download_images/README.md b/components/download_images/README.md index e23779857..5a392aaeb 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -13,14 +13,16 @@ from the img2dataset library. ### Inputs / outputs **This component consumes:** + - images - - url: string + - url: string **This component produces:** + - images - - data: binary - - width: int32 - - height: int32 + - data: binary + - width: int32 + - height: int32 ### Arguments diff --git a/components/embed_images/README.md b/components/embed_images/README.md index c805533ca..eec02f577 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -6,12 +6,14 @@ Component that generates CLIP embeddings from images ### Inputs / outputs **This component consumes:** + - images - - data: binary + - data: binary **This component produces:** + - embeddings - - data: list + - data: list ### Arguments diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 3d18043dd..454253416 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -8,12 +8,14 @@ used to find images similar to the embedded images / captions. ### Inputs / outputs **This component consumes:** + - embeddings - - data: list + - data: list **This component produces:** + - images - - url: string + - url: string ### Arguments diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index dff4c6730..e0d55e57d 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -6,8 +6,9 @@ Component that filters code based on the code to comment ratio ### Inputs / outputs **This component consumes:** + - code - - content: string + - content: string **This component produces no data.** diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 3074d74af..1bc0c27f5 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -6,9 +6,10 @@ Component that filters images based on minimum size and max aspect ratio ### Inputs / outputs **This component consumes:** + - images - - width: int32 - - height: int32 + - width: int32 + - height: int32 **This component produces no data.** diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 340cf486f..46f5699e4 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -6,10 +6,11 @@ Component that filters code based on line length ### Inputs / outputs **This component consumes:** + - code - - avg_line_length: double - - max_line_length: int32 - - alphanum_fraction: double + - avg_line_length: double + - max_line_length: int32 + - alphanum_fraction: double **This component produces no data.** diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 31b8f8c00..5d679c457 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -21,14 +21,16 @@ right side is border-cropped image. ### Inputs / outputs **This component consumes:** + - images - - data: binary + - data: binary **This component produces:** + - images - - data: binary - - width: int32 - - height: int32 + - data: binary + - width: int32 + - height: int32 ### Arguments diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index 2ec9e4b14..a69a4df4e 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -6,14 +6,16 @@ Component that extracts image resolution data from the images ### Inputs / outputs **This component consumes:** + - images - - data: binary + - data: binary **This component produces:** + - images - - data: binary - - width: int32 - - height: int32 + - data: binary + - width: int32 + - height: int32 ### Arguments diff --git a/components/language_filter/README.md b/components/language_filter/README.md index 1409e8c08..c3afd6435 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -6,8 +6,9 @@ A component that filters text based on the provided language. ### Inputs / outputs **This component consumes:** + - text - - data: string + - data: string **This component produces no data.** diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 89a857fd7..834f568e5 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -10,9 +10,10 @@ location. It supports the following formats: .zip, gzip, tar and tar.gz. **This component consumes no data.** **This component produces:** + - file - - filename: string - - content: binary + - filename: string + - content: binary ### Arguments diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index a59f95540..6d5bdded6 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -8,8 +8,9 @@ Component that loads a dataset from the hub **This component consumes no data.** **This component produces:** + - dummy_variable - - data: binary + - data: binary ### Arguments diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 57d826707..9155c8163 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -8,8 +8,9 @@ Component that loads a dataset from a parquet uri **This component consumes no data.** **This component produces:** + - dummy_variable - - data: binary + - data: binary ### Arguments diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 95766e907..422fdc7af 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -6,12 +6,14 @@ A component that generates minhashes of text. ### Inputs / outputs **This component consumes:** + - text - - data: string + - data: string **This component produces:** + - text - - minhash: list + - minhash: list ### Arguments diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index 347a27aa4..e9143486b 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -26,12 +26,14 @@ code. ### Inputs / outputs **This component consumes:** + - code - - content: string + - content: string **This component produces:** + - code - - content: string + - content: string ### Arguments diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index a7467ead4..45fb1c034 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -11,12 +11,14 @@ This component doesn’t return the actual images, only URLs. ### Inputs / outputs **This component consumes:** + - prompts - - text: string + - text: string **This component produces:** + - images - - url: string + - url: string ### Arguments diff --git a/components/segment_images/README.md b/components/segment_images/README.md index fdb057414..2f569d42e 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -6,12 +6,14 @@ Component that creates segmentation masks for images using a model from the Hugg ### Inputs / outputs **This component consumes:** + - images - - data: binary + - data: binary **This component produces:** + - segmentations - - data: binary + - data: binary ### Arguments diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index 5eec6a9a7..01ee0ba1a 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -6,8 +6,9 @@ A component that filters out text based on their length ### Inputs / outputs **This component consumes:** + - text - - data: string + - data: string **This component produces no data.** diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 79fad72a2..6ae6fb97f 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -18,8 +18,9 @@ the training of large language models. ### Inputs / outputs **This component consumes:** + - text - - data: string + - data: string **This component produces no data.** diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 318b8ee5b..1f68f45f5 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -6,8 +6,9 @@ Component that writes a dataset to the hub ### Inputs / outputs **This component consumes:** + - dummy_variable - - data: binary + - data: binary **This component produces no data.** diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml index 80f719960..0a85a254b 100644 --- a/docs/.readthedocs.yaml +++ b/docs/.readthedocs.yaml @@ -21,4 +21,7 @@ build: - poetry config virtualenvs.create false post_install: # Install dependencies with 'docs' dependency group - - poetry install --with docs \ No newline at end of file + - poetry install --with docs + pre_build: + # Generate hub documentation + - python scripts/component_readme/generate_hub.py \ No newline at end of file diff --git a/docs/components/components.md b/docs/components/components.md index e6d348ca4..d79901648 100644 --- a/docs/components/components.md +++ b/docs/components/components.md @@ -2,7 +2,7 @@ Fondant makes it easy to build data preparation pipelines leveraging reusable components. Fondant provides a lot of components out of the box -([overview](https://github.com/ml6team/fondant/tree/main/components)), but you can also define your +([overview](hub.md)), but you can also define your own custom components. ## The anatomy of a component diff --git a/docs/components/hub.md b/docs/components/hub.md new file mode 100644 index 000000000..53dce2ecc --- /dev/null +++ b/docs/components/hub.md @@ -0,0 +1,88 @@ +--- +disable_toc: True +--- + +# Component Hub + +Below you can find the reusable components offered by Fondant. + +??? "caption_images" + + --8<-- "components/caption_images/README.md:1" + +??? "download_images" + + --8<-- "components/download_images/README.md:1" + +??? "embed_images" + + --8<-- "components/embed_images/README.md:1" + +??? "embedding_based_laion_retrieval" + + --8<-- "components/embedding_based_laion_retrieval/README.md:1" + +??? "filter_comments" + + --8<-- "components/filter_comments/README.md:1" + +??? "filter_image_resolution" + + --8<-- "components/filter_image_resolution/README.md:1" + +??? "filter_line_length" + + --8<-- "components/filter_line_length/README.md:1" + +??? "image_cropping" + + --8<-- "components/image_cropping/README.md:1" + +??? "image_resolution_extraction" + + --8<-- "components/image_resolution_extraction/README.md:1" + +??? "language_filter" + + --8<-- "components/language_filter/README.md:1" + +??? "load_from_files" + + --8<-- "components/load_from_files/README.md:1" + +??? "load_from_hf_hub" + + --8<-- "components/load_from_hf_hub/README.md:1" + +??? "load_from_parquet" + + --8<-- "components/load_from_parquet/README.md:1" + +??? "minhash_generator" + + --8<-- "components/minhash_generator/README.md:1" + +??? "pii_redaction" + + --8<-- "components/pii_redaction/README.md:1" + +??? "prompt_based_laion_retrieval" + + --8<-- "components/prompt_based_laion_retrieval/README.md:1" + +??? "segment_images" + + --8<-- "components/segment_images/README.md:1" + +??? "text_length_filter" + + --8<-- "components/text_length_filter/README.md:1" + +??? "text_normalization" + + --8<-- "components/text_normalization/README.md:1" + +??? "write_to_hf_hub" + + --8<-- "components/write_to_hf_hub/README.md:1" + diff --git a/docs/overrides/partials/toc.html b/docs/overrides/partials/toc.html new file mode 100644 index 000000000..df8c43203 --- /dev/null +++ b/docs/overrides/partials/toc.html @@ -0,0 +1,56 @@ + + + +{% set title = lang.t("toc") %} +{% if config.mdx_configs.toc and config.mdx_configs.toc.title %} + {% set title = config.mdx_configs.toc.title %} +{% endif %} + + + \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 050b06cdf..6c660daef 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,6 +42,7 @@ nav: - Creating custom components: components/custom_component.md - Read / write components: components/generic_component.md - Component spec: components/component_spec.md + - Hub: components/hub.md - Data explorer: data_explorer.md - Infrastructure: infrastructure.md - Manifest: manifest.md diff --git a/scripts/component_readme/generate_hub.py b/scripts/component_readme/generate_hub.py new file mode 100644 index 000000000..faffcd914 --- /dev/null +++ b/scripts/component_readme/generate_hub.py @@ -0,0 +1,36 @@ +import typing as t +from pathlib import Path +from glob import glob + +import jinja2 + + +def find_components() -> t.List[str]: + return [Path(d).name for d in sorted(glob("components/*", recursive=True))] + + +def generate_hub(components) -> str: + env = jinja2.Environment( + loader=jinja2.loaders.FileSystemLoader(Path(__file__).parent), + trim_blocks=True + ) + template = env.get_template("hub_template.md") + + return template.render( + components=components + ) + + +def write_hub(hub: str) -> None: + with open("docs/components/hub.md", "w") as f: + f.write(hub) + + +def main(): + components = find_components() + hub = generate_hub(components) + write_hub(hub) + + +if __name__ == "__main__": + main() diff --git a/scripts/component_readme/hub_template.md b/scripts/component_readme/hub_template.md new file mode 100644 index 000000000..76f664991 --- /dev/null +++ b/scripts/component_readme/hub_template.md @@ -0,0 +1,14 @@ +--- +disable_toc: True +--- + +# Component Hub + +Below you can find the reusable components offered by Fondant. + +{% for component in components %} +??? "{{ component }}" + + --8<-- "components/{{ component }}/README.md:1" + +{% endfor %} diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 672ef6faa..1266b56d3 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -7,10 +7,11 @@ {% if consumes %} **This component consumes:** + {% for subset_name, subset in consumes.items() %} - {{ subset_name }} {% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} + - {{ field.name }}: {{ field.type.value }} {% endfor %} {% endfor %} {% else %} @@ -19,10 +20,11 @@ {% if produces %} **This component produces:** + {% for subset_name, subset in produces.items() %} - {{ subset_name }} {% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} + - {{ field.name }}: {{ field.type.value }} {% endfor %} {% endfor %} {% else %}