diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5876f3b6..adfaf43b3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,6 @@ repos: "--exit-non-zero-on-fix", ] - - repo: https://github.com/PyCQA/bandit rev: 1.7.4 hooks: @@ -55,4 +54,13 @@ repos: - types-jsonschema - types-PyYAML - types-requests - pass_filenames: false \ No newline at end of file + pass_filenames: false + + - repo: local + hooks: + - id: generate_component_readmes + name: Generate component READMEs + language: python + entry: python scripts/component_readme/generate_readme.py + files: ^components/.*/fondant_component.yaml + additional_dependencies: ["fondant"] \ No newline at end of file diff --git a/components/caption_images/README.md b/components/caption_images/README.md index fef8bb921..b65ec6176 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -48,4 +48,4 @@ pipeline.add_op(caption_images_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/embed_images/README.md b/components/embed_images/README.md index a4d39eec6..c805533ca 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -41,9 +41,3 @@ embed_images_op = ComponentOp.from_registry( pipeline.add_op(embed_images_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_comments/README.md b/components/filter_comments/README.md index 8870b6ac5..dff4c6730 100644 --- a/components/filter_comments/README.md +++ b/components/filter_comments/README.md @@ -39,9 +39,3 @@ filter_comments_op = ComponentOp.from_registry( pipeline.add_op(filter_comments_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 4f7df62a9..3074d74af 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -18,8 +18,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| min_image_dim | int | Minimum image dimension | None | -| max_aspect_ratio | float | Maximum aspect ratio | None | +| min_image_dim | int | Minimum image dimension | / | +| max_aspect_ratio | float | Maximum aspect ratio | / | ### Usage @@ -40,9 +40,3 @@ filter_image_resolution_op = ComponentOp.from_registry( pipeline.add_op(filter_image_resolution_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/filter_line_length/README.md b/components/filter_line_length/README.md index 96393f89d..340cf486f 100644 --- a/components/filter_line_length/README.md +++ b/components/filter_line_length/README.md @@ -19,9 +19,9 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| avg_line_length_threshold | int | Threshold for average line length to filter on | None | -| max_line_length_threshold | int | Threshold for maximum line length to filter on | None | -| alphanum_fraction_threshold | float | Alphanum fraction to filter on | None | +| avg_line_length_threshold | int | Threshold for average line length to filter on | / | +| max_line_length_threshold | int | Threshold for maximum line length to filter on | / | +| alphanum_fraction_threshold | float | Alphanum fraction to filter on | / | ### Usage @@ -43,9 +43,3 @@ filter_line_length_op = ComponentOp.from_registry( pipeline.add_op(filter_line_length_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index f9a7993e9..31b8f8c00 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -58,9 +58,3 @@ image_cropping_op = ComponentOp.from_registry( pipeline.add_op(image_cropping_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index cd833b966..2ec9e4b14 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -36,9 +36,3 @@ image_resolution_extraction_op = ComponentOp.from_registry( pipeline.add_op(image_resolution_extraction_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c90eefc87..1409e8c08 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -42,4 +42,4 @@ pipeline.add_op(language_filter_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index e8fd3321d..89a857fd7 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -20,7 +20,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| directory_uri | str | Local or remote path to the directory containing the files | None | +| directory_uri | str | Local or remote path to the directory containing the files | / | ### Usage @@ -45,4 +45,4 @@ pipeline.add_op(load_from_files_op, dependencies=[...]) #Add previous component You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index c91b8c0ca..a59f95540 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -17,8 +17,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| dataset_name | str | Name of dataset on the hub | None | -| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | None | +| dataset_name | str | Name of dataset on the hub | / | +| column_name_mapping | dict | Mapping of the consumed hub dataset to fondant column names | / | | image_column_names | list | Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | None | | n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | | index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | @@ -45,9 +45,3 @@ load_from_hf_hub_op = ComponentOp.from_registry( pipeline.add_op(load_from_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index 72618ba77..57d826707 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -17,7 +17,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | None | +| dataset_uri | str | The remote path to the parquet file/folder containing the dataset | / | | column_name_mapping | dict | Mapping of the consumed dataset | None | | n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | None | | index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | None | @@ -43,9 +43,3 @@ load_from_parquet_op = ComponentOp.from_registry( pipeline.add_op(load_from_parquet_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 1de50836b..95766e907 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -44,4 +44,4 @@ pipeline.add_op(minhash_generator_op, dependencies=[...]) #Add previous compone You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/pii_redaction/README.md b/components/pii_redaction/README.md index 3b4cbf2d4..347a27aa4 100644 --- a/components/pii_redaction/README.md +++ b/components/pii_redaction/README.md @@ -54,9 +54,3 @@ pii_redaction_op = ComponentOp.from_registry( pipeline.add_op(pii_redaction_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index 814d1557a..a7467ead4 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -24,7 +24,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| num_images | int | Number of images to retrieve for each prompt | None | +| num_images | int | Number of images to retrieve for each prompt | / | | aesthetic_score | int | Aesthetic embedding to add to the query embedding, between 0 and 9 (higher is prettier). | 9 | | aesthetic_weight | float | Weight of the aesthetic embedding when added to the query, between 0 and 1 | 0.5 | | url | str | The url of the backend clip retrieval service, defaults to the public service | https://knn.laion.ai/knn-service | @@ -50,9 +50,3 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(prompt_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 66ea287d8..fdb057414 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -20,7 +20,7 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | | model_id | str | id of the model on the Hugging Face hub | openmmlab/upernet-convnext-small | -| batch_size | int | batch size to use | None | +| batch_size | int | batch size to use | / | ### Usage @@ -41,9 +41,3 @@ segment_images_op = ComponentOp.from_registry( pipeline.add_op(segment_images_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/components/text_length_filter/README.md b/components/text_length_filter/README.md index 86d3111ee..5eec6a9a7 100644 --- a/components/text_length_filter/README.md +++ b/components/text_length_filter/README.md @@ -17,8 +17,8 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| min_characters_length | int | Minimum number of characters | None | -| min_words_length | int | Mininum number of words | None | +| min_characters_length | int | Minimum number of characters | / | +| min_words_length | int | Mininum number of words | / | ### Usage @@ -44,4 +44,4 @@ pipeline.add_op(text_length_filter_op, dependencies=[...]) #Add previous compon You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md index 8d2723d4e..79fad72a2 100644 --- a/components/text_normalization/README.md +++ b/components/text_normalization/README.md @@ -29,11 +29,11 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | None | -| apply_nfc | bool | If true apply nfc normalization | None | -| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | None | -| do_lowercase | bool | If true apply lowercasing | None | -| remove_punctuation | str | If true punctuation will be removed | None | +| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / | +| apply_nfc | bool | If true apply nfc normalization | / | +| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / | +| do_lowercase | bool | If true apply lowercasing | / | +| remove_punctuation | str | If true punctuation will be removed | / | ### Usage @@ -62,4 +62,4 @@ pipeline.add_op(text_normalization_op, dependencies=[...]) #Add previous compon You can run the tests using docker with BuildKit. From this directory, run: ``` docker build . --target test -``` \ No newline at end of file +``` diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 3ab86b2b9..318b8ee5b 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -17,9 +17,9 @@ The component takes the following arguments to alter its behavior: | argument | type | description | default | | -------- | ---- | ----------- | ------- | -| hf_token | str | The hugging face token used to write to the hub | None | -| username | str | The username under which to upload the dataset | None | -| dataset_name | str | The name of the dataset to upload | None | +| hf_token | str | The hugging face token used to write to the hub | / | +| username | str | The username under which to upload the dataset | / | +| dataset_name | str | The name of the dataset to upload | / | | image_column_names | list | A list containing the image column names. Used to format to image to HF hub format | None | | column_name_mapping | dict | Mapping of the consumed fondant column names to the written hub column names | None | @@ -45,9 +45,3 @@ write_to_hf_hub_op = ComponentOp.from_registry( pipeline.add_op(write_to_hf_hub_op, dependencies=[...]) #Add previous component as dependency ``` -### Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` \ No newline at end of file diff --git a/scripts/component_readme/generate_readme.py b/scripts/component_readme/generate_readme.py index 72e4aaba4..7edd5902c 100644 --- a/scripts/component_readme/generate_readme.py +++ b/scripts/component_readme/generate_readme.py @@ -1,13 +1,12 @@ import argparse -import ast from pathlib import Path import jinja2 from fondant.component_spec import ComponentSpec -def read_component_spec(component_dir: Path) -> ComponentSpec: - return ComponentSpec.from_file(component_dir / "fondant_component.yaml") +def read_component_spec(component_spec_path: Path) -> ComponentSpec: + return ComponentSpec.from_file(component_spec_path) def generate_readme(component_spec: ComponentSpec, *, component_dir: Path) -> str: @@ -35,17 +34,20 @@ def write_readme(readme: str, component_dir: Path) -> None: f.write(readme) -def main(component_dir: Path): - component_spec = read_component_spec(component_dir) +def main(component_spec_path: Path): + component_spec = read_component_spec(component_spec_path) + component_dir = component_spec_path.parent readme = generate_readme(component_spec, component_dir=component_dir) write_readme(readme, component_dir=component_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-d", "--component_dir", + parser.add_argument("component_specs", + nargs="+", type=Path, - help="Path to the component to generate a readme for") + help="Path to the component spec to generate a readme from") args = parser.parse_args() - main(args.component_dir) + for spec in args.component_specs: + main(spec)