From 293aa4148a90ee72b1210c0a4f5612be944256b5 Mon Sep 17 00:00:00 2001 From: Alexander Remmerie <48564828+alexanderremmerie@users.noreply.github.com> Date: Thu, 24 Aug 2023 09:27:38 +0200 Subject: [PATCH] [Datacomp] Add clean_captions and filter_clip_score components (#381) Co-authored-by: Niels Rogge Co-authored-by: Robbe Sneyders --- .../components/clean_captions/Dockerfile | 23 +++++++ .../clean_captions/fondant_component.yaml | 15 +++++ .../clean_captions/requirements.txt | 0 .../components/clean_captions/src/main.py | 65 +++++++++++++++++++ .../components/filter_clip_score/Dockerfile | 23 +++++++ .../filter_clip_score/fondant_component.yaml | 14 ++++ .../filter_clip_score/requirements.txt | 0 .../components/filter_clip_score/src/main.py | 32 +++++++++ .../load_from_hf_hub/fondant_component.yaml | 8 +-- examples/pipelines/datacomp/pipeline.py | 1 + .../pipelines/datacomp/simple_pipeline.py | 31 ++++++--- 11 files changed, 197 insertions(+), 15 deletions(-) create mode 100644 examples/pipelines/datacomp/components/clean_captions/Dockerfile create mode 100644 examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/clean_captions/requirements.txt create mode 100644 examples/pipelines/datacomp/components/clean_captions/src/main.py create mode 100644 examples/pipelines/datacomp/components/filter_clip_score/Dockerfile create mode 100644 examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/filter_clip_score/requirements.txt create mode 100644 examples/pipelines/datacomp/components/filter_clip_score/src/main.py diff --git a/examples/pipelines/datacomp/components/clean_captions/Dockerfile b/examples/pipelines/datacomp/components/clean_captions/Dockerfile new file mode 100644 index 000000000..563dd87a1 --- /dev/null +++ b/examples/pipelines/datacomp/components/clean_captions/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +## System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml b/examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml new file mode 100644 index 000000000..019bdafba --- /dev/null +++ b/examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml @@ -0,0 +1,15 @@ +name: Clean captions +description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates) +image: ghcr.io/ml6team/clean_captions:50f3a97878ac81670ebe624039ff0fcec0542e4f + +consumes: + text: + fields: + data: + type: string + +produces: + text: + fields: + data: + type: string \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/clean_captions/requirements.txt b/examples/pipelines/datacomp/components/clean_captions/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipelines/datacomp/components/clean_captions/src/main.py b/examples/pipelines/datacomp/components/clean_captions/src/main.py new file mode 100644 index 000000000..0f7e2ed00 --- /dev/null +++ b/examples/pipelines/datacomp/components/clean_captions/src/main.py @@ -0,0 +1,65 @@ +import logging + +import pandas as pd + +from fondant.component import PandasTransformComponent +from dateutil.parser import parse + +logger = logging.getLogger(__name__) + + +def isNonEnglish(s): + try: + s.encode(encoding="utf-8").decode("ascii") + except UnicodeDecodeError: + return True + else: + return False + + +def get_num_nonenglish_characters(text): + return sum([isNonEnglish(char) for char in text]) + + +def has_too_much_weird_characters(text, max_ratio=0.5): + return (get_num_nonenglish_characters(text) / len(text)) > max_ratio + + +def is_valid_date(date_string): + try: + parse(date_string) + return True + except (ValueError, OverflowError): + return False + + +def is_empty(text): + return text.strip() == "" + + +class FilterTextComplexity(PandasTransformComponent): + """Component that filters out bad captions in image-text pairs: + - Empty captions + - Captions with weird characters + - Captions that are dates + """ + + def __init__(self, *args) -> None: + pass + + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + texts = dataframe["text"]["data"] + + logger.info("Filtering on empty captions...") + mask = texts.apply(lambda text: not is_empty(text)) + dataframe = dataframe[mask] + + logger.info("Filtering on weird character captions...") + mask = texts.apply(lambda text: not has_too_much_weird_characters(text)) + dataframe = dataframe[mask] + + logger.info("Filtering on captions that look like dates...") + mask = texts.apply(lambda text: not is_valid_date(text)) + dataframe = dataframe[mask] + + return dataframe diff --git a/examples/pipelines/datacomp/components/filter_clip_score/Dockerfile b/examples/pipelines/datacomp/components/filter_clip_score/Dockerfile new file mode 100644 index 000000000..563dd87a1 --- /dev/null +++ b/examples/pipelines/datacomp/components/filter_clip_score/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +## System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml new file mode 100644 index 000000000..27d0f625a --- /dev/null +++ b/examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml @@ -0,0 +1,14 @@ +name: Filter CLIP score +description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates) +image: ghcr.io/ml6team/filter_clip_score:50f3a97878ac81670ebe624039ff0fcec0542e4f + +consumes: + imagetext: + fields: + clipl14score: + type: float32 + +args: + pct_threshold: + type: float + description: "Percentage treshold to filter out captions" \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/filter_clip_score/requirements.txt b/examples/pipelines/datacomp/components/filter_clip_score/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipelines/datacomp/components/filter_clip_score/src/main.py b/examples/pipelines/datacomp/components/filter_clip_score/src/main.py new file mode 100644 index 000000000..f480f5349 --- /dev/null +++ b/examples/pipelines/datacomp/components/filter_clip_score/src/main.py @@ -0,0 +1,32 @@ +import logging +import pandas as pd +from fondant.component import PandasTransformComponent + +logger = logging.getLogger(__name__) + + +class FilterTextComplexity(PandasTransformComponent): + """ + Component that filters rows based on clip scores + """ + + def __init__(self, *args, pct_threshold: float, **kwargs): + self.pct_threshold = pct_threshold + + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + logger.info("Filtering on clip scores...") + logger.info(f"Initial length: {len(dataframe)}") + + clip_scores = dataframe["imagetext"]["clipl14score"] + sorted_clip_scores = clip_scores.sort_values(ascending=False) + threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold) + threshold = sorted_clip_scores.iloc[threshold_idx] + logger.info(f"Clip score Threshold: {threshold}") + + mask = clip_scores > threshold + filtered_dataframe = dataframe[mask] + logger.info( + f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})" + ) + + return filtered_dataframe diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 100320a9c..c4b55d226 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -25,13 +25,11 @@ produces: data: type: string - image_text: + imagetext: fields: - uid: - type: string - clip_b32_similarity_score: + clipb32score: type: float32 - clip_l14_similarity_score: + clipl14score: type: float32 args: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 659714d76..860c97027 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -68,6 +68,7 @@ pipeline.add_op(load_from_hub_op) pipeline.add_op(filter_complexity_op, dependencies=download_images_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) + # TODO add more ops diff --git a/examples/pipelines/datacomp/simple_pipeline.py b/examples/pipelines/datacomp/simple_pipeline.py index 9b307e213..8da4ef8bf 100644 --- a/examples/pipelines/datacomp/simple_pipeline.py +++ b/examples/pipelines/datacomp/simple_pipeline.py @@ -7,7 +7,7 @@ from pipeline_configs import PipelineConfigs -from fondant.pipeline import ComponentOp, Pipeline, Client +from fondant.pipeline import ComponentOp, Pipeline logger = logging.getLogger(__name__) @@ -17,7 +17,6 @@ pipeline_description="A pipeline for filtering the Datacomp dataset", base_path=PipelineConfigs.BASE_PATH, ) -client = Client(host=PipelineConfigs.HOST) # define ops load_component_column_mapping = { @@ -27,16 +26,17 @@ "face_bboxes": "images_face_bboxes", "sha256": "images_sha256", "text": "text_data", - "uid": "image_text_uid", - "clip_b32_similarity_score": "image_text_clip_b32_similarity_score", - "clip_l14_similarity_score": "image_text_clip_l14_similarity_score", + "clip_b32_similarity_score": "imagetext_clipb32score", + "clip_l14_similarity_score": "imagetext_clipl14score", } load_from_hub_op = ComponentOp( component_dir="components/load_from_hf_hub", arguments={ - "dataset_name": "nielsr/datacomp-small-with-embeddings", + "dataset_name": "mlfoundations/datacomp_small", "column_name_mapping": load_component_column_mapping, + "n_rows_to_load": 100, + "index_column": "uid", }, node_pool_label="node_pool", node_pool_name="n2-standard-128-pool", @@ -57,13 +57,24 @@ node_pool_label="node_pool", node_pool_name="n2-standard-128-pool", ) +clean_captions_op = ComponentOp( + component_dir="components/clean_captions", + node_pool_label="node_pool", + node_pool_name="n2-standard-128-pool", +) +filter_clip_score_op = ComponentOp( + component_dir="components/filter_clip_score", + arguments={ + "pct_threshold": 0.3, + }, + node_pool_label="node_pool", + node_pool_name="n2-standard-128-pool", +) # add ops to pipeline pipeline.add_op(load_from_hub_op) pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) +pipeline.add_op(clean_captions_op, dependencies=filter_complexity_op) +pipeline.add_op(filter_clip_score_op, dependencies=clean_captions_op) # TODO add more ops - - -if __name__ == "__main__": - client.compile_and_run(pipeline=pipeline)