-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Datacomp] Add clean_captions and filter_clip_score components (#381)
Co-authored-by: Niels Rogge <[email protected]> Co-authored-by: Robbe Sneyders <[email protected]>
- Loading branch information
1 parent
b9f3dea
commit 293aa41
Showing
11 changed files
with
197 additions
and
15 deletions.
There are no files selected for viewing
23 changes: 23 additions & 0 deletions
23
examples/pipelines/datacomp/components/clean_captions/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Install Fondant | ||
# This is split from other requirements to leverage caching | ||
ARG FONDANT_VERSION=main | ||
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["fondant", "execute", "main"] |
15 changes: 15 additions & 0 deletions
15
examples/pipelines/datacomp/components/clean_captions/fondant_component.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
name: Clean captions | ||
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates) | ||
image: ghcr.io/ml6team/clean_captions:50f3a97878ac81670ebe624039ff0fcec0542e4f | ||
|
||
consumes: | ||
text: | ||
fields: | ||
data: | ||
type: string | ||
|
||
produces: | ||
text: | ||
fields: | ||
data: | ||
type: string |
Empty file.
65 changes: 65 additions & 0 deletions
65
examples/pipelines/datacomp/components/clean_captions/src/main.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import logging | ||
|
||
import pandas as pd | ||
|
||
from fondant.component import PandasTransformComponent | ||
from dateutil.parser import parse | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def isNonEnglish(s): | ||
try: | ||
s.encode(encoding="utf-8").decode("ascii") | ||
except UnicodeDecodeError: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def get_num_nonenglish_characters(text): | ||
return sum([isNonEnglish(char) for char in text]) | ||
|
||
|
||
def has_too_much_weird_characters(text, max_ratio=0.5): | ||
return (get_num_nonenglish_characters(text) / len(text)) > max_ratio | ||
|
||
|
||
def is_valid_date(date_string): | ||
try: | ||
parse(date_string) | ||
return True | ||
except (ValueError, OverflowError): | ||
return False | ||
|
||
|
||
def is_empty(text): | ||
return text.strip() == "" | ||
|
||
|
||
class FilterTextComplexity(PandasTransformComponent): | ||
"""Component that filters out bad captions in image-text pairs: | ||
- Empty captions | ||
- Captions with weird characters | ||
- Captions that are dates | ||
""" | ||
|
||
def __init__(self, *args) -> None: | ||
pass | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
texts = dataframe["text"]["data"] | ||
|
||
logger.info("Filtering on empty captions...") | ||
mask = texts.apply(lambda text: not is_empty(text)) | ||
dataframe = dataframe[mask] | ||
|
||
logger.info("Filtering on weird character captions...") | ||
mask = texts.apply(lambda text: not has_too_much_weird_characters(text)) | ||
dataframe = dataframe[mask] | ||
|
||
logger.info("Filtering on captions that look like dates...") | ||
mask = texts.apply(lambda text: not is_valid_date(text)) | ||
dataframe = dataframe[mask] | ||
|
||
return dataframe |
23 changes: 23 additions & 0 deletions
23
examples/pipelines/datacomp/components/filter_clip_score/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Install Fondant | ||
# This is split from other requirements to leverage caching | ||
ARG FONDANT_VERSION=main | ||
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
|
||
ENTRYPOINT ["fondant", "execute", "main"] |
14 changes: 14 additions & 0 deletions
14
examples/pipelines/datacomp/components/filter_clip_score/fondant_component.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
name: Filter CLIP score | ||
description: Component that filters out bad captions (Empty captions, Captions with weird characters, Captions that are dates) | ||
image: ghcr.io/ml6team/filter_clip_score:50f3a97878ac81670ebe624039ff0fcec0542e4f | ||
|
||
consumes: | ||
imagetext: | ||
fields: | ||
clipl14score: | ||
type: float32 | ||
|
||
args: | ||
pct_threshold: | ||
type: float | ||
description: "Percentage treshold to filter out captions" |
Empty file.
32 changes: 32 additions & 0 deletions
32
examples/pipelines/datacomp/components/filter_clip_score/src/main.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import logging | ||
import pandas as pd | ||
from fondant.component import PandasTransformComponent | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class FilterTextComplexity(PandasTransformComponent): | ||
""" | ||
Component that filters rows based on clip scores | ||
""" | ||
|
||
def __init__(self, *args, pct_threshold: float, **kwargs): | ||
self.pct_threshold = pct_threshold | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
logger.info("Filtering on clip scores...") | ||
logger.info(f"Initial length: {len(dataframe)}") | ||
|
||
clip_scores = dataframe["imagetext"]["clipl14score"] | ||
sorted_clip_scores = clip_scores.sort_values(ascending=False) | ||
threshold_idx = int(len(sorted_clip_scores) * self.pct_threshold) | ||
threshold = sorted_clip_scores.iloc[threshold_idx] | ||
logger.info(f"Clip score Threshold: {threshold}") | ||
|
||
mask = clip_scores > threshold | ||
filtered_dataframe = dataframe[mask] | ||
logger.info( | ||
f"Final length: {len(filtered_dataframe)} ({len(filtered_dataframe) / len(dataframe):.2f})" | ||
) | ||
|
||
return filtered_dataframe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters