diff --git a/src/fondant/components/filter_text_length/Dockerfile b/src/fondant/components/filter_text_length/Dockerfile deleted file mode 100644 index d7b80e7ab..000000000 --- a/src/fondant/components/filter_text_length/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM --platform=linux/amd64 python:3.10-slim as base - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main -RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ - -FROM base as test -COPY tests/ tests/ -RUN pip3 install --no-cache-dir -r tests/requirements.txt -RUN python -m pytest tests - -FROM base -WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] diff --git a/src/fondant/components/filter_text_length/README.md b/src/fondant/components/filter_text_length/README.md deleted file mode 100644 index fa2088de6..000000000 --- a/src/fondant/components/filter_text_length/README.md +++ /dev/null @@ -1,64 +0,0 @@ -# Filter text length - - -## Description -A component that filters out text based on their length - - -## Inputs / outputs - - -### Consumes -**This component consumes:** - -- text: string - - - - - -### Produces - - -**This component does not produce data.** - - -## Arguments - -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| min_characters_length | int | Minimum number of characters | / | -| min_words_length | int | Mininum number of words | / | - - -## Usage - -You can add this component to your pipeline using the following code: - -```python -from fondant.pipeline import Pipeline - - -pipeline = Pipeline(...) - -dataset = pipeline.read(...) - -dataset = dataset.apply( - "filter_text_length", - arguments={ - # Add arguments - # "min_characters_length": 0, - # "min_words_length": 0, - }, -) -``` - - -## Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` diff --git a/src/fondant/components/filter_text_length/fondant_component.yaml b/src/fondant/components/filter_text_length/fondant_component.yaml deleted file mode 100644 index ba99e95f4..000000000 --- a/src/fondant/components/filter_text_length/fondant_component.yaml +++ /dev/null @@ -1,17 +0,0 @@ -name: Filter text length -description: A component that filters out text based on their length -image: fndnt/filter_text_length:latest -tags: - - Text processing - -consumes: - text: - type: string - -args: - min_characters_length: - description: Minimum number of characters - type: int - min_words_length: - description: Mininum number of words - type: int \ No newline at end of file diff --git a/src/fondant/components/filter_text_length/requirements.txt b/src/fondant/components/filter_text_length/requirements.txt deleted file mode 100644 index c9348a998..000000000 --- a/src/fondant/components/filter_text_length/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyarrow>=7.0 -fasttext-wheel==0.9.2 \ No newline at end of file diff --git a/src/fondant/components/filter_text_length/src/main.py b/src/fondant/components/filter_text_length/src/main.py deleted file mode 100644 index 4d88ee103..000000000 --- a/src/fondant/components/filter_text_length/src/main.py +++ /dev/null @@ -1,36 +0,0 @@ -"""A component that filters out text based on their length.""" -import logging - -import fasttext -import pandas as pd -from fondant.component import PandasTransformComponent - -logger = logging.getLogger(__name__) - - -class FilterTextLengthComponent(PandasTransformComponent): - """A component that filters out text based on their length.""" - - def __init__(self, *, min_characters_length: int, min_words_length: int): - """Setup component. - - Args: - min_characters_length: minimum number of characters - min_words_length: minimum number of words. - """ - super().__init__() - self.min_characters_length = min_characters_length - self.min_words_length = min_words_length - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - """Filter out text based on their length.""" - caption_num_words = dataframe["text"].apply( - lambda x: len(fasttext.tokenize(x)), - ) - caption_num_chars = dataframe["text"].apply(len) - - mask = (caption_num_words >= self.min_words_length) & ( - caption_num_chars >= self.min_characters_length - ) - dataframe = dataframe[mask] - return dataframe diff --git a/src/fondant/components/filter_text_length/tests/pytest.ini b/src/fondant/components/filter_text_length/tests/pytest.ini deleted file mode 100644 index bf6a8a517..000000000 --- a/src/fondant/components/filter_text_length/tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = ../src \ No newline at end of file diff --git a/src/fondant/components/filter_text_length/tests/requirements.txt b/src/fondant/components/filter_text_length/tests/requirements.txt deleted file mode 100644 index 2a929edcc..000000000 --- a/src/fondant/components/filter_text_length/tests/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.4.2 diff --git a/src/fondant/components/filter_text_length/tests/text_length_filter_test.py b/src/fondant/components/filter_text_length/tests/text_length_filter_test.py deleted file mode 100644 index e73824df2..000000000 --- a/src/fondant/components/filter_text_length/tests/text_length_filter_test.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Unit test for text length filter component.""" -import pandas as pd - -from src.main import FilterTextLengthComponent - - -def test_run_component_test(): - """Test text length filter component.""" - # Given: Dataframe with text with different lengths - data = [ - {"text": "To less words"}, - {"text": "Still to less chars"}, - {"text": "This a valid sentence which should be still there"}, - ] - - dataframe = pd.DataFrame(data) - - component = FilterTextLengthComponent( - min_characters_length=20, - min_words_length=4, - ) - dataframe = component.transform(dataframe=dataframe) - - # Then: dataframe only contains one row - assert len(dataframe) == 1 - assert ( - dataframe.loc[2]["text"] == "This a valid sentence which should be still there" - )