diff --git a/components/minhash_generator/Dockerfile b/components/minhash_generator/Dockerfile new file mode 100644 index 000000000..abfa9a414 --- /dev/null +++ b/components/minhash_generator/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/components/minhash_generator/fondant_component.yaml b/components/minhash_generator/fondant_component.yaml new file mode 100644 index 000000000..f1a83ae38 --- /dev/null +++ b/components/minhash_generator/fondant_component.yaml @@ -0,0 +1,22 @@ +name: MinHash generator +description: A component that generates minhashes of text. +image: ghcr.io/ml6team/minhash_generator:latest + +consumes: + text: + fields: + data: + type: string + +produces: + text: + fields: + minhash: + type: array + items: + type: uint64 +args: + shingle_ngram_size: + description: Define size of ngram used for the shingle generation + type: int + default: 3 \ No newline at end of file diff --git a/components/minhash_generator/requirements.txt b/components/minhash_generator/requirements.txt new file mode 100644 index 000000000..396953e56 --- /dev/null +++ b/components/minhash_generator/requirements.txt @@ -0,0 +1,2 @@ +datasketch==1.5.9 +nltk==3.8.1 \ No newline at end of file diff --git a/components/minhash_generator/src/main.py b/components/minhash_generator/src/main.py new file mode 100644 index 000000000..2135ec3a2 --- /dev/null +++ b/components/minhash_generator/src/main.py @@ -0,0 +1,64 @@ +"""A component that generates minhashes of text.""" +import logging + +import numpy as np +import pandas as pd +from datasketch import MinHash +from fondant.component import PandasTransformComponent +from nltk.util import ngrams + +logger = logging.getLogger(__name__) + + +def create_shingles(text: str) -> list: + """Creates text shingles that will be used for the hash generation.""" + # Split text into words + words = text.split() + + # Generate shingles of size 3 using nltk's ngrams function + return list(ngrams(words, 3)) + +def compute_minhash(shingles: list) -> np.ndarray: + """Calculate minhash based on the shingles.""" + minhash = MinHash() + + # Update the MinHash object with the shingles + for shingle in shingles: + minhash.update(" ".join(shingle).encode("utf-8")) + + return minhash.hashvalues + +class MinHashGeneratorComponent(PandasTransformComponent): + """Component generates minhashes of text.""" + + def setup(self, *, shingle_ngram_size: int): + """Setup component. + + Args: + shingle_ngram_size: Defines size of ngram used for the shingle generation. + """ + self.shingle_ngram_size = shingle_ngram_size + + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Generates minhash values of text. + + Args: + dataframe: Pandas dataframe. + + Returns: + Pandas dataframe + """ + dataframe[("text", "shingles")] = dataframe[("text", "data")].apply( + create_shingles, + ) + dataframe[("text", "minhash")] = dataframe[("text", "shingles")].apply( + compute_minhash, + ) + + return dataframe + + +if __name__ == "__main__": + component = MinHashGeneratorComponent.from_args() + component.run() diff --git a/components/minhash_generator/tests/component_test.py b/components/minhash_generator/tests/component_test.py new file mode 100644 index 000000000..b446a704c --- /dev/null +++ b/components/minhash_generator/tests/component_test.py @@ -0,0 +1,38 @@ +"""Unit test for minhash generation component.""" +import pandas as pd +from fondant.component_spec import ComponentSpec + +from components.minhash_generator.src.main import MinHashGeneratorComponent + + +def test_run_component_test(): + """Test MinHash generation.""" + # Given: Dataframe with text, one duplicate in + data = [ + {"data": "This is my first sentence"}, + {"data": "This is my first sentence"}, + {"data": "This is a different sentence"}, + ] + + dataframe = pd.concat({"text": pd.DataFrame(data)}, axis=1, names=["text", "data"]) + + # When: The text filter component proceed the dataframe + spec = ComponentSpec.from_file("../fondant_component.yaml") + + component = MinHashGeneratorComponent( + spec, + input_manifest_path="./dummy_input_manifest.json", + output_manifest_path="./dummy_input_manifest.json", + metadata={}, + user_arguments={}, + ) + + dataframe = component.transform(dataframe=dataframe) + + # Then: dataframe contain minhashes for each entry + assert any( + dataframe.loc[0]["text"]["minhash"] == dataframe.loc[1]["text"]["minhash"], + ) + assert not any( + dataframe.loc[0]["text"]["minhash"] == dataframe.loc[2]["text"]["minhash"], + )