diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile index 605adc7e9..eb96b604a 100644 --- a/components/text_normalization/Dockerfile +++ b/components/text_normalization/Dockerfile @@ -1,18 +1,27 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base ## System dependencies RUN apt-get update && \ apt-get upgrade -y && \ apt-get install git -y +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant + # install requirements COPY requirements.txt / RUN pip3 install --no-cache-dir -r requirements.txt -# Set the working directory to the component folder -WORKDIR /component/src - # Copy over src-files -COPY src/ . +COPY src/ src/ +# Run component tests +FROM base as test +RUN pip3 install pytest # TODO add pytest to package setup +ENV PYTHONPATH "${PYTHONPATH}:./src" +COPY tests/ tests/ +RUN ["python", "-m", "pytest", "tests/"] + +FROM base +# Set the working directory to the component folder +WORKDIR /src ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md new file mode 100644 index 000000000..7b01ecbd7 --- /dev/null +++ b/components/text_normalization/README.md @@ -0,0 +1,12 @@ +# Text normalization component + +This component implements several text normalization techniques to clean and preprocess textual data: + +- Apply lowercasing: Converts all text to lowercase +- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs +- Apply NFC normalization: Converts characters to their canonical representation +- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) +- Remove punctuation: Strips punctuation marks from the text + +These text normalization techniques are valuable for preparing text data before using it for +the training of large language models. \ No newline at end of file diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index 6119e914d..f9d2bfabb 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -9,12 +9,18 @@ consumes: type: string args: + remove_additional_whitespaces: + description: If true remove all additional whitespace, tabs. + type: bool apply_nfc: description: If true apply nfc normalization type: bool + normalize_lines: + description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter + type: bool do_lowercase: description: If true apply lowercasing type: bool - characters_to_remove: - description: List of characters which will be removed, e.g. [?,.!,@#%] - type: list \ No newline at end of file + remove_punctuation: + description: If true punctuation will be removed + type: str \ No newline at end of file diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt index e69de29bb..a4299def8 100644 --- a/components/text_normalization/requirements.txt +++ b/components/text_normalization/requirements.txt @@ -0,0 +1 @@ +ftfy==6.1.1 \ No newline at end of file diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index a3c415717..662c8023c 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -1,28 +1,58 @@ """A component that normalizes text.""" import logging import re -import unicodedata +import string from typing import List +import ftfy import pandas as pd from fondant.component import PandasTransformComponent from fondant.executor import PandasTransformExecutor +from utils import is_counter, is_one_word, mainly_uppercase, only_numerical logger = logging.getLogger(__name__) +def _remove_punctuation(text): + """Remove punctuation in given text.""" + return text.translate(str.maketrans("", "", string.punctuation)) + + +def _remove_additional_whitespaces(text): + """ + Text cleaning method from slimpajama approach. + https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py + Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle + and in the beginning / end. + """ + return re.sub(r"\s+", " ", text.strip()) + + +def normalize_lines(text): + def any_condition_met(line, discard_condition_functions): + return any(condition(line) for condition in discard_condition_functions) + + discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word] + return " ".join( + [line for line in text.split("\n") if not any_condition_met(line, discard_conditions)]) + + class TextNormalizationComponent(PandasTransformComponent): """Component that normalizes text.""" - def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]): + def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, + remove_bad_patterns: bool, + do_lowercase: bool, remove_punctuation: bool): + self.remove_additional_whitespaces = remove_additional_whitespaces self.apply_nfc = apply_nfc + self.normalize_lines = remove_bad_patterns self.do_lowercase = do_lowercase - self.characters_to_remove = characters_to_remove + self.remove_punctuation = remove_punctuation @staticmethod def _do_nfc_normalization(text: str): """Apply nfc normalization to the text of the dataframe.""" - return unicodedata.normalize("NFC", text) + return ftfy.fix_text(text, normalization="NFC") @staticmethod def _remove_patterns(regex_patterns: List[str], text: str): @@ -36,7 +66,11 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Apply normalization transformations. The component is capable of: - NFC normalization - Lowercasing - - Removing of regex patterns. + - Removing of unnecessary whitespaces (e.g. tabs), punctuation + - Apply line-wise transformations that exclude lines matching specified patterns. + Patterns include lines that are mainly composed of uppercase characters, lines that consist + only of numerical characters, lines that are counters (e.g., "3 likes"), and lines + that contain only one word. Args: dataframe: Pandas dataframe. @@ -44,18 +78,26 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - if self.apply_nfc: - dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x)) + if self.normalize_lines: + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + normalize_lines) if self.do_lowercase: - dataframe["text"]["data"].apply(lambda x: x.lower()) - - if len(self.characters_to_remove) > 0: - dataframe["text"]["data"].apply( - lambda x: self._remove_patterns( - self.characters_to_remove, x, - ), - ) + dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) + + if self.apply_nfc: + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + self._do_nfc_normalization) + + if self.remove_punctuation: + dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation) + + if self.remove_additional_whitespaces: + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + _remove_additional_whitespaces) + + # remove all empty rows + dataframe = dataframe[dataframe[("text", "data")].astype(bool)] return dataframe diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py new file mode 100644 index 000000000..b487bc61e --- /dev/null +++ b/components/text_normalization/src/utils.py @@ -0,0 +1,65 @@ +import re + + +def mainly_uppercase(line: str, threshold: float = 0.7) -> bool: + """ + Checks if a line is mainly composed of uppercase characters. + + Args: + line: The input line to check. + threshold (float): The threshold (between 0 and 1) to determine what is considered + "mainly uppercase." + + Returns: + bool: True if the line is mainly uppercase, False otherwise. + """ + uppercase_count = sum(1 for char in line if char.isupper()) + total_chars = len(line) + if total_chars == 0: + return False + + uppercase_ratio = uppercase_count / total_chars + return uppercase_ratio >= threshold + + +def only_numerical(line: str) -> bool: + """ + Checks if a line is composed only of numerical characters. + + Args: + line: The input line to check. + + Returns: + bool: True if the line is only composed of numerical characters, False otherwise. + """ + return line.isdigit() + + +def is_counter(line: str) -> bool: + """ + Checks if a line represents a counter (e.g., "3 likes"). + + Args: + line: The input line to check. + + Returns: + bool: True if the line represents a counter, False otherwise. + """ + # Use regular expression to check for the pattern: + line = line.strip() + pattern = r"^\d+\s+\S+$" + return re.match(pattern, line) is not None + + +def is_one_word(line: str) -> bool: + """ + Checks if a line contains only one word. + + Args: + line: The input line to check. + + Returns: + bool: True if the line contains only one word, False otherwise. + """ + words = line.split() + return len(words) == 1 diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py new file mode 100644 index 000000000..34ce528aa --- /dev/null +++ b/components/text_normalization/tests/component_test.py @@ -0,0 +1,36 @@ + +import pandas as pd + +from src.main import TextNormalizationComponent + + +def test_transform_custom_componen_test(): + """Test components transform method.""" + user_arguments = { + "remove_additional_whitespaces": True, + "apply_nfc": True, + "remove_bad_patterns": True, + "do_lowercase": True, + "remove_punctuation": True, + } + component = TextNormalizationComponent(**user_arguments) + + input_dataframe = pd.DataFrame([ + "\u0043\u0327 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + ], columns=[("text", "data")]) + + expected_output = pd.DataFrame([ + "\u00e7 something", + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", + ], columns=[("text", "data")]) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output, + right=output_dataframe, + check_dtype=False, + ) diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py new file mode 100644 index 000000000..44f360676 --- /dev/null +++ b/components/text_normalization/tests/utils_test.py @@ -0,0 +1,42 @@ + +from src.utils import ( + is_counter, + is_one_word, + mainly_uppercase, + only_numerical, +) + + +def test_mainly_uppercase(): + line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" + assert mainly_uppercase(line, threshold=0.5) + +def test_mainly_uppercase_under_threshold(): + line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" + assert not mainly_uppercase(line, threshold=0.9) + +def test_only_numerical(): + line = "42" + assert only_numerical(line) + +def test_only_numerical_on_words(): + line = "42 lorem ipsum" + assert not only_numerical(line) + +def test_is_counter(): + line = "13 Likes" + assert is_counter(line) + +def test_is_not_counter(): + line = "Hello world! 42 people are part of .." + assert not is_counter(line) + +def test_is_one_word(): + line = "word" + assert is_one_word(line) + +def test_is_not_one_word(): + line = "two words" + assert not is_one_word(line) + +