[LLM pipeline] Update text normalization component (#335)

Add minor improvements to the text normalization component. Mainly based on the work of [Penedo et al ](https://arxiv.org/pdf/2306.01116.pdf) Quality can be improved by removing specific patterns in single lines: > We analyse documents line-by-line, and discard or edit the lines based on the following rules: • If it is mainly composed of uppercase characters (discard); • If it is only composed of numerical characters (discard); • If it is a counter (e.g. 3 likes) (discard); • If it only contains one word (discard);
ml6team · Aug 16, 2023 · e3e078d · e3e078d
1 parent 363769c
commit e3e078d
Show file tree

Hide file tree

Showing 8 changed files with 236 additions and 23 deletions.
diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile
@@ -1,18 +1,27 @@
-FROM --platform=linux/amd64 python:3.8-slim
+FROM --platform=linux/amd64 python:3.8-slim as base
 
 ## System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install git -y
 
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant
+
 # install requirements
 COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-# Set the working directory to the component folder
-WORKDIR /component/src
-
 # Copy over src-files
-COPY src/ .
+COPY src/ src/
 
+# Run component tests
+FROM base as test
+RUN pip3 install pytest # TODO add pytest to package setup
+ENV PYTHONPATH "${PYTHONPATH}:./src"
+COPY tests/ tests/
+RUN ["python", "-m", "pytest", "tests/"]
+
+FROM base
+# Set the working directory to the component folder
+WORKDIR /src
 ENTRYPOINT ["python", "main.py"]
diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md
@@ -0,0 +1,12 @@
+# Text normalization component
+
+This component implements several text normalization techniques to clean and preprocess textual data:
+
+- Apply lowercasing: Converts all text to lowercase
+- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs
+- Apply NFC normalization: Converts characters to their canonical representation
+- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf)
+- Remove punctuation: Strips punctuation marks from the text
+
+These text normalization techniques are valuable for preparing text data before using it for 
+the training of large language models.
diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
@@ -9,12 +9,18 @@ consumes:
         type: string
 
 args:
+  remove_additional_whitespaces:
+    description: If true remove all additional whitespace, tabs.
+    type: bool
   apply_nfc:
     description: If true apply nfc normalization
     type: bool
+  normalize_lines:
+    description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter
+    type: bool
   do_lowercase:
     description: If true apply lowercasing
     type: bool
-  characters_to_remove:
-    description: List of characters which will be removed, e.g. [?,.!,@#%]
-    type: list
+  remove_punctuation:
+    description: If true punctuation will be removed
+    type: str
diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt
@@ -0,0 +1 @@
+ftfy==6.1.1
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
@@ -1,28 +1,58 @@
 """A component that normalizes text."""
 import logging
 import re
-import unicodedata
+import string
 from typing import List
 
+import ftfy
 import pandas as pd
 from fondant.component import PandasTransformComponent
 from fondant.executor import PandasTransformExecutor
+from utils import is_counter, is_one_word, mainly_uppercase, only_numerical
 
 logger = logging.getLogger(__name__)
 
 
+def _remove_punctuation(text):
+    """Remove punctuation in given text."""
+    return text.translate(str.maketrans("", "", string.punctuation))
+
+
+def _remove_additional_whitespaces(text):
+    """
+    Text cleaning method from slimpajama approach.
+    https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
+    Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle
+    and in the beginning / end.
+    """
+    return re.sub(r"\s+", " ", text.strip())
+
+
+def normalize_lines(text):
+    def any_condition_met(line, discard_condition_functions):
+        return any(condition(line) for condition in discard_condition_functions)
+
+    discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word]
+    return " ".join(
+        [line for line in text.split("\n") if not any_condition_met(line, discard_conditions)])
+
+
 class TextNormalizationComponent(PandasTransformComponent):
     """Component that normalizes text."""
 
-    def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]):
+    def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool,
+                 remove_bad_patterns: bool,
+                 do_lowercase: bool, remove_punctuation: bool):
+        self.remove_additional_whitespaces = remove_additional_whitespaces
         self.apply_nfc = apply_nfc
+        self.normalize_lines = remove_bad_patterns
         self.do_lowercase = do_lowercase
-        self.characters_to_remove = characters_to_remove
+        self.remove_punctuation = remove_punctuation
 
     @staticmethod
     def _do_nfc_normalization(text: str):
         """Apply nfc normalization to the text of the dataframe."""
-        return unicodedata.normalize("NFC", text)
+        return ftfy.fix_text(text, normalization="NFC")
 
     @staticmethod
     def _remove_patterns(regex_patterns: List[str], text: str):
@@ -36,26 +66,38 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Apply normalization transformations. The component is capable of:
         - NFC normalization
         - Lowercasing
-        - Removing of regex patterns.
+        - Removing of unnecessary whitespaces (e.g. tabs), punctuation
+        - Apply line-wise transformations that exclude lines matching specified patterns.
+        Patterns include lines that are mainly composed of uppercase characters, lines that consist
+        only of numerical characters, lines that are counters (e.g., "3 likes"), and lines
+        that contain only one word.
 
         Args:
             dataframe: Pandas dataframe.
 
         Returns:
             Pandas dataframe
         """
-        if self.apply_nfc:
-            dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
+        if self.normalize_lines:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                normalize_lines)
 
         if self.do_lowercase:
-            dataframe["text"]["data"].apply(lambda x: x.lower())
-
-        if len(self.characters_to_remove) > 0:
-            dataframe["text"]["data"].apply(
-                lambda x: self._remove_patterns(
-                    self.characters_to_remove, x,
-                ),
-            )
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
+
+        if self.apply_nfc:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                self._do_nfc_normalization)
+
+        if self.remove_punctuation:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation)
+
+        if self.remove_additional_whitespaces:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                _remove_additional_whitespaces)
+
+        # remove all empty rows
+        dataframe = dataframe[dataframe[("text", "data")].astype(bool)]
 
         return dataframe
 

diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py
@@ -0,0 +1,65 @@
+import re
+
+
+def mainly_uppercase(line: str, threshold: float = 0.7) -> bool:
+    """
+    Checks if a line is mainly composed of uppercase characters.
+
+    Args:
+        line: The input line to check.
+        threshold (float): The threshold (between 0 and 1) to determine what is considered
+        "mainly uppercase."
+
+    Returns:
+        bool: True if the line is mainly uppercase, False otherwise.
+    """
+    uppercase_count = sum(1 for char in line if char.isupper())
+    total_chars = len(line)
+    if total_chars == 0:
+        return False
+
+    uppercase_ratio = uppercase_count / total_chars
+    return uppercase_ratio >= threshold
+
+
+def only_numerical(line: str) -> bool:
+    """
+    Checks if a line is composed only of numerical characters.
+
+    Args:
+        line: The input line to check.
+
+    Returns:
+        bool: True if the line is only composed of numerical characters, False otherwise.
+    """
+    return line.isdigit()
+
+
+def is_counter(line: str) -> bool:
+    """
+    Checks if a line represents a counter (e.g., "3 likes").
+
+    Args:
+        line: The input line to check.
+
+    Returns:
+        bool: True if the line represents a counter, False otherwise.
+    """
+    # Use regular expression to check for the pattern: <number> <text>
+    line = line.strip()
+    pattern = r"^\d+\s+\S+$"
+    return re.match(pattern, line) is not None
+
+
+def is_one_word(line: str) -> bool:
+    """
+    Checks if a line contains only one word.
+
+    Args:
+        line: The input line to check.
+
+    Returns:
+        bool: True if the line contains only one word, False otherwise.
+    """
+    words = line.split()
+    return len(words) == 1
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
@@ -0,0 +1,36 @@
+
+import pandas as pd
+
+from src.main import TextNormalizationComponent
+
+
+def test_transform_custom_componen_test():
+    """Test components transform method."""
+    user_arguments = {
+        "remove_additional_whitespaces": True,
+        "apply_nfc": True,
+        "remove_bad_patterns": True,
+        "do_lowercase": True,
+        "remove_punctuation": True,
+    }
+    component = TextNormalizationComponent(**user_arguments)
+
+    input_dataframe = pd.DataFrame([
+        "\u0043\u0327 something",
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+    ], columns=[("text", "data")])
+
+    expected_output = pd.DataFrame([
+        "\u00e7 something",
+        "lorem ipsum dolor sit amet consectetur adipiscing elit",
+        "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
+    ], columns=[("text", "data")])
+
+    output_dataframe = component.transform(input_dataframe)
+
+    pd.testing.assert_frame_equal(
+        left=expected_output,
+        right=output_dataframe,
+        check_dtype=False,
+    )
diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py
@@ -0,0 +1,42 @@
+
+from src.utils import (
+    is_counter,
+    is_one_word,
+    mainly_uppercase,
+    only_numerical,
+)
+
+
+def test_mainly_uppercase():
+    line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
+    assert mainly_uppercase(line, threshold=0.5)
+
+def test_mainly_uppercase_under_threshold():
+    line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
+    assert not mainly_uppercase(line, threshold=0.9)
+
+def test_only_numerical():
+    line = "42"
+    assert only_numerical(line)
+
+def test_only_numerical_on_words():
+    line = "42 lorem ipsum"
+    assert not only_numerical(line)
+
+def test_is_counter():
+    line = "13 Likes"
+    assert is_counter(line)
+
+def test_is_not_counter():
+    line = "Hello world! 42 people are part of .."
+    assert not is_counter(line)
+
+def test_is_one_word():
+    line = "word"
+    assert is_one_word(line)
+
+def test_is_not_one_word():
+    line = "two words"
+    assert not is_one_word(line)
+
+