Skip to content

Commit

Permalink
[LLM pipeline] Update text normalization component (#335)
Browse files Browse the repository at this point in the history
Add minor improvements to the text normalization component. Mainly based
on the work of [Penedo et al ](https://arxiv.org/pdf/2306.01116.pdf)

Quality can be improved by removing specific patterns in single lines:
> We analyse documents line-by-line, and
discard or edit the lines based on the following rules:
• If it is mainly composed of uppercase characters (discard);
• If it is only composed of numerical characters (discard);
• If it is a counter (e.g. 3 likes) (discard);
• If it only contains one word (discard);
  • Loading branch information
mrchtr authored Aug 16, 2023
1 parent 363769c commit e3e078d
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 23 deletions.
19 changes: 14 additions & 5 deletions components/text_normalization/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
FROM --platform=linux/amd64 python:3.8-slim
FROM --platform=linux/amd64 python:3.8-slim as base

## System dependencies
RUN apt-get update && \
apt-get upgrade -y && \
apt-get install git -y

RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant

# install requirements
COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt

# Set the working directory to the component folder
WORKDIR /component/src

# Copy over src-files
COPY src/ .
COPY src/ src/

# Run component tests
FROM base as test
RUN pip3 install pytest # TODO add pytest to package setup
ENV PYTHONPATH "${PYTHONPATH}:./src"
COPY tests/ tests/
RUN ["python", "-m", "pytest", "tests/"]

FROM base
# Set the working directory to the component folder
WORKDIR /src
ENTRYPOINT ["python", "main.py"]
12 changes: 12 additions & 0 deletions components/text_normalization/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Text normalization component

This component implements several text normalization techniques to clean and preprocess textual data:

- Apply lowercasing: Converts all text to lowercase
- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs
- Apply NFC normalization: Converts characters to their canonical representation
- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf)
- Remove punctuation: Strips punctuation marks from the text

These text normalization techniques are valuable for preparing text data before using it for
the training of large language models.
12 changes: 9 additions & 3 deletions components/text_normalization/fondant_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ consumes:
type: string

args:
remove_additional_whitespaces:
description: If true remove all additional whitespace, tabs.
type: bool
apply_nfc:
description: If true apply nfc normalization
type: bool
normalize_lines:
description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter
type: bool
do_lowercase:
description: If true apply lowercasing
type: bool
characters_to_remove:
description: List of characters which will be removed, e.g. [?,.!,@#%]
type: list
remove_punctuation:
description: If true punctuation will be removed
type: str
1 change: 1 addition & 0 deletions components/text_normalization/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ftfy==6.1.1
72 changes: 57 additions & 15 deletions components/text_normalization/src/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,58 @@
"""A component that normalizes text."""
import logging
import re
import unicodedata
import string
from typing import List

import ftfy
import pandas as pd
from fondant.component import PandasTransformComponent
from fondant.executor import PandasTransformExecutor
from utils import is_counter, is_one_word, mainly_uppercase, only_numerical

logger = logging.getLogger(__name__)


def _remove_punctuation(text):
"""Remove punctuation in given text."""
return text.translate(str.maketrans("", "", string.punctuation))


def _remove_additional_whitespaces(text):
"""
Text cleaning method from slimpajama approach.
https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle
and in the beginning / end.
"""
return re.sub(r"\s+", " ", text.strip())


def normalize_lines(text):
def any_condition_met(line, discard_condition_functions):
return any(condition(line) for condition in discard_condition_functions)

discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word]
return " ".join(
[line for line in text.split("\n") if not any_condition_met(line, discard_conditions)])


class TextNormalizationComponent(PandasTransformComponent):
"""Component that normalizes text."""

def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]):
def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool,
remove_bad_patterns: bool,
do_lowercase: bool, remove_punctuation: bool):
self.remove_additional_whitespaces = remove_additional_whitespaces
self.apply_nfc = apply_nfc
self.normalize_lines = remove_bad_patterns
self.do_lowercase = do_lowercase
self.characters_to_remove = characters_to_remove
self.remove_punctuation = remove_punctuation

@staticmethod
def _do_nfc_normalization(text: str):
"""Apply nfc normalization to the text of the dataframe."""
return unicodedata.normalize("NFC", text)
return ftfy.fix_text(text, normalization="NFC")

@staticmethod
def _remove_patterns(regex_patterns: List[str], text: str):
Expand All @@ -36,26 +66,38 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
Apply normalization transformations. The component is capable of:
- NFC normalization
- Lowercasing
- Removing of regex patterns.
- Removing of unnecessary whitespaces (e.g. tabs), punctuation
- Apply line-wise transformations that exclude lines matching specified patterns.
Patterns include lines that are mainly composed of uppercase characters, lines that consist
only of numerical characters, lines that are counters (e.g., "3 likes"), and lines
that contain only one word.
Args:
dataframe: Pandas dataframe.
Returns:
Pandas dataframe
"""
if self.apply_nfc:
dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
if self.normalize_lines:
dataframe[("text", "data")] = dataframe[("text", "data")].apply(
normalize_lines)

if self.do_lowercase:
dataframe["text"]["data"].apply(lambda x: x.lower())

if len(self.characters_to_remove) > 0:
dataframe["text"]["data"].apply(
lambda x: self._remove_patterns(
self.characters_to_remove, x,
),
)
dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())

if self.apply_nfc:
dataframe[("text", "data")] = dataframe[("text", "data")].apply(
self._do_nfc_normalization)

if self.remove_punctuation:
dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation)

if self.remove_additional_whitespaces:
dataframe[("text", "data")] = dataframe[("text", "data")].apply(
_remove_additional_whitespaces)

# remove all empty rows
dataframe = dataframe[dataframe[("text", "data")].astype(bool)]

return dataframe

Expand Down
65 changes: 65 additions & 0 deletions components/text_normalization/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import re


def mainly_uppercase(line: str, threshold: float = 0.7) -> bool:
"""
Checks if a line is mainly composed of uppercase characters.
Args:
line: The input line to check.
threshold (float): The threshold (between 0 and 1) to determine what is considered
"mainly uppercase."
Returns:
bool: True if the line is mainly uppercase, False otherwise.
"""
uppercase_count = sum(1 for char in line if char.isupper())
total_chars = len(line)
if total_chars == 0:
return False

uppercase_ratio = uppercase_count / total_chars
return uppercase_ratio >= threshold


def only_numerical(line: str) -> bool:
"""
Checks if a line is composed only of numerical characters.
Args:
line: The input line to check.
Returns:
bool: True if the line is only composed of numerical characters, False otherwise.
"""
return line.isdigit()


def is_counter(line: str) -> bool:
"""
Checks if a line represents a counter (e.g., "3 likes").
Args:
line: The input line to check.
Returns:
bool: True if the line represents a counter, False otherwise.
"""
# Use regular expression to check for the pattern: <number> <text>
line = line.strip()
pattern = r"^\d+\s+\S+$"
return re.match(pattern, line) is not None


def is_one_word(line: str) -> bool:
"""
Checks if a line contains only one word.
Args:
line: The input line to check.
Returns:
bool: True if the line contains only one word, False otherwise.
"""
words = line.split()
return len(words) == 1
36 changes: 36 additions & 0 deletions components/text_normalization/tests/component_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

import pandas as pd

from src.main import TextNormalizationComponent


def test_transform_custom_componen_test():
"""Test components transform method."""
user_arguments = {
"remove_additional_whitespaces": True,
"apply_nfc": True,
"remove_bad_patterns": True,
"do_lowercase": True,
"remove_punctuation": True,
}
component = TextNormalizationComponent(**user_arguments)

input_dataframe = pd.DataFrame([
"\u0043\u0327 something",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
"Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
], columns=[("text", "data")])

expected_output = pd.DataFrame([
"\u00e7 something",
"lorem ipsum dolor sit amet consectetur adipiscing elit",
"nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
], columns=[("text", "data")])

output_dataframe = component.transform(input_dataframe)

pd.testing.assert_frame_equal(
left=expected_output,
right=output_dataframe,
check_dtype=False,
)
42 changes: 42 additions & 0 deletions components/text_normalization/tests/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

from src.utils import (
is_counter,
is_one_word,
mainly_uppercase,
only_numerical,
)


def test_mainly_uppercase():
line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
assert mainly_uppercase(line, threshold=0.5)

def test_mainly_uppercase_under_threshold():
line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
assert not mainly_uppercase(line, threshold=0.9)

def test_only_numerical():
line = "42"
assert only_numerical(line)

def test_only_numerical_on_words():
line = "42 lorem ipsum"
assert not only_numerical(line)

def test_is_counter():
line = "13 Likes"
assert is_counter(line)

def test_is_not_counter():
line = "Hello world! 42 people are part of .."
assert not is_counter(line)

def test_is_one_word():
line = "word"
assert is_one_word(line)

def test_is_not_one_word():
line = "two words"
assert not is_one_word(line)


0 comments on commit e3e078d

Please sign in to comment.