-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LLM pipeline] Update text normalization component (#335)
Add minor improvements to the text normalization component. Mainly based on the work of [Penedo et al ](https://arxiv.org/pdf/2306.01116.pdf) Quality can be improved by removing specific patterns in single lines: > We analyse documents line-by-line, and discard or edit the lines based on the following rules: • If it is mainly composed of uppercase characters (discard); • If it is only composed of numerical characters (discard); • If it is a counter (e.g. 3 likes) (discard); • If it only contains one word (discard);
- Loading branch information
Showing
8 changed files
with
236 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,27 @@ | ||
FROM --platform=linux/amd64 python:3.8-slim | ||
FROM --platform=linux/amd64 python:3.8-slim as base | ||
|
||
## System dependencies | ||
RUN apt-get update && \ | ||
apt-get upgrade -y && \ | ||
apt-get install git -y | ||
|
||
RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant | ||
|
||
# install requirements | ||
COPY requirements.txt / | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
# Set the working directory to the component folder | ||
WORKDIR /component/src | ||
|
||
# Copy over src-files | ||
COPY src/ . | ||
COPY src/ src/ | ||
|
||
# Run component tests | ||
FROM base as test | ||
RUN pip3 install pytest # TODO add pytest to package setup | ||
ENV PYTHONPATH "${PYTHONPATH}:./src" | ||
COPY tests/ tests/ | ||
RUN ["python", "-m", "pytest", "tests/"] | ||
|
||
FROM base | ||
# Set the working directory to the component folder | ||
WORKDIR /src | ||
ENTRYPOINT ["python", "main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Text normalization component | ||
|
||
This component implements several text normalization techniques to clean and preprocess textual data: | ||
|
||
- Apply lowercasing: Converts all text to lowercase | ||
- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs | ||
- Apply NFC normalization: Converts characters to their canonical representation | ||
- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) | ||
- Remove punctuation: Strips punctuation marks from the text | ||
|
||
These text normalization techniques are valuable for preparing text data before using it for | ||
the training of large language models. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ftfy==6.1.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import re | ||
|
||
|
||
def mainly_uppercase(line: str, threshold: float = 0.7) -> bool: | ||
""" | ||
Checks if a line is mainly composed of uppercase characters. | ||
Args: | ||
line: The input line to check. | ||
threshold (float): The threshold (between 0 and 1) to determine what is considered | ||
"mainly uppercase." | ||
Returns: | ||
bool: True if the line is mainly uppercase, False otherwise. | ||
""" | ||
uppercase_count = sum(1 for char in line if char.isupper()) | ||
total_chars = len(line) | ||
if total_chars == 0: | ||
return False | ||
|
||
uppercase_ratio = uppercase_count / total_chars | ||
return uppercase_ratio >= threshold | ||
|
||
|
||
def only_numerical(line: str) -> bool: | ||
""" | ||
Checks if a line is composed only of numerical characters. | ||
Args: | ||
line: The input line to check. | ||
Returns: | ||
bool: True if the line is only composed of numerical characters, False otherwise. | ||
""" | ||
return line.isdigit() | ||
|
||
|
||
def is_counter(line: str) -> bool: | ||
""" | ||
Checks if a line represents a counter (e.g., "3 likes"). | ||
Args: | ||
line: The input line to check. | ||
Returns: | ||
bool: True if the line represents a counter, False otherwise. | ||
""" | ||
# Use regular expression to check for the pattern: <number> <text> | ||
line = line.strip() | ||
pattern = r"^\d+\s+\S+$" | ||
return re.match(pattern, line) is not None | ||
|
||
|
||
def is_one_word(line: str) -> bool: | ||
""" | ||
Checks if a line contains only one word. | ||
Args: | ||
line: The input line to check. | ||
Returns: | ||
bool: True if the line contains only one word, False otherwise. | ||
""" | ||
words = line.split() | ||
return len(words) == 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
|
||
import pandas as pd | ||
|
||
from src.main import TextNormalizationComponent | ||
|
||
|
||
def test_transform_custom_componen_test(): | ||
"""Test components transform method.""" | ||
user_arguments = { | ||
"remove_additional_whitespaces": True, | ||
"apply_nfc": True, | ||
"remove_bad_patterns": True, | ||
"do_lowercase": True, | ||
"remove_punctuation": True, | ||
} | ||
component = TextNormalizationComponent(**user_arguments) | ||
|
||
input_dataframe = pd.DataFrame([ | ||
"\u0043\u0327 something", | ||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.", | ||
"Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", | ||
], columns=[("text", "data")]) | ||
|
||
expected_output = pd.DataFrame([ | ||
"\u00e7 something", | ||
"lorem ipsum dolor sit amet consectetur adipiscing elit", | ||
"nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", | ||
], columns=[("text", "data")]) | ||
|
||
output_dataframe = component.transform(input_dataframe) | ||
|
||
pd.testing.assert_frame_equal( | ||
left=expected_output, | ||
right=output_dataframe, | ||
check_dtype=False, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
|
||
from src.utils import ( | ||
is_counter, | ||
is_one_word, | ||
mainly_uppercase, | ||
only_numerical, | ||
) | ||
|
||
|
||
def test_mainly_uppercase(): | ||
line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" | ||
assert mainly_uppercase(line, threshold=0.5) | ||
|
||
def test_mainly_uppercase_under_threshold(): | ||
line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" | ||
assert not mainly_uppercase(line, threshold=0.9) | ||
|
||
def test_only_numerical(): | ||
line = "42" | ||
assert only_numerical(line) | ||
|
||
def test_only_numerical_on_words(): | ||
line = "42 lorem ipsum" | ||
assert not only_numerical(line) | ||
|
||
def test_is_counter(): | ||
line = "13 Likes" | ||
assert is_counter(line) | ||
|
||
def test_is_not_counter(): | ||
line = "Hello world! 42 people are part of .." | ||
assert not is_counter(line) | ||
|
||
def test_is_one_word(): | ||
line = "word" | ||
assert is_one_word(line) | ||
|
||
def test_is_not_one_word(): | ||
line = "two words" | ||
assert not is_one_word(line) | ||
|
||
|