From a2c62394393f56525396da7acc661b6f563e9a84 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 1 Aug 2023 13:16:12 +0200 Subject: [PATCH 1/9] Add readme and component cleaning --- .../text_normalization/requirements.txt | 1 + components/text_normalization/src/main.py | 50 +++++++++- .../src/resources/de_bad_patterns.txt | 6 ++ components/text_normalization/src/utils.py | 95 +++++++++++++++++++ .../tests/component_test.py | 54 +++++++++++ .../tests/fixtures/en_text_normalization.json | 17 ++++ 6 files changed, 218 insertions(+), 5 deletions(-) create mode 100644 components/text_normalization/src/resources/de_bad_patterns.txt create mode 100644 components/text_normalization/src/utils.py create mode 100644 components/text_normalization/tests/component_test.py create mode 100644 components/text_normalization/tests/fixtures/en_text_normalization.json diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt index e69de29bb..9e5daac86 100644 --- a/components/text_normalization/requirements.txt +++ b/components/text_normalization/requirements.txt @@ -0,0 +1 @@ +ftfy=6.1.1 \ No newline at end of file diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index a3c415717..0e0ff52a4 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -1,9 +1,10 @@ """A component that normalizes text.""" import logging import re -import unicodedata +import string from typing import List +import ftfy import pandas as pd from fondant.component import PandasTransformComponent from fondant.executor import PandasTransformExecutor @@ -11,6 +12,39 @@ logger = logging.getLogger(__name__) +def clean(text, remove_punctuation=True): + """ + Text cleaning method from slimpajama approach. + https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py + Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle + and in the beginning / end. + + Args: + - text: text to be cleaned + """ + # remove punctuation + if remove_punctuation: + text = text.translate(str.maketrans("", "", string.punctuation)) + + # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end + text = re.sub(r"\s+", " ", text.strip()) + return text + +def remove_noisy_lines(text, language): + """ + !!! and note that they require adaptation across languages !!! + • If it is short (≤ 10 words) and matches a pattern (edit): + - At the beginning of the line (e.g. sign-in); + - At the end of the line (e.g. Read more...); + - Anywhere in the line (e.g. items in cart). + """ + language + "bad_patterns.txt" + + def any_condition_met(line, discard_condition_functions): + return any(condition(line) for condition in discard_condition_functions) + + return " ".join([line for line in text.split("\n") if not any_condition_met]) + class TextNormalizationComponent(PandasTransformComponent): """Component that normalizes text.""" @@ -18,11 +52,12 @@ def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_rem self.apply_nfc = apply_nfc self.do_lowercase = do_lowercase self.characters_to_remove = characters_to_remove + self.default_cleaning = True @staticmethod def _do_nfc_normalization(text: str): """Apply nfc normalization to the text of the dataframe.""" - return unicodedata.normalize("NFC", text) + return ftfy.fix_text(text, normalization="NFC") @staticmethod def _remove_patterns(regex_patterns: List[str], text: str): @@ -44,14 +79,19 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ + dataframe[("text", "data")] = dataframe["text"]["data"].apply(remove_noisy_lines) + if self.apply_nfc: - dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x)) + dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x)) if self.do_lowercase: - dataframe["text"]["data"].apply(lambda x: x.lower()) + dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: x.lower()) + + if self.default_cleaning: + dataframe[("text", "data")] = dataframe["text"]["data"].apply(clean) if len(self.characters_to_remove) > 0: - dataframe["text"]["data"].apply( + dataframe[("text", "data")] = dataframe["text"]["data"].apply( lambda x: self._remove_patterns( self.characters_to_remove, x, ), diff --git a/components/text_normalization/src/resources/de_bad_patterns.txt b/components/text_normalization/src/resources/de_bad_patterns.txt new file mode 100644 index 000000000..2504b1974 --- /dev/null +++ b/components/text_normalization/src/resources/de_bad_patterns.txt @@ -0,0 +1,6 @@ +Weiterlesen +Startseite +Einkaufswagen +Konto +Zum Einkaufswagen hinzufügen +Zum Warenkorb hinzufügen \ No newline at end of file diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py new file mode 100644 index 000000000..f6dff6d21 --- /dev/null +++ b/components/text_normalization/src/utils.py @@ -0,0 +1,95 @@ +import re + + +def mainly_uppercase(line, threshold=0.7): + """ + Checks if a line is mainly composed of uppercase characters. + + Args: + line (str): The input line to check. + threshold (float): The threshold (between 0 and 1) to determine what is considered "mainly uppercase." + + Returns: + bool: True if the line is mainly uppercase, False otherwise. + """ + uppercase_count = sum(1 for char in line if char.isupper()) + total_chars = len(line) + if total_chars == 0: + return False + + uppercase_ratio = uppercase_count / total_chars + return uppercase_ratio >= threshold + +def only_numerical(line): + """ + Checks if a line is composed only of numerical characters. + + Args: + line (str): The input line to check. + + Returns: + bool: True if the line is only composed of numerical characters, False otherwise. + """ + return line.isdigit() + +def is_counter(line): + """ + Checks if a line represents a counter (e.g., "3 likes"). + + Args: + line (str): The input line to check. + + Returns: + bool: True if the line represents a counter, False otherwise. + """ + # Use regular expression to check for the pattern: + pattern = r"^\d+\s+\S+$" + return re.match(pattern, line) is not None + +def is_one_word(line): + """ + Checks if a line contains only one word. + + Args: + line (str): The input line to check. + + Returns: + bool: True if the line contains only one word, False otherwise. + """ + words = line.split() + return len(words) == 1 + + +def read_patterns_from_file(file_path): + """ + Read patterns from a text file. + + Args: + file_path (str): The path to the text file containing patterns. + + Returns: + list: A list of patterns read from the file. + """ + with open(file_path) as file: + return [pattern.strip() for pattern in file] +def is_short_and_matches_pattern(line, pattern_file_path, max_words=10): + """ + Checks if a line is short (< max_words) and matches the given pattern. + + Args: + line (str): The input line to check. + max_words (int): The maximum number of words allowed in the line (default is 10). + + Returns: + bool: True if the line is short and matches the pattern, False otherwise. + """ + patterns = read_patterns_from_file(pattern_file_path) + words = line.split() + if len(words) > max_words: + return False + + for pattern in patterns: + if re.search(rf'\b{re.escape(pattern)}\b', line) is not None: + return True + return None + diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py new file mode 100644 index 000000000..43d2ada6a --- /dev/null +++ b/components/text_normalization/tests/component_test.py @@ -0,0 +1,54 @@ +import json +from glob import glob + +import pandas +from fondant.component import Component +from fondant.executor import Executor + + +def load_fixtures(path): + test_configurations = [] + fixture_list = glob(path) + for fixture in fixture_list: + with open(fixture) as file: + fixture_dict = json.load(file) + + user_argmuments = fixture_dict["user_arguments"] + input_data = { + tuple(key.split("_")): value for key, value in fixture_dict["input"].items() + } + expected_out = { + tuple(key.split("_")): value + for key, value in fixture_dict["output"].items() + } + + test_configurations.append((user_argmuments, input_data, expected_out)) + + return test_configurations + +class TestComponentExecuter(Executor[Component]): + def __init__(self, user_arguments: t.Dict[str, t.Any], input_data: t.Dict): + self.user_arguments = user_arguments + self.input_data = input_data + + def execute(self, component_cls: t.Type[Component]) -> pandas.DataFrame: + """Execute a component. + + Args: + component_cls: The class of the component to execute. + """ + component = component_cls(None, **self.user_arguments) + + input_dataframe = dd.from_dict(self.input_data, npartitions=2) + + if isinstance(component, PandasTransformComponent): + output_df = component.transform(input_dataframe.compute()) + + elif isinstance(component, DaskTransformComponent): + output_df = component.transform(input_dataframe()).compute() + + else: + msg = "Non support component type." + raise NotImplementedError(msg) + + return output_df diff --git a/components/text_normalization/tests/fixtures/en_text_normalization.json b/components/text_normalization/tests/fixtures/en_text_normalization.json new file mode 100644 index 000000000..4a8165a28 --- /dev/null +++ b/components/text_normalization/tests/fixtures/en_text_normalization.json @@ -0,0 +1,17 @@ +{ + "user_arguments": { + "language": "de" + }, + "input": { + "data_text": [ + "Das hier ist ein Satz in deutscher Sprache", + "This is a sentence in English", + "Dit is een zin in het Nederlands" + ] + }, + "output": { + "data_text": [ + "Das hier ist ein Satz in deutscher Sprache" + ] + } +} \ No newline at end of file From 21e040bab33c2be675625173611b49e9424a5245 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 1 Aug 2023 15:55:36 +0200 Subject: [PATCH 2/9] Refactor text normalization component --- .../text_normalization/fondant_component.yaml | 15 ++++- components/text_normalization/src/main.py | 53 ++++++++---------- ...e_bad_patterns.txt => en_bad_patterns.txt} | 0 .../tests/component_test.py | 56 +++++++++---------- .../text_normalization/tests/conftest.py | 8 +++ .../tests/fixtures/apply_all.json | 22 ++++++++ .../apply_nfc_text_normalization.json | 24 ++++++++ .../tests/fixtures/en_text_normalization.json | 17 ------ .../lowercasing_text_normalization.json | 24 ++++++++ ..._additional_whitespaces_normalization.json | 24 ++++++++ .../tests/fixtures/remove_bad_patterns.json | 22 ++++++++ ...emove_bad_patterns_text_normalization.json | 25 +++++++++ .../remove_puncuation_text_normalization.json | 24 ++++++++ 13 files changed, 233 insertions(+), 81 deletions(-) rename components/text_normalization/src/resources/{de_bad_patterns.txt => en_bad_patterns.txt} (100%) create mode 100644 components/text_normalization/tests/conftest.py create mode 100644 components/text_normalization/tests/fixtures/apply_all.json create mode 100644 components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json delete mode 100644 components/text_normalization/tests/fixtures/en_text_normalization.json create mode 100644 components/text_normalization/tests/fixtures/lowercasing_text_normalization.json create mode 100644 components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json create mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns.json create mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json create mode 100644 components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index 6119e914d..2f5070466 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -9,12 +9,21 @@ consumes: type: string args: + remove_additional_whitespaces: + description: If true remove all additional whitespace, tabs. + type: bool apply_nfc: description: If true apply nfc normalization type: bool + remove_bad_patterns: + description: If true remove bad patterns + type: bool do_lowercase: description: If true apply lowercasing type: bool - characters_to_remove: - description: List of characters which will be removed, e.g. [?,.!,@#%] - type: list \ No newline at end of file + language: + description: Language is needed for language specific normalizations + type: str + remove_punctuation: + description: If true punctuation will be removed + type: str \ No newline at end of file diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index 0e0ff52a4..0ae5f4559 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -12,7 +12,11 @@ logger = logging.getLogger(__name__) -def clean(text, remove_punctuation=True): +def _remove_punctuation(text): + """Remove punctuation in given text.""" + return text.translate(str.maketrans("", "", string.punctuation)) + +def _remove_additional_whitespaces(text): """ Text cleaning method from slimpajama approach. https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py @@ -22,22 +26,10 @@ def clean(text, remove_punctuation=True): Args: - text: text to be cleaned """ - # remove punctuation - if remove_punctuation: - text = text.translate(str.maketrans("", "", string.punctuation)) - - # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end - text = re.sub(r"\s+", " ", text.strip()) - return text + return re.sub(r"\s+", " ", text.strip()) def remove_noisy_lines(text, language): - """ - !!! and note that they require adaptation across languages !!! - • If it is short (≤ 10 words) and matches a pattern (edit): - - At the beginning of the line (e.g. sign-in); - - At the end of the line (e.g. Read more...); - - Anywhere in the line (e.g. items in cart). - """ + """""" language + "bad_patterns.txt" def any_condition_met(line, discard_condition_functions): @@ -47,12 +39,13 @@ def any_condition_met(line, discard_condition_functions): class TextNormalizationComponent(PandasTransformComponent): """Component that normalizes text.""" - - def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]): + def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, remove_bad_patterns: bool, do_lowercase: bool, language: str, remove_punctuation: bool): + self.remove_additional_whitespaces = remove_additional_whitespaces self.apply_nfc = apply_nfc + self.remove_bad_patterns = remove_bad_patterns self.do_lowercase = do_lowercase - self.characters_to_remove = characters_to_remove - self.default_cleaning = True + self.language = language + self.remove_punctuation = remove_punctuation @staticmethod def _do_nfc_normalization(text: str): @@ -79,23 +72,23 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - dataframe[("text", "data")] = dataframe["text"]["data"].apply(remove_noisy_lines) + if self.remove_additional_whitespaces: + dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_additional_whitespaces) + + if self.remove_bad_patterns: + dataframe[("text", "data")] = dataframe[("text","data")].apply(lambda x: remove_noisy_lines(x, self.language)) if self.apply_nfc: - dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x)) + dataframe[("text", "data")] = dataframe[("text", "data")].apply(self._do_nfc_normalization) if self.do_lowercase: - dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: x.lower()) + dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) - if self.default_cleaning: - dataframe[("text", "data")] = dataframe["text"]["data"].apply(clean) + if self.remove_punctuation: + dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation) - if len(self.characters_to_remove) > 0: - dataframe[("text", "data")] = dataframe["text"]["data"].apply( - lambda x: self._remove_patterns( - self.characters_to_remove, x, - ), - ) + # remove all empty rows + dataframe = dataframe.dropna(subset=[("text", "data")]) return dataframe diff --git a/components/text_normalization/src/resources/de_bad_patterns.txt b/components/text_normalization/src/resources/en_bad_patterns.txt similarity index 100% rename from components/text_normalization/src/resources/de_bad_patterns.txt rename to components/text_normalization/src/resources/en_bad_patterns.txt diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py index 43d2ada6a..9601c2b6b 100644 --- a/components/text_normalization/tests/component_test.py +++ b/components/text_normalization/tests/component_test.py @@ -1,18 +1,29 @@ import json +import os +import typing as t from glob import glob -import pandas -from fondant.component import Component -from fondant.executor import Executor +import pandas as pd +import pytest +from fondant.component_spec import ComponentSpec +from components.text_normalization.src.main import TextNormalizationComponent -def load_fixtures(path): + +class MockedComponentSpec(ComponentSpec): + """Just for mocking purpose. This component spec is not needed for unit testing.""" + def __init__(self, specification: t.Dict[str, t.Any]): + pass + + +def load_fixtures(path="./fixtures"): test_configurations = [] - fixture_list = glob(path) + fixture_list = glob(path + "/*.json") for fixture in fixture_list: with open(fixture) as file: fixture_dict = json.load(file) + fixture_name = os.path.splitext(fixture)[0] user_argmuments = fixture_dict["user_arguments"] input_data = { tuple(key.split("_")): value for key, value in fixture_dict["input"].items() @@ -22,33 +33,16 @@ def load_fixtures(path): for key, value in fixture_dict["output"].items() } - test_configurations.append((user_argmuments, input_data, expected_out)) + test_configurations.append((fixture_name, user_argmuments, input_data, expected_out)) return test_configurations -class TestComponentExecuter(Executor[Component]): - def __init__(self, user_arguments: t.Dict[str, t.Any], input_data: t.Dict): - self.user_arguments = user_arguments - self.input_data = input_data - - def execute(self, component_cls: t.Type[Component]) -> pandas.DataFrame: - """Execute a component. - - Args: - component_cls: The class of the component to execute. - """ - component = component_cls(None, **self.user_arguments) - - input_dataframe = dd.from_dict(self.input_data, npartitions=2) - - if isinstance(component, PandasTransformComponent): - output_df = component.transform(input_dataframe.compute()) - - elif isinstance(component, DaskTransformComponent): - output_df = component.transform(input_dataframe()).compute() - - else: - msg = "Non support component type." - raise NotImplementedError(msg) +@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), load_fixtures()) +def test_component(fixture_name, user_arguments, input_data, expected_output): + """Test transform method of text normalization component.""" + print(fixture_name) + component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments) - return output_df + input_df = pd.DataFrame(input_data) + transformed_output = component.transform(input_df) + pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output) diff --git a/components/text_normalization/tests/conftest.py b/components/text_normalization/tests/conftest.py new file mode 100644 index 000000000..ef13b7321 --- /dev/null +++ b/components/text_normalization/tests/conftest.py @@ -0,0 +1,8 @@ +import os +import sys + +# Get the absolute path to the "src" directory +src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) + +# Append the "src" directory to the Python path +sys.path.append(src_path) diff --git a/components/text_normalization/tests/fixtures/apply_all.json b/components/text_normalization/tests/fixtures/apply_all.json new file mode 100644 index 000000000..062191020 --- /dev/null +++ b/components/text_normalization/tests/fixtures/apply_all.json @@ -0,0 +1,22 @@ +{ + "user_arguments": { + "apply_nfc": true, + "do_lowercase": true, + "language": "en", + "remove_punctuation": true, + "remove_additional_whitespaces": true, + "remove_bad_patterns": true + }, + "input": { + "text_data": [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." + ] + }, + "output": { + "text_data": [ + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus" + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json new file mode 100644 index 000000000..b9d7977f0 --- /dev/null +++ b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json @@ -0,0 +1,24 @@ +{ + "user_arguments": { + "apply_nfc": true, + "do_lowercase": false, + "language": "en", + "remove_punctuation": false, + "remove_additional_whitespaces": false, + "remove_bad_patterns": false + }, + "input": { + "text_data": [ + "\u0043\u0327 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." + ] + }, + "output": { + "text_data": [ + "\u00C7 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/en_text_normalization.json b/components/text_normalization/tests/fixtures/en_text_normalization.json deleted file mode 100644 index 4a8165a28..000000000 --- a/components/text_normalization/tests/fixtures/en_text_normalization.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "user_arguments": { - "language": "de" - }, - "input": { - "data_text": [ - "Das hier ist ein Satz in deutscher Sprache", - "This is a sentence in English", - "Dit is een zin in het Nederlands" - ] - }, - "output": { - "data_text": [ - "Das hier ist ein Satz in deutscher Sprache" - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json new file mode 100644 index 000000000..bd64f90cb --- /dev/null +++ b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json @@ -0,0 +1,24 @@ +{ + "user_arguments": { + "apply_nfc": true, + "do_lowercase": true, + "language": "en", + "remove_punctuation": false, + "remove_additional_whitespaces": false, + "remove_bad_patterns": false + }, + "input": { + "text_data": [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." + ] + }, + "output": { + "text_data": [ + "lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "nulla facilisi. sed eu nulla sit amet enim scelerisque dapibus.", + "suspendisse potenti. fusce sit amet erat vel nunc placerat bibendum." + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json new file mode 100644 index 000000000..36f519967 --- /dev/null +++ b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json @@ -0,0 +1,24 @@ +{ + "user_arguments": { + "apply_nfc": false, + "do_lowercase": false, + "language": "en", + "remove_punctuation": false, + "remove_additional_whitespaces": true, + "remove_bad_patterns": false + }, + "input": { + "text_data": [ + " Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + " Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum. " + ] + }, + "output": { + "text_data": [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns.json b/components/text_normalization/tests/fixtures/remove_bad_patterns.json new file mode 100644 index 000000000..062191020 --- /dev/null +++ b/components/text_normalization/tests/fixtures/remove_bad_patterns.json @@ -0,0 +1,22 @@ +{ + "user_arguments": { + "apply_nfc": true, + "do_lowercase": true, + "language": "en", + "remove_punctuation": true, + "remove_additional_whitespaces": true, + "remove_bad_patterns": true + }, + "input": { + "text_data": [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." + ] + }, + "output": { + "text_data": [ + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus" + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json new file mode 100644 index 000000000..a5103f2dc --- /dev/null +++ b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json @@ -0,0 +1,25 @@ +{ + "user_arguments": { + "apply_nfc": false, + "do_lowercase": false, + "language": "de", + "remove_punctuation": false, + "remove_additional_whitespaces": false, + "remove_bad_patterns": true + }, + "input": { + "text_data": [ + "Lorem ipsum dolor sit \n HELLO WORLD some \n amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.", + "45345345" + ] + }, + "output": { + "text_data": [ + "Lorem ipsum dolor sit amet consectetur adipiscing elit", + "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", + "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum" + ] + } +} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json new file mode 100644 index 000000000..b03fd81f1 --- /dev/null +++ b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json @@ -0,0 +1,24 @@ +{ + "user_arguments": { + "apply_nfc": true, + "do_lowercase": false, + "language": "en", + "remove_punctuation": true, + "remove_additional_whitespaces": false, + "remove_bad_patterns": false + }, + "input": { + "text_data": [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." + ] + }, + "output": { + "text_data": [ + "Lorem ipsum dolor sit amet consectetur adipiscing elit", + "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", + "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum" + ] + } +} \ No newline at end of file From 8f0897be288414172a8027a7ae11db05f30079b7 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Wed, 2 Aug 2023 10:58:04 +0200 Subject: [PATCH 3/9] Refactor text normalization component --- components/text_normalization/src/main.py | 37 +++++++++------- .../src/resources/en_bad_patterns.txt | 6 --- components/text_normalization/src/utils.py | 42 +++---------------- .../tests/component_test.py | 12 +++--- .../tests/fixtures/remove_bad_patterns.json | 22 ---------- ...emove_bad_patterns_text_normalization.json | 10 ++--- .../text_normalization/tests/utils_test.py | 42 +++++++++++++++++++ 7 files changed, 81 insertions(+), 90 deletions(-) delete mode 100644 components/text_normalization/src/resources/en_bad_patterns.txt delete mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns.json create mode 100644 components/text_normalization/tests/utils_test.py diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index 0ae5f4559..50e2ca2af 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -8,6 +8,7 @@ import pandas as pd from fondant.component import PandasTransformComponent from fondant.executor import PandasTransformExecutor +from utils import is_counter, is_one_word, mainly_uppercase, only_numerical logger = logging.getLogger(__name__) @@ -16,30 +17,32 @@ def _remove_punctuation(text): """Remove punctuation in given text.""" return text.translate(str.maketrans("", "", string.punctuation)) + def _remove_additional_whitespaces(text): """ Text cleaning method from slimpajama approach. https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle and in the beginning / end. - - Args: - - text: text to be cleaned """ return re.sub(r"\s+", " ", text.strip()) -def remove_noisy_lines(text, language): - """""" - language + "bad_patterns.txt" +def remove_noisy_lines(text): def any_condition_met(line, discard_condition_functions): return any(condition(line) for condition in discard_condition_functions) - return " ".join([line for line in text.split("\n") if not any_condition_met]) + discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word] + return " ".join( + [line for line in text.split("\n") if not any_condition_met(line, discard_conditions)]) + class TextNormalizationComponent(PandasTransformComponent): """Component that normalizes text.""" - def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, remove_bad_patterns: bool, do_lowercase: bool, language: str, remove_punctuation: bool): + + def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, + remove_bad_patterns: bool, + do_lowercase: bool, language: str, remove_punctuation: bool): self.remove_additional_whitespaces = remove_additional_whitespaces self.apply_nfc = apply_nfc self.remove_bad_patterns = remove_bad_patterns @@ -72,23 +75,25 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - if self.remove_additional_whitespaces: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_additional_whitespaces) + if self.do_lowercase: + dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) if self.remove_bad_patterns: - dataframe[("text", "data")] = dataframe[("text","data")].apply(lambda x: remove_noisy_lines(x, self.language)) + dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines) if self.apply_nfc: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(self._do_nfc_normalization) - - if self.do_lowercase: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + self._do_nfc_normalization) if self.remove_punctuation: dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation) + if self.remove_additional_whitespaces: + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + _remove_additional_whitespaces) + # remove all empty rows - dataframe = dataframe.dropna(subset=[("text", "data")]) + dataframe = dataframe[dataframe[("text", "data")].astype(bool)] return dataframe diff --git a/components/text_normalization/src/resources/en_bad_patterns.txt b/components/text_normalization/src/resources/en_bad_patterns.txt deleted file mode 100644 index 2504b1974..000000000 --- a/components/text_normalization/src/resources/en_bad_patterns.txt +++ /dev/null @@ -1,6 +0,0 @@ -Weiterlesen -Startseite -Einkaufswagen -Konto -Zum Einkaufswagen hinzufügen -Zum Warenkorb hinzufügen \ No newline at end of file diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py index f6dff6d21..5edc1cd9c 100644 --- a/components/text_normalization/src/utils.py +++ b/components/text_normalization/src/utils.py @@ -7,7 +7,8 @@ def mainly_uppercase(line, threshold=0.7): Args: line (str): The input line to check. - threshold (float): The threshold (between 0 and 1) to determine what is considered "mainly uppercase." + threshold (float): The threshold (between 0 and 1) to determine what is considered + "mainly uppercase." Returns: bool: True if the line is mainly uppercase, False otherwise. @@ -20,6 +21,7 @@ def mainly_uppercase(line, threshold=0.7): uppercase_ratio = uppercase_count / total_chars return uppercase_ratio >= threshold + def only_numerical(line): """ Checks if a line is composed only of numerical characters. @@ -32,6 +34,7 @@ def only_numerical(line): """ return line.isdigit() + def is_counter(line): """ Checks if a line represents a counter (e.g., "3 likes"). @@ -43,9 +46,11 @@ def is_counter(line): bool: True if the line represents a counter, False otherwise. """ # Use regular expression to check for the pattern: + line = line.strip() pattern = r"^\d+\s+\S+$" return re.match(pattern, line) is not None + def is_one_word(line): """ Checks if a line contains only one word. @@ -58,38 +63,3 @@ def is_one_word(line): """ words = line.split() return len(words) == 1 - - -def read_patterns_from_file(file_path): - """ - Read patterns from a text file. - - Args: - file_path (str): The path to the text file containing patterns. - - Returns: - list: A list of patterns read from the file. - """ - with open(file_path) as file: - return [pattern.strip() for pattern in file] -def is_short_and_matches_pattern(line, pattern_file_path, max_words=10): - """ - Checks if a line is short (< max_words) and matches the given pattern. - - Args: - line (str): The input line to check. - max_words (int): The maximum number of words allowed in the line (default is 10). - - Returns: - bool: True if the line is short and matches the pattern, False otherwise. - """ - patterns = read_patterns_from_file(pattern_file_path) - words = line.split() - if len(words) > max_words: - return False - - for pattern in patterns: - if re.search(rf'\b{re.escape(pattern)}\b', line) is not None: - return True - return None - diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py index 9601c2b6b..bff208ed6 100644 --- a/components/text_normalization/tests/component_test.py +++ b/components/text_normalization/tests/component_test.py @@ -12,6 +12,7 @@ class MockedComponentSpec(ComponentSpec): """Just for mocking purpose. This component spec is not needed for unit testing.""" + def __init__(self, specification: t.Dict[str, t.Any]): pass @@ -24,7 +25,7 @@ def load_fixtures(path="./fixtures"): fixture_dict = json.load(file) fixture_name = os.path.splitext(fixture)[0] - user_argmuments = fixture_dict["user_arguments"] + user_arguments = fixture_dict["user_arguments"] input_data = { tuple(key.split("_")): value for key, value in fixture_dict["input"].items() } @@ -33,16 +34,17 @@ def load_fixtures(path="./fixtures"): for key, value in fixture_dict["output"].items() } - test_configurations.append((fixture_name, user_argmuments, input_data, expected_out)) + test_configurations.append((fixture_name, user_arguments, input_data, expected_out)) return test_configurations -@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), load_fixtures()) + +@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), + load_fixtures()) def test_component(fixture_name, user_arguments, input_data, expected_output): """Test transform method of text normalization component.""" - print(fixture_name) + print("Running test case based on: ", fixture_name) component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments) - input_df = pd.DataFrame(input_data) transformed_output = component.transform(input_df) pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output) diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns.json b/components/text_normalization/tests/fixtures/remove_bad_patterns.json deleted file mode 100644 index 062191020..000000000 --- a/components/text_normalization/tests/fixtures/remove_bad_patterns.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": true, - "do_lowercase": true, - "language": "en", - "remove_punctuation": true, - "remove_additional_whitespaces": true, - "remove_bad_patterns": true - }, - "input": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." - ] - }, - "output": { - "text_data": [ - "lorem ipsum dolor sit amet consectetur adipiscing elit", - "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus" - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json index a5103f2dc..c4a8eab71 100644 --- a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json +++ b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json @@ -4,12 +4,12 @@ "do_lowercase": false, "language": "de", "remove_punctuation": false, - "remove_additional_whitespaces": false, + "remove_additional_whitespaces": true, "remove_bad_patterns": true }, "input": { "text_data": [ - "Lorem ipsum dolor sit \n HELLO WORLD some \n amet, consectetur adipiscing elit.", + "Lorem ipsum dolor sit \n HELLO WORLD \n amet, consectetur adipiscing elit.", "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!", "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.", "45345345" @@ -17,9 +17,9 @@ }, "output": { "text_data": [ - "Lorem ipsum dolor sit amet consectetur adipiscing elit", - "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", - "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum" + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", + "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." ] } } \ No newline at end of file diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py new file mode 100644 index 000000000..af2c7d6b7 --- /dev/null +++ b/components/text_normalization/tests/utils_test.py @@ -0,0 +1,42 @@ + +from components.text_normalization.src.utils import ( + is_counter, + is_one_word, + mainly_uppercase, + only_numerical, +) + + +def test_mainly_uppercase(): + line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" + assert mainly_uppercase(line, threshold=0.5) + +def test_mainly_uppercase_under_threshold(): + line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" + assert ~mainly_uppercase(line, threshold=0.9) + +def test_only_numerical(): + line = "42" + assert only_numerical(line) + +def test_only_numerical_on_words(): + line = "42 lorem ipsum" + assert ~only_numerical(line) + +def test_is_counter(): + line = "13 Likes" + assert is_counter(line) + +def test_is_not_counter(): + line = "Hello world! 42 people are part of .." + assert ~is_counter(line) + +def test_is_one_word(): + line = "word" + assert is_one_word(line) + +def test_is_not_one_word(): + line = "two words" + assert ~is_one_word(line) + + From efe5c49702c030d074b699aaf613323443411992 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Thu, 3 Aug 2023 08:21:51 +0200 Subject: [PATCH 4/9] Add component readme.md --- components/text_normalization/README.md | 12 ++++++++++++ components/text_normalization/fondant_component.yaml | 3 --- 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 components/text_normalization/README.md diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md new file mode 100644 index 000000000..7b01ecbd7 --- /dev/null +++ b/components/text_normalization/README.md @@ -0,0 +1,12 @@ +# Text normalization component + +This component implements several text normalization techniques to clean and preprocess textual data: + +- Apply lowercasing: Converts all text to lowercase +- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs +- Apply NFC normalization: Converts characters to their canonical representation +- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) +- Remove punctuation: Strips punctuation marks from the text + +These text normalization techniques are valuable for preparing text data before using it for +the training of large language models. \ No newline at end of file diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index 2f5070466..3df9f2ae4 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -21,9 +21,6 @@ args: do_lowercase: description: If true apply lowercasing type: bool - language: - description: Language is needed for language specific normalizations - type: str remove_punctuation: description: If true punctuation will be removed type: str \ No newline at end of file From 1d35b0d22b625d17240ff352381bfc95e3ab734d Mon Sep 17 00:00:00 2001 From: mrchtr Date: Mon, 7 Aug 2023 20:39:56 +0200 Subject: [PATCH 5/9] Addressing comments --- components/text_normalization/Dockerfile | 19 +++++-- .../text_normalization/fondant_component.yaml | 2 +- .../text_normalization/requirements.txt | 2 +- components/text_normalization/src/main.py | 6 +-- components/text_normalization/src/utils.py | 8 +-- .../tests/component_test.py | 50 ------------------- .../text_normalization/tests/conftest.py | 8 --- .../text_normalization/tests/utils_test.py | 4 +- src/fondant/abstract_component_test.py | 3 ++ 9 files changed, 28 insertions(+), 74 deletions(-) delete mode 100644 components/text_normalization/tests/component_test.py delete mode 100644 components/text_normalization/tests/conftest.py diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile index 605adc7e9..ac4d4aedf 100644 --- a/components/text_normalization/Dockerfile +++ b/components/text_normalization/Dockerfile @@ -1,18 +1,27 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base ## System dependencies RUN apt-get update && \ apt-get upgrade -y && \ apt-get install git -y +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant + # install requirements COPY requirements.txt / RUN pip3 install --no-cache-dir -r requirements.txt -# Set the working directory to the component folder -WORKDIR /component/src - # Copy over src-files -COPY src/ . +COPY src/ src/ +# Run component tests +FROM base as test +RUN pip3 install pytest pandas # TODO add pytest to package setup +COPY tests/ tests/ +RUN ["python", "-m", "pytest", "tests/"] + +FROM base +# Set the working directory to the component folder +WORKDIR /src +RUN echo $(ls) ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index 3df9f2ae4..6057fbfa4 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -16,7 +16,7 @@ args: description: If true apply nfc normalization type: bool remove_bad_patterns: - description: If true remove bad patterns + description: If true remove common patterns in web texts (e.g. lines contains only number, lines consists of uppercase letters, or counters) type: bool do_lowercase: description: If true apply lowercasing diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt index 9e5daac86..a4299def8 100644 --- a/components/text_normalization/requirements.txt +++ b/components/text_normalization/requirements.txt @@ -1 +1 @@ -ftfy=6.1.1 \ No newline at end of file +ftfy==6.1.1 \ No newline at end of file diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index 50e2ca2af..741b45041 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -75,12 +75,12 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - if self.do_lowercase: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) - if self.remove_bad_patterns: dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines) + if self.do_lowercase: + dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) + if self.apply_nfc: dataframe[("text", "data")] = dataframe[("text", "data")].apply( self._do_nfc_normalization) diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py index 5edc1cd9c..24e1db54a 100644 --- a/components/text_normalization/src/utils.py +++ b/components/text_normalization/src/utils.py @@ -1,7 +1,7 @@ import re -def mainly_uppercase(line, threshold=0.7): +def mainly_uppercase(line: str, threshold: float = 0.7) -> bool: """ Checks if a line is mainly composed of uppercase characters. @@ -22,7 +22,7 @@ def mainly_uppercase(line, threshold=0.7): return uppercase_ratio >= threshold -def only_numerical(line): +def only_numerical(line: str) -> bool: """ Checks if a line is composed only of numerical characters. @@ -35,7 +35,7 @@ def only_numerical(line): return line.isdigit() -def is_counter(line): +def is_counter(line: str) -> bool: """ Checks if a line represents a counter (e.g., "3 likes"). @@ -51,7 +51,7 @@ def is_counter(line): return re.match(pattern, line) is not None -def is_one_word(line): +def is_one_word(line: str) -> bool: """ Checks if a line contains only one word. diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py deleted file mode 100644 index bff208ed6..000000000 --- a/components/text_normalization/tests/component_test.py +++ /dev/null @@ -1,50 +0,0 @@ -import json -import os -import typing as t -from glob import glob - -import pandas as pd -import pytest -from fondant.component_spec import ComponentSpec - -from components.text_normalization.src.main import TextNormalizationComponent - - -class MockedComponentSpec(ComponentSpec): - """Just for mocking purpose. This component spec is not needed for unit testing.""" - - def __init__(self, specification: t.Dict[str, t.Any]): - pass - - -def load_fixtures(path="./fixtures"): - test_configurations = [] - fixture_list = glob(path + "/*.json") - for fixture in fixture_list: - with open(fixture) as file: - fixture_dict = json.load(file) - - fixture_name = os.path.splitext(fixture)[0] - user_arguments = fixture_dict["user_arguments"] - input_data = { - tuple(key.split("_")): value for key, value in fixture_dict["input"].items() - } - expected_out = { - tuple(key.split("_")): value - for key, value in fixture_dict["output"].items() - } - - test_configurations.append((fixture_name, user_arguments, input_data, expected_out)) - - return test_configurations - - -@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), - load_fixtures()) -def test_component(fixture_name, user_arguments, input_data, expected_output): - """Test transform method of text normalization component.""" - print("Running test case based on: ", fixture_name) - component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments) - input_df = pd.DataFrame(input_data) - transformed_output = component.transform(input_df) - pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output) diff --git a/components/text_normalization/tests/conftest.py b/components/text_normalization/tests/conftest.py deleted file mode 100644 index ef13b7321..000000000 --- a/components/text_normalization/tests/conftest.py +++ /dev/null @@ -1,8 +0,0 @@ -import os -import sys - -# Get the absolute path to the "src" directory -src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")) - -# Append the "src" directory to the Python path -sys.path.append(src_path) diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py index af2c7d6b7..3f76dcfa6 100644 --- a/components/text_normalization/tests/utils_test.py +++ b/components/text_normalization/tests/utils_test.py @@ -1,5 +1,5 @@ -from components.text_normalization.src.utils import ( +from src.utils import ( is_counter, is_one_word, mainly_uppercase, @@ -37,6 +37,6 @@ def test_is_one_word(): def test_is_not_one_word(): line = "two words" - assert ~is_one_word(line) + assert not is_one_word(line) diff --git a/src/fondant/abstract_component_test.py b/src/fondant/abstract_component_test.py index 92a1dd69f..079f3dccd 100644 --- a/src/fondant/abstract_component_test.py +++ b/src/fondant/abstract_component_test.py @@ -3,6 +3,9 @@ import pandas as pd import pytest +from fondant.component_spec import ComponentSpec + + class AbstractComponentTest(ABC): @abstractmethod From e0c0c8c00ea9856f14ace0f3c14117a6c2a3b6ed Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 8 Aug 2023 08:38:52 +0200 Subject: [PATCH 6/9] Update docsstrings, adapt component test to use the AbstractComponentTest --- components/text_normalization/Dockerfile | 2 +- .../text_normalization/fondant_component.yaml | 4 +-- components/text_normalization/src/main.py | 15 +++++++---- .../tests/fixtures/apply_all.json | 22 ---------------- .../apply_nfc_text_normalization.json | 24 ------------------ .../lowercasing_text_normalization.json | 24 ------------------ ..._additional_whitespaces_normalization.json | 24 ------------------ ...emove_bad_patterns_text_normalization.json | 25 ------------------- .../remove_puncuation_text_normalization.json | 24 ------------------ src/fondant/abstract_component_test.py | 3 --- 10 files changed, 13 insertions(+), 154 deletions(-) delete mode 100644 components/text_normalization/tests/fixtures/apply_all.json delete mode 100644 components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json delete mode 100644 components/text_normalization/tests/fixtures/lowercasing_text_normalization.json delete mode 100644 components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json delete mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json delete mode 100644 components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile index ac4d4aedf..12137c722 100644 --- a/components/text_normalization/Dockerfile +++ b/components/text_normalization/Dockerfile @@ -17,11 +17,11 @@ COPY src/ src/ # Run component tests FROM base as test RUN pip3 install pytest pandas # TODO add pytest to package setup +ENV PYTHONPATH "${PYTHONPATH}:./src" COPY tests/ tests/ RUN ["python", "-m", "pytest", "tests/"] FROM base # Set the working directory to the component folder WORKDIR /src -RUN echo $(ls) ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml index 6057fbfa4..f9d2bfabb 100644 --- a/components/text_normalization/fondant_component.yaml +++ b/components/text_normalization/fondant_component.yaml @@ -15,8 +15,8 @@ args: apply_nfc: description: If true apply nfc normalization type: bool - remove_bad_patterns: - description: If true remove common patterns in web texts (e.g. lines contains only number, lines consists of uppercase letters, or counters) + normalize_lines: + description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter type: bool do_lowercase: description: If true apply lowercasing diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index 741b45041..4c51f346e 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -28,7 +28,7 @@ def _remove_additional_whitespaces(text): return re.sub(r"\s+", " ", text.strip()) -def remove_noisy_lines(text): +def normalize_lines(text): def any_condition_met(line, discard_condition_functions): return any(condition(line) for condition in discard_condition_functions) @@ -45,7 +45,7 @@ def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, do_lowercase: bool, language: str, remove_punctuation: bool): self.remove_additional_whitespaces = remove_additional_whitespaces self.apply_nfc = apply_nfc - self.remove_bad_patterns = remove_bad_patterns + self.normalize_lines = remove_bad_patterns self.do_lowercase = do_lowercase self.language = language self.remove_punctuation = remove_punctuation @@ -67,7 +67,11 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Apply normalization transformations. The component is capable of: - NFC normalization - Lowercasing - - Removing of regex patterns. + - Removing of unnecessary whitespaces (e.g. tabs), punctuation + - Apply line-wise transformations that exclude lines matching specified patterns. + Patterns include lines that are mainly composed of uppercase characters, lines that consist + only of numerical characters, lines that are counters (e.g., "3 likes"), and lines + that contain only one word. Args: dataframe: Pandas dataframe. @@ -75,8 +79,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - if self.remove_bad_patterns: - dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines) + if self.normalize_lines: + dataframe[("text", "data")] = dataframe[("text", "data")].apply( + normalize_lines) if self.do_lowercase: dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower()) diff --git a/components/text_normalization/tests/fixtures/apply_all.json b/components/text_normalization/tests/fixtures/apply_all.json deleted file mode 100644 index 062191020..000000000 --- a/components/text_normalization/tests/fixtures/apply_all.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": true, - "do_lowercase": true, - "language": "en", - "remove_punctuation": true, - "remove_additional_whitespaces": true, - "remove_bad_patterns": true - }, - "input": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." - ] - }, - "output": { - "text_data": [ - "lorem ipsum dolor sit amet consectetur adipiscing elit", - "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus" - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json deleted file mode 100644 index b9d7977f0..000000000 --- a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": true, - "do_lowercase": false, - "language": "en", - "remove_punctuation": false, - "remove_additional_whitespaces": false, - "remove_bad_patterns": false - }, - "input": { - "text_data": [ - "\u0043\u0327 something", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." - ] - }, - "output": { - "text_data": [ - "\u00C7 something", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus." - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json deleted file mode 100644 index bd64f90cb..000000000 --- a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": true, - "do_lowercase": true, - "language": "en", - "remove_punctuation": false, - "remove_additional_whitespaces": false, - "remove_bad_patterns": false - }, - "input": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." - ] - }, - "output": { - "text_data": [ - "lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "nulla facilisi. sed eu nulla sit amet enim scelerisque dapibus.", - "suspendisse potenti. fusce sit amet erat vel nunc placerat bibendum." - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json deleted file mode 100644 index 36f519967..000000000 --- a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": false, - "do_lowercase": false, - "language": "en", - "remove_punctuation": false, - "remove_additional_whitespaces": true, - "remove_bad_patterns": false - }, - "input": { - "text_data": [ - " Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - " Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum. " - ] - }, - "output": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json deleted file mode 100644 index c4a8eab71..000000000 --- a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": false, - "do_lowercase": false, - "language": "de", - "remove_punctuation": false, - "remove_additional_whitespaces": true, - "remove_bad_patterns": true - }, - "input": { - "text_data": [ - "Lorem ipsum dolor sit \n HELLO WORLD \n amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.", - "45345345" - ] - }, - "output": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." - ] - } -} \ No newline at end of file diff --git a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json deleted file mode 100644 index b03fd81f1..000000000 --- a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "user_arguments": { - "apply_nfc": true, - "do_lowercase": false, - "language": "en", - "remove_punctuation": true, - "remove_additional_whitespaces": false, - "remove_bad_patterns": false - }, - "input": { - "text_data": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!", - "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum." - ] - }, - "output": { - "text_data": [ - "Lorem ipsum dolor sit amet consectetur adipiscing elit", - "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", - "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum" - ] - } -} \ No newline at end of file diff --git a/src/fondant/abstract_component_test.py b/src/fondant/abstract_component_test.py index 079f3dccd..92a1dd69f 100644 --- a/src/fondant/abstract_component_test.py +++ b/src/fondant/abstract_component_test.py @@ -3,9 +3,6 @@ import pandas as pd import pytest -from fondant.component_spec import ComponentSpec - - class AbstractComponentTest(ABC): @abstractmethod From d5a508f177938f0758321f3fa112355c92670865 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 8 Aug 2023 08:46:10 +0200 Subject: [PATCH 7/9] Update docker file --- components/text_normalization/Dockerfile | 2 +- components/text_normalization/src/utils.py | 8 ++++---- components/text_normalization/tests/utils_test.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile index 12137c722..eb96b604a 100644 --- a/components/text_normalization/Dockerfile +++ b/components/text_normalization/Dockerfile @@ -16,7 +16,7 @@ COPY src/ src/ # Run component tests FROM base as test -RUN pip3 install pytest pandas # TODO add pytest to package setup +RUN pip3 install pytest # TODO add pytest to package setup ENV PYTHONPATH "${PYTHONPATH}:./src" COPY tests/ tests/ RUN ["python", "-m", "pytest", "tests/"] diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py index 24e1db54a..b487bc61e 100644 --- a/components/text_normalization/src/utils.py +++ b/components/text_normalization/src/utils.py @@ -6,7 +6,7 @@ def mainly_uppercase(line: str, threshold: float = 0.7) -> bool: Checks if a line is mainly composed of uppercase characters. Args: - line (str): The input line to check. + line: The input line to check. threshold (float): The threshold (between 0 and 1) to determine what is considered "mainly uppercase." @@ -27,7 +27,7 @@ def only_numerical(line: str) -> bool: Checks if a line is composed only of numerical characters. Args: - line (str): The input line to check. + line: The input line to check. Returns: bool: True if the line is only composed of numerical characters, False otherwise. @@ -40,7 +40,7 @@ def is_counter(line: str) -> bool: Checks if a line represents a counter (e.g., "3 likes"). Args: - line (str): The input line to check. + line: The input line to check. Returns: bool: True if the line represents a counter, False otherwise. @@ -56,7 +56,7 @@ def is_one_word(line: str) -> bool: Checks if a line contains only one word. Args: - line (str): The input line to check. + line: The input line to check. Returns: bool: True if the line contains only one word, False otherwise. diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py index 3f76dcfa6..44f360676 100644 --- a/components/text_normalization/tests/utils_test.py +++ b/components/text_normalization/tests/utils_test.py @@ -13,7 +13,7 @@ def test_mainly_uppercase(): def test_mainly_uppercase_under_threshold(): line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" - assert ~mainly_uppercase(line, threshold=0.9) + assert not mainly_uppercase(line, threshold=0.9) def test_only_numerical(): line = "42" @@ -21,7 +21,7 @@ def test_only_numerical(): def test_only_numerical_on_words(): line = "42 lorem ipsum" - assert ~only_numerical(line) + assert not only_numerical(line) def test_is_counter(): line = "13 Likes" @@ -29,7 +29,7 @@ def test_is_counter(): def test_is_not_counter(): line = "Hello world! 42 people are part of .." - assert ~is_counter(line) + assert not is_counter(line) def test_is_one_word(): line = "word" From 2a7a73319f91874215a79eb4533955b845f3444a Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 8 Aug 2023 14:45:18 +0200 Subject: [PATCH 8/9] Testing strategy drafts --- components/text_normalization/src/main.py | 3 +- .../tests/component_test.py | 122 ++++++++++++++++++ src/fondant/testing_utils.py | 39 ++++++ 3 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 components/text_normalization/tests/component_test.py create mode 100644 src/fondant/testing_utils.py diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py index 4c51f346e..662c8023c 100644 --- a/components/text_normalization/src/main.py +++ b/components/text_normalization/src/main.py @@ -42,12 +42,11 @@ class TextNormalizationComponent(PandasTransformComponent): def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, remove_bad_patterns: bool, - do_lowercase: bool, language: str, remove_punctuation: bool): + do_lowercase: bool, remove_punctuation: bool): self.remove_additional_whitespaces = remove_additional_whitespaces self.apply_nfc = apply_nfc self.normalize_lines = remove_bad_patterns self.do_lowercase = do_lowercase - self.language = language self.remove_punctuation = remove_punctuation @staticmethod diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py new file mode 100644 index 000000000..d39d9273b --- /dev/null +++ b/components/text_normalization/tests/component_test.py @@ -0,0 +1,122 @@ +from typing import Any, Dict + +import pandas as pd +import pytest +from fondant.testing_utils import execute_pandas_transform_component + +from src.main import TextNormalizationComponent + + +def test_transform_custom_componen_test(): + """Test components transform method. + Option 1: handling the test case is up to the users. + """ + user_arguments = { + "remove_additional_whitespaces": True, + "apply_nfc": True, + "remove_bad_patterns": True, + "do_lowercase": True, + "remove_punctuation": True, + } + component = TextNormalizationComponent(**user_arguments) + + input_dataframe = pd.DataFrame([ + "\u0043\u0327 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + ], columns=[("text", "data")]) + + expected_output = pd.DataFrame([ + "\u00e7 something", + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", + ], columns=[("text", "data")]) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output, + right=output_dataframe, + check_dtype=False, + ) + + +def test_transform_helper_methods(): + """Test components transform method. + Option 2: using helper method provided by fondant. + """ + user_arguments = { + "remove_additional_whitespaces": True, + "apply_nfc": True, + "remove_bad_patterns": True, + "do_lowercase": True, + "remove_punctuation": True, + } + component = TextNormalizationComponent(**user_arguments) + + input_dataframe = pd.DataFrame([ + "\u0043\u0327 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + ], columns=[("text", "data")]) + + expected_output = pd.DataFrame([ + "\u00e7 something", + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", + ], columns=[("text", "data")]) + + execute_pandas_transform_component(component, input_dataframe, expected_output) + + +data = [ + # first scenario + { + "user_arguments": { + "remove_additional_whitespaces": True, + "apply_nfc": True, + "remove_bad_patterns": True, + "do_lowercase": True, + "remove_punctuation": True, + }, + "input_dataframe": pd.DataFrame([ + "\u0043\u0327 something", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + ], columns=[("text", "data")]), + "output_dataframe": pd.DataFrame([ + "\u00e7 something", + "lorem ipsum dolor sit amet consectetur adipiscing elit", + "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", + ], columns=[("text", "data")]), + }, + + # second scenario + { + "user_arguments": { + "remove_additional_whitespaces": True, + "apply_nfc": True, + "remove_bad_patterns": True, + "do_lowercase": False, + "remove_punctuation": True, + }, + "input_dataframe": pd.DataFrame([ + "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", + ], columns=[("text", "data")]), + "output_dataframe": pd.DataFrame([ + "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", + ], columns=[("text", "data")]), + }, +] + + +@pytest.mark.parametrize( + "scenario", + data, +) +def test_transform_helper_methods_parametrized(scenario: Dict[str, Any]): + """Option 3: Only defining parametrized scenarios. Usage of helper provided by fondant.""" + component = TextNormalizationComponent(**scenario["user_arguments"]) + execute_pandas_transform_component(component, + scenario["input_dataframe"], + scenario["output_dataframe"]) diff --git a/src/fondant/testing_utils.py b/src/fondant/testing_utils.py new file mode 100644 index 000000000..97eed0c86 --- /dev/null +++ b/src/fondant/testing_utils.py @@ -0,0 +1,39 @@ +import dask.dataframe as dd +import pandas as pd + +from fondant.component import DaskTransformComponent, PandasTransformComponent + + +def execute_pandas_transform_component( + component: PandasTransformComponent, + input_dataframe: pd.DataFrame, + expected_output: pd.DataFrame, +): + """Helper method for executing pandas transform component.""" + _compare_pandas_dataframe(component.transform(input_dataframe), expected_output) + + +def _compare_pandas_dataframe( + expected_output: pd.DataFrame, + output_dataframe: pd.DataFrame, +): + """Comparing to pandas dataframes.""" + pd.testing.assert_frame_equal( + left=expected_output, + right=output_dataframe, + check_dtype=False, + ) + + +def execute_dask_transform_component( + component: DaskTransformComponent, + input_dataframe: dd, + expected_output: dd, +): + """Helper method for executing pandas transform component.""" + _compare_dask_dataframe(component.transform(input_dataframe), expected_output) + + +def _compare_dask_dataframe(expected_output: dd, output_dataframe: dd): + msg = "Not implemented." + raise NotImplementedError(msg) From e914376fe907ffef05874e4d9cd0e07a8d3d83b2 Mon Sep 17 00:00:00 2001 From: mrchtr Date: Tue, 15 Aug 2023 07:40:03 +0200 Subject: [PATCH 9/9] Refactor unit tests --- .../tests/component_test.py | 88 +------------------ src/fondant/testing_utils.py | 39 -------- 2 files changed, 1 insertion(+), 126 deletions(-) delete mode 100644 src/fondant/testing_utils.py diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py index d39d9273b..34ce528aa 100644 --- a/components/text_normalization/tests/component_test.py +++ b/components/text_normalization/tests/component_test.py @@ -1,16 +1,11 @@ -from typing import Any, Dict import pandas as pd -import pytest -from fondant.testing_utils import execute_pandas_transform_component from src.main import TextNormalizationComponent def test_transform_custom_componen_test(): - """Test components transform method. - Option 1: handling the test case is up to the users. - """ + """Test components transform method.""" user_arguments = { "remove_additional_whitespaces": True, "apply_nfc": True, @@ -39,84 +34,3 @@ def test_transform_custom_componen_test(): right=output_dataframe, check_dtype=False, ) - - -def test_transform_helper_methods(): - """Test components transform method. - Option 2: using helper method provided by fondant. - """ - user_arguments = { - "remove_additional_whitespaces": True, - "apply_nfc": True, - "remove_bad_patterns": True, - "do_lowercase": True, - "remove_punctuation": True, - } - component = TextNormalizationComponent(**user_arguments) - - input_dataframe = pd.DataFrame([ - "\u0043\u0327 something", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", - ], columns=[("text", "data")]) - - expected_output = pd.DataFrame([ - "\u00e7 something", - "lorem ipsum dolor sit amet consectetur adipiscing elit", - "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", - ], columns=[("text", "data")]) - - execute_pandas_transform_component(component, input_dataframe, expected_output) - - -data = [ - # first scenario - { - "user_arguments": { - "remove_additional_whitespaces": True, - "apply_nfc": True, - "remove_bad_patterns": True, - "do_lowercase": True, - "remove_punctuation": True, - }, - "input_dataframe": pd.DataFrame([ - "\u0043\u0327 something", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", - ], columns=[("text", "data")]), - "output_dataframe": pd.DataFrame([ - "\u00e7 something", - "lorem ipsum dolor sit amet consectetur adipiscing elit", - "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", - ], columns=[("text", "data")]), - }, - - # second scenario - { - "user_arguments": { - "remove_additional_whitespaces": True, - "apply_nfc": True, - "remove_bad_patterns": True, - "do_lowercase": False, - "remove_punctuation": True, - }, - "input_dataframe": pd.DataFrame([ - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", - ], columns=[("text", "data")]), - "output_dataframe": pd.DataFrame([ - "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus", - ], columns=[("text", "data")]), - }, -] - - -@pytest.mark.parametrize( - "scenario", - data, -) -def test_transform_helper_methods_parametrized(scenario: Dict[str, Any]): - """Option 3: Only defining parametrized scenarios. Usage of helper provided by fondant.""" - component = TextNormalizationComponent(**scenario["user_arguments"]) - execute_pandas_transform_component(component, - scenario["input_dataframe"], - scenario["output_dataframe"]) diff --git a/src/fondant/testing_utils.py b/src/fondant/testing_utils.py deleted file mode 100644 index 97eed0c86..000000000 --- a/src/fondant/testing_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -import dask.dataframe as dd -import pandas as pd - -from fondant.component import DaskTransformComponent, PandasTransformComponent - - -def execute_pandas_transform_component( - component: PandasTransformComponent, - input_dataframe: pd.DataFrame, - expected_output: pd.DataFrame, -): - """Helper method for executing pandas transform component.""" - _compare_pandas_dataframe(component.transform(input_dataframe), expected_output) - - -def _compare_pandas_dataframe( - expected_output: pd.DataFrame, - output_dataframe: pd.DataFrame, -): - """Comparing to pandas dataframes.""" - pd.testing.assert_frame_equal( - left=expected_output, - right=output_dataframe, - check_dtype=False, - ) - - -def execute_dask_transform_component( - component: DaskTransformComponent, - input_dataframe: dd, - expected_output: dd, -): - """Helper method for executing pandas transform component.""" - _compare_dask_dataframe(component.transform(input_dataframe), expected_output) - - -def _compare_dask_dataframe(expected_output: dd, output_dataframe: dd): - msg = "Not implemented." - raise NotImplementedError(msg)