From a2c62394393f56525396da7acc661b6f563e9a84 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 1 Aug 2023 13:16:12 +0200
Subject: [PATCH 1/9] Add readme and component cleaning

---
 .../text_normalization/requirements.txt       |  1 +
 components/text_normalization/src/main.py     | 50 +++++++++-
 .../src/resources/de_bad_patterns.txt         |  6 ++
 components/text_normalization/src/utils.py    | 95 +++++++++++++++++++
 .../tests/component_test.py                   | 54 +++++++++++
 .../tests/fixtures/en_text_normalization.json | 17 ++++
 6 files changed, 218 insertions(+), 5 deletions(-)
 create mode 100644 components/text_normalization/src/resources/de_bad_patterns.txt
 create mode 100644 components/text_normalization/src/utils.py
 create mode 100644 components/text_normalization/tests/component_test.py
 create mode 100644 components/text_normalization/tests/fixtures/en_text_normalization.json

diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt
index e69de29bb..9e5daac86 100644
--- a/components/text_normalization/requirements.txt
+++ b/components/text_normalization/requirements.txt
@@ -0,0 +1 @@
+ftfy=6.1.1
\ No newline at end of file
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index a3c415717..0e0ff52a4 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -1,9 +1,10 @@
 """A component that normalizes text."""
 import logging
 import re
-import unicodedata
+import string
 from typing import List
 
+import ftfy
 import pandas as pd
 from fondant.component import PandasTransformComponent
 from fondant.executor import PandasTransformExecutor
@@ -11,6 +12,39 @@
 logger = logging.getLogger(__name__)
 
 
+def clean(text, remove_punctuation=True):
+    """
+    Text cleaning method from slimpajama approach.
+    https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
+    Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle
+    and in the beginning / end.
+
+    Args:
+         - text: text to be cleaned
+    """
+    # remove punctuation
+    if remove_punctuation:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+
+    # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
+    text = re.sub(r"\s+", " ", text.strip())
+    return text
+
+def remove_noisy_lines(text, language):
+    """
+    !!! and note that they require adaptation across languages !!!
+    • If it is short (≤ 10 words) and matches a pattern (edit):
+        - At the beginning of the line (e.g. sign-in);
+        - At the end of the line (e.g. Read more...);
+        - Anywhere in the line (e.g. items in cart).
+    """
+    language  + "bad_patterns.txt"
+
+    def any_condition_met(line, discard_condition_functions):
+        return any(condition(line) for condition in discard_condition_functions)
+
+    return " ".join([line for line in text.split("\n") if not any_condition_met])
+
 class TextNormalizationComponent(PandasTransformComponent):
     """Component that normalizes text."""
 
@@ -18,11 +52,12 @@ def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_rem
         self.apply_nfc = apply_nfc
         self.do_lowercase = do_lowercase
         self.characters_to_remove = characters_to_remove
+        self.default_cleaning = True
 
     @staticmethod
     def _do_nfc_normalization(text: str):
         """Apply nfc normalization to the text of the dataframe."""
-        return unicodedata.normalize("NFC", text)
+        return ftfy.fix_text(text, normalization="NFC")
 
     @staticmethod
     def _remove_patterns(regex_patterns: List[str], text: str):
@@ -44,14 +79,19 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Returns:
             Pandas dataframe
         """
+        dataframe[("text", "data")] = dataframe["text"]["data"].apply(remove_noisy_lines)
+
         if self.apply_nfc:
-            dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
+            dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
 
         if self.do_lowercase:
-            dataframe["text"]["data"].apply(lambda x: x.lower())
+            dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: x.lower())
+
+        if self.default_cleaning:
+            dataframe[("text", "data")] = dataframe["text"]["data"].apply(clean)
 
         if len(self.characters_to_remove) > 0:
-            dataframe["text"]["data"].apply(
+            dataframe[("text", "data")] = dataframe["text"]["data"].apply(
                 lambda x: self._remove_patterns(
                     self.characters_to_remove, x,
                 ),
diff --git a/components/text_normalization/src/resources/de_bad_patterns.txt b/components/text_normalization/src/resources/de_bad_patterns.txt
new file mode 100644
index 000000000..2504b1974
--- /dev/null
+++ b/components/text_normalization/src/resources/de_bad_patterns.txt
@@ -0,0 +1,6 @@
+Weiterlesen
+Startseite
+Einkaufswagen
+Konto
+Zum Einkaufswagen hinzufügen
+Zum Warenkorb hinzufügen
\ No newline at end of file
diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py
new file mode 100644
index 000000000..f6dff6d21
--- /dev/null
+++ b/components/text_normalization/src/utils.py
@@ -0,0 +1,95 @@
+import re
+
+
+def mainly_uppercase(line, threshold=0.7):
+    """
+    Checks if a line is mainly composed of uppercase characters.
+
+    Args:
+        line (str): The input line to check.
+        threshold (float): The threshold (between 0 and 1) to determine what is considered "mainly uppercase."
+
+    Returns:
+        bool: True if the line is mainly uppercase, False otherwise.
+    """
+    uppercase_count = sum(1 for char in line if char.isupper())
+    total_chars = len(line)
+    if total_chars == 0:
+        return False
+
+    uppercase_ratio = uppercase_count / total_chars
+    return uppercase_ratio >= threshold
+
+def only_numerical(line):
+    """
+    Checks if a line is composed only of numerical characters.
+
+    Args:
+        line (str): The input line to check.
+
+    Returns:
+        bool: True if the line is only composed of numerical characters, False otherwise.
+    """
+    return line.isdigit()
+
+def is_counter(line):
+    """
+    Checks if a line represents a counter (e.g., "3 likes").
+
+    Args:
+        line (str): The input line to check.
+
+    Returns:
+        bool: True if the line represents a counter, False otherwise.
+    """
+    # Use regular expression to check for the pattern: <number> <text>
+    pattern = r"^\d+\s+\S+$"
+    return re.match(pattern, line) is not None
+
+def is_one_word(line):
+    """
+    Checks if a line contains only one word.
+
+    Args:
+        line (str): The input line to check.
+
+    Returns:
+        bool: True if the line contains only one word, False otherwise.
+    """
+    words = line.split()
+    return len(words) == 1
+
+
+def read_patterns_from_file(file_path):
+    """
+    Read patterns from a text file.
+
+    Args:
+        file_path (str): The path to the text file containing patterns.
+
+    Returns:
+        list: A list of patterns read from the file.
+    """
+    with open(file_path) as file:
+        return [pattern.strip() for pattern in file]
+def is_short_and_matches_pattern(line, pattern_file_path, max_words=10):
+    """
+    Checks if a line is short (< max_words) and matches the given pattern.
+
+    Args:
+        line (str): The input line to check.
+        max_words (int): The maximum number of words allowed in the line (default is 10).
+
+    Returns:
+        bool: True if the line is short and matches the pattern, False otherwise.
+    """
+    patterns = read_patterns_from_file(pattern_file_path)
+    words = line.split()
+    if len(words) > max_words:
+        return False
+
+    for pattern in patterns:
+        if re.search(rf'\b{re.escape(pattern)}\b', line) is not None:
+            return True
+    return None
+
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
new file mode 100644
index 000000000..43d2ada6a
--- /dev/null
+++ b/components/text_normalization/tests/component_test.py
@@ -0,0 +1,54 @@
+import json
+from glob import glob
+
+import pandas
+from fondant.component import Component
+from fondant.executor import Executor
+
+
+def load_fixtures(path):
+    test_configurations = []
+    fixture_list = glob(path)
+    for fixture in fixture_list:
+        with open(fixture) as file:
+            fixture_dict = json.load(file)
+
+        user_argmuments = fixture_dict["user_arguments"]
+        input_data = {
+            tuple(key.split("_")): value for key, value in fixture_dict["input"].items()
+        }
+        expected_out = {
+            tuple(key.split("_")): value
+            for key, value in fixture_dict["output"].items()
+        }
+
+        test_configurations.append((user_argmuments, input_data, expected_out))
+
+    return test_configurations
+
+class TestComponentExecuter(Executor[Component]):
+    def __init__(self, user_arguments: t.Dict[str, t.Any], input_data: t.Dict):
+        self.user_arguments = user_arguments
+        self.input_data = input_data
+
+    def execute(self, component_cls: t.Type[Component]) -> pandas.DataFrame:
+        """Execute a component.
+
+        Args:
+            component_cls: The class of the component to execute.
+        """
+        component = component_cls(None, **self.user_arguments)
+
+        input_dataframe = dd.from_dict(self.input_data, npartitions=2)
+
+        if isinstance(component, PandasTransformComponent):
+            output_df = component.transform(input_dataframe.compute())
+
+        elif isinstance(component, DaskTransformComponent):
+            output_df = component.transform(input_dataframe()).compute()
+
+        else:
+            msg = "Non support component type."
+            raise NotImplementedError(msg)
+
+        return output_df
diff --git a/components/text_normalization/tests/fixtures/en_text_normalization.json b/components/text_normalization/tests/fixtures/en_text_normalization.json
new file mode 100644
index 000000000..4a8165a28
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/en_text_normalization.json
@@ -0,0 +1,17 @@
+{
+  "user_arguments": {
+    "language": "de"
+  },
+  "input": {
+    "data_text": [
+      "Das hier ist ein Satz in deutscher Sprache",
+      "This is a sentence in English",
+      "Dit is een zin in het Nederlands"
+    ]
+  },
+  "output": {
+    "data_text": [
+      "Das hier ist ein Satz in deutscher Sprache"
+    ]
+  }
+}
\ No newline at end of file

From 21e040bab33c2be675625173611b49e9424a5245 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 1 Aug 2023 15:55:36 +0200
Subject: [PATCH 2/9] Refactor text normalization component

---
 .../text_normalization/fondant_component.yaml | 15 ++++-
 components/text_normalization/src/main.py     | 53 ++++++++----------
 ...e_bad_patterns.txt => en_bad_patterns.txt} |  0
 .../tests/component_test.py                   | 56 +++++++++----------
 .../text_normalization/tests/conftest.py      |  8 +++
 .../tests/fixtures/apply_all.json             | 22 ++++++++
 .../apply_nfc_text_normalization.json         | 24 ++++++++
 .../tests/fixtures/en_text_normalization.json | 17 ------
 .../lowercasing_text_normalization.json       | 24 ++++++++
 ..._additional_whitespaces_normalization.json | 24 ++++++++
 .../tests/fixtures/remove_bad_patterns.json   | 22 ++++++++
 ...emove_bad_patterns_text_normalization.json | 25 +++++++++
 .../remove_puncuation_text_normalization.json | 24 ++++++++
 13 files changed, 233 insertions(+), 81 deletions(-)
 rename components/text_normalization/src/resources/{de_bad_patterns.txt => en_bad_patterns.txt} (100%)
 create mode 100644 components/text_normalization/tests/conftest.py
 create mode 100644 components/text_normalization/tests/fixtures/apply_all.json
 create mode 100644 components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
 delete mode 100644 components/text_normalization/tests/fixtures/en_text_normalization.json
 create mode 100644 components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
 create mode 100644 components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
 create mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns.json
 create mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
 create mode 100644 components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json

diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
index 6119e914d..2f5070466 100644
--- a/components/text_normalization/fondant_component.yaml
+++ b/components/text_normalization/fondant_component.yaml
@@ -9,12 +9,21 @@ consumes:
         type: string
 
 args:
+  remove_additional_whitespaces:
+    description: If true remove all additional whitespace, tabs.
+    type: bool
   apply_nfc:
     description: If true apply nfc normalization
     type: bool
+  remove_bad_patterns:
+    description: If true remove bad patterns
+    type: bool
   do_lowercase:
     description: If true apply lowercasing
     type: bool
-  characters_to_remove:
-    description: List of characters which will be removed, e.g. [?,.!,@#%]
-    type: list
\ No newline at end of file
+  language:
+    description: Language is needed for language specific normalizations
+    type: str
+  remove_punctuation:
+    description: If true punctuation will be removed
+    type: str
\ No newline at end of file
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index 0e0ff52a4..0ae5f4559 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -12,7 +12,11 @@
 logger = logging.getLogger(__name__)
 
 
-def clean(text, remove_punctuation=True):
+def _remove_punctuation(text):
+    """Remove punctuation in given text."""
+    return text.translate(str.maketrans("", "", string.punctuation))
+
+def _remove_additional_whitespaces(text):
     """
     Text cleaning method from slimpajama approach.
     https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
@@ -22,22 +26,10 @@ def clean(text, remove_punctuation=True):
     Args:
          - text: text to be cleaned
     """
-    # remove punctuation
-    if remove_punctuation:
-        text = text.translate(str.maketrans("", "", string.punctuation))
-
-    # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
-    text = re.sub(r"\s+", " ", text.strip())
-    return text
+    return re.sub(r"\s+", " ", text.strip())
 
 def remove_noisy_lines(text, language):
-    """
-    !!! and note that they require adaptation across languages !!!
-    • If it is short (≤ 10 words) and matches a pattern (edit):
-        - At the beginning of the line (e.g. sign-in);
-        - At the end of the line (e.g. Read more...);
-        - Anywhere in the line (e.g. items in cart).
-    """
+    """"""
     language  + "bad_patterns.txt"
 
     def any_condition_met(line, discard_condition_functions):
@@ -47,12 +39,13 @@ def any_condition_met(line, discard_condition_functions):
 
 class TextNormalizationComponent(PandasTransformComponent):
     """Component that normalizes text."""
-
-    def __init__(self, *args, apply_nfc: bool, do_lowercase: bool, characters_to_remove: List[str]):
+    def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, remove_bad_patterns: bool, do_lowercase: bool, language: str, remove_punctuation: bool):
+        self.remove_additional_whitespaces = remove_additional_whitespaces
         self.apply_nfc = apply_nfc
+        self.remove_bad_patterns = remove_bad_patterns
         self.do_lowercase = do_lowercase
-        self.characters_to_remove = characters_to_remove
-        self.default_cleaning = True
+        self.language = language
+        self.remove_punctuation = remove_punctuation
 
     @staticmethod
     def _do_nfc_normalization(text: str):
@@ -79,23 +72,23 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Returns:
             Pandas dataframe
         """
-        dataframe[("text", "data")] = dataframe["text"]["data"].apply(remove_noisy_lines)
+        if self.remove_additional_whitespaces:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_additional_whitespaces)
+
+        if self.remove_bad_patterns:
+            dataframe[("text", "data")] = dataframe[("text","data")].apply(lambda x: remove_noisy_lines(x, self.language))
 
         if self.apply_nfc:
-            dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: self._do_nfc_normalization(x))
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(self._do_nfc_normalization)
 
         if self.do_lowercase:
-            dataframe[("text", "data")] = dataframe["text"]["data"].apply(lambda x: x.lower())
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
 
-        if self.default_cleaning:
-            dataframe[("text", "data")] = dataframe["text"]["data"].apply(clean)
+        if self.remove_punctuation:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation)
 
-        if len(self.characters_to_remove) > 0:
-            dataframe[("text", "data")] = dataframe["text"]["data"].apply(
-                lambda x: self._remove_patterns(
-                    self.characters_to_remove, x,
-                ),
-            )
+        # remove all empty rows
+        dataframe = dataframe.dropna(subset=[("text", "data")])
 
         return dataframe
 
diff --git a/components/text_normalization/src/resources/de_bad_patterns.txt b/components/text_normalization/src/resources/en_bad_patterns.txt
similarity index 100%
rename from components/text_normalization/src/resources/de_bad_patterns.txt
rename to components/text_normalization/src/resources/en_bad_patterns.txt
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
index 43d2ada6a..9601c2b6b 100644
--- a/components/text_normalization/tests/component_test.py
+++ b/components/text_normalization/tests/component_test.py
@@ -1,18 +1,29 @@
 import json
+import os
+import typing as t
 from glob import glob
 
-import pandas
-from fondant.component import Component
-from fondant.executor import Executor
+import pandas as pd
+import pytest
+from fondant.component_spec import ComponentSpec
 
+from components.text_normalization.src.main import TextNormalizationComponent
 
-def load_fixtures(path):
+
+class MockedComponentSpec(ComponentSpec):
+    """Just for mocking purpose. This component spec is not needed for unit testing."""
+    def __init__(self, specification: t.Dict[str, t.Any]):
+        pass
+
+
+def load_fixtures(path="./fixtures"):
     test_configurations = []
-    fixture_list = glob(path)
+    fixture_list = glob(path + "/*.json")
     for fixture in fixture_list:
         with open(fixture) as file:
             fixture_dict = json.load(file)
 
+        fixture_name = os.path.splitext(fixture)[0]
         user_argmuments = fixture_dict["user_arguments"]
         input_data = {
             tuple(key.split("_")): value for key, value in fixture_dict["input"].items()
@@ -22,33 +33,16 @@ def load_fixtures(path):
             for key, value in fixture_dict["output"].items()
         }
 
-        test_configurations.append((user_argmuments, input_data, expected_out))
+        test_configurations.append((fixture_name, user_argmuments, input_data, expected_out))
 
     return test_configurations
 
-class TestComponentExecuter(Executor[Component]):
-    def __init__(self, user_arguments: t.Dict[str, t.Any], input_data: t.Dict):
-        self.user_arguments = user_arguments
-        self.input_data = input_data
-
-    def execute(self, component_cls: t.Type[Component]) -> pandas.DataFrame:
-        """Execute a component.
-
-        Args:
-            component_cls: The class of the component to execute.
-        """
-        component = component_cls(None, **self.user_arguments)
-
-        input_dataframe = dd.from_dict(self.input_data, npartitions=2)
-
-        if isinstance(component, PandasTransformComponent):
-            output_df = component.transform(input_dataframe.compute())
-
-        elif isinstance(component, DaskTransformComponent):
-            output_df = component.transform(input_dataframe()).compute()
-
-        else:
-            msg = "Non support component type."
-            raise NotImplementedError(msg)
+@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), load_fixtures())
+def test_component(fixture_name, user_arguments, input_data, expected_output):
+    """Test transform method of text normalization component."""
+    print(fixture_name)
+    component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments)
 
-        return output_df
+    input_df = pd.DataFrame(input_data)
+    transformed_output = component.transform(input_df)
+    pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output)
diff --git a/components/text_normalization/tests/conftest.py b/components/text_normalization/tests/conftest.py
new file mode 100644
index 000000000..ef13b7321
--- /dev/null
+++ b/components/text_normalization/tests/conftest.py
@@ -0,0 +1,8 @@
+import os
+import sys
+
+# Get the absolute path to the "src" directory
+src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
+
+# Append the "src" directory to the Python path
+sys.path.append(src_path)
diff --git a/components/text_normalization/tests/fixtures/apply_all.json b/components/text_normalization/tests/fixtures/apply_all.json
new file mode 100644
index 000000000..062191020
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/apply_all.json
@@ -0,0 +1,22 @@
+{
+  "user_arguments": {
+    "apply_nfc": true,
+    "do_lowercase": true,
+    "language": "en",
+    "remove_punctuation": true,
+    "remove_additional_whitespaces": true,
+    "remove_bad_patterns": true
+  },
+  "input": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
+    ]
+  },
+  "output": {
+    "text_data": [
+      "lorem ipsum dolor sit amet consectetur adipiscing elit",
+      "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
new file mode 100644
index 000000000..b9d7977f0
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
@@ -0,0 +1,24 @@
+{
+  "user_arguments": {
+    "apply_nfc": true,
+    "do_lowercase": false,
+    "language": "en",
+    "remove_punctuation": false,
+    "remove_additional_whitespaces": false,
+    "remove_bad_patterns": false
+  },
+  "input": {
+    "text_data": [
+      "\u0043\u0327 something",
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
+    ]
+  },
+  "output": {
+    "text_data": [
+      "\u00C7 something",
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/en_text_normalization.json b/components/text_normalization/tests/fixtures/en_text_normalization.json
deleted file mode 100644
index 4a8165a28..000000000
--- a/components/text_normalization/tests/fixtures/en_text_normalization.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "user_arguments": {
-    "language": "de"
-  },
-  "input": {
-    "data_text": [
-      "Das hier ist ein Satz in deutscher Sprache",
-      "This is a sentence in English",
-      "Dit is een zin in het Nederlands"
-    ]
-  },
-  "output": {
-    "data_text": [
-      "Das hier ist ein Satz in deutscher Sprache"
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
new file mode 100644
index 000000000..bd64f90cb
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
@@ -0,0 +1,24 @@
+{
+  "user_arguments": {
+    "apply_nfc": true,
+    "do_lowercase": true,
+    "language": "en",
+    "remove_punctuation": false,
+    "remove_additional_whitespaces": false,
+    "remove_bad_patterns": false
+  },
+  "input": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
+    ]
+  },
+  "output": {
+    "text_data": [
+      "lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "nulla facilisi. sed eu nulla sit amet enim scelerisque dapibus.",
+      "suspendisse potenti. fusce sit amet erat vel nunc placerat bibendum."
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
new file mode 100644
index 000000000..36f519967
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
@@ -0,0 +1,24 @@
+{
+  "user_arguments": {
+    "apply_nfc": false,
+    "do_lowercase": false,
+    "language": "en",
+    "remove_punctuation": false,
+    "remove_additional_whitespaces": true,
+    "remove_bad_patterns": false
+  },
+  "input": {
+    "text_data": [
+      " Lorem ipsum dolor sit        amet, consectetur adipiscing elit.",
+      "     Nulla facilisi.    Sed eu nulla sit amet enim scelerisque dapibus!",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.         "
+    ]
+  },
+  "output": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns.json b/components/text_normalization/tests/fixtures/remove_bad_patterns.json
new file mode 100644
index 000000000..062191020
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/remove_bad_patterns.json
@@ -0,0 +1,22 @@
+{
+  "user_arguments": {
+    "apply_nfc": true,
+    "do_lowercase": true,
+    "language": "en",
+    "remove_punctuation": true,
+    "remove_additional_whitespaces": true,
+    "remove_bad_patterns": true
+  },
+  "input": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
+    ]
+  },
+  "output": {
+    "text_data": [
+      "lorem ipsum dolor sit amet consectetur adipiscing elit",
+      "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
new file mode 100644
index 000000000..a5103f2dc
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
@@ -0,0 +1,25 @@
+{
+  "user_arguments": {
+    "apply_nfc": false,
+    "do_lowercase": false,
+    "language": "de",
+    "remove_punctuation": false,
+    "remove_additional_whitespaces": false,
+    "remove_bad_patterns": true
+  },
+  "input": {
+    "text_data": [
+      "Lorem ipsum dolor sit \n HELLO WORLD some \n amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.",
+      "45345345"
+    ]
+  },
+  "output": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet consectetur adipiscing elit",
+      "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
+      "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json
new file mode 100644
index 000000000..b03fd81f1
--- /dev/null
+++ b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json
@@ -0,0 +1,24 @@
+{
+  "user_arguments": {
+    "apply_nfc": true,
+    "do_lowercase": false,
+    "language": "en",
+    "remove_punctuation": true,
+    "remove_additional_whitespaces": false,
+    "remove_bad_patterns": false
+  },
+  "input": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
+    ]
+  },
+  "output": {
+    "text_data": [
+      "Lorem ipsum dolor sit amet consectetur adipiscing elit",
+      "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
+      "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum"
+    ]
+  }
+}
\ No newline at end of file

From 8f0897be288414172a8027a7ae11db05f30079b7 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Wed, 2 Aug 2023 10:58:04 +0200
Subject: [PATCH 3/9] Refactor text normalization component

---
 components/text_normalization/src/main.py     | 37 +++++++++-------
 .../src/resources/en_bad_patterns.txt         |  6 ---
 components/text_normalization/src/utils.py    | 42 +++----------------
 .../tests/component_test.py                   | 12 +++---
 .../tests/fixtures/remove_bad_patterns.json   | 22 ----------
 ...emove_bad_patterns_text_normalization.json | 10 ++---
 .../text_normalization/tests/utils_test.py    | 42 +++++++++++++++++++
 7 files changed, 81 insertions(+), 90 deletions(-)
 delete mode 100644 components/text_normalization/src/resources/en_bad_patterns.txt
 delete mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns.json
 create mode 100644 components/text_normalization/tests/utils_test.py

diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index 0ae5f4559..50e2ca2af 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from fondant.component import PandasTransformComponent
 from fondant.executor import PandasTransformExecutor
+from utils import is_counter, is_one_word, mainly_uppercase, only_numerical
 
 logger = logging.getLogger(__name__)
 
@@ -16,30 +17,32 @@ def _remove_punctuation(text):
     """Remove punctuation in given text."""
     return text.translate(str.maketrans("", "", string.punctuation))
 
+
 def _remove_additional_whitespaces(text):
     """
     Text cleaning method from slimpajama approach.
     https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
     Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle
     and in the beginning / end.
-
-    Args:
-         - text: text to be cleaned
     """
     return re.sub(r"\s+", " ", text.strip())
 
-def remove_noisy_lines(text, language):
-    """"""
-    language  + "bad_patterns.txt"
 
+def remove_noisy_lines(text):
     def any_condition_met(line, discard_condition_functions):
         return any(condition(line) for condition in discard_condition_functions)
 
-    return " ".join([line for line in text.split("\n") if not any_condition_met])
+    discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word]
+    return " ".join(
+        [line for line in text.split("\n") if not any_condition_met(line, discard_conditions)])
+
 
 class TextNormalizationComponent(PandasTransformComponent):
     """Component that normalizes text."""
-    def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool, remove_bad_patterns: bool, do_lowercase: bool, language: str, remove_punctuation: bool):
+
+    def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool,
+                 remove_bad_patterns: bool,
+                 do_lowercase: bool, language: str, remove_punctuation: bool):
         self.remove_additional_whitespaces = remove_additional_whitespaces
         self.apply_nfc = apply_nfc
         self.remove_bad_patterns = remove_bad_patterns
@@ -72,23 +75,25 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Returns:
             Pandas dataframe
         """
-        if self.remove_additional_whitespaces:
-            dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_additional_whitespaces)
+        if self.do_lowercase:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
 
         if self.remove_bad_patterns:
-            dataframe[("text", "data")] = dataframe[("text","data")].apply(lambda x: remove_noisy_lines(x, self.language))
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines)
 
         if self.apply_nfc:
-            dataframe[("text", "data")] = dataframe[("text", "data")].apply(self._do_nfc_normalization)
-
-        if self.do_lowercase:
-            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                self._do_nfc_normalization)
 
         if self.remove_punctuation:
             dataframe[("text", "data")] = dataframe[("text", "data")].apply(_remove_punctuation)
 
+        if self.remove_additional_whitespaces:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                _remove_additional_whitespaces)
+
         # remove all empty rows
-        dataframe = dataframe.dropna(subset=[("text", "data")])
+        dataframe = dataframe[dataframe[("text", "data")].astype(bool)]
 
         return dataframe
 
diff --git a/components/text_normalization/src/resources/en_bad_patterns.txt b/components/text_normalization/src/resources/en_bad_patterns.txt
deleted file mode 100644
index 2504b1974..000000000
--- a/components/text_normalization/src/resources/en_bad_patterns.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-Weiterlesen
-Startseite
-Einkaufswagen
-Konto
-Zum Einkaufswagen hinzufügen
-Zum Warenkorb hinzufügen
\ No newline at end of file
diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py
index f6dff6d21..5edc1cd9c 100644
--- a/components/text_normalization/src/utils.py
+++ b/components/text_normalization/src/utils.py
@@ -7,7 +7,8 @@ def mainly_uppercase(line, threshold=0.7):
 
     Args:
         line (str): The input line to check.
-        threshold (float): The threshold (between 0 and 1) to determine what is considered "mainly uppercase."
+        threshold (float): The threshold (between 0 and 1) to determine what is considered
+        "mainly uppercase."
 
     Returns:
         bool: True if the line is mainly uppercase, False otherwise.
@@ -20,6 +21,7 @@ def mainly_uppercase(line, threshold=0.7):
     uppercase_ratio = uppercase_count / total_chars
     return uppercase_ratio >= threshold
 
+
 def only_numerical(line):
     """
     Checks if a line is composed only of numerical characters.
@@ -32,6 +34,7 @@ def only_numerical(line):
     """
     return line.isdigit()
 
+
 def is_counter(line):
     """
     Checks if a line represents a counter (e.g., "3 likes").
@@ -43,9 +46,11 @@ def is_counter(line):
         bool: True if the line represents a counter, False otherwise.
     """
     # Use regular expression to check for the pattern: <number> <text>
+    line = line.strip()
     pattern = r"^\d+\s+\S+$"
     return re.match(pattern, line) is not None
 
+
 def is_one_word(line):
     """
     Checks if a line contains only one word.
@@ -58,38 +63,3 @@ def is_one_word(line):
     """
     words = line.split()
     return len(words) == 1
-
-
-def read_patterns_from_file(file_path):
-    """
-    Read patterns from a text file.
-
-    Args:
-        file_path (str): The path to the text file containing patterns.
-
-    Returns:
-        list: A list of patterns read from the file.
-    """
-    with open(file_path) as file:
-        return [pattern.strip() for pattern in file]
-def is_short_and_matches_pattern(line, pattern_file_path, max_words=10):
-    """
-    Checks if a line is short (< max_words) and matches the given pattern.
-
-    Args:
-        line (str): The input line to check.
-        max_words (int): The maximum number of words allowed in the line (default is 10).
-
-    Returns:
-        bool: True if the line is short and matches the pattern, False otherwise.
-    """
-    patterns = read_patterns_from_file(pattern_file_path)
-    words = line.split()
-    if len(words) > max_words:
-        return False
-
-    for pattern in patterns:
-        if re.search(rf'\b{re.escape(pattern)}\b', line) is not None:
-            return True
-    return None
-
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
index 9601c2b6b..bff208ed6 100644
--- a/components/text_normalization/tests/component_test.py
+++ b/components/text_normalization/tests/component_test.py
@@ -12,6 +12,7 @@
 
 class MockedComponentSpec(ComponentSpec):
     """Just for mocking purpose. This component spec is not needed for unit testing."""
+
     def __init__(self, specification: t.Dict[str, t.Any]):
         pass
 
@@ -24,7 +25,7 @@ def load_fixtures(path="./fixtures"):
             fixture_dict = json.load(file)
 
         fixture_name = os.path.splitext(fixture)[0]
-        user_argmuments = fixture_dict["user_arguments"]
+        user_arguments = fixture_dict["user_arguments"]
         input_data = {
             tuple(key.split("_")): value for key, value in fixture_dict["input"].items()
         }
@@ -33,16 +34,17 @@ def load_fixtures(path="./fixtures"):
             for key, value in fixture_dict["output"].items()
         }
 
-        test_configurations.append((fixture_name, user_argmuments, input_data, expected_out))
+        test_configurations.append((fixture_name, user_arguments, input_data, expected_out))
 
     return test_configurations
 
-@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"), load_fixtures())
+
+@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"),
+                         load_fixtures())
 def test_component(fixture_name, user_arguments, input_data, expected_output):
     """Test transform method of text normalization component."""
-    print(fixture_name)
+    print("Running test case based on: ", fixture_name)
     component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments)
-
     input_df = pd.DataFrame(input_data)
     transformed_output = component.transform(input_df)
     pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output)
diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns.json b/components/text_normalization/tests/fixtures/remove_bad_patterns.json
deleted file mode 100644
index 062191020..000000000
--- a/components/text_normalization/tests/fixtures/remove_bad_patterns.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": true,
-    "do_lowercase": true,
-    "language": "en",
-    "remove_punctuation": true,
-    "remove_additional_whitespaces": true,
-    "remove_bad_patterns": true
-  },
-  "input": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
-    ]
-  },
-  "output": {
-    "text_data": [
-      "lorem ipsum dolor sit amet consectetur adipiscing elit",
-      "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus"
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
index a5103f2dc..c4a8eab71 100644
--- a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
+++ b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
@@ -4,12 +4,12 @@
     "do_lowercase": false,
     "language": "de",
     "remove_punctuation": false,
-    "remove_additional_whitespaces": false,
+    "remove_additional_whitespaces": true,
     "remove_bad_patterns": true
   },
   "input": {
     "text_data": [
-      "Lorem ipsum dolor sit \n HELLO WORLD some \n amet, consectetur adipiscing elit.",
+      "Lorem ipsum dolor sit \n HELLO WORLD \n amet, consectetur adipiscing elit.",
       "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!",
       "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.",
       "45345345"
@@ -17,9 +17,9 @@
   },
   "output": {
     "text_data": [
-      "Lorem ipsum dolor sit amet consectetur adipiscing elit",
-      "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
-      "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum"
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
+      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
     ]
   }
 }
\ No newline at end of file
diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py
new file mode 100644
index 000000000..af2c7d6b7
--- /dev/null
+++ b/components/text_normalization/tests/utils_test.py
@@ -0,0 +1,42 @@
+
+from components.text_normalization.src.utils import (
+    is_counter,
+    is_one_word,
+    mainly_uppercase,
+    only_numerical,
+)
+
+
+def test_mainly_uppercase():
+    line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
+    assert mainly_uppercase(line, threshold=0.5)
+
+def test_mainly_uppercase_under_threshold():
+    line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
+    assert ~mainly_uppercase(line, threshold=0.9)
+
+def test_only_numerical():
+    line = "42"
+    assert only_numerical(line)
+
+def test_only_numerical_on_words():
+    line = "42 lorem ipsum"
+    assert ~only_numerical(line)
+
+def test_is_counter():
+    line = "13 Likes"
+    assert is_counter(line)
+
+def test_is_not_counter():
+    line = "Hello world! 42 people are part of .."
+    assert ~is_counter(line)
+
+def test_is_one_word():
+    line = "word"
+    assert is_one_word(line)
+
+def test_is_not_one_word():
+    line = "two words"
+    assert ~is_one_word(line)
+
+

From efe5c49702c030d074b699aaf613323443411992 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Thu, 3 Aug 2023 08:21:51 +0200
Subject: [PATCH 4/9] Add component readme.md

---
 components/text_normalization/README.md              | 12 ++++++++++++
 components/text_normalization/fondant_component.yaml |  3 ---
 2 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100644 components/text_normalization/README.md

diff --git a/components/text_normalization/README.md b/components/text_normalization/README.md
new file mode 100644
index 000000000..7b01ecbd7
--- /dev/null
+++ b/components/text_normalization/README.md
@@ -0,0 +1,12 @@
+# Text normalization component
+
+This component implements several text normalization techniques to clean and preprocess textual data:
+
+- Apply lowercasing: Converts all text to lowercase
+- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs
+- Apply NFC normalization: Converts characters to their canonical representation
+- Remove common seen patterns in webpages following the implementation of [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf)
+- Remove punctuation: Strips punctuation marks from the text
+
+These text normalization techniques are valuable for preparing text data before using it for 
+the training of large language models.
\ No newline at end of file
diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
index 2f5070466..3df9f2ae4 100644
--- a/components/text_normalization/fondant_component.yaml
+++ b/components/text_normalization/fondant_component.yaml
@@ -21,9 +21,6 @@ args:
   do_lowercase:
     description: If true apply lowercasing
     type: bool
-  language:
-    description: Language is needed for language specific normalizations
-    type: str
   remove_punctuation:
     description: If true punctuation will be removed
     type: str
\ No newline at end of file

From 1d35b0d22b625d17240ff352381bfc95e3ab734d Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Mon, 7 Aug 2023 20:39:56 +0200
Subject: [PATCH 5/9] Addressing comments

---
 components/text_normalization/Dockerfile      | 19 +++++--
 .../text_normalization/fondant_component.yaml |  2 +-
 .../text_normalization/requirements.txt       |  2 +-
 components/text_normalization/src/main.py     |  6 +--
 components/text_normalization/src/utils.py    |  8 +--
 .../tests/component_test.py                   | 50 -------------------
 .../text_normalization/tests/conftest.py      |  8 ---
 .../text_normalization/tests/utils_test.py    |  4 +-
 src/fondant/abstract_component_test.py        |  3 ++
 9 files changed, 28 insertions(+), 74 deletions(-)
 delete mode 100644 components/text_normalization/tests/component_test.py
 delete mode 100644 components/text_normalization/tests/conftest.py

diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile
index 605adc7e9..ac4d4aedf 100644
--- a/components/text_normalization/Dockerfile
+++ b/components/text_normalization/Dockerfile
@@ -1,18 +1,27 @@
-FROM --platform=linux/amd64 python:3.8-slim
+FROM --platform=linux/amd64 python:3.8-slim as base
 
 ## System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install git -y
 
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/mrchtr/fondant
+
 # install requirements
 COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-# Set the working directory to the component folder
-WORKDIR /component/src
-
 # Copy over src-files
-COPY src/ .
+COPY src/ src/
 
+# Run component tests
+FROM base as test
+RUN pip3 install pytest pandas # TODO add pytest to package setup
+COPY tests/ tests/
+RUN ["python", "-m", "pytest", "tests/"]
+
+FROM base
+# Set the working directory to the component folder
+WORKDIR /src
+RUN echo $(ls)
 ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
index 3df9f2ae4..6057fbfa4 100644
--- a/components/text_normalization/fondant_component.yaml
+++ b/components/text_normalization/fondant_component.yaml
@@ -16,7 +16,7 @@ args:
     description: If true apply nfc normalization
     type: bool
   remove_bad_patterns:
-    description: If true remove bad patterns
+    description: If true remove common patterns in web texts (e.g. lines contains only number, lines consists of uppercase letters, or counters)
     type: bool
   do_lowercase:
     description: If true apply lowercasing
diff --git a/components/text_normalization/requirements.txt b/components/text_normalization/requirements.txt
index 9e5daac86..a4299def8 100644
--- a/components/text_normalization/requirements.txt
+++ b/components/text_normalization/requirements.txt
@@ -1 +1 @@
-ftfy=6.1.1
\ No newline at end of file
+ftfy==6.1.1
\ No newline at end of file
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index 50e2ca2af..741b45041 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -75,12 +75,12 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Returns:
             Pandas dataframe
         """
-        if self.do_lowercase:
-            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
-
         if self.remove_bad_patterns:
             dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines)
 
+        if self.do_lowercase:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
+
         if self.apply_nfc:
             dataframe[("text", "data")] = dataframe[("text", "data")].apply(
                 self._do_nfc_normalization)
diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py
index 5edc1cd9c..24e1db54a 100644
--- a/components/text_normalization/src/utils.py
+++ b/components/text_normalization/src/utils.py
@@ -1,7 +1,7 @@
 import re
 
 
-def mainly_uppercase(line, threshold=0.7):
+def mainly_uppercase(line: str, threshold: float = 0.7) -> bool:
     """
     Checks if a line is mainly composed of uppercase characters.
 
@@ -22,7 +22,7 @@ def mainly_uppercase(line, threshold=0.7):
     return uppercase_ratio >= threshold
 
 
-def only_numerical(line):
+def only_numerical(line: str) -> bool:
     """
     Checks if a line is composed only of numerical characters.
 
@@ -35,7 +35,7 @@ def only_numerical(line):
     return line.isdigit()
 
 
-def is_counter(line):
+def is_counter(line: str) -> bool:
     """
     Checks if a line represents a counter (e.g., "3 likes").
 
@@ -51,7 +51,7 @@ def is_counter(line):
     return re.match(pattern, line) is not None
 
 
-def is_one_word(line):
+def is_one_word(line: str) -> bool:
     """
     Checks if a line contains only one word.
 
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
deleted file mode 100644
index bff208ed6..000000000
--- a/components/text_normalization/tests/component_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import json
-import os
-import typing as t
-from glob import glob
-
-import pandas as pd
-import pytest
-from fondant.component_spec import ComponentSpec
-
-from components.text_normalization.src.main import TextNormalizationComponent
-
-
-class MockedComponentSpec(ComponentSpec):
-    """Just for mocking purpose. This component spec is not needed for unit testing."""
-
-    def __init__(self, specification: t.Dict[str, t.Any]):
-        pass
-
-
-def load_fixtures(path="./fixtures"):
-    test_configurations = []
-    fixture_list = glob(path + "/*.json")
-    for fixture in fixture_list:
-        with open(fixture) as file:
-            fixture_dict = json.load(file)
-
-        fixture_name = os.path.splitext(fixture)[0]
-        user_arguments = fixture_dict["user_arguments"]
-        input_data = {
-            tuple(key.split("_")): value for key, value in fixture_dict["input"].items()
-        }
-        expected_out = {
-            tuple(key.split("_")): value
-            for key, value in fixture_dict["output"].items()
-        }
-
-        test_configurations.append((fixture_name, user_arguments, input_data, expected_out))
-
-    return test_configurations
-
-
-@pytest.mark.parametrize(("fixture_name", "user_arguments", "input_data", "expected_output"),
-                         load_fixtures())
-def test_component(fixture_name, user_arguments, input_data, expected_output):
-    """Test transform method of text normalization component."""
-    print("Running test case based on: ", fixture_name)
-    component = TextNormalizationComponent(MockedComponentSpec({}), **user_arguments)
-    input_df = pd.DataFrame(input_data)
-    transformed_output = component.transform(input_df)
-    pd.testing.assert_frame_equal(pd.DataFrame(expected_output), transformed_output)
diff --git a/components/text_normalization/tests/conftest.py b/components/text_normalization/tests/conftest.py
deleted file mode 100644
index ef13b7321..000000000
--- a/components/text_normalization/tests/conftest.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import os
-import sys
-
-# Get the absolute path to the "src" directory
-src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
-
-# Append the "src" directory to the Python path
-sys.path.append(src_path)
diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py
index af2c7d6b7..3f76dcfa6 100644
--- a/components/text_normalization/tests/utils_test.py
+++ b/components/text_normalization/tests/utils_test.py
@@ -1,5 +1,5 @@
 
-from components.text_normalization.src.utils import (
+from src.utils import (
     is_counter,
     is_one_word,
     mainly_uppercase,
@@ -37,6 +37,6 @@ def test_is_one_word():
 
 def test_is_not_one_word():
     line = "two words"
-    assert ~is_one_word(line)
+    assert not is_one_word(line)
 
 
diff --git a/src/fondant/abstract_component_test.py b/src/fondant/abstract_component_test.py
index 92a1dd69f..079f3dccd 100644
--- a/src/fondant/abstract_component_test.py
+++ b/src/fondant/abstract_component_test.py
@@ -3,6 +3,9 @@
 import pandas as pd
 import pytest
 
+from fondant.component_spec import ComponentSpec
+
+
 
 class AbstractComponentTest(ABC):
     @abstractmethod

From e0c0c8c00ea9856f14ace0f3c14117a6c2a3b6ed Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 8 Aug 2023 08:38:52 +0200
Subject: [PATCH 6/9] Update docsstrings, adapt component test to use the
 AbstractComponentTest

---
 components/text_normalization/Dockerfile      |  2 +-
 .../text_normalization/fondant_component.yaml |  4 +--
 components/text_normalization/src/main.py     | 15 +++++++----
 .../tests/fixtures/apply_all.json             | 22 ----------------
 .../apply_nfc_text_normalization.json         | 24 ------------------
 .../lowercasing_text_normalization.json       | 24 ------------------
 ..._additional_whitespaces_normalization.json | 24 ------------------
 ...emove_bad_patterns_text_normalization.json | 25 -------------------
 .../remove_puncuation_text_normalization.json | 24 ------------------
 src/fondant/abstract_component_test.py        |  3 ---
 10 files changed, 13 insertions(+), 154 deletions(-)
 delete mode 100644 components/text_normalization/tests/fixtures/apply_all.json
 delete mode 100644 components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
 delete mode 100644 components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
 delete mode 100644 components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
 delete mode 100644 components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
 delete mode 100644 components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json

diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile
index ac4d4aedf..12137c722 100644
--- a/components/text_normalization/Dockerfile
+++ b/components/text_normalization/Dockerfile
@@ -17,11 +17,11 @@ COPY src/ src/
 # Run component tests
 FROM base as test
 RUN pip3 install pytest pandas # TODO add pytest to package setup
+ENV PYTHONPATH "${PYTHONPATH}:./src"
 COPY tests/ tests/
 RUN ["python", "-m", "pytest", "tests/"]
 
 FROM base
 # Set the working directory to the component folder
 WORKDIR /src
-RUN echo $(ls)
 ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/components/text_normalization/fondant_component.yaml b/components/text_normalization/fondant_component.yaml
index 6057fbfa4..f9d2bfabb 100644
--- a/components/text_normalization/fondant_component.yaml
+++ b/components/text_normalization/fondant_component.yaml
@@ -15,8 +15,8 @@ args:
   apply_nfc:
     description: If true apply nfc normalization
     type: bool
-  remove_bad_patterns:
-    description: If true remove common patterns in web texts (e.g. lines contains only number, lines consists of uppercase letters, or counters)
+  normalize_lines:
+    description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter
     type: bool
   do_lowercase:
     description: If true apply lowercasing
diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index 741b45041..4c51f346e 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -28,7 +28,7 @@ def _remove_additional_whitespaces(text):
     return re.sub(r"\s+", " ", text.strip())
 
 
-def remove_noisy_lines(text):
+def normalize_lines(text):
     def any_condition_met(line, discard_condition_functions):
         return any(condition(line) for condition in discard_condition_functions)
 
@@ -45,7 +45,7 @@ def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool,
                  do_lowercase: bool, language: str, remove_punctuation: bool):
         self.remove_additional_whitespaces = remove_additional_whitespaces
         self.apply_nfc = apply_nfc
-        self.remove_bad_patterns = remove_bad_patterns
+        self.normalize_lines = remove_bad_patterns
         self.do_lowercase = do_lowercase
         self.language = language
         self.remove_punctuation = remove_punctuation
@@ -67,7 +67,11 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Apply normalization transformations. The component is capable of:
         - NFC normalization
         - Lowercasing
-        - Removing of regex patterns.
+        - Removing of unnecessary whitespaces (e.g. tabs), punctuation
+        - Apply line-wise transformations that exclude lines matching specified patterns.
+        Patterns include lines that are mainly composed of uppercase characters, lines that consist
+        only of numerical characters, lines that are counters (e.g., "3 likes"), and lines
+        that contain only one word.
 
         Args:
             dataframe: Pandas dataframe.
@@ -75,8 +79,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         Returns:
             Pandas dataframe
         """
-        if self.remove_bad_patterns:
-            dataframe[("text", "data")] = dataframe[("text", "data")].apply(remove_noisy_lines)
+        if self.normalize_lines:
+            dataframe[("text", "data")] = dataframe[("text", "data")].apply(
+                normalize_lines)
 
         if self.do_lowercase:
             dataframe[("text", "data")] = dataframe[("text", "data")].apply(lambda x: x.lower())
diff --git a/components/text_normalization/tests/fixtures/apply_all.json b/components/text_normalization/tests/fixtures/apply_all.json
deleted file mode 100644
index 062191020..000000000
--- a/components/text_normalization/tests/fixtures/apply_all.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": true,
-    "do_lowercase": true,
-    "language": "en",
-    "remove_punctuation": true,
-    "remove_additional_whitespaces": true,
-    "remove_bad_patterns": true
-  },
-  "input": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
-    ]
-  },
-  "output": {
-    "text_data": [
-      "lorem ipsum dolor sit amet consectetur adipiscing elit",
-      "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus"
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json b/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
deleted file mode 100644
index b9d7977f0..000000000
--- a/components/text_normalization/tests/fixtures/apply_nfc_text_normalization.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": true,
-    "do_lowercase": false,
-    "language": "en",
-    "remove_punctuation": false,
-    "remove_additional_whitespaces": false,
-    "remove_bad_patterns": false
-  },
-  "input": {
-    "text_data": [
-      "\u0043\u0327 something",
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
-    ]
-  },
-  "output": {
-    "text_data": [
-      "\u00C7 something",
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus."
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json b/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
deleted file mode 100644
index bd64f90cb..000000000
--- a/components/text_normalization/tests/fixtures/lowercasing_text_normalization.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": true,
-    "do_lowercase": true,
-    "language": "en",
-    "remove_punctuation": false,
-    "remove_additional_whitespaces": false,
-    "remove_bad_patterns": false
-  },
-  "input": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
-    ]
-  },
-  "output": {
-    "text_data": [
-      "lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "nulla facilisi. sed eu nulla sit amet enim scelerisque dapibus.",
-      "suspendisse potenti. fusce sit amet erat vel nunc placerat bibendum."
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json b/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
deleted file mode 100644
index 36f519967..000000000
--- a/components/text_normalization/tests/fixtures/remove_additional_whitespaces_normalization.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": false,
-    "do_lowercase": false,
-    "language": "en",
-    "remove_punctuation": false,
-    "remove_additional_whitespaces": true,
-    "remove_bad_patterns": false
-  },
-  "input": {
-    "text_data": [
-      " Lorem ipsum dolor sit        amet, consectetur adipiscing elit.",
-      "     Nulla facilisi.    Sed eu nulla sit amet enim scelerisque dapibus!",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.         "
-    ]
-  },
-  "output": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json b/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
deleted file mode 100644
index c4a8eab71..000000000
--- a/components/text_normalization/tests/fixtures/remove_bad_patterns_text_normalization.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": false,
-    "do_lowercase": false,
-    "language": "de",
-    "remove_punctuation": false,
-    "remove_additional_whitespaces": true,
-    "remove_bad_patterns": true
-  },
-  "input": {
-    "text_data": [
-      "Lorem ipsum dolor sit \n HELLO WORLD \n amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit \n 10 Likes \n amet enim scelerisque dapibus!",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum.",
-      "45345345"
-    ]
-  },
-  "output": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
-    ]
-  }
-}
\ No newline at end of file
diff --git a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json b/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json
deleted file mode 100644
index b03fd81f1..000000000
--- a/components/text_normalization/tests/fixtures/remove_puncuation_text_normalization.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "user_arguments": {
-    "apply_nfc": true,
-    "do_lowercase": false,
-    "language": "en",
-    "remove_punctuation": true,
-    "remove_additional_whitespaces": false,
-    "remove_bad_patterns": false
-  },
-  "input": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-      "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus!",
-      "Suspendisse potenti. Fusce sit amet erat vel nunc placerat bibendum."
-    ]
-  },
-  "output": {
-    "text_data": [
-      "Lorem ipsum dolor sit amet consectetur adipiscing elit",
-      "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
-      "Suspendisse potenti Fusce sit amet erat vel nunc placerat bibendum"
-    ]
-  }
-}
\ No newline at end of file
diff --git a/src/fondant/abstract_component_test.py b/src/fondant/abstract_component_test.py
index 079f3dccd..92a1dd69f 100644
--- a/src/fondant/abstract_component_test.py
+++ b/src/fondant/abstract_component_test.py
@@ -3,9 +3,6 @@
 import pandas as pd
 import pytest
 
-from fondant.component_spec import ComponentSpec
-
-
 
 class AbstractComponentTest(ABC):
     @abstractmethod

From d5a508f177938f0758321f3fa112355c92670865 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 8 Aug 2023 08:46:10 +0200
Subject: [PATCH 7/9] Update docker file

---
 components/text_normalization/Dockerfile          | 2 +-
 components/text_normalization/src/utils.py        | 8 ++++----
 components/text_normalization/tests/utils_test.py | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/components/text_normalization/Dockerfile b/components/text_normalization/Dockerfile
index 12137c722..eb96b604a 100644
--- a/components/text_normalization/Dockerfile
+++ b/components/text_normalization/Dockerfile
@@ -16,7 +16,7 @@ COPY src/ src/
 
 # Run component tests
 FROM base as test
-RUN pip3 install pytest pandas # TODO add pytest to package setup
+RUN pip3 install pytest # TODO add pytest to package setup
 ENV PYTHONPATH "${PYTHONPATH}:./src"
 COPY tests/ tests/
 RUN ["python", "-m", "pytest", "tests/"]
diff --git a/components/text_normalization/src/utils.py b/components/text_normalization/src/utils.py
index 24e1db54a..b487bc61e 100644
--- a/components/text_normalization/src/utils.py
+++ b/components/text_normalization/src/utils.py
@@ -6,7 +6,7 @@ def mainly_uppercase(line: str, threshold: float = 0.7) -> bool:
     Checks if a line is mainly composed of uppercase characters.
 
     Args:
-        line (str): The input line to check.
+        line: The input line to check.
         threshold (float): The threshold (between 0 and 1) to determine what is considered
         "mainly uppercase."
 
@@ -27,7 +27,7 @@ def only_numerical(line: str) -> bool:
     Checks if a line is composed only of numerical characters.
 
     Args:
-        line (str): The input line to check.
+        line: The input line to check.
 
     Returns:
         bool: True if the line is only composed of numerical characters, False otherwise.
@@ -40,7 +40,7 @@ def is_counter(line: str) -> bool:
     Checks if a line represents a counter (e.g., "3 likes").
 
     Args:
-        line (str): The input line to check.
+        line: The input line to check.
 
     Returns:
         bool: True if the line represents a counter, False otherwise.
@@ -56,7 +56,7 @@ def is_one_word(line: str) -> bool:
     Checks if a line contains only one word.
 
     Args:
-        line (str): The input line to check.
+        line: The input line to check.
 
     Returns:
         bool: True if the line contains only one word, False otherwise.
diff --git a/components/text_normalization/tests/utils_test.py b/components/text_normalization/tests/utils_test.py
index 3f76dcfa6..44f360676 100644
--- a/components/text_normalization/tests/utils_test.py
+++ b/components/text_normalization/tests/utils_test.py
@@ -13,7 +13,7 @@ def test_mainly_uppercase():
 
 def test_mainly_uppercase_under_threshold():
     line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
-    assert ~mainly_uppercase(line, threshold=0.9)
+    assert not mainly_uppercase(line, threshold=0.9)
 
 def test_only_numerical():
     line = "42"
@@ -21,7 +21,7 @@ def test_only_numerical():
 
 def test_only_numerical_on_words():
     line = "42 lorem ipsum"
-    assert ~only_numerical(line)
+    assert not only_numerical(line)
 
 def test_is_counter():
     line = "13 Likes"
@@ -29,7 +29,7 @@ def test_is_counter():
 
 def test_is_not_counter():
     line = "Hello world! 42 people are part of .."
-    assert ~is_counter(line)
+    assert not is_counter(line)
 
 def test_is_one_word():
     line = "word"

From 2a7a73319f91874215a79eb4533955b845f3444a Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 8 Aug 2023 14:45:18 +0200
Subject: [PATCH 8/9] Testing strategy drafts

---
 components/text_normalization/src/main.py     |   3 +-
 .../tests/component_test.py                   | 122 ++++++++++++++++++
 src/fondant/testing_utils.py                  |  39 ++++++
 3 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 components/text_normalization/tests/component_test.py
 create mode 100644 src/fondant/testing_utils.py

diff --git a/components/text_normalization/src/main.py b/components/text_normalization/src/main.py
index 4c51f346e..662c8023c 100644
--- a/components/text_normalization/src/main.py
+++ b/components/text_normalization/src/main.py
@@ -42,12 +42,11 @@ class TextNormalizationComponent(PandasTransformComponent):
 
     def __init__(self, *args, remove_additional_whitespaces: bool, apply_nfc: bool,
                  remove_bad_patterns: bool,
-                 do_lowercase: bool, language: str, remove_punctuation: bool):
+                 do_lowercase: bool, remove_punctuation: bool):
         self.remove_additional_whitespaces = remove_additional_whitespaces
         self.apply_nfc = apply_nfc
         self.normalize_lines = remove_bad_patterns
         self.do_lowercase = do_lowercase
-        self.language = language
         self.remove_punctuation = remove_punctuation
 
     @staticmethod
diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
new file mode 100644
index 000000000..d39d9273b
--- /dev/null
+++ b/components/text_normalization/tests/component_test.py
@@ -0,0 +1,122 @@
+from typing import Any, Dict
+
+import pandas as pd
+import pytest
+from fondant.testing_utils import execute_pandas_transform_component
+
+from src.main import TextNormalizationComponent
+
+
+def test_transform_custom_componen_test():
+    """Test components transform method.
+    Option 1: handling the test case is up to the users.
+    """
+    user_arguments = {
+        "remove_additional_whitespaces": True,
+        "apply_nfc": True,
+        "remove_bad_patterns": True,
+        "do_lowercase": True,
+        "remove_punctuation": True,
+    }
+    component = TextNormalizationComponent(**user_arguments)
+
+    input_dataframe = pd.DataFrame([
+        "\u0043\u0327 something",
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+    ], columns=[("text", "data")])
+
+    expected_output = pd.DataFrame([
+        "\u00e7 something",
+        "lorem ipsum dolor sit amet consectetur adipiscing elit",
+        "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
+    ], columns=[("text", "data")])
+
+    output_dataframe = component.transform(input_dataframe)
+
+    pd.testing.assert_frame_equal(
+        left=expected_output,
+        right=output_dataframe,
+        check_dtype=False,
+    )
+
+
+def test_transform_helper_methods():
+    """Test components transform method.
+    Option 2: using helper method provided by fondant.
+    """
+    user_arguments = {
+        "remove_additional_whitespaces": True,
+        "apply_nfc": True,
+        "remove_bad_patterns": True,
+        "do_lowercase": True,
+        "remove_punctuation": True,
+    }
+    component = TextNormalizationComponent(**user_arguments)
+
+    input_dataframe = pd.DataFrame([
+        "\u0043\u0327 something",
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+        "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+    ], columns=[("text", "data")])
+
+    expected_output = pd.DataFrame([
+        "\u00e7 something",
+        "lorem ipsum dolor sit amet consectetur adipiscing elit",
+        "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
+    ], columns=[("text", "data")])
+
+    execute_pandas_transform_component(component, input_dataframe, expected_output)
+
+
+data = [
+    # first scenario
+    {
+        "user_arguments": {
+            "remove_additional_whitespaces": True,
+            "apply_nfc": True,
+            "remove_bad_patterns": True,
+            "do_lowercase": True,
+            "remove_punctuation": True,
+        },
+        "input_dataframe": pd.DataFrame([
+            "\u0043\u0327 something",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+            "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+        ], columns=[("text", "data")]),
+        "output_dataframe": pd.DataFrame([
+            "\u00e7 something",
+            "lorem ipsum dolor sit amet consectetur adipiscing elit",
+            "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
+        ], columns=[("text", "data")]),
+    },
+
+    # second scenario
+    {
+        "user_arguments": {
+            "remove_additional_whitespaces": True,
+            "apply_nfc": True,
+            "remove_bad_patterns": True,
+            "do_lowercase": False,
+            "remove_punctuation": True,
+        },
+        "input_dataframe": pd.DataFrame([
+            "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
+        ], columns=[("text", "data")]),
+        "output_dataframe": pd.DataFrame([
+            "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
+        ], columns=[("text", "data")]),
+    },
+]
+
+
+@pytest.mark.parametrize(
+    "scenario",
+    data,
+)
+def test_transform_helper_methods_parametrized(scenario: Dict[str, Any]):
+    """Option 3: Only defining parametrized scenarios. Usage of helper provided by fondant."""
+    component = TextNormalizationComponent(**scenario["user_arguments"])
+    execute_pandas_transform_component(component,
+                                       scenario["input_dataframe"],
+                                       scenario["output_dataframe"])
diff --git a/src/fondant/testing_utils.py b/src/fondant/testing_utils.py
new file mode 100644
index 000000000..97eed0c86
--- /dev/null
+++ b/src/fondant/testing_utils.py
@@ -0,0 +1,39 @@
+import dask.dataframe as dd
+import pandas as pd
+
+from fondant.component import DaskTransformComponent, PandasTransformComponent
+
+
+def execute_pandas_transform_component(
+    component: PandasTransformComponent,
+    input_dataframe: pd.DataFrame,
+    expected_output: pd.DataFrame,
+):
+    """Helper method for executing pandas transform component."""
+    _compare_pandas_dataframe(component.transform(input_dataframe), expected_output)
+
+
+def _compare_pandas_dataframe(
+    expected_output: pd.DataFrame,
+    output_dataframe: pd.DataFrame,
+):
+    """Comparing to pandas dataframes."""
+    pd.testing.assert_frame_equal(
+        left=expected_output,
+        right=output_dataframe,
+        check_dtype=False,
+    )
+
+
+def execute_dask_transform_component(
+    component: DaskTransformComponent,
+    input_dataframe: dd,
+    expected_output: dd,
+):
+    """Helper method for executing pandas transform component."""
+    _compare_dask_dataframe(component.transform(input_dataframe), expected_output)
+
+
+def _compare_dask_dataframe(expected_output: dd, output_dataframe: dd):
+    msg = "Not implemented."
+    raise NotImplementedError(msg)

From e914376fe907ffef05874e4d9cd0e07a8d3d83b2 Mon Sep 17 00:00:00 2001
From: mrchtr <matthias.r1092@gmail.com>
Date: Tue, 15 Aug 2023 07:40:03 +0200
Subject: [PATCH 9/9] Refactor unit tests

---
 .../tests/component_test.py                   | 88 +------------------
 src/fondant/testing_utils.py                  | 39 --------
 2 files changed, 1 insertion(+), 126 deletions(-)
 delete mode 100644 src/fondant/testing_utils.py

diff --git a/components/text_normalization/tests/component_test.py b/components/text_normalization/tests/component_test.py
index d39d9273b..34ce528aa 100644
--- a/components/text_normalization/tests/component_test.py
+++ b/components/text_normalization/tests/component_test.py
@@ -1,16 +1,11 @@
-from typing import Any, Dict
 
 import pandas as pd
-import pytest
-from fondant.testing_utils import execute_pandas_transform_component
 
 from src.main import TextNormalizationComponent
 
 
 def test_transform_custom_componen_test():
-    """Test components transform method.
-    Option 1: handling the test case is up to the users.
-    """
+    """Test components transform method."""
     user_arguments = {
         "remove_additional_whitespaces": True,
         "apply_nfc": True,
@@ -39,84 +34,3 @@ def test_transform_custom_componen_test():
         right=output_dataframe,
         check_dtype=False,
     )
-
-
-def test_transform_helper_methods():
-    """Test components transform method.
-    Option 2: using helper method provided by fondant.
-    """
-    user_arguments = {
-        "remove_additional_whitespaces": True,
-        "apply_nfc": True,
-        "remove_bad_patterns": True,
-        "do_lowercase": True,
-        "remove_punctuation": True,
-    }
-    component = TextNormalizationComponent(**user_arguments)
-
-    input_dataframe = pd.DataFrame([
-        "\u0043\u0327 something",
-        "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-        "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
-    ], columns=[("text", "data")])
-
-    expected_output = pd.DataFrame([
-        "\u00e7 something",
-        "lorem ipsum dolor sit amet consectetur adipiscing elit",
-        "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
-    ], columns=[("text", "data")])
-
-    execute_pandas_transform_component(component, input_dataframe, expected_output)
-
-
-data = [
-    # first scenario
-    {
-        "user_arguments": {
-            "remove_additional_whitespaces": True,
-            "apply_nfc": True,
-            "remove_bad_patterns": True,
-            "do_lowercase": True,
-            "remove_punctuation": True,
-        },
-        "input_dataframe": pd.DataFrame([
-            "\u0043\u0327 something",
-            "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
-            "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
-        ], columns=[("text", "data")]),
-        "output_dataframe": pd.DataFrame([
-            "\u00e7 something",
-            "lorem ipsum dolor sit amet consectetur adipiscing elit",
-            "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
-        ], columns=[("text", "data")]),
-    },
-
-    # second scenario
-    {
-        "user_arguments": {
-            "remove_additional_whitespaces": True,
-            "apply_nfc": True,
-            "remove_bad_patterns": True,
-            "do_lowercase": False,
-            "remove_punctuation": True,
-        },
-        "input_dataframe": pd.DataFrame([
-            "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
-        ], columns=[("text", "data")]),
-        "output_dataframe": pd.DataFrame([
-            "Nulla facilisi Sed eu nulla sit amet enim scelerisque dapibus",
-        ], columns=[("text", "data")]),
-    },
-]
-
-
-@pytest.mark.parametrize(
-    "scenario",
-    data,
-)
-def test_transform_helper_methods_parametrized(scenario: Dict[str, Any]):
-    """Option 3: Only defining parametrized scenarios. Usage of helper provided by fondant."""
-    component = TextNormalizationComponent(**scenario["user_arguments"])
-    execute_pandas_transform_component(component,
-                                       scenario["input_dataframe"],
-                                       scenario["output_dataframe"])
diff --git a/src/fondant/testing_utils.py b/src/fondant/testing_utils.py
deleted file mode 100644
index 97eed0c86..000000000
--- a/src/fondant/testing_utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import dask.dataframe as dd
-import pandas as pd
-
-from fondant.component import DaskTransformComponent, PandasTransformComponent
-
-
-def execute_pandas_transform_component(
-    component: PandasTransformComponent,
-    input_dataframe: pd.DataFrame,
-    expected_output: pd.DataFrame,
-):
-    """Helper method for executing pandas transform component."""
-    _compare_pandas_dataframe(component.transform(input_dataframe), expected_output)
-
-
-def _compare_pandas_dataframe(
-    expected_output: pd.DataFrame,
-    output_dataframe: pd.DataFrame,
-):
-    """Comparing to pandas dataframes."""
-    pd.testing.assert_frame_equal(
-        left=expected_output,
-        right=output_dataframe,
-        check_dtype=False,
-    )
-
-
-def execute_dask_transform_component(
-    component: DaskTransformComponent,
-    input_dataframe: dd,
-    expected_output: dd,
-):
-    """Helper method for executing pandas transform component."""
-    _compare_dask_dataframe(component.transform(input_dataframe), expected_output)
-
-
-def _compare_dask_dataframe(expected_output: dd, output_dataframe: dd):
-    msg = "Not implemented."
-    raise NotImplementedError(msg)