bugfix/clean-pictograms-from-transcripts-before-indexing (#165)

* Add emoji and pictogram cleaner function * Add tests * Use str.strip()
CouncilDataProject · Feb 17, 2022 · 203fec3 · 203fec3
1 parent ede007f
commit 203fec3
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 18 deletions.
diff --git a/cdp_backend/pipeline/event_index_pipeline.py b/cdp_backend/pipeline/event_index_pipeline.py
@@ -207,6 +207,7 @@ def read_transcripts_and_generate_grams(
                     cleaned_text=string_utils.clean_text(
                         sentence.text,
                         clean_stop_words=True,
+                        clean_emojis=True,
                     ),
                     n_grams=[],
                 )

diff --git a/cdp_backend/tests/utils/test_string_utils.py b/cdp_backend/tests/utils/test_string_utils.py
@@ -9,24 +9,84 @@
 
 
 @pytest.mark.parametrize(
-    "text, expected, clean_stop_words",
+    "text, expected, clean_stop_words, clean_emojis",
     [
-        ("hello and goodbye", "hello goodbye", True),
-        ("   \t\n   hello and to of a         goodbye         ", "hello goodbye", True),
-        ("hell'o    and   good-bye", "hello goodbye", True),
-        ("and", "", True),
-        ("hello and goodbye", "hello and goodbye", False),
+        (
+            "hello and goodbye",
+            "hello goodbye",
+            True,
+            True,
+        ),
+        (
+            "   \t\n   hello and to of a         goodbye         ",
+            "hello goodbye",
+            True,
+            True,
+        ),
+        (
+            "hell'o    and   good-bye",
+            "hello goodbye",
+            True,
+            True,
+        ),
+        (
+            "and",
+            "",
+            True,
+            True,
+        ),
+        (
+            "hello and goodbye",
+            "hello and goodbye",
+            False,
+            True,
+        ),
         (
             "   \t\n   hello and to of a         goodbye         ",
             "hello and to of a goodbye",
             False,
+            True,
+        ),
+        (
+            "hell'o    and   good-bye",
+            "hello and goodbye",
+            False,
+            True,
+        ),
+        (
+            "and",
+            "and",
+            False,
+            True,
+        ),
+        (
+            "♪ Seattle channel music ♪",
+            "Seattle channel music",
+            False,
+            True,
+        ),
+        (
+            "\t\n    \t♪ Seattle channel music ♪",
+            "Seattle channel music",
+            False,
+            True,
         ),
-        ("hell'o    and   good-bye", "hello and goodbye", False),
-        ("and", "and", False),
     ],
 )
-def test_clean_text(text: str, expected: str, clean_stop_words: bool) -> None:
-    assert string_utils.clean_text(text, clean_stop_words=clean_stop_words) == expected
+def test_clean_text(
+    text: str,
+    expected: str,
+    clean_stop_words: bool,
+    clean_emojis: bool,
+) -> None:
+    assert (
+        string_utils.clean_text(
+            text,
+            clean_stop_words=clean_stop_words,
+            clean_emojis=clean_emojis,
+        )
+        == expected
+    )
 
 
 @pytest.mark.parametrize(

diff --git a/cdp_backend/utils/string_utils.py b/cdp_backend/utils/string_utils.py
@@ -12,7 +12,38 @@
 ###############################################################################
 
 
-def clean_text(text: str, clean_stop_words: bool = False) -> str:
+def remove_emojis(text: str) -> str:
+    """
+    Minor changes made from this answer on stackoverflow:
+    https://stackoverflow.com/a/58356570
+    """
+    emoj_patterns = re.compile(
+        "["
+        "\U0001F600-\U0001F64F"  # emoticons
+        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+        "\U0001F680-\U0001F6FF"  # transport & map symbols
+        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        "\U00002702-\U000027B0"
+        "\U000024C2-\U0001F251"
+        "\U0001f926-\U0001f937"
+        "\u2600-\u2B55"
+        "\u200d"
+        "\u23cf"
+        "\u23e9"
+        "\u231a"
+        "\ufe0f"  # dingbats
+        "\u3030"
+        "]+",
+        re.UNICODE,
+    )
+    return re.sub(emoj_patterns, "", text)
+
+
+def clean_text(
+    text: str,
+    clean_stop_words: bool = False,
+    clean_emojis: bool = False,
+) -> str:
     """
     Clean text of common characters and extra formatting.
 
@@ -23,6 +54,9 @@ def clean_text(text: str, clean_stop_words: bool = False) -> str:
     clean_stop_words: bool
         Should English stop words be removed from the raw text or not.
         Default: False (do not remove stop words)
+    clean_emojis: bool
+        Should emojis, emoticons, pictograms, and other characters be removed.
+        Default: False (do not remove pictograms)
 
     Returns
     -------
@@ -57,22 +91,23 @@ def clean_text(text: str, clean_stop_words: bool = False) -> str:
             STOPWORDS = stopwords.words("english")
 
         joined_stopwords = "|".join(STOPWORDS)
-        cleaned_stopwords = re.sub(
+        cleaned_text = re.sub(
             r"\b(" + joined_stopwords + r")\b",
             "",
             cleaned_punctuation,
         )
     else:
         # Update for mypy typing
-        cleaned_stopwords = cleaned_punctuation
+        cleaned_text = cleaned_punctuation
+
+    # Remove pictograms
+    if clean_emojis:
+        cleaned_text = remove_emojis(cleaned_text)
 
     # Remove gaps in string
     try:
-        cleaned_doc = re.sub(r" {2,}", " ", cleaned_stopwords)
-        if cleaned_doc[0] == " ":
-            cleaned_doc = cleaned_doc[1:]
-        if cleaned_doc[-1] == " ":
-            cleaned_doc = cleaned_doc[:-1]
+        cleaned_doc = re.sub(r" {2,}", " ", cleaned_text)
+        cleaned_doc = cleaned_doc.strip()
 
     # IndexError occurs when the string was cleaned and it contained entirely stop
     # words or punctuation for some reason