Skip to content

Commit

Permalink
bugfix/clean-pictograms-from-transcripts-before-indexing (#165)
Browse files Browse the repository at this point in the history
* Add emoji and pictogram cleaner function

* Add tests

* Use str.strip()
  • Loading branch information
Jackson Maxfield Brown authored Feb 17, 2022
1 parent ede007f commit 203fec3
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 18 deletions.
1 change: 1 addition & 0 deletions cdp_backend/pipeline/event_index_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def read_transcripts_and_generate_grams(
cleaned_text=string_utils.clean_text(
sentence.text,
clean_stop_words=True,
clean_emojis=True,
),
n_grams=[],
)
Expand Down
80 changes: 70 additions & 10 deletions cdp_backend/tests/utils/test_string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,84 @@


@pytest.mark.parametrize(
"text, expected, clean_stop_words",
"text, expected, clean_stop_words, clean_emojis",
[
("hello and goodbye", "hello goodbye", True),
(" \t\n hello and to of a goodbye ", "hello goodbye", True),
("hell'o and good-bye", "hello goodbye", True),
("and", "", True),
("hello and goodbye", "hello and goodbye", False),
(
"hello and goodbye",
"hello goodbye",
True,
True,
),
(
" \t\n hello and to of a goodbye ",
"hello goodbye",
True,
True,
),
(
"hell'o and good-bye",
"hello goodbye",
True,
True,
),
(
"and",
"",
True,
True,
),
(
"hello and goodbye",
"hello and goodbye",
False,
True,
),
(
" \t\n hello and to of a goodbye ",
"hello and to of a goodbye",
False,
True,
),
(
"hell'o and good-bye",
"hello and goodbye",
False,
True,
),
(
"and",
"and",
False,
True,
),
(
"♪ Seattle channel music ♪",
"Seattle channel music",
False,
True,
),
(
"\t\n \t♪ Seattle channel music ♪",
"Seattle channel music",
False,
True,
),
("hell'o and good-bye", "hello and goodbye", False),
("and", "and", False),
],
)
def test_clean_text(text: str, expected: str, clean_stop_words: bool) -> None:
assert string_utils.clean_text(text, clean_stop_words=clean_stop_words) == expected
def test_clean_text(
text: str,
expected: str,
clean_stop_words: bool,
clean_emojis: bool,
) -> None:
assert (
string_utils.clean_text(
text,
clean_stop_words=clean_stop_words,
clean_emojis=clean_emojis,
)
== expected
)


@pytest.mark.parametrize(
Expand Down
51 changes: 43 additions & 8 deletions cdp_backend/utils/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,38 @@
###############################################################################


def clean_text(text: str, clean_stop_words: bool = False) -> str:
def remove_emojis(text: str) -> str:
"""
Minor changes made from this answer on stackoverflow:
https://stackoverflow.com/a/58356570
"""
emoj_patterns = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
re.UNICODE,
)
return re.sub(emoj_patterns, "", text)


def clean_text(
text: str,
clean_stop_words: bool = False,
clean_emojis: bool = False,
) -> str:
"""
Clean text of common characters and extra formatting.
Expand All @@ -23,6 +54,9 @@ def clean_text(text: str, clean_stop_words: bool = False) -> str:
clean_stop_words: bool
Should English stop words be removed from the raw text or not.
Default: False (do not remove stop words)
clean_emojis: bool
Should emojis, emoticons, pictograms, and other characters be removed.
Default: False (do not remove pictograms)
Returns
-------
Expand Down Expand Up @@ -57,22 +91,23 @@ def clean_text(text: str, clean_stop_words: bool = False) -> str:
STOPWORDS = stopwords.words("english")

joined_stopwords = "|".join(STOPWORDS)
cleaned_stopwords = re.sub(
cleaned_text = re.sub(
r"\b(" + joined_stopwords + r")\b",
"",
cleaned_punctuation,
)
else:
# Update for mypy typing
cleaned_stopwords = cleaned_punctuation
cleaned_text = cleaned_punctuation

# Remove pictograms
if clean_emojis:
cleaned_text = remove_emojis(cleaned_text)

# Remove gaps in string
try:
cleaned_doc = re.sub(r" {2,}", " ", cleaned_stopwords)
if cleaned_doc[0] == " ":
cleaned_doc = cleaned_doc[1:]
if cleaned_doc[-1] == " ":
cleaned_doc = cleaned_doc[:-1]
cleaned_doc = re.sub(r" {2,}", " ", cleaned_text)
cleaned_doc = cleaned_doc.strip()

# IndexError occurs when the string was cleaned and it contained entirely stop
# words or punctuation for some reason
Expand Down

0 comments on commit 203fec3

Please sign in to comment.