From c083e90d04b5bd1aaacd480482bcede89fcdf2a4 Mon Sep 17 00:00:00 2001 From: mingfengwan Date: Wed, 10 Jun 2020 12:02:57 -0400 Subject: [PATCH 1/5] if the hashtag or word is in all caps, lower it Example: "IMUSTGO" to "imustgo" --- ekphrasis/classes/segmenter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ekphrasis/classes/segmenter.py b/ekphrasis/classes/segmenter.py index 821de70..1b4cc0c 100644 --- a/ekphrasis/classes/segmenter.py +++ b/ekphrasis/classes/segmenter.py @@ -131,6 +131,8 @@ def find_segment(self, text, prev=''): # if you don't have enough RAM lower the maxsize @lru_cache(maxsize=65536) def segment(self, word): + if word.isupper(): + word = word.lower() if word.islower(): return " ".join(self.find_segment(word)[1]) else: From 1b5dad5c9a3d25b9170c22737f289ffe0760cee5 Mon Sep 17 00:00:00 2001 From: mingfengwan Date: Wed, 10 Jun 2020 12:41:10 -0400 Subject: [PATCH 2/5] change regex to allow non-English characters --- ekphrasis/tools/generate_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ekphrasis/tools/generate_stats.py b/ekphrasis/tools/generate_stats.py index ba110f7..c521aa3 100644 --- a/ekphrasis/tools/generate_stats.py +++ b/ekphrasis/tools/generate_stats.py @@ -12,7 +12,7 @@ import numpy from tqdm import tqdm -REGEX_TOKEN = re.compile(r'(? Date: Wed, 5 Aug 2020 18:52:30 -0400 Subject: [PATCH 3/5] code rearrangement --- ekphrasis/classes/preprocessor.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ekphrasis/classes/preprocessor.py b/ekphrasis/classes/preprocessor.py index e79f171..0e4187a 100644 --- a/ekphrasis/classes/preprocessor.py +++ b/ekphrasis/classes/preprocessor.py @@ -7,7 +7,7 @@ from ekphrasis.classes.segmenter import Segmenter from ekphrasis.classes.spellcorrect import SpellCorrector from ekphrasis.utils.nlp import unpack_contractions -from ekphrasis.utils.helpers import remove_tags + # noinspection PyPackageRequirements class TextPreProcessor: @@ -72,8 +72,6 @@ def __init__(self, **kwargs): fix_text (bool): choose if you want to fix bad unicode terms and html entities. - - remove_tags (bool): Choose to remove tags after processing """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) @@ -89,7 +87,6 @@ def __init__(self, **kwargs): self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") - self.remove_tags = kwargs.get("remove_tags", False) if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) @@ -134,8 +131,8 @@ def handle_hashtag_match(self, m): text = m.group()[1:] # todo:simplify routine + expanded = self.segmenter.segment(text) if text.islower(): - expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) # print(m.group(), " - ", expanded) @@ -145,7 +142,6 @@ def handle_hashtag_match(self, m): else: # split words following CamelCase convention - expanded = self.regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") # print(m.group(), " - ", expanded) @@ -157,8 +153,10 @@ def handle_hashtag_match(self, m): def handle_elongated_match(self, m): text = m.group() + # normalize to at most 2 repeating chars text = self.regexes["normalize_elong"].sub(r'\1\1', text) + normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized @@ -176,7 +174,6 @@ def handle_elongated_match(self, m): text = self.add_special_tag(text, "elongated") return text - @lru_cache(maxsize=65536) def handle_repeated_puncts(self, m): @@ -310,7 +307,7 @@ def pre_process_doc(self, doc): if "censored" in self.include_tags: doc = self.regexes["censored"].sub( lambda w: self.handle_generic_match(w, "censored"), doc) - + ########################### # unpack contractions: i'm -> i am, can't -> can not... ########################### @@ -319,9 +316,6 @@ def pre_process_doc(self, doc): if self.unpack_contractions: doc = unpack_contractions(doc) - if self.remove_tags: - doc = remove_tags(doc) - # omit allcaps if inside hashtags doc = re.sub(r' +', ' ', doc) # remove repeating spaces # doc = re.sub(r'', '', doc) # remove repeating spaces From 22f8e8158826b852a2a1821b95d9449f643aa863 Mon Sep 17 00:00:00 2001 From: mingfengwan Date: Wed, 5 Aug 2020 19:25:41 -0400 Subject: [PATCH 4/5] revert and change --- ekphrasis/classes/preprocessor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ekphrasis/classes/preprocessor.py b/ekphrasis/classes/preprocessor.py index 0e4187a..fef6ab8 100644 --- a/ekphrasis/classes/preprocessor.py +++ b/ekphrasis/classes/preprocessor.py @@ -7,7 +7,7 @@ from ekphrasis.classes.segmenter import Segmenter from ekphrasis.classes.spellcorrect import SpellCorrector from ekphrasis.utils.nlp import unpack_contractions - +from ekphrasis.utils.helpers import remove_tags # noinspection PyPackageRequirements class TextPreProcessor: @@ -72,6 +72,8 @@ def __init__(self, **kwargs): fix_text (bool): choose if you want to fix bad unicode terms and html entities. + + remove_tags (bool): Choose to remove tags after processing """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) @@ -87,6 +89,7 @@ def __init__(self, **kwargs): self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") + self.remove_tags = kwargs.get("remove_tags", False) if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) @@ -153,10 +156,8 @@ def handle_hashtag_match(self, m): def handle_elongated_match(self, m): text = m.group() - # normalize to at most 2 repeating chars text = self.regexes["normalize_elong"].sub(r'\1\1', text) - normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized @@ -174,6 +175,7 @@ def handle_elongated_match(self, m): text = self.add_special_tag(text, "elongated") return text + @lru_cache(maxsize=65536) def handle_repeated_puncts(self, m): @@ -307,7 +309,7 @@ def pre_process_doc(self, doc): if "censored" in self.include_tags: doc = self.regexes["censored"].sub( lambda w: self.handle_generic_match(w, "censored"), doc) - + ########################### # unpack contractions: i'm -> i am, can't -> can not... ########################### @@ -316,6 +318,9 @@ def pre_process_doc(self, doc): if self.unpack_contractions: doc = unpack_contractions(doc) + if self.remove_tags: + doc = remove_tags(doc) + # omit allcaps if inside hashtags doc = re.sub(r' +', ' ', doc) # remove repeating spaces # doc = re.sub(r'', '', doc) # remove repeating spaces From f56de1f3e21e8ae1e423145e1904ff5062c75fee Mon Sep 17 00:00:00 2001 From: mingfengwan Date: Wed, 5 Aug 2020 19:27:48 -0400 Subject: [PATCH 5/5] small issue fix Imustgo -> imustgo --- ekphrasis/classes/segmenter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ekphrasis/classes/segmenter.py b/ekphrasis/classes/segmenter.py index 1b4cc0c..4bf579a 100644 --- a/ekphrasis/classes/segmenter.py +++ b/ekphrasis/classes/segmenter.py @@ -131,7 +131,7 @@ def find_segment(self, text, prev=''): # if you don't have enough RAM lower the maxsize @lru_cache(maxsize=65536) def segment(self, word): - if word.isupper(): + if word.isupper() or (word[0].isupper() and word[1:].islower()): word = word.lower() if word.islower(): return " ".join(self.find_segment(word)[1])