diff --git a/ekphrasis/classes/preprocessor.py b/ekphrasis/classes/preprocessor.py index e79f171..fef6ab8 100644 --- a/ekphrasis/classes/preprocessor.py +++ b/ekphrasis/classes/preprocessor.py @@ -134,8 +134,8 @@ def handle_hashtag_match(self, m): text = m.group()[1:] # todo:simplify routine + expanded = self.segmenter.segment(text) if text.islower(): - expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) # print(m.group(), " - ", expanded) @@ -145,7 +145,6 @@ def handle_hashtag_match(self, m): else: # split words following CamelCase convention - expanded = self.regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") # print(m.group(), " - ", expanded) diff --git a/ekphrasis/classes/segmenter.py b/ekphrasis/classes/segmenter.py index 821de70..4bf579a 100644 --- a/ekphrasis/classes/segmenter.py +++ b/ekphrasis/classes/segmenter.py @@ -131,6 +131,8 @@ def find_segment(self, text, prev=''): # if you don't have enough RAM lower the maxsize @lru_cache(maxsize=65536) def segment(self, word): + if word.isupper() or (word[0].isupper() and word[1:].islower()): + word = word.lower() if word.islower(): return " ".join(self.find_segment(word)[1]) else: diff --git a/ekphrasis/tools/generate_stats.py b/ekphrasis/tools/generate_stats.py index ba110f7..c521aa3 100644 --- a/ekphrasis/tools/generate_stats.py +++ b/ekphrasis/tools/generate_stats.py @@ -12,7 +12,7 @@ import numpy from tqdm import tqdm -REGEX_TOKEN = re.compile(r'(?