-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix for issue #1009 #1018
Fix for issue #1009 #1018
Changes from 2 commits
4d61547
b621f10
0af3fd0
56deea3
8b17640
8d6774b
2257343
8fb22df
edc7ea8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,7 +258,7 @@ class WikiCorpus(TextCorpus): | |
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word | ||
|
||
""" | ||
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): | ||
def __init__(self, fname, processes=None, lemmatize=True, dictionary=None, filter_namespaces=('0',)): | ||
""" | ||
Initialize the corpus. Unless a dictionary is provided, this scans the | ||
corpus once, to determine its vocabulary. | ||
|
@@ -274,7 +274,16 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
if processes is None: | ||
processes = max(1, multiprocessing.cpu_count() - 1) | ||
self.processes = processes | ||
self.lemmatize = lemmatize | ||
# if the user has set lemmatize flag as True, check if installed | ||
if lemmatize == True: | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason for this code to be duplicated here and in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, that's correct. So the lemmatize in the constructor should be set to default value True then, and pattern related checks will be handled in lemmatize method, is that right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the duplicate pattern library check code. |
||
from pattern.en import parse | ||
self.lemmatize = True | ||
except ImportError: | ||
self.lemmatize = False | ||
warnings.warn("Pattern library is not installed, falling back to regexp based lemmatization.") | ||
else: | ||
self.lemmatize = False | ||
if dictionary is None: | ||
self.dictionary = Dictionary(self.get_texts()) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,7 +80,6 @@ def smart_open(fname, mode='rb'): | |
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) | ||
|
||
|
||
|
||
def synchronous(tlockname): | ||
""" | ||
A decorator to place an instance-based lock around a method. | ||
|
@@ -1003,19 +1002,6 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}): | |
daemon.requestLoop() | ||
|
||
|
||
def has_pattern(): | ||
""" | ||
Function to check if there is installed pattern library | ||
""" | ||
pattern = False | ||
try: | ||
from pattern.en import parse | ||
pattern = True | ||
except ImportError: | ||
warnings.warn("Pattern library is not installed, lemmatization won't be available.") | ||
return pattern | ||
|
||
|
||
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, | ||
stopwords=frozenset(), min_length=2, max_length=15): | ||
""" | ||
|
@@ -1037,10 +1023,10 @@ def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, | |
['rank/NN', 'study/VB', 'hard/RB'] | ||
|
||
""" | ||
if not has_pattern(): | ||
raise ImportError("Pattern library is not installed. Pattern library is needed in order \ | ||
to use lemmatize function") | ||
from pattern.en import parse | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please only check import in one place There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Please review. |
||
from pattern.en import parse | ||
except ImportError: | ||
warnings.warn("Pattern library is not installed, lemmatization won't be available.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it cause an unhandled exception later on if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, it will create an unhandled expection later in the method where parse is used, since only a warning is created. I'll change that. |
||
|
||
if light: | ||
import warnings | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is enough in python