diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index dd3f703899..ce2d2332a3 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -69,6 +69,7 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). + * metadata if set to true will ensure that serialize will write out article titles to a pickle file. Example: diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 1a53b282e9..8ded391769 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -266,6 +266,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. + self.metadata if set to true will ensure that serialize will write out article titles to a pickle file. """ self.fname = fname