From 53fcc8c6904ea149443ba2075daa83f4d624e8f7 Mon Sep 17 00:00:00 2001 From: Lukas Elmer Date: Thu, 11 Jun 2015 18:59:08 +0200 Subject: [PATCH 1/2] Update doc comment to match implementation Document pageid in extract_pages. It was introduced here: https://github.com/piskvorky/gensim/commit/6783b813408acc4e04ebe0603192c0d76508b048#diff-eece52d95c280dabe57c803c95d6bb96 --- gensim/corpora/wikicorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 0adc07265c..801bac2d22 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -184,7 +184,7 @@ def extract_pages(f, filter_namespaces=False): """ Extract pages from MediaWiki database dump. - Return an iterable over (str, str) which generates (title, content) pairs. + Return an iterable over (str, str, str) which generates (title, content, pageid) pairs. """ elems = (elem for _, elem in iterparse(f, events=("end",))) From cbe49bd4cf2e019103eb540a949605e57ad0546a Mon Sep 17 00:00:00 2001 From: Lukas Elmer Date: Fri, 12 Jun 2015 01:37:23 +0200 Subject: [PATCH 2/2] Not pairs, triplets --- gensim/corpora/wikicorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 801bac2d22..ab322b81f8 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -184,7 +184,7 @@ def extract_pages(f, filter_namespaces=False): """ Extract pages from MediaWiki database dump. - Return an iterable over (str, str, str) which generates (title, content, pageid) pairs. + Return an iterable over (str, str, str) which generates (title, content, pageid) triplets. """ elems = (elem for _, elem in iterparse(f, events=("end",)))