Skip to content

Commit

Permalink
Fix flake8 warnings W605
Browse files Browse the repository at this point in the history
  • Loading branch information
horpto committed Nov 6, 2018
1 parent 7e4965e commit da0db8c
Show file tree
Hide file tree
Showing 14 changed files with 55 additions and 55 deletions.
6 changes: 3 additions & 3 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
r'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
re.UNICODE
)
"""Table markup"""
Expand Down Expand Up @@ -143,8 +143,8 @@ def filter_example(elem, text, *args, **kwargs):
# regex is in the function call so that we do not pollute the wikicorpus
# namespace do not do this in production as this function is called for
# every element in the wiki dump
_regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_excellent = re.compile(r'.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile(r'.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)

if text is None:
return False
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,14 +376,14 @@ def extend_corpus(self, corpus):
self.corpus.extend(corpus)

def compute_phinorm(self, expElogthetad, expElogbetad):
"""Efficiently computes the normalizing factor in phi.
r"""Efficiently computes the normalizing factor in phi.
Parameters
----------
expElogthetad: numpy.ndarray
Value of variational distribution :math:`q(\theta|\gamma)`.
expElogbetad: numpy.ndarray
Value of variational distribution :math:`q(\\beta|\lambda)`.
Value of variational distribution :math:`q(\beta|\lambda)`.
Returns
-------
Expand Down Expand Up @@ -888,7 +888,7 @@ def rho():
del other

def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
"""Estimate the variational bound of documents from `corpus`.
r"""Estimate the variational bound of documents from `corpus`.
:math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module contains base classes required for implementing \*2vec algorithms.
r"""This module contains base classes required for implementing \*2vec algorithms.
The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings.
In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector
Expand Down Expand Up @@ -56,7 +56,7 @@


class BaseAny2VecModel(utils.SaveLoad):
"""Base class for training, using and evaluating \*2vec model.
r"""Base class for training, using and evaluating \*2vec model.
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
Expand Down
34 changes: 17 additions & 17 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@


def expect_log_sticks(sticks):
"""For stick-breaking hdp, get the :math:`\mathbb{E}[log(sticks)]`.
r"""For stick-breaking hdp, get the :math:`\mathbb{E}[log(sticks)]`.
Parameters
----------
Expand All @@ -97,7 +97,7 @@ def expect_log_sticks(sticks):


def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.
r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.
Parameters
----------
Expand All @@ -115,7 +115,7 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
Returns
-------
(numpy.ndarray, numpy.ndarray)
Computed (:math:`likelihood`, :math:`\\gamma`).
Computed (:math:`likelihood`, :math:`\gamma`).
"""
gamma = np.ones(len(alpha))
Expand Down Expand Up @@ -172,7 +172,7 @@ def set_zero(self):


class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""`Hierarchical Dirichlet Process model <http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf>`_
r"""`Hierarchical Dirichlet Process model <http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf>`_
Topic models promise to help summarize and organize large archives of texts that cannot be easily analyzed by hand.
Hierarchical Dirichlet process (HDP) is a powerful mixed-membership model for the unsupervised analysis of grouped
Expand All @@ -194,7 +194,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
For this assume that there is a restaurant franchise (`corpus`) which has a large number of restaurants
(`documents`, `j`) under it. They have a global menu of dishes (`topics`, :math:`\Phi_{k}`) which they serve.
Also, a single dish (`topic`, :math:`\Phi_{k}`) is only served at a single table `t` for all the customers
(`words`, :math:`\\theta_{j,i}`) who sit at that table.
(`words`, :math:`\theta_{j,i}`) who sit at that table.
So, when a customer enters the restaurant he/she has the choice to make where he/she wants to sit.
He/she can choose to sit at a table where some customers are already sitting , or he/she can choose to sit
at a new table. Here the probability of choosing each option is not same.
Expand All @@ -213,31 +213,31 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
share the same set of atoms, :math:`\Phi_{k}`, and only the atom weights :math:`\pi _{jt}` differs.
There will be multiple document-level atoms :math:`\psi_{jt}` which map to the same corpus-level atom
:math:`\Phi_{k}`. Here, the :math:`\\beta` signify the weights given to each of the topics globally. Also, each
factor :math:`\\theta_{j,i}` is distributed according to :math:`G_{j}`, i.e., it takes on the value of
:math:`\Phi_{k}`. Here, the :math:`\beta` signify the weights given to each of the topics globally. Also, each
factor :math:`\theta_{j,i}` is distributed according to :math:`G_{j}`, i.e., it takes on the value of
:math:`\Phi_{k}` with probability :math:`\pi _{jt}`. :math:`C_{j,t}` is an indicator variable whose value `k`
signifies the index of :math:`\Phi`. This helps to map :math:`\psi_{jt}` to :math:`\Phi_{k}`.
The top level (`corpus` level) stick proportions correspond the values of :math:`\\beta`,
The top level (`corpus` level) stick proportions correspond the values of :math:`\beta`,
bottom level (`document` level) stick proportions correspond to the values of :math:`\pi`.
The truncation level for the corpus (`K`) and document (`T`) corresponds to the number of :math:`\\beta`
The truncation level for the corpus (`K`) and document (`T`) corresponds to the number of :math:`\beta`
and :math:`\pi` which are in existence.
Now, whenever coordinate ascent updates are to be performed, they happen at two level. The document level as well
as corpus level.
At document level, we update the following:
#. The parameters to the document level sticks, i.e, a and b parameters of :math:`\\beta` distribution of the
#. The parameters to the document level sticks, i.e, a and b parameters of :math:`\beta` distribution of the
variable :math:`\pi _{jt}`.
#. The parameters to per word topic indicators, :math:`Z_{j,n}`. Here :math:`Z_{j,n}` selects topic parameter
:math:`\psi_{jt}`.
#. The parameters to per document topic indices :math:`\Phi_{jtk}`.
At corpus level, we update the following:
#. The parameters to the top level sticks, i.e., the parameters of the :math:`\\beta` distribution for the
corpus level :math:`\\beta`, which signify the topic distribution at corpus level.
#. The parameters to the top level sticks, i.e., the parameters of the :math:`\beta` distribution for the
corpus level :math:`\beta`, which signify the topic distribution at corpus level.
#. The parameters to the topics :math:`\Phi_{k}`.
Now coming on to the steps involved, procedure for online variational inference for the Hdp model is as follows:
Expand All @@ -261,14 +261,14 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
Attributes
----------
lda_alpha : numpy.ndarray
Same as :math:`\\alpha` from :class:`gensim.models.ldamodel.LdaModel`.
Same as :math:`\alpha` from :class:`gensim.models.ldamodel.LdaModel`.
lda_beta : numpy.ndarray
Same as :math:`\\beta` from from :class:`gensim.models.ldamodel.LdaModel`.
Same as :math:`\beta` from from :class:`gensim.models.ldamodel.LdaModel`.
m_D : int
Number of documents in the corpus.
m_Elogbeta : numpy.ndarray:
Stores value of dirichlet expectation, i.e., compute :math:`E[log \\theta]` for a vector
:math:`\\theta \sim Dir(\\alpha)`.
Stores value of dirichlet expectation, i.e., compute :math:`E[log \theta]` for a vector
:math:`\theta \sim Dir(\alpha)`.
m_lambda : {numpy.ndarray, float}
Drawn samples from the parameterized gamma distribution.
m_lambda_sum : {numpy.ndarray, float}
Expand All @@ -280,7 +280,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
m_rhot : float
Assigns weight to the information obtained from the mini-chunk and its value it between 0 and 1.
m_status_up_to_date : bool
Flag to indicate whether `lambda `and :math:`E[log \\theta]` have been updated if True, otherwise - not.
Flag to indicate whether `lambda `and :math:`E[log \theta]` have been updated if True, otherwise - not.
m_timestamp : numpy.ndarray
Helps to keep track and perform lazy updates on lambda.
m_updatect : int
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ def update_zeta(self):
return self.zeta

def compute_post_variance(self, word, chain_variance):
"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
This function accepts the word to compute variance for, along with the associated sslm class object,
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/logentropy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


class LogEntropyModel(interfaces.TransformationABC):
"""Objects of this class realize the transformation between word-document co-occurrence matrix (int)
r"""Objects of this class realize the transformation between word-document co-occurrence matrix (int)
into a locally/globally weighted matrix (positive floats).
This is done by a log entropy normalization, optionally normalizing the resulting documents to unit length.
Expand All @@ -35,9 +35,9 @@ class LogEntropyModel(interfaces.TransformationABC):
local\_weight_{i,j} = log(frequency_{i,j} + 1)
P_{i,j} = \\frac{frequency_{i,j}}{\sum_j frequency_{i,j}}
P_{i,j} = \frac{frequency_{i,j}}{\sum_j frequency_{i,j}}
global\_weight_i = 1 + \\frac{\sum_j P_{i,j} * log(P_{i,j})}{log(number\_of\_documents + 1)}
global\_weight_i = 1 + \frac{\sum_j P_{i,j} * log(P_{i,j})}{log(number\_of\_documents + 1)}
final\_weight_{i,j} = local\_weight_{i,j} * global\_weight_i
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/normmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ class NormModel(interfaces.TransformationABC):
"""Objects of this class realize the explicit normalization of vectors (l1 and l2)."""

def __init__(self, corpus=None, norm='l2'):
"""Compute the l1 or l2 normalization by normalizing separately for each document in a corpus.
r"""Compute the l1 or l2 normalization by normalizing separately for each document in a corpus.
If :math:`v_{i,j}` is the 'i'th component of the vector representing document 'j', the l1 normalization is
.. math:: l1_{i, j} = \\frac{v_{i,j}}{\sum_k |v_{k,j}|}
.. math:: l1_{i, j} = \frac{v_{i,j}}{\sum_k |v_{k,j}|}
the l2 normalization is
.. math:: l2_{i, j} = \\frac{v_{i,j}}{\sqrt{\sum_k v_{k,j}^2}}
.. math:: l2_{i, j} = \frac{v_{i,j}}{\sqrt{\sum_k v_{k,j}^2}}
Parameters
Expand Down
10 changes: 5 additions & 5 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def __getitem__(self, sentence):


def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.
Parameters
Expand All @@ -678,14 +678,14 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count
Notes
-----
Formula: :math:`\\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`.
Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`.
"""
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab


def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction"
r"""Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction"
by Gerlof Bouma <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.
Parameters
Expand All @@ -705,8 +705,8 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
Notes
-----
Formula: :math:`\\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
where :math:`prob(word) = \\frac{word\_count}{corpus\_word\_count}`
Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}`
"""
if bigram_count >= min_count:
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def resolve_weights(smartirs):


def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
:math:`idf = add + log_{log\_base} \\frac{totaldocs}{docfreq}`
r"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
:math:`idf = add + log_{log\_base} \frac{totaldocs}{docfreq}`
Parameters
----------
Expand Down Expand Up @@ -239,11 +239,11 @@ class TfidfModel(interfaces.TransformationABC):
"""
def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65):
"""Compute TF-IDF by multiplying a local component (term frequency) with a global component
r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component
(inverse document frequency), and normalizing the resulting documents to unit length.
Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
.. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}
.. math:: weight_{i,j} = frequency_{i,j} * log_2 \frac{D}{document\_freq_{i}}
or, more generally
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Python wrapper for `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_
r"""Python wrapper for `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_
from `MALLET, the Java topic modelling toolkit <http://mallet.cs.umass.edu/>`_
This module allows both LDA model estimation from a training corpus and inference of topic distribution on new,
Expand Down Expand Up @@ -250,7 +250,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True):
cmd = \
self.mallet_path + \
" import-file --preserve-case --keep-sequence " \
"--remove-stopwords --token-regex \"\S+\" --input %s --output %s"
"--remove-stopwords --token-regex \"\\S+\" --input %s --output %s"
if infer:
cmd += ' --use-pipe-from ' + self.fcorpusmallet()
cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ def test_two_level_directory(self):

def test_filename_filtering(self):
dirpath = self.write_one_level('test1.log', 'test1.txt', 'test2.log', 'other1.log')
corpus = textcorpus.TextDirectoryCorpus(dirpath, pattern="test.*\.log")
corpus = textcorpus.TextDirectoryCorpus(dirpath, pattern=r"test.*\.log")
filenames = list(corpus.iter_filepaths())
expected = [os.path.join(dirpath, name) for name in ('test1.log', 'test2.log')]
self.assertEqual(sorted(expected), sorted(filenames))
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def mean_absolute_difference(a, b):


def dirichlet_expectation(alpha):
"""For a vector :math:`\\theta \sim Dir(\\alpha)`, compute :math:`E[log \\theta]`.
r"""For a vector :math:`\theta \sim Dir(\alpha)`, compute :math:`E[log \theta]`.
Parameters
----------
Expand All @@ -70,7 +70,7 @@ def dirichlet_expectation(alpha):
Returns
-------
numpy.ndarray:
:math:`E[log \\theta]`
:math:`E[log \theta]`
"""
if len(alpha.shape) == 1:
Expand Down
10 changes: 5 additions & 5 deletions gensim/topic_coherence/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@


def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False):
"""Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`.
This is defined as :math:`m_{lc}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`.
r"""Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`.
This is defined as :math:`m_{lc}(S_i) = log \frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`.
Parameters
----------
Expand Down Expand Up @@ -124,7 +124,7 @@ def aggregate_segment_sims(segment_sims, with_std, with_support):


def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=False, with_support=False):
"""Compute log ratio measure for `segment_topics`.
r"""Compute log ratio measure for `segment_topics`.
Parameters
----------
Expand All @@ -146,12 +146,12 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=F
-----
If `normalize=False`:
Calculate the log-ratio-measure, popularly known as **PMI** which is used by coherence measures such as `c_v`.
This is defined as :math:`m_{lr}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}`
This is defined as :math:`m_{lr}(S_i) = log \frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}`
If `normalize=True`:
Calculate the normalized-log-ratio-measure, popularly knowns as **NPMI**
which is used by coherence measures such as `c_v`.
This is defined as :math:`m_{nlr}(S_i) = \\frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}`
This is defined as :math:`m_{nlr}(S_i) = \frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}`
Returns
-------
Expand Down
8 changes: 4 additions & 4 deletions gensim/topic_coherence/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def s_one_pre(topics):
"""Performs segmentation on a list of topics.
r"""Performs segmentation on a list of topics.
Notes
-----
Expand Down Expand Up @@ -54,9 +54,9 @@ def s_one_pre(topics):


def s_one_one(topics):
"""Perform segmentation on a list of topics.
r"""Perform segmentation on a list of topics.
Segmentation is defined as
:math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i \\neq j}`.
:math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i \neq j}`.
Parameters
----------
Expand Down Expand Up @@ -96,7 +96,7 @@ def s_one_one(topics):


def s_one_set(topics):
"""Perform s_one_set segmentation on a list of topics.
r"""Perform s_one_set segmentation on a list of topics.
Segmentation is defined as
:math:`s_{set} = {(W', W^{*}) | W' = {w_i}; w_{i} \in W; W^{*} = W}`
Expand Down

0 comments on commit da0db8c

Please sign in to comment.