Skip to content

Commit

Permalink
Fix pagerank algorithm. Fix #805 (#1653)
Browse files Browse the repository at this point in the history
* added a regression test for summarization.keywords()
 * handled case with graph smaller than 3 nodes
 * removed TODO about complex eigenvectors
 * added more comments
  • Loading branch information
xelez authored and menshikh-iv committed Oct 26, 2017
1 parent b912203 commit 00192a8
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 4 deletions.
25 changes: 21 additions & 4 deletions gensim/summarization/pagerank_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
import numpy
from numpy import empty as empty_matrix
from scipy.linalg import eig
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigs
from six.moves import xrange
Expand All @@ -21,9 +23,10 @@ def pagerank_weighted(graph, damping=0.85):

pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix

vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors?
vec = principal_eigenvector(pagerank_matrix.T)

return process_results(graph, vecs.real)
# Because pagerank_matrix is positive, vec is always real (i.e. not complex)
return process_results(graph, vec.real)


def build_adjacency_matrix(graph):
Expand Down Expand Up @@ -56,9 +59,23 @@ def build_probability_matrix(graph):
return matrix


def process_results(graph, vecs):
def principal_eigenvector(a):
# Note that we prefer to use `eigs` even for dense matrix
# because we need only one eigenvector. See #441, #438 for discussion.

# But it doesn't work for dim A < 3, so we just handle this special case
if len(a) < 3:
vals, vecs = eig(a)
ind = numpy.abs(vals).argmax()
return vecs[:, ind]
else:
vals, vecs = eigs(a, k=1)
return vecs[:, 0]


def process_results(graph, vec):
scores = {}
for i, node in enumerate(graph.nodes()):
scores[node] = abs(vecs[i, :])
scores[node] = abs(vec[i])

return scores
6 changes: 6 additions & 0 deletions gensim/test/test_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def test_keywords_ratio(self):

self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)

def test_text_keywords_with_small_graph(self):
# regression test, we get graph 2x2 on this text
text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
kwds = keywords(text, words=1, split=True)
self.assertTrue(len(kwds))


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down

0 comments on commit 00192a8

Please sign in to comment.