Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix docstrings forgensim.models.hdpmodel, gensim.models.lda_worker & gensim.models.lda_dispatcher(#1667) #1912

Merged
merged 17 commits into from
Apr 2, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 63 additions & 6 deletions gensim/models/lda_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# Copyright (C) 2011 Radim Rehurek <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Worker ("slave") process used in computing distributed LDA. Run this script \
on every node in your cluster. If you wish, you may even run it multiple times \
on a single machine, to make better use of multiple cores (just beware that \
memory footprint increases accordingly).
"""Worker ("slave") process used in computing distributed LDA.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First of all, please fix PEP8 problems (almost lead spaces), look at travis log https://travis-ci.org/RaRe-Technologies/gensim/jobs/342495787#L511

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Also, should I add a section for module level attributes such as HUGE_TIMEOUT ,MAX_JOBS_QUEUE,etc in lda_dispatcher.py ?


Run this script on every node in your cluster. If you wish, you may even
run it multiple times on a single machine, to make better use of multiple
cores (just beware that memory footprint increases accordingly).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please look at #1892, this is really good way how to document distributed stuff (instruction of running, showing arguments of script in automatic way, etc)

Example: python -m gensim.models.lda_worker

"""


Expand Down Expand Up @@ -40,11 +40,35 @@


class Worker(object):
"""Used as a Pyro class with exposed methods.

Exposes every non-private method and property of the class automatically
to be available for remote access.

Attributes
----------
model : :obj: of :class:`~gensim.models.ldamodel.LdaModel`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to write :obj: (here and everywhere)


"""

def __init__(self):
"""Partly initializes the model."""
self.model = None

@Pyro4.expose
def initialize(self, myid, dispatcher, **model_params):
"""Fully initializes the worker.

Parameters
----------
myid : int
An ID number used to identify this worker in the dispatcher object.
dispatcher : :class:`~gensim.models.lda_dispatcher.Dispatcher`
The dispatcher responsible for scheduling this worker.
**model_params
Keyword parameters to initialize the inner LDA model, see :class:`~gensim.models.ldamodel.LdaModel`.

"""
self.lock_update = threading.Lock()
self.jobsdone = 0 # how many jobs has this worker completed?
# id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
Expand All @@ -59,6 +83,12 @@ def initialize(self, myid, dispatcher, **model_params):
def requestjob(self):
"""
Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called.

Raises
------
RuntimeError
Worker has to be initialised before receiving jobs.

"""
if self.model is None:
raise RuntimeError("worker must be initialized before receiving jobs")
Expand All @@ -79,6 +109,14 @@ def requestjob(self):

@utils.synchronous('lock_update')
def processjob(self, job):
"""Incrementally processes the job and potentially logs progress.

Parameters
----------
job : {iterable of list of (int, float), scipy.sparse.csc}
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).

"""
logger.debug("starting to process job #%i", self.jobsdone)
self.model.do_estep(job)
self.jobsdone += 1
Expand All @@ -89,11 +127,20 @@ def processjob(self, job):

@Pyro4.expose
def ping(self):
"""Test the connectivity with Worker."""
return True

@Pyro4.expose
@utils.synchronous('lock_update')
def getstate(self):
"""Log and get the LDA model's current state.

Returns
-------
result : :obj: of `~gensim.models.ldamodel.LdaState`
The current state.

"""
logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone)
result = self.model.state
assert isinstance(result, ldamodel.LdaState)
Expand All @@ -104,6 +151,14 @@ def getstate(self):
@Pyro4.expose
@utils.synchronous('lock_update')
def reset(self, state):
"""Reset the worker by setting sufficient stats to 0.

Parameters
----------
state : :obj: of :class:`~gensim.models.ldamodel.LdaState`
Encapsulates information for distributed computation of LdaModel objects.

"""
assert state is not None
logger.info("resetting worker #%i", self.myid)
self.model.state = state
Expand All @@ -113,11 +168,13 @@ def reset(self, state):

@Pyro4.oneway
def exit(self):
"""Terminate the worker."""
logger.info("terminating worker #%i", self.myid)
os._exit(0)


def main():
"""Set up argument parser,logger and launches pyro daemon."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None)
parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int)
Expand Down Expand Up @@ -146,4 +203,4 @@ def main():


if __name__ == '__main__':
main()
main()