Skip to content

Commit

Permalink
Merge pull request #143 from bab2min/dev_12_2
Browse files Browse the repository at this point in the history
Dev 0.12.2
  • Loading branch information
bab2min authored Sep 2, 2021
2 parents 926f6ff + 5141849 commit 67259ba
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 14 deletions.
1 change: 1 addition & 0 deletions src/TopicModel/CTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
if (normalize)
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/DMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
auto alphaDoc = getCachedAlpha(doc);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
Expand Down
8 changes: 6 additions & 2 deletions src/TopicModel/HDPModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
if (normalize)
Expand Down Expand Up @@ -538,8 +539,11 @@ namespace tomoto
auto d = lda->_makeFromRawDoc(doc);
lda->_addDoc(d);
}

lda->prepare(true, this->minWordCf, this->minWordDf, this->removeTopN);

lda->realV = this->realV;
lda->realN = this->realN;
lda->weightedN = this->weightedN;
lda->prepare(true, 0, 0, 0, false);

auto selectFirst = [&](const std::pair<size_t, size_t>& p) { return std::max(p.first / sum - topicThreshold, 0.f); };
std::discrete_distribution<size_t> randomTopic{
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/HPAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(1 + this->K + K2);
Float sum = doc.getSumWordWeight() + this->alphas.sum();
if (!normalize) sum = 1;
Expand Down
4 changes: 2 additions & 2 deletions src/TopicModel/LDACVB0Model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,9 @@ namespace tomoto
}
}

void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) override
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
{
if (initDocs) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
if (initDocs) this->removeStopwords(minWordCnt, minWordDf, removeTopN, updateStopwords);
static_cast<DerivedClass*>(this)->updateWeakArray();
static_cast<DerivedClass*>(this)->initGlobalState(initDocs);

Expand Down
7 changes: 4 additions & 3 deletions src/TopicModel/LDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1057,9 +1057,9 @@ namespace tomoto
}
}

void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) override
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
{
if (initDocs) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
static_cast<DerivedClass*>(this)->updateWeakArray();
static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
static_cast<DerivedClass*>(this)->prepareWordPriors();
Expand Down Expand Up @@ -1116,7 +1116,7 @@ namespace tomoto
for (auto& doc : this->docs) doc.updateSumWordWeight(this->realV);
}
static_cast<DerivedClass*>(this)->prepareShared();
BaseClass::prepare(initDocs, minWordCnt, minWordDf, removeTopN);
BaseClass::prepare(initDocs, minWordCnt, minWordDf, removeTopN, updateStopwords);
}

std::vector<uint64_t> getCountByTopic() const override
Expand All @@ -1126,6 +1126,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(K);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), K };
if (normalize)
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/LLDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
auto maskedAlphas = this->alphas.array() * doc.labelMask.template cast<Float>().array();
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/MGLDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K + KL);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K + KL };
if (normalize)
Expand Down
1 change: 1 addition & 0 deletions src/TopicModel/PLDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
auto maskedAlphas = this->alphas.array() * doc.labelMask.template cast<Float>().array();
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
Expand Down
4 changes: 3 additions & 1 deletion src/TopicModel/PT.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace tomoto

struct PTArgs : public LDAArgs
{
size_t p = 100;
size_t p = 0;
Float lambda = 0.01;
};

Expand All @@ -30,5 +30,7 @@ namespace tomoto
bool scalarRng = false);

virtual size_t getP() const = 0;
virtual std::vector<Float> getTopicsFromPseudoDoc(const DocumentBase* doc, bool normalize = true) const = 0;
virtual std::vector<std::pair<Tid, Float>> getTopicsFromPseudoDocSorted(const DocumentBase* doc, size_t topN) const = 0;
};
}
20 changes: 20 additions & 0 deletions src/TopicModel/PTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ namespace tomoto

std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
{
if (doc.Zs.empty()) return {};
std::vector<Float> ret(this->K);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
m = this->alphas.array();
Expand All @@ -280,6 +281,25 @@ namespace tomoto
return ret;
}

std::vector<Float> getTopicsFromPseudoDoc(const DocumentBase* _doc, bool normalize) const override
{
auto& doc = *static_cast<const _DocType*>(_doc);
if (!doc.numByTopic.size()) return {};
std::vector<Float> ret(this->K);
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
m = doc.numByTopic.array().template cast<Float>() + this->alphas.array();
if (normalize)
{
m /= m.sum();
}
return ret;
}

std::vector<std::pair<Tid, Float>> getTopicsFromPseudoDocSorted(const DocumentBase* doc, size_t topN) const override
{
return extractTopN<Tid>(getTopicsFromPseudoDoc(doc, true), topN);
}

void updateDocs()
{
for (auto& doc : this->docs)
Expand Down
4 changes: 2 additions & 2 deletions src/TopicModel/TopicModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ namespace tomoto

virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
virtual size_t getGlobalStep() const = 0;
virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) = 0;
virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) = 0;

virtual size_t getK() const = 0;
virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
Expand Down Expand Up @@ -605,7 +605,7 @@ namespace tomoto
return empty;
}

void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) override
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
{
auto p = countRealN();
realN = p.first;
Expand Down
17 changes: 17 additions & 0 deletions src/python/py_PT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ static int PT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs)
[=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; }
);

if (margs.p == 0) margs.p = margs.k * 10;

tomoto::ITopicModel* inst = tomoto::IPTModel::create((tomoto::TermWeight)tw, margs);
if (!inst) throw py::ValueError{ "unknown `tw` value" };
self->inst = inst;
Expand Down Expand Up @@ -99,3 +101,18 @@ TopicModelTypeObject PT_type = { {
PyType_GenericAlloc,
PyType_GenericNew,
}};


PyObject* Document_getTopicsFromPseudoDoc(DocumentObject* self, size_t topN)
{
tomoto::IPTModel* mdl = dynamic_cast<tomoto::IPTModel*>(self->corpus->tm->inst);
if (!mdl) throw py::ValueError{ "`from_pseudo_doc` is valid for only `tomotopy.PTModel`." };
return py::buildPyValue(self->corpus->tm->inst->getTopicsByDocSorted(self->getBoundDoc(), topN));
}

PyObject* Document_getTopicDistFromPseudoDoc(DocumentObject* self, bool normalize)
{
tomoto::IPTModel* mdl = dynamic_cast<tomoto::IPTModel*>(self->corpus->tm->inst);
if (!mdl) throw py::ValueError{ "`from_pseudo_doc` is valid for only `tomotopy.PTModel`." };
return py::buildPyValue(self->corpus->tm->inst->getTopicsByDoc(self->getBoundDoc(), !!normalize));
}
14 changes: 11 additions & 3 deletions src/python/py_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1094,27 +1094,35 @@ PyObject* DocumentObject::repr(DocumentObject* self)
static PyObject* Document_getTopics(DocumentObject* self, PyObject* args, PyObject* kwargs)
{
size_t topN = 10;
static const char* kwlist[] = { "top_n", nullptr };
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &topN)) return nullptr;
size_t fromPseudoDoc = 0;
static const char* kwlist[] = { "top_n", "from_pseudo_doc", nullptr};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|np", (char**)kwlist, &topN, &fromPseudoDoc)) return nullptr;
return py::handleExc([&]()
{
if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." };
if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" };
if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" };
#ifdef TM_PT
if (fromPseudoDoc) return Document_getTopicsFromPseudoDoc(self, topN);
#endif
return py::buildPyValue(self->corpus->tm->inst->getTopicsByDocSorted(self->getBoundDoc(), topN));
});
}

static PyObject* Document_getTopicDist(DocumentObject* self, PyObject* args, PyObject* kwargs)
{
size_t normalize = 1;
static const char* kwlist[] = { "normalize", nullptr };
size_t fromPseudoDoc = 0;
static const char* kwlist[] = { "normalize", "from_pseudo_doc", nullptr };
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|p", (char**)kwlist, &normalize)) return nullptr;
return py::handleExc([&]()
{
if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." };
if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" };
if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" };
#ifdef TM_PT
if (fromPseudoDoc) return Document_getTopicDistFromPseudoDoc(self, !!normalize);
#endif
return py::buildPyValue(self->corpus->tm->inst->getTopicsByDoc(self->getBoundDoc(), !!normalize));
});
}
Expand Down
4 changes: 4 additions & 0 deletions src/python/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@ PyObject* Document_getSubTopicDist(DocumentObject* self, PyObject* args, PyObjec

PyObject* Document_getCountVector(DocumentObject* self);

PyObject* Document_getTopicsFromPseudoDoc(DocumentObject* self, size_t topN);
PyObject* Document_getTopicDistFromPseudoDoc(DocumentObject* self, bool normalize);


template<typename _Target, typename _Order>
PyObject* buildPyValueReorder(const _Target& target, const _Order& order)
{
Expand Down
28 changes: 27 additions & 1 deletion test/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,32 @@ def train_multi_corpus(cls, inputFile, mdFields, f, kargs, ps):
print('Corpus2')
for d in tcorpus2[:10]: print(d.get_ll())

def uninit_doc(cls, inputFile, mdFields, f, kargs, ps):
print('Test uninitialized doc')
tw = 0
print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw]))
mdl = cls(tw=tw, min_df=2, rm_top=2, **kargs)
print('Adding docs...')
unseen_docs = []
for n, line in enumerate(open(inputFile, encoding='utf-8')):
ch = line.strip().split()
if len(ch) < mdFields + 1: continue
if n < 20: unseen_docs.append(line)
else:
if mdFields:
mdl.add_doc(ch[mdFields:], f(ch[:mdFields]))
else:
mdl.add_doc(ch)
mdl.train(20, parallel=ps)
for n, line in enumerate(unseen_docs):
ch = line.strip().split()
if mdFields:
unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields]))
else:
unseen_docs[n] = mdl.make_doc(ch)
unseen_docs[n].get_topics()
unseen_docs[n].get_topic_dist()

def test_empty_uid():
cps = tp.utils.Corpus()
cps.add_doc("test text".split())
Expand Down Expand Up @@ -489,7 +515,7 @@ def test_corpus_save_load():
for ps in pss:
for func in [null_doc, train1, train4, train0,
save_and_load, infer, infer_together,
copy_train,
copy_train, uninit_doc,
]:
locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*(mc + (ps,))))(func, model_case[:-1], ps)

Expand Down

0 comments on commit 67259ba

Please sign in to comment.