From 1beb9dd6b838ffd1ef8df128d1399536abd64814 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Mon, 2 Mar 2020 00:23:23 +0900 Subject: [PATCH 1/2] bug fixing including #30 --- .gitignore | 3 +- README.kr.rst | 5 + README.rst | 5 + setup.py | 2 +- src/TopicModel/CTModel.hpp | 12 +- src/TopicModel/DMRModel.hpp | 6 +- src/TopicModel/GDMRModel.hpp | 6 +- src/TopicModel/HDPModel.hpp | 17 ++- src/TopicModel/HLDAModel.hpp | 16 ++- src/TopicModel/HPAModel.cpp | 2 +- src/TopicModel/HPAModel.hpp | 28 ++-- src/TopicModel/LDA.h | 5 +- src/TopicModel/LDACVB0Model.hpp | 20 +-- src/TopicModel/LDAModel.hpp | 239 ++++++++++++++++++++++++-------- src/TopicModel/LLDAModel.hpp | 1 + src/TopicModel/MGLDAModel.hpp | 10 +- src/TopicModel/PAModel.hpp | 78 ++++++++--- src/TopicModel/PLDAModel.hpp | 6 +- src/TopicModel/SLDAModel.hpp | 6 +- src/TopicModel/TopicModel.hpp | 7 +- src/Utils/exception.h | 6 + src/Utils/math.h | 2 +- src/Utils/sample.hpp | 28 ++-- src/Utils/serializer.hpp | 86 ++++++++++++ src/python/py_LLDA.cpp | 2 +- test/unit_test.py | 29 +++- tomotopy/__init__.py | 2 + tomotopy/documentation.kr.rst | 5 + tomotopy/documentation.rst | 5 + 29 files changed, 493 insertions(+), 146 deletions(-) diff --git a/.gitignore b/.gitignore index ddf337a..d82a285 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ build_windows.bat *.bin enwiki-stemmed-1000.txt -/venv/ \ No newline at end of file +/venv/ +.vscode/ \ No newline at end of file diff --git a/README.kr.rst b/README.kr.rst index edda860..04e632d 100644 --- a/README.kr.rst +++ b/README.kr.rst @@ -210,6 +210,11 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.5.2 (2020-03-01) + * `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다. + * `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다. + * `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다. + * 0.5.1 (2020-01-11) * `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다. * `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다. diff --git a/README.rst b/README.rst index 69ceb3e..01c5262 100644 --- a/README.rst +++ b/README.rst @@ -215,6 +215,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.5.2 (2020-03-01) + * A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`. + * A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program. + * A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True. + * 0.5.1 (2020-01-11) * A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`. * Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables. diff --git a/setup.py b/setup.py index 1eb75f2..e919fde 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ setup( name='tomotopy', - version='0.5.1', + version='0.5.2', description='Tomoto, The Topic Modeling Tool for Python', long_description=long_description, diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp index fa4e69b..7daa55d 100644 --- a/src/TopicModel/CTModel.hpp +++ b/src/TopicModel/CTModel.hpp @@ -38,14 +38,16 @@ namespace tomoto size_t numDocBetaSample = -1; math::MultiNormalDistribution topicPrior; + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; zLikelihood = doc.smBeta.array() - * (ld.numByTopicWord.col(vid).array().template cast() + this->eta) - / (ld.numByTopic.array().template cast() + V * this->eta); + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; } @@ -106,10 +108,10 @@ namespace tomoto doc.smBeta /= doc.smBeta.array().sum(); } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { - BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId); + BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId); /*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0) { updateBeta(doc, rgs); diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp index 1b79ce4..6bfc38b 100644 --- a/src/TopicModel/DMRModel.hpp +++ b/src/TopicModel/DMRModel.hpp @@ -153,14 +153,16 @@ namespace tomoto return 0; } + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; zLikelihood = (doc.numByTopic.array().template cast() + this->expLambda.col(doc.metadata).array()) - * (ld.numByTopicWord.col(vid).array().template cast() + this->eta) - / (ld.numByTopic.array().template cast() + V * this->eta); + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp index 99f526b..4295308 100644 --- a/src/TopicModel/GDMRModel.hpp +++ b/src/TopicModel/GDMRModel.hpp @@ -177,15 +177,17 @@ namespace tomoto } } + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; getTermsFromMd(ld, &doc.metadataC[0], ld.terms); zLikelihood = (doc.numByTopic.array().template cast() + (this->lambda * ld.terms).array().exp() + this->alphaEps) - * (ld.numByTopicWord.col(vid).array().template cast() + this->eta) - / (ld.numByTopic.array().template cast() + V * this->eta); + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp index 9d07415..d6876cb 100644 --- a/src/TopicModel/HDPModel.hpp +++ b/src/TopicModel/HDPModel.hpp @@ -190,8 +190,8 @@ namespace tomoto } } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -200,7 +200,7 @@ namespace tomoto calcWordTopicProb(ld, doc.words[w]); auto topicDist = getTopicLikelihoods(ld); auto dist = getTableLikelihoods(ld, doc, doc.words[w]); - doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + 1, rgs); + doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + (_infer ? 0 : 1), rgs); if (doc.Zs[w] == doc.numTopicByTable.size()) // create new table { size_t K = ld.numByTopic.size(); @@ -281,8 +281,8 @@ namespace tomoto for (auto& r : res) r.get(); } - template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const + template + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const { std::vector> res; const size_t V = this->realV; @@ -457,6 +457,13 @@ namespace tomoto { return this->globalState.numTableByTopic[tid]; } + + std::vector getTopicsByDoc(const _DocType& doc) const + { + std::vector ret(this->K); + Eigen::Map> { ret.data(), this->K }.array() = doc.numByTopic.array().template cast() / doc.getSumWordWeight(); + return ret; + } }; template diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp index 1536759..34212f7 100644 --- a/src/TopicModel/HLDAModel.hpp +++ b/src/TopicModel/HLDAModel.hpp @@ -422,6 +422,7 @@ namespace tomoto addWordToOnlyLocal(ld, doc, pid, vid, level); } + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; @@ -443,14 +444,23 @@ namespace tomoto { if (doc.words[w] >= this->realV) continue; addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w]); - auto dist = static_cast(this)->getZLikelihoods(ld, doc, docId, doc.words[w]); + FLOAT* dist; + if (this->etaByTopicWord.size()) + { + THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features"); + } + else + { + dist = static_cast(this)->template + getZLikelihoods(ld, doc, docId, doc.words[w]); + } doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + this->K, rgs); addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w]); } } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { sampleTopics(doc, docId, ld, rgs); } diff --git a/src/TopicModel/HPAModel.cpp b/src/TopicModel/HPAModel.cpp index 45a1e6a..a83beba 100644 --- a/src/TopicModel/HPAModel.cpp +++ b/src/TopicModel/HPAModel.cpp @@ -18,4 +18,4 @@ namespace tomoto } return nullptr; } -} \ No newline at end of file +} diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp index b57cff2..5325ac1 100644 --- a/src/TopicModel/HPAModel.hpp +++ b/src/TopicModel/HPAModel.hpp @@ -88,6 +88,7 @@ namespace tomoto return std::make_pair(ceil(k * (float)K2 / this->K), ceil((k + 1) * (float)K2 / this->K)); } + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; @@ -173,24 +174,32 @@ namespace tomoto } } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) { - b = this->chunkOffsetByDoc(partitionId, docId); - e = this->chunkOffsetByDoc(partitionId + 1, docId); + b = edd.chunkOffsetByDoc(partitionId, docId); + e = edd.chunkOffsetByDoc(partitionId + 1, docId); } - size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0; const auto K = this->K; for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); - auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + FLOAT* dist; + if (this->etaByTopicWord.size()) + { + THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features"); + } + else + { + dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + } if (_Exclusive) { auto z = sample::sampleFromDiscreteAcc(dist, dist + K2 + K + 1, rgs); @@ -233,12 +242,13 @@ namespace tomoto } } - void distributePartition(ThreadPool& pool, _ModelState* localData) + template + void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const { } - template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const + template + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const { std::vector> res; diff --git a/src/TopicModel/LDA.h b/src/TopicModel/LDA.h index 3652103..b9bc349 100644 --- a/src/TopicModel/LDA.h +++ b/src/TopicModel/LDA.h @@ -114,7 +114,10 @@ namespace tomoto virtual std::vector getCountByTopic() const = 0; virtual size_t getK() const = 0; virtual FLOAT getAlpha() const = 0; - virtual FLOAT getAlpha(TID k1) const = 0; + virtual FLOAT getAlpha(TID k) const = 0; virtual FLOAT getEta() const = 0; + + virtual std::vector getWordPrior(const std::string& word) const = 0; + virtual void setWordPrior(const std::string& word, const std::vector& priors) = 0; }; } diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp index a75f4a6..1374292 100644 --- a/src/TopicModel/LDACVB0Model.hpp +++ b/src/TopicModel/LDACVB0Model.hpp @@ -63,6 +63,9 @@ namespace tomoto virtual size_t getK() const = 0; virtual FLOAT getAlpha() const = 0; virtual FLOAT getEta() const = 0; + + virtual std::vector getWordPrior(const std::string& word) const { return {}; } + virtual void setWordPrior(const std::string& word, const std::vector& priors) {} }; template static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha) { - FLOAT ret = 0; + auto listExpr = Eigen::Matrix::NullaryExpr(len, list); auto dAlpha = math::digammaT(alpha); - for (size_t i = 0; i < len; ++i) - { - ret += math::digammaT(list(i) + alpha) - dAlpha; - } - return ret; + return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum(); } void optimizeParameters(ThreadPool& pool, _ModelState* localData) @@ -138,8 +137,8 @@ namespace tomoto if (DEC) ld.numByTopicWord.col(vid) = ld.numByTopicWord.col(vid).cwiseMax(0); } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -150,7 +149,8 @@ namespace tomoto } } - void updatePartition(ThreadPool& pool, _ModelState* localData) + template + void updatePartition(ThreadPool& pool, _ModelState* localData, _DocIter first, _DocIter last, _ExtraDocData& edd) { } @@ -166,7 +166,7 @@ namespace tomoto forRandom((this->docs.size() - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id) { static_cast(this)->template sampleDocument( - this->docs[id * chStride + ch], id * chStride + ch, + this->docs[id * chStride + ch], 0, id * chStride + ch, localData[threadId], rgs[threadId], this->iterated); }); })); diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index 9519aa4..b20f55d 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -56,6 +56,45 @@ namespace tomoto }; } + + template + class EtaHelper + { + const _Model* _this; + public: + EtaHelper(const _Model* p) : _this(p) {} + + FLOAT getEta(size_t vid) const + { + return _this->eta; + } + + FLOAT getEtaSum() const + { + return _this->eta * _this->realV; + } + }; + + template + class EtaHelper<_Model, true> + { + const _Model* _this; + public: + EtaHelper(const _Model* p) : _this(p) {} + + auto getEta(size_t vid) const + -> decltype(_this->etaByTopicWord.col(vid).array()) + { + return _this->etaByTopicWord.col(vid).array(); + } + + auto getEtaSum() const + -> decltype(_this->etaSumByTopic.array()) + { + return _this->etaSumByTopic.array(); + } + }; + template::value, LDAModel, _Derived>::type; using BaseClass = TopicModel<_Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; + friend EtaHelper; + friend EtaHelper; static constexpr const char* TWID = _TW == TermWeight::one ? "one" : (_TW == TermWeight::idf ? "idf" : "pmi"); static constexpr const char* TMID = "LDA"; @@ -82,22 +123,27 @@ namespace tomoto TID K; FLOAT alpha, eta; Eigen::Matrix alphas; + std::unordered_map> etaByWord; + Eigen::Matrix etaByTopicWord; // (K, V) + Eigen::Matrix etaSumByTopic; // (K, ) size_t optimInterval = 10, burnIn = 0; Eigen::Matrix numByTopicDoc; + + struct ExtraDocData + { + std::vector vChunkOffset; + Eigen::Matrix chunkOffsetByDoc; + }; + + ExtraDocData eddTrain; - std::vector vChunkOffset; - Eigen::Matrix chunkOffsetByDoc; template static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha) { - FLOAT ret = 0; + auto listExpr = Eigen::Matrix::NullaryExpr(len, list); auto dAlpha = math::digammaT(alpha); - for (size_t i = 0; i < len; ++i) - { - ret += math::digammaT(list(i) + alpha) - dAlpha; - } - return ret; + return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum(); } /* @@ -117,15 +163,22 @@ namespace tomoto } } + template + EtaHelper getEtaHelper() const + { + return EtaHelper{ static_cast(this) }; + } + + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; zLikelihood = (doc.numByTopic.array().template cast() + alphas.array()) - * (ld.numByTopicWord.col(vid).array().template cast() + eta) - / (ld.numByTopic.array().template cast() + V * eta); - + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); sample::prefixSum(zLikelihood.data(), K); return &zLikelihood[0]; } @@ -147,31 +200,41 @@ namespace tomoto /* main sampling procedure */ - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) { - b = chunkOffsetByDoc(partitionId, docId); - e = chunkOffsetByDoc(partitionId + 1, docId); + b = edd.chunkOffsetByDoc(partitionId, docId); + e = edd.chunkOffsetByDoc(partitionId + 1, docId); } - size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0; for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w]); - auto dist = static_cast(this)->getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + FLOAT* dist; + if (etaByTopicWord.size()) + { + dist = static_cast(this)->template + getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + } + else + { + dist = static_cast(this)->template + getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + } doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + K, rgs); addWordTo<1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w]); } } - template + template void performSampling(ThreadPool& pool, _ModelState* localData, RandGen* rgs, std::vector>& res, - _DocIter docFirst, _DocIter docLast) const + _DocIter docFirst, _DocIter docLast, const _ExtraDocData& edd) const { // single-threaded sampling if (_ps == ParallelScheme::none) @@ -179,8 +242,8 @@ namespace tomoto size_t docId = 0; for (auto doc = docFirst; doc != docLast; ++doc) { - static_cast(this)->template sampleDocument<_ps>( - *doc, docId++, + static_cast(this)->template sampleDocument<_ps, _infer>( + *doc, edd, docId++, *localData, *rgs, this->iterated, 0); } } @@ -195,8 +258,8 @@ namespace tomoto size_t didx = (i + partitionId) % chStride; forRandom(((size_t)std::distance(docFirst, docLast) + (chStride - 1) - didx) / chStride, rgs[partitionId](), [&](size_t id) { - static_cast(this)->template sampleDocument<_ps>( - docFirst[id * chStride + didx], id * chStride + didx, + static_cast(this)->template sampleDocument<_ps, _infer>( + docFirst[id * chStride + didx], edd, id * chStride + didx, localData[partitionId], rgs[partitionId], this->iterated, partitionId); }); }); @@ -214,8 +277,8 @@ namespace tomoto { forRandom(((size_t)std::distance(docFirst, docLast) + (chStride - 1) - ch) / chStride, rgs[threadId](), [&](size_t id) { - static_cast(this)->template sampleDocument<_ps>( - docFirst[id * chStride + ch], id * chStride + ch, + static_cast(this)->template sampleDocument<_ps, _infer>( + docFirst[id * chStride + ch], edd, id * chStride + ch, localData[threadId], rgs[threadId], this->iterated, 0); }); })); @@ -225,52 +288,55 @@ namespace tomoto } } - void updatePartition(ThreadPool& pool, _ModelState* localData) + template + void updatePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, _DocIter first, _DocIter last, _ExtraDocData& edd) const { size_t numPools = pool.getNumWorkers(); - if (vChunkOffset.size() != numPools) + if (edd.vChunkOffset.size() != numPools) { - vChunkOffset.clear(); + edd.vChunkOffset.clear(); size_t totCnt = std::accumulate(this->vocabFrequencies.begin(), this->vocabFrequencies.begin() + this->realV, 0); size_t cumCnt = 0; for (size_t i = 0; i < this->realV; ++i) { cumCnt += this->vocabFrequencies[i]; - if (cumCnt * numPools >= totCnt * (vChunkOffset.size() + 1)) vChunkOffset.emplace_back(i + 1); + if (cumCnt * numPools >= totCnt * (edd.vChunkOffset.size() + 1)) edd.vChunkOffset.emplace_back(i + 1); } - chunkOffsetByDoc.resize(numPools + 1, this->docs.size()); - for (size_t i = 0; i < this->docs.size(); ++i) + edd.chunkOffsetByDoc.resize(numPools + 1, std::distance(first, last)); + size_t i = 0; + for (; first != last; ++first, ++i) { - auto& doc = this->docs[i]; - chunkOffsetByDoc(0, i) = 0; + auto& doc = *first; + edd.chunkOffsetByDoc(0, i) = 0; size_t g = 0; for (size_t j = 0; j < doc.words.size(); ++j) { - for (; g < numPools && doc.words[j] >= vChunkOffset[g]; ++g) + for (; g < numPools && doc.words[j] >= edd.vChunkOffset[g]; ++g) { - chunkOffsetByDoc(g + 1, i) = j; + edd.chunkOffsetByDoc(g + 1, i) = j; } } for (; g < numPools; ++g) { - chunkOffsetByDoc(g + 1, i) = doc.words.size(); + edd.chunkOffsetByDoc(g + 1, i) = doc.words.size(); } } } - static_cast(this)->distributePartition(pool, localData); + static_cast(this)->distributePartition(pool, globalState, localData, edd); } - void distributePartition(ThreadPool& pool, _ModelState* localData) + template + void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const { std::vector> res = pool.enqueueToAll([&](size_t partitionId) { - size_t b = partitionId ? vChunkOffset[partitionId - 1] : 0, - e = vChunkOffset[partitionId]; + size_t b = partitionId ? edd.vChunkOffset[partitionId - 1] : 0, + e = edd.vChunkOffset[partitionId]; - localData[partitionId].numByTopicWord = this->globalState.numByTopicWord.block(0, b, this->globalState.numByTopicWord.rows(), e - b); - localData[partitionId].numByTopic = this->globalState.numByTopic; - if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = this->globalState.zLikelihood; + localData[partitionId].numByTopicWord = globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b); + localData[partitionId].numByTopic = globalState.numByTopic; + if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = globalState.zLikelihood; }); for (auto& r : res) r.get(); @@ -296,10 +362,10 @@ namespace tomoto std::vector> res; try { - performSampling<_ps>(pool, localData, rgs, res, - this->docs.begin(), this->docs.end()); + performSampling<_ps, false>(pool, localData, rgs, res, + this->docs.begin(), this->docs.end(), eddTrain); static_cast(this)->updateGlobalInfo(pool, localData); - static_cast(this)->template mergeState<_ps>(pool, this->globalState, this->tState, localData, rgs); + static_cast(this)->template mergeState<_ps>(pool, this->globalState, this->tState, localData, rgs, eddTrain); static_cast(this)->template sampleGlobalLevel<>(&pool, localData, rgs, this->docs.begin(), this->docs.end()); if (this->iterated >= this->burnIn && optimInterval && (this->iterated + 1) % optimInterval == 0) { @@ -324,8 +390,8 @@ namespace tomoto /* merges multithreaded document sampling result */ - template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const + template + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -357,8 +423,8 @@ namespace tomoto { res = pool.enqueueToAll([&](size_t partitionId) { - size_t b = partitionId ? vChunkOffset[partitionId - 1] : 0, - e = vChunkOffset[partitionId]; + size_t b = partitionId ? edd.vChunkOffset[partitionId - 1] : 0, + e = edd.vChunkOffset[partitionId]; globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b) = localData[partitionId].numByTopicWord; }); for (auto& r : res) r.get(); @@ -425,6 +491,7 @@ namespace tomoto { if (!ld.numByTopicWord(k, v)) continue; ll += math::lgammaT(ld.numByTopicWord(k, v) + eta) - lgammaEta; + assert(isfinite(ll)); } } return ll; @@ -459,6 +526,21 @@ namespace tomoto if(_TW != TermWeight::one) doc.wordWeights.resize(wordSize, 1); } + void prepareWordPriors() + { + if (etaByWord.empty()) return; + etaByTopicWord.resize(K, this->realV); + etaSumByTopic.resize(K); + etaByTopicWord.array() = eta; + for (auto& it : etaByWord) + { + auto id = this->dict.toWid(it.first); + if (id == (VID)-1 || id >= this->realV) continue; + etaByTopicWord.col(id) = Eigen::Map>{ it.second.data(), (Eigen::Index)it.second.size() }; + } + etaSumByTopic = etaByTopicWord.rowwise().sum(); + } + void initGlobalState(bool initDocs) { const size_t V = this->realV; @@ -486,7 +568,15 @@ namespace tomoto { auto& z = doc.Zs[i]; auto w = doc.words[i]; - z = g.theta(rgs); + if (etaByTopicWord.size()) + { + auto col = etaByTopicWord.col(w); + z = sample::sampleFromDiscrete(col.data(), col.data() + col.size(), rgs); + } + else + { + z = g.theta(rgs); + } addWordTo<1>(ld, doc, i, w, z); } @@ -580,13 +670,19 @@ namespace tomoto std::vector rgs; for (size_t i = 0; i < pool.getNumWorkers(); ++i) rgs.emplace_back(rgc()); + ExtraDocData edd; + if (_ps == ParallelScheme::partition) + { + updatePartition(pool, tmpState, localData.data(), docFirst, docLast, edd); + } + for (size_t i = 0; i < maxIter; ++i) { std::vector> res; - performSampling<_ps>(pool, + performSampling<_ps, true>(pool, (m_flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), res, - docFirst, docLast); - static_cast(this)->template mergeState<_ps>(pool, tmpState, tState, localData.data(), rgs.data()); + docFirst, docLast, edd); + static_cast(this)->template mergeState<_ps>(pool, tmpState, tState, localData.data(), rgs.data(), edd); static_cast(this)->template sampleGlobalLevel<>( &pool, (m_flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), docFirst, docLast); } @@ -597,6 +693,7 @@ namespace tomoto else if (m_flags & flags::shared_state) { ThreadPool pool{ numWorkers }; + ExtraDocData edd; std::vector ret; const double gllRest = static_cast(this)->getLLRest(this->globalState); for (auto d = docFirst; d != docLast; ++d) @@ -606,7 +703,7 @@ namespace tomoto initializeDocState(*d, nullptr, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) { - static_cast(this)->template sampleDocument(*d, -1, tmpState, rgc, i); + static_cast(this)->template sampleDocument(*d, edd, -1, tmpState, rgc, i); static_cast(this)->template sampleGlobalLevel<>( &pool, &tmpState, &rgc, &*d, &*d + 1); } @@ -619,6 +716,7 @@ namespace tomoto else { ThreadPool pool{ numWorkers, numWorkers * 8 }; + ExtraDocData edd; std::vector> res; const double gllRest = static_cast(this)->getLLRest(this->globalState); for (auto d = docFirst; d != docLast; ++d) @@ -630,7 +728,7 @@ namespace tomoto initializeDocState(*d, nullptr, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) { - static_cast(this)->template sampleDocument(*d, -1, tmpState, rgc, i); + static_cast(this)->template sampleDocument(*d, edd, -1, tmpState, rgc, i); static_cast(this)->template sampleGlobalLevel<>( nullptr, &tmpState, &rgc, &*d, &*d + 1); } @@ -690,6 +788,34 @@ namespace tomoto return make_unique<_DocType>(this->_makeDocWithinVocab(words)); } + void setWordPrior(const std::string& word, const std::vector& priors) override + { + if (priors.size() != K) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "priors.size() must be equal to K."); + for (auto p : priors) + { + if (p < 0) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "priors must not be less than 0."); + } + this->dict.add(word); + etaByWord.emplace(word, priors); + } + + std::vector getWordPrior(const std::string& word) const override + { + if (etaByTopicWord.size()) + { + auto id = this->dict.toWid(word); + if (id == (VID)-1) return {}; + auto col = etaByTopicWord.col(id); + return std::vector{ col.data(), col.data() + col.size() }; + } + else + { + auto it = etaByWord.find(word); + if (it == etaByWord.end()) return {}; + return it->second; + } + } + void updateDocs() { size_t docId = 0; @@ -704,6 +830,7 @@ namespace tomoto if (initDocs) this->removeStopwords(minWordCnt, removeTopN); static_cast(this)->updateWeakArray(); static_cast(this)->initGlobalState(initDocs); + static_cast(this)->prepareWordPriors(); const size_t V = this->realV; diff --git a/src/TopicModel/LLDAModel.hpp b/src/TopicModel/LLDAModel.hpp index e6d54e5..67bcda9 100644 --- a/src/TopicModel/LLDAModel.hpp +++ b/src/TopicModel/LLDAModel.hpp @@ -29,6 +29,7 @@ namespace tomoto Dictionary topicLabelDict; + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; diff --git a/src/TopicModel/MGLDAModel.hpp b/src/TopicModel/MGLDAModel.hpp index 7f6fd19..d21ea9f 100644 --- a/src/TopicModel/MGLDAModel.hpp +++ b/src/TopicModel/MGLDAModel.hpp @@ -97,17 +97,17 @@ namespace tomoto } } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) { - b = this->chunkOffsetByDoc(partitionId, docId); - e = this->chunkOffsetByDoc(partitionId + 1, docId); + b = edd.chunkOffsetByDoc(partitionId, docId); + e = edd.chunkOffsetByDoc(partitionId + 1, docId); } - size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0; const auto K = this->K; for (size_t w = b; w < e; ++w) diff --git a/src/TopicModel/PAModel.hpp b/src/TopicModel/PAModel.hpp index 12a56db..d1337c0 100644 --- a/src/TopicModel/PAModel.hpp +++ b/src/TopicModel/PAModel.hpp @@ -67,14 +67,17 @@ namespace tomoto } // topic 1 & 2 assignment likelihoods for new word. ret K*K2 FLOATs + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; const auto eta = this->eta; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; - ld.subTmp = (ld.numByTopicWord.col(vid).array().template cast() + eta) / (ld.numByTopic2.array().template cast() + V * eta); + ld.subTmp = (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic2.array().template cast() + etaHelper.getEtaSum()); for (size_t k = 0; k < this->K; ++k) { @@ -102,22 +105,30 @@ namespace tomoto updateCnt(ld.numByTopicWord(z2, vid), INC * weight); } - template - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + template + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) { - b = this->chunkOffsetByDoc(partitionId, docId); - e = this->chunkOffsetByDoc(partitionId + 1, docId); + b = edd.chunkOffsetByDoc(partitionId, docId); + e = edd.chunkOffsetByDoc(partitionId + 1, docId); } - size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0; for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); - auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + FLOAT* dist; + if (this->etaByTopicWord.size()) + { + dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + } + else + { + dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); + } auto z = sample::sampleFromDiscreteAcc(dist, dist + this->K * K2, rgs); doc.Zs[w] = z / K2; doc.Z2s[w] = z % K2; @@ -125,25 +136,26 @@ namespace tomoto } } - void distributePartition(ThreadPool& pool, _ModelState* localData) + template + void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const { std::vector> res = pool.enqueueToAll([&](size_t partitionId) { - size_t b = partitionId ? this->vChunkOffset[partitionId - 1] : 0, - e = this->vChunkOffset[partitionId]; - - localData[partitionId].numByTopicWord = this->globalState.numByTopicWord.block(0, b, this->globalState.numByTopicWord.rows(), e - b); - localData[partitionId].numByTopic = this->globalState.numByTopic; - localData[partitionId].numByTopic1_2 = this->globalState.numByTopic1_2; - localData[partitionId].numByTopic2 = this->globalState.numByTopic2; - if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = this->globalState.zLikelihood; + size_t b = partitionId ? edd.vChunkOffset[partitionId - 1] : 0, + e = edd.vChunkOffset[partitionId]; + + localData[partitionId].numByTopicWord = globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b); + localData[partitionId].numByTopic = globalState.numByTopic; + localData[partitionId].numByTopic1_2 = globalState.numByTopic1_2; + localData[partitionId].numByTopic2 = globalState.numByTopic2; + if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = globalState.zLikelihood; }); for (auto& r : res) r.get(); } - template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const + template + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -180,8 +192,8 @@ namespace tomoto { res = pool.enqueueToAll([&](size_t partitionId) { - size_t b = partitionId ? this->vChunkOffset[partitionId - 1] : 0, - e = this->vChunkOffset[partitionId]; + size_t b = partitionId ? edd.vChunkOffset[partitionId - 1] : 0, + e = edd.vChunkOffset[partitionId]; globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b) = localData[partitionId].numByTopicWord; }); for (auto& r : res) r.get(); @@ -268,6 +280,21 @@ namespace tomoto doc.Z2s = tvector(wordSize); } + void prepareWordPriors() + { + if (this->etaByWord.empty()) return; + this->etaByTopicWord.resize(K2, this->realV); + this->etaSumByTopic.resize(K2); + this->etaByTopicWord.array() = this->eta; + for (auto& it : this->etaByWord) + { + auto id = this->dict.toWid(it.first); + if (id == (VID)-1 || id >= this->realV) continue; + this->etaByTopicWord.col(id) = Eigen::Map>{ it.second.data(), (Eigen::Index)it.second.size() }; + } + this->etaSumByTopic = this->etaByTopicWord.rowwise().sum(); + } + void initGlobalState(bool initDocs) { const size_t V = this->realV; @@ -370,6 +397,17 @@ namespace tomoto } return ret; } + + void setWordPrior(const std::string& word, const std::vector& priors) override + { + if (priors.size() != K2) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "priors.size() must be equal to K2."); + for (auto p : priors) + { + if (p < 0) THROW_ERROR_WITH_INFO(exception::InvalidArgument, "priors must not be less than 0."); + } + this->dict.add(word); + this->etaByWord.emplace(word, priors); + } }; template diff --git a/src/TopicModel/PLDAModel.hpp b/src/TopicModel/PLDAModel.hpp index 9373a88..a0d2a6b 100644 --- a/src/TopicModel/PLDAModel.hpp +++ b/src/TopicModel/PLDAModel.hpp @@ -31,14 +31,16 @@ namespace tomoto size_t numLatentTopics, numTopicsPerLabel; + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; zLikelihood = (doc.numByTopic.array().template cast() + this->alphas.array()) - * (ld.numByTopicWord.col(vid).array().template cast() + this->eta) - / (ld.numByTopic.array().template cast() + V * this->eta); + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); zLikelihood.array() *= doc.labelMask.array().template cast(); sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; diff --git a/src/TopicModel/SLDAModel.hpp b/src/TopicModel/SLDAModel.hpp index c3582e5..a8c8dce 100644 --- a/src/TopicModel/SLDAModel.hpp +++ b/src/TopicModel/SLDAModel.hpp @@ -203,14 +203,16 @@ namespace tomoto Eigen::Matrix normZ; // topic proportions for all docs, Dim : (K, D) Eigen::Matrix Ys; // response variables, Dim : (D, F) + template FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); + auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; zLikelihood = (doc.numByTopic.array().template cast() + this->alphas.array()) - * (ld.numByTopicWord.col(vid).array().template cast() + this->eta) - / (ld.numByTopic.array().template cast() + V * this->eta); + * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) + / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); for (size_t f = 0; f < F; ++f) { diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index 4eba47a..15ec703 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -303,7 +303,7 @@ namespace tomoto break; case ParallelScheme::partition: if (!(_Flags & flags::partitioned_multisampling)) THROW_ERROR_WITH_INFO(exception::InvalidArgument, - std::string{ "This model doesn't provide ParallelScheme::" } +toString(ps)); + std::string{ "This model doesn't provide ParallelScheme::" } + toString(ps)); break; } return ps; @@ -331,7 +331,8 @@ namespace tomoto if (ps == ParallelScheme::partition) { localData.resize(numWorkers); - static_cast<_Derived*>(this)->updatePartition(*cachedPool, localData.data()); + static_cast<_Derived*>(this)->updatePartition(*cachedPool, globalState, localData.data(), docs.begin(), docs.end(), + static_cast<_Derived*>(this)->eddTrain); } auto state = ps == ParallelScheme::none ? &globalState : localData.data(); @@ -451,7 +452,7 @@ namespace tomoto return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); } } - throw std::invalid_argument{ "invalid ParallelScheme" }; + THROW_ERROR_WITH_INFO(exception::InvalidArgument, "invalid ParallelScheme"); } std::vector getTopicsByDoc(const DocumentBase* doc) const override diff --git a/src/Utils/exception.h b/src/Utils/exception.h index 816e336..8d3bbef 100644 --- a/src/Utils/exception.h +++ b/src/Utils/exception.h @@ -11,6 +11,12 @@ namespace tomoto using std::runtime_error::runtime_error; }; + class Unimplemented : public std::runtime_error + { + public: + using std::runtime_error::runtime_error; + }; + class InvalidArgument : public std::invalid_argument { public: diff --git a/src/Utils/math.h b/src/Utils/math.h index d535a36..01f19b9 100644 --- a/src/Utils/math.h +++ b/src/Utils/math.h @@ -167,7 +167,7 @@ namespace tomoto } template - inline _T digammaApprox(_T z) + inline auto digammaApprox(_T z) -> decltype(log(z + 4) - 1. / 2. / (z + 4) - 1. / 12. / ((z + 4) * (z + 4)) - 1. / z - 1. / (z + 1) - 1. / (z + 2) - 1. / (z + 3)) { // approximation : digamma(z) ~= ln(z+4) - 1/2/(z+4) - 1/12/(z+4)^2 - 1/z - 1/(z+1) - 1/(z+2) - 1/(z+3) return log(z + 4) - 1. / 2. / (z + 4) - 1. / 12. / ((z + 4) * (z + 4)) - 1. / z - 1. / (z + 1) - 1. / (z + 2) - 1. / (z + 3); diff --git a/src/Utils/sample.hpp b/src/Utils/sample.hpp index aeb186b..dbee1c1 100644 --- a/src/Utils/sample.hpp +++ b/src/Utils/sample.hpp @@ -113,20 +113,6 @@ namespace tomoto } } #endif - - template - inline size_t sampleFromDiscrete(RealIt begin, RealIt end, Random& rg) - { - auto r = std::generate_canonical(rg) * std::accumulate(begin, end, 0.f); - size_t K = std::distance(begin, end); - size_t z = 0; - for (; r >= *begin && z < K - 1; ++z, ++begin) - { - r -= *begin; - } - return z; - } - struct FastRealGenerator { template @@ -144,6 +130,20 @@ namespace tomoto } }; + template + inline size_t sampleFromDiscrete(RealIt begin, RealIt end, Random& rg) + { + FastRealGenerator dist; + auto r = dist(rg) * std::accumulate(begin, end, 0.f); + size_t K = std::distance(begin, end); + size_t z = 0; + for (; r > *begin && z < K - 1; ++z, ++begin) + { + r -= *begin; + } + return z; + } + template inline size_t sampleFromDiscreteAcc(RealIt begin, RealIt end, Random& rg) { diff --git a/src/Utils/serializer.hpp b/src/Utils/serializer.hpp index 1d83c5f..03877bf 100644 --- a/src/Utils/serializer.hpp +++ b/src/Utils/serializer.hpp @@ -12,6 +12,43 @@ namespace tomoto { namespace serializer { + namespace detail + { + template using Invoke = typename _T::type; + + template struct seq { using type = seq; }; + + template struct concat; + + template + struct concat, seq<_i2...>> + : seq<_i1..., (sizeof...(_i1) + _i2)...> {}; + + template + using Concat = Invoke>; + + template struct gen_seq; + template using GenSeq = Invoke>; + + template + struct gen_seq : Concat, GenSeq<_n - _n / 2>> {}; + + template<> struct gen_seq<0> : seq<> {}; + template<> struct gen_seq<1> : seq<0> {}; + + template + std::array to_array(const char(&a)[_n], seq<_is...>) + { + return { {a[_is]...} }; + } + + template + constexpr std::array to_array(const char(&a)[_n]) + { + return to_array(a, GenSeq<_n - 1>{}); + } + } + template inline void writeToStream(std::ostream& ostr, const _Ty& v); template inline void readFromStream(std::istream& istr, _Ty& v); template inline _Ty readFromStream(std::istream& istr); @@ -28,6 +65,29 @@ namespace tomoto {} }; + template + struct Key + { + std::array m; + Key(const std::array& _m) : m(_m) + { + } + + Key(std::array&& _m) : m(_m) + { + } + + Key(const char(&a)[_len + 1]) : Key{ detail::to_array(a) } + { + } + }; + + template + constexpr Key<_n - 1> to_key(const char(&a)[_n]) + { + return Key<_n - 1>{detail::to_array(a)}; + } + inline void writeMany(std::ostream& ostr) { // do nothing @@ -71,6 +131,32 @@ namespace tomoto readMany(istr, std::forward<_RestTy>(rest)...); } + inline void writeManyKV(std::ostream& ostr) + { + // do nothing + } + + template + inline void writeManyKV(std::ostream& ostr, const Key<_len>& key, const _ValTy& value, _RestTy&& ... rest) + { + + writeToStream(ostr, value); + writeManyKV(ostr, std::forward<_RestTy>(rest)...); + } + + inline void readManyKV(std::istream& istr) + { + // do nothing + } + + template + inline void readManyKV(std::istream& istr, const Key<_len>& key, _ValTy& value, _RestTy&& ... rest) + { + + readFromStream(istr, value); + readManyKV(istr, std::forward<_RestTy>(rest)...); + } + namespace detail { template struct sfinae_true : std::true_type {}; diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index b299cae..00fd940 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -51,7 +51,7 @@ static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k { py::UniqueObj iter2; if (PyUnicode_Check(argLabels)) PRINT_WARN("[warn] 'labels' should be an iterable of str."); - if (!(iter = PyObject_GetIter(argLabels))) + if (!(iter2 = PyObject_GetIter(argLabels))) { throw runtime_error{ "'labels' must be an iterable of str." }; } diff --git a/test/unit_test.py b/test/unit_test.py index b6d2707..6512dcb 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -2,7 +2,7 @@ model_cases = [ (tp.LDAModel, 'test/sample.txt', 0, None, {'k':10}, None), - (tp.LLDAModel, 'test/sample_with_md.txt', 0, None, {'k':5}, None), + (tp.LLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'k':5}, None), (tp.PLDAModel, 'test/sample_with_md.txt', 0, None, {'latent_topics':2, 'topics_per_label':2}, None), (tp.PLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'latent_topics':2, 'topics_per_label':2}, None), (tp.HLDAModel, 'test/sample.txt', 0, None, {'depth':3}, [tp.ParallelScheme.NONE]), @@ -95,6 +95,31 @@ def infer(cls, inputFile, mdFields, f, kargs, ps): mdl.infer(unseen_docs, parallel=ps) +def infer_together(cls, inputFile, mdFields, f, kargs, ps): + print('Test infer') + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + unseen_docs = [] + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if n < 20: unseen_docs.append(line) + else: + if mdFields: + mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: + mdl.add_doc(ch) + mdl.train(20, parallel=ps) + for n, line in enumerate(unseen_docs): + if mdFields: + unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) + else: + unseen_docs[n] = mdl.make_doc(ch) + + mdl.infer(unseen_docs, parallel=ps, together=True) + def test_estimate_SLDA_PARTITION(cls=tp.SLDAModel, inputFile='test/sample_with_md.txt', mdFields=1, f=lambda x:list(map(float, x)), kargs={'k':10, 'vars':'b'}, ps=tp.ParallelScheme.PARTITION): print('Test estimate') tw = 0 @@ -122,5 +147,5 @@ def test_estimate_SLDA_PARTITION(cls=tp.SLDAModel, inputFile='test/sample_with_m pss = model_case[5] if not pss: pss = [tp.ParallelScheme.COPY_MERGE, tp.ParallelScheme.PARTITION] for ps in pss: - for func in [train1, train4, train0, save_and_load, infer]: + for func in [train1, train4, train0, save_and_load, infer, infer_together]: locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*(mc + (ps,))))(func, model_case[:-1], ps) diff --git a/tomotopy/__init__.py b/tomotopy/__init__.py index d364951..0be618c 100644 --- a/tomotopy/__init__.py +++ b/tomotopy/__init__.py @@ -90,6 +90,8 @@ def _load(): except: if isa == isas[-1]: raise _load() + +from tomotopy.corpus import * import os if os.environ.get('TOMOTOPY_LANG') == 'kr': __doc__ = """`tomotopy` 패키지는 Python에서 사용가능한 다양한 토픽 모델링 타입과 함수를 제공합니다. diff --git a/tomotopy/documentation.kr.rst b/tomotopy/documentation.kr.rst index 0adf85e..c5a409a 100644 --- a/tomotopy/documentation.kr.rst +++ b/tomotopy/documentation.kr.rst @@ -251,6 +251,11 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.5.2 (2020-03-01) + * `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다. + * `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다. + * `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다. + * 0.5.1 (2020-01-11) * `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다. * `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다. diff --git a/tomotopy/documentation.rst b/tomotopy/documentation.rst index 4ca8e69..22ca8c7 100644 --- a/tomotopy/documentation.rst +++ b/tomotopy/documentation.rst @@ -254,6 +254,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.5.2 (2020-03-01) + * A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`. + * A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program. + * A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True. + * 0.5.1 (2020-01-11) * A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`. * Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables. From f30807f2461d2eeca475b5538402a8791af740f9 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Mon, 2 Mar 2020 00:54:44 +0900 Subject: [PATCH 2/2] fixed wrong code --- tomotopy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomotopy/__init__.py b/tomotopy/__init__.py index 0be618c..7ccf29f 100644 --- a/tomotopy/__init__.py +++ b/tomotopy/__init__.py @@ -91,7 +91,7 @@ def _load(): if isa == isas[-1]: raise _load() -from tomotopy.corpus import * +#from tomotopy.corpus import * import os if os.environ.get('TOMOTOPY_LANG') == 'kr': __doc__ = """`tomotopy` 패키지는 Python에서 사용가능한 다양한 토픽 모델링 타입과 함수를 제공합니다.