From 3a11b1c24f3adf40a5eabb7fb25a348a72c67a0d Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 25 Apr 2021 22:35:23 +0900 Subject: [PATCH 1/5] preparing 0.12.0 improved DMR & GDMR (#107) improved GDMR's performance fixed wrong topic_id for excluded words copy() method for topic models typed Python exceptions & warnings refactored code based on c++14 --- examples/dmr_multi_label.py | 54 +++ examples/dmr_plot.py | 2 +- src/Coherence/CoherenceModel.hpp | 2 +- src/Labeling/FoRelevance.cpp | 72 +++- src/Labeling/FoRelevance.h | 4 +- src/TopicModel/CT.h | 4 +- src/TopicModel/CTModel.hpp | 16 +- src/TopicModel/DMR.h | 14 +- src/TopicModel/DMRModel.hpp | 258 +++++++++++--- src/TopicModel/DTModel.hpp | 29 +- src/TopicModel/GDMR.h | 17 +- src/TopicModel/GDMRModel.hpp | 211 +++++++----- src/TopicModel/HDPModel.hpp | 12 +- src/TopicModel/HLDAModel.hpp | 8 +- src/TopicModel/HPAModel.hpp | 18 +- src/TopicModel/LDACVB0Model.hpp | 14 +- src/TopicModel/LDAModel.hpp | 47 ++- src/TopicModel/LLDAModel.hpp | 4 +- src/TopicModel/MGLDAModel.hpp | 8 +- src/TopicModel/PAModel.hpp | 16 +- src/TopicModel/PLDAModel.hpp | 4 +- src/TopicModel/PTModel.hpp | 4 +- src/TopicModel/SLDAModel.hpp | 98 ++++-- src/TopicModel/TopicModel.hpp | 43 ++- src/Utils/AliasMethod.hpp | 10 +- src/Utils/Dictionary.h | 1 + src/Utils/Utils.hpp | 113 +++++- src/Utils/math.h | 10 +- src/Utils/serializer.hpp | 10 + src/Utils/text.hpp | 8 + src/Utils/tvector.hpp | 13 +- src/python/PyUtils.h | 354 ++++++++++++++++--- src/python/docs.h | 195 ++++++++++- src/python/label_docs.h | 2 +- src/python/py_CT.cpp | 97 ++---- src/python/py_DMR.cpp | 261 ++++++++------ src/python/py_DT.cpp | 224 +++--------- src/python/py_GDMR.cpp | 207 ++++++----- src/python/py_HDP.cpp | 76 +--- src/python/py_HLDA.cpp | 70 +--- src/python/py_HPA.cpp | 93 ++--- src/python/py_LDA.cpp | 474 +++++++------------------ src/python/py_LLDA.cpp | 149 +++----- src/python/py_MGLDA.cpp | 108 ++---- src/python/py_PA.cpp | 212 +++--------- src/python/py_PLDA.cpp | 85 ++--- src/python/py_PT.cpp | 14 +- src/python/py_SLDA.cpp | 119 ++----- src/python/py_coherence.cpp | 37 +- src/python/py_label.cpp | 153 ++------- src/python/py_utils.cpp | 571 +++++++++---------------------- src/python/utils.h | 161 ++++----- test/unit_test.py | 43 ++- tomotopy/_summary.py | 23 +- tomotopy/_version.py | 2 +- 55 files changed, 2373 insertions(+), 2481 deletions(-) create mode 100644 examples/dmr_multi_label.py diff --git a/examples/dmr_multi_label.py b/examples/dmr_multi_label.py new file mode 100644 index 0000000..592182f --- /dev/null +++ b/examples/dmr_multi_label.py @@ -0,0 +1,54 @@ +''' +This example show how to perform a DMR topic model with multi-metadata using tomotopy +''' +import itertools + +import tomotopy as tp +import numpy as np + +# You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data . +corpus = tp.utils.Corpus() +for line in open('text_mining_year_journal.txt', encoding='utf-8'): + fd = line.strip().split('\t', maxsplit=2) + corpus.add_doc(fd[2].split(), multi_metadata=['y_' + fd[0], 'j_' + fd[1]]) +# We add prefix 'y' for year-label and 'j' for journal-label + +# We set a range of the first metadata as [2000, 2017] +# and one of the second metadata as [0, 1]. +mdl = tp.DMRModel(tw=tp.TermWeight.ONE, + k=20, + corpus=corpus +) +mdl.optim_interval = 20 +mdl.burn_in = 200 + +mdl.train(0) + +print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( + len(mdl.docs), len(mdl.used_vocabs), mdl.num_words +)) + +# Let's train the model +for i in range(0, 2000, 20): + print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word)) + mdl.train(20) +print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word)) + +mdl.summary() + +year_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('y_')) +journal_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('j_')) + +# calculate topic distribution with each metadata using get_topic_prior() +print('Topic distributions by year') +for l in year_labels: + print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n') + +print('Topic distributions by journal') +for l in journal_labels: + print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n') + +# Also we can estimate topic distributions with multiple metadata +print('Topic distributions by year-journal') +for y, j in itertools.product(year_labels, journal_labels): + print(y, ',', j, '\n', mdl.get_topic_prior(multi_metadata=[y, j]), '\n') diff --git a/examples/dmr_plot.py b/examples/dmr_plot.py index de8d09d..51ec6ce 100644 --- a/examples/dmr_plot.py +++ b/examples/dmr_plot.py @@ -38,7 +38,7 @@ for i in range(0, 2000, 20): print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word)) mdl.train(20) -print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word)) +print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word)) mdl.summary() diff --git a/src/Coherence/CoherenceModel.hpp b/src/Coherence/CoherenceModel.hpp index 16d0742..5387e4a 100644 --- a/src/Coherence/CoherenceModel.hpp +++ b/src/Coherence/CoherenceModel.hpp @@ -25,7 +25,7 @@ namespace tomoto void init(size_t windowSize) { pe_type = _pe; - pe = make_unique>(windowSize); + pe = std::make_unique>(windowSize); } template diff --git a/src/Labeling/FoRelevance.cpp b/src/Labeling/FoRelevance.cpp index 374c9d1..d9cd548 100644 --- a/src/Labeling/FoRelevance.cpp +++ b/src/Labeling/FoRelevance.cpp @@ -6,6 +6,55 @@ using namespace tomoto::label; +template +class DocWordIterator +{ + const tomoto::DocumentBase* doc = nullptr; + size_t n = 0; +public: + DocWordIterator(const tomoto::DocumentBase* _doc = nullptr, size_t _n = 0) + : doc{ _doc }, n{ _n } + { + } + + tomoto::Vid operator[](size_t i) const + { + return doc->words[doc->wOrder.empty() ? (n + i) : doc->wOrder[n + i]]; + } + + tomoto::Vid operator*() const + { + return doc->words[doc->wOrder.empty() ? n : doc->wOrder[n]]; + } + + bool operator==(const DocWordIterator& o) const + { + return doc == o.doc && n == o.n; + } + + bool operator!=(const DocWordIterator& o) const + { + return !operator==(o); + } + + DocWordIterator& operator++() + { + if (reverse) --n; + else ++n; + return *this; + } + + DocWordIterator operator+(ptrdiff_t o) const + { + return { doc, (size_t)((ptrdiff_t)n + o) }; + } + + DocWordIterator operator-(ptrdiff_t o) const + { + return { doc, (size_t)((ptrdiff_t)n - o) }; + } +}; + class DocWrapper { const tomoto::DocumentBase* doc; @@ -25,24 +74,24 @@ class DocWrapper return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]]; } - auto begin() const -> decltype(doc->words.begin()) + DocWordIterator<> begin() const { - return doc->words.begin(); + return { doc, 0 }; } - auto end() const -> decltype(doc->words.end()) + DocWordIterator<> end() const { - return doc->words.end(); + return { doc, doc->words.size() }; } - auto rbegin() const -> decltype(doc->words.rbegin()) + DocWordIterator rbegin() const { - return doc->words.rbegin(); + return { doc, doc->words.size() }; } - auto rend() const -> decltype(doc->words.rend()) + DocWordIterator rend() const { - return doc->words.rend(); + return { doc, 0 }; } }; @@ -99,7 +148,6 @@ std::vector PMIExtractor::extract(const tomoto::ITopicModel* tm) cons return candidates; } - std::vector tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const { auto& vocabFreqs = tm->getVocabCf(); @@ -217,11 +265,11 @@ void FoRelevance::estimateContexts() } } - Eigen::Matrix wordTopicDist{ tm->getV(), tm->getK() }; + Matrix wordTopicDist{ tm->getV(), tm->getK() }; for (size_t i = 0; i < tm->getK(); ++i) { auto dist = tm->getWidsByTopic(i); - wordTopicDist.col(i) = Eigen::Map>{ dist.data(), (Eigen::Index)dist.size() }; + wordTopicDist.col(i) = Eigen::Map{ dist.data(), (Eigen::Index)dist.size() }; } size_t totDocCnt = 0; @@ -256,7 +304,7 @@ void FoRelevance::estimateContexts() } size_t docCnt = 0; - Eigen::Matrix wcPMI = Eigen::Matrix::Zero(this->tm->getV()); + Vector wcPMI = Vector::Zero(this->tm->getV()); for (auto& docId : c.docIds) { thread_local Eigen::VectorXi bdf(this->tm->getV()); diff --git a/src/Labeling/FoRelevance.h b/src/Labeling/FoRelevance.h index 481de5f..ad34c6a 100644 --- a/src/Labeling/FoRelevance.h +++ b/src/Labeling/FoRelevance.h @@ -93,8 +93,8 @@ namespace tomoto if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); if (numWorkers > 1) { - pool = make_unique(numWorkers); - mtx = make_unique(numWorkers); + pool = std::make_unique(numWorkers); + mtx = std::make_unique(numWorkers); } for (; candFirst != candEnd; ++candFirst) diff --git a/src/TopicModel/CT.h b/src/TopicModel/CT.h index d9071de..140c578 100644 --- a/src/TopicModel/CT.h +++ b/src/TopicModel/CT.h @@ -8,8 +8,8 @@ namespace tomoto { using BaseDocument = DocumentLDA<_tw>; using DocumentLDA<_tw>::DocumentLDA; - Eigen::Matrix beta; // Dim: (K, betaSample) - Eigen::Matrix smBeta; // Dim: K + Matrix beta; // Dim: (K, betaSample) + Vector smBeta; // Dim: K DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta); diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp index 4e10bec..7f8257b 100644 --- a/src/TopicModel/CTModel.hpp +++ b/src/TopicModel/CTModel.hpp @@ -56,14 +56,14 @@ namespace tomoto void updateBeta(_DocType& doc, _RandGen& rg) const { - Eigen::Matrix pbeta, lowerBound, upperBound; + Vector pbeta, lowerBound, upperBound; constexpr Float epsilon = 1e-8; constexpr size_t burnIn = 3; - pbeta = lowerBound = upperBound = Eigen::Matrix::Zero(this->K); + pbeta = lowerBound = upperBound = Vector::Zero(this->K); for (size_t i = 0; i < numBetaSample + burnIn; ++i) { - if (i == 0) pbeta = Eigen::Matrix::Ones(this->K); + if (i == 0) pbeta = Vector::Ones(this->K); else pbeta = doc.beta.col(i % numBetaSample).array().exp(); Float betaESum = pbeta.sum() + 1; @@ -199,7 +199,7 @@ namespace tomoto for (; _first != _last; ++_first) { auto& doc = *_first; - Eigen::Matrix pbeta = doc.smBeta.array().log(); + Vector pbeta = doc.smBeta.array().log(); Float last = pbeta[K - 1]; for (Tid k = 0; k < K; ++k) { @@ -215,8 +215,8 @@ namespace tomoto void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const { BaseClass::prepareDoc(doc, docId, wordSize); - doc.beta = Eigen::Matrix::Zero(this->K, numBetaSample); - doc.smBeta = Eigen::Matrix::Constant(this->K, (Float)1 / this->K); + doc.beta = Matrix::Zero(this->K, numBetaSample); + doc.smBeta = Vector::Constant(this->K, (Float)1 / this->K); } void updateDocs() @@ -224,7 +224,7 @@ namespace tomoto BaseClass::updateDocs(); for (auto& doc : this->docs) { - doc.beta = Eigen::Matrix::Zero(this->K, numBetaSample); + doc.beta = Matrix::Zero(this->K, numBetaSample); } } @@ -274,7 +274,7 @@ namespace tomoto std::vector getCorrelationTopic(Tid k) const override { - Eigen::Matrix ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt(); + Vector ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt(); return { ret.data(), ret.data() + ret.size() }; } diff --git a/src/TopicModel/DMR.h b/src/TopicModel/DMR.h index e45ce3a..1a0a059 100644 --- a/src/TopicModel/DMR.h +++ b/src/TopicModel/DMR.h @@ -11,11 +11,15 @@ namespace tomoto using BaseDocument = DocumentLDA<_tw>; using DocumentLDA<_tw>::DocumentLDA; uint64_t metadata = 0; + std::vector multiMetadata; + Vector mdVec; + size_t mdHash = (size_t)-1; + mutable Matrix cachedAlpha; RawDoc::MiscType makeMisc(const ITopicModel* tm) const override; DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata); - DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata); + DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata); }; struct DMRArgs : public LDAArgs @@ -36,10 +40,18 @@ namespace tomoto virtual void setOptimRepeat(size_t repeat) = 0; virtual size_t getOptimRepeat() const = 0; virtual size_t getF() const = 0; + virtual size_t getMdVecSize() const = 0; virtual Float getSigma() const = 0; virtual const Dictionary& getMetadataDict() const = 0; + virtual const Dictionary& getMultiMetadataDict() const = 0; virtual std::vector getLambdaByMetadata(size_t metadataId) const = 0; virtual std::vector getLambdaByTopic(Tid tid) const = 0; + + virtual std::vector getTopicPrior( + const std::string& metadata, + const std::vector& multiMetadata, + bool raw = false + ) const = 0; }; template diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp index 8f70d47..2e39a82 100644 --- a/src/TopicModel/DMRModel.hpp +++ b/src/TopicModel/DMRModel.hpp @@ -13,7 +13,21 @@ namespace tomoto template struct ModelStateDMR : public ModelStateLDA<_tw> { - Eigen::Matrix tmpK; + Vector tmpK; + }; + + struct MdHash + { + size_t operator()(std::pair const& p) const + { + size_t seed = p.first; + for (size_t i = 0; i < p.second.size(); ++i) + { + auto elem = p.second[i]; + seed ^= std::hash()(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } }; template lambda; - Eigen::Matrix expLambda; + Matrix lambda; + mutable std::unordered_map, size_t, MdHash> mdHashMap; + mutable Matrix cachedAlphas; Float sigma; - uint32_t F = 0; + uint32_t F = 0, mdVecSize = 1; uint32_t optimRepeat = 5; Float alphaEps = 1e-10; - Float temperatureScale = 0; static constexpr Float maxLambda = 10; static constexpr size_t maxBFGSIteration = 10; Dictionary metadataDict; + Dictionary multiMetadataDict; LBFGSpp::LBFGSSolver solver; - Float getNegativeLambdaLL(Eigen::Ref> x, Eigen::Matrix& g) const + Float getNegativeLambdaLL(Eigen::Ref x, Vector& g) const { g = (x.array() - log(this->alpha)) / pow(sigma, 2); return (x.array() - log(this->alpha)).pow(2).sum() / 2 / pow(sigma, 2); } - Float evaluateLambdaObj(Eigen::Ref> x, Eigen::Matrix& g, ThreadPool& pool, _ModelState* localData) const + Float evaluateLambdaObj(Eigen::Ref x, Vector& g, ThreadPool& pool, _ModelState* localData) const { // if one of x is greater than maxLambda, return +inf for preventing searching more if ((x.array() > maxLambda).any()) return INFINITY; const auto K = this->K; - Float fx = - static_cast(this)->getNegativeLambdaLL(x, g); - auto alphas = (x.array().exp() + alphaEps).eval(); + Float fx = -static_cast(this)->getNegativeLambdaLL(x, g); + Eigen::Map xReshaped{ x.data(), K, F * mdVecSize }; - std::vector>> res; + std::vector>> res; const size_t chStride = pool.getNumWorkers() * 8; for (size_t ch = 0; ch < chStride; ++ch) { @@ -72,28 +87,28 @@ namespace tomoto { auto& tmpK = localData[threadId].tmpK; if (!tmpK.size()) tmpK.resize(this->K); - Eigen::Matrix val = Eigen::Matrix::Zero(K * F + 1); + Eigen::Array val = Eigen::Array::Zero(K * F * mdVecSize + 1); + Eigen::Map grad{ val.data(), K, F * mdVecSize }; + Float& fx = val[K * F * mdVecSize]; for (size_t docId = ch; docId < this->docs.size(); docId += chStride) { const auto& doc = this->docs[docId]; - auto alphaDoc = alphas.segment(doc.metadata * K, K); + auto alphaDoc = ((xReshaped.middleCols(doc.metadata * mdVecSize, mdVecSize) * doc.mdVec).array().exp() + alphaEps).matrix().eval(); Float alphaSum = alphaDoc.sum(); for (Tid k = 0; k < K; ++k) { - val[K * F] -= math::lgammaT(alphaDoc[k]) - math::lgammaT(doc.numByTopic[k] + alphaDoc[k]); + fx -= math::lgammaT(alphaDoc[k]) - math::lgammaT(doc.numByTopic[k] + alphaDoc[k]); if (!std::isfinite(alphaDoc[k]) && alphaDoc[k] > 0) tmpK[k] = 0; else tmpK[k] = -(math::digammaT(alphaDoc[k]) - math::digammaT(doc.numByTopic[k] + alphaDoc[k])); } - //val[K * F] = -(lgammaApprox(alphaDoc.array()) - lgammaApprox(doc.numByTopic.array().cast() + alphaDoc.array())).sum(); - //tmpK = -(digammaApprox(alphaDoc.array()) - digammaApprox(doc.numByTopic.array().cast() + alphaDoc.array())); - val[K * F] += math::lgammaT(alphaSum) - math::lgammaT(doc.getSumWordWeight() + alphaSum); + fx += math::lgammaT(alphaSum) - math::lgammaT(doc.getSumWordWeight() + alphaSum); Float t = math::digammaT(alphaSum) - math::digammaT(doc.getSumWordWeight() + alphaSum); if (!std::isfinite(alphaSum) && alphaSum > 0) { - val[K * F] = -INFINITY; + fx = -INFINITY; t = 0; } - val.segment(doc.metadata * K, K).array() -= alphaDoc.array() * (tmpK.array() + t); + grad.middleCols(doc.metadata * mdVecSize, mdVecSize) -= (alphaDoc.array() * (tmpK.array() + t)).matrix() * doc.mdVec.transpose(); } return val; })); @@ -101,8 +116,8 @@ namespace tomoto for (auto& r : res) { auto ret = r.get(); - fx += ret[K * F]; - g += ret.head(K * F); + fx += ret[K * F * mdVecSize]; + g += ret.head(K * F * mdVecSize).matrix(); } // positive fx is an error from limited precision of float. @@ -113,20 +128,23 @@ namespace tomoto void initParameters() { lambda = Eigen::Rand::normalLike(lambda, this->rg, 0, sigma); - lambda += log(this->alphas.array()).matrix().replicate(1, F); + for (size_t f = 0; f < F; ++f) + { + lambda.col(f * mdVecSize) += this->alphas.array().log().matrix(); + } } void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { - Eigen::Matrix bLambda; + Matrix bLambda; Float fx = 0, bestFx = INFINITY; for (size_t i = 0; i < optimRepeat; ++i) { static_cast(this)->initParameters(); - int ret = solver.minimize([this, &pool, localData](Eigen::Ref> x, Eigen::Matrix& g) + int ret = solver.minimize([this, &pool, localData](Eigen::Ref x, Vector& g) { return static_cast(this)->evaluateLambdaObj(x, g, pool, localData); - }, Eigen::Map>(lambda.data(), lambda.size()), fx); + }, Eigen::Map(lambda.data(), lambda.size()), fx); if (fx < bestFx) { @@ -140,41 +158,57 @@ namespace tomoto throw exc::TrainingError{ "optimizing parameters has been failed!" }; } lambda = bLambda; + updateCachedAlphas(); //std::cerr << fx << std::endl; - expLambda = lambda.array().exp() + alphaEps; } int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::cerr << "Failed to optimize! Reset prior and retry!" << std::endl; lambda.setZero(); - expLambda = lambda.array().exp() + alphaEps; + updateCachedAlphas(); return 0; } + auto getCachedAlpha(const _DocType& doc) const + { + if (doc.mdHash < cachedAlphas.cols()) + { + return cachedAlphas.col(doc.mdHash); + } + else + { + if (!doc.cachedAlpha.size()) + { + doc.cachedAlpha = (lambda.middleCols(doc.metadata * mdVecSize, mdVecSize) * doc.mdVec).array().exp() + alphaEps; + } + return doc.cachedAlpha.col(0); + } + } + template Float* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); auto etaHelper = this->template getEtaHelper<_asymEta>(); + auto alphas = getCachedAlpha(doc); auto& zLikelihood = ld.zLikelihood; - zLikelihood = (doc.numByTopic.array().template cast() + this->expLambda.col(doc.metadata).array()) + zLikelihood = (doc.numByTopic.array().template cast() + alphas.array()) * (ld.numByTopicWord.col(vid).array().template cast() + etaHelper.getEta(vid)) / (ld.numByTopic.array().template cast() + etaHelper.getEtaSum()); sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; } - double getLLDocTopic(const _DocType& doc) const { const size_t V = this->realV; const auto K = this->K; - auto alphaDoc = expLambda.col(doc.metadata); - + auto alphaDoc = getCachedAlpha(doc); + Float ll = 0; Float alphaSum = alphaDoc.sum(); for (Tid k = 0; k < K; ++k) @@ -196,7 +230,7 @@ namespace tomoto for (; _first != _last; ++_first) { auto& doc = *_first; - auto alphaDoc = expLambda.col(doc.metadata); + auto alphaDoc = getCachedAlpha(doc); Float alphaSum = alphaDoc.sum(); for (Tid k = 0; k < K; ++k) @@ -231,25 +265,102 @@ namespace tomoto return ll; } + void updateCachedAlphas() const + { + cachedAlphas.resize(this->K, mdHashMap.size()); + + for (auto& p : mdHashMap) + { + cachedAlphas.col(p.second) = (lambda.middleCols(p.first.first * mdVecSize, mdVecSize) * p.first.second).array().exp() + alphaEps; + } + } + + void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const + { + BaseClass::prepareDoc(doc, docId, wordSize); + + doc.mdVec = Vector::Zero(mdVecSize); + doc.mdVec[0] = 1; + for (auto x : doc.multiMetadata) + { + doc.mdVec[x + 1] = 1; + } + + auto p = std::make_pair(doc.metadata, doc.mdVec); + auto it = mdHashMap.find(p); + if (it == mdHashMap.end()) + { + it = mdHashMap.emplace(p, mdHashMap.size()).first; + } + doc.mdHash = it->second; + } + void initGlobalState(bool initDocs) { BaseClass::initGlobalState(initDocs); - this->globalState.tmpK = Eigen::Matrix::Zero(this->K); + this->globalState.tmpK = Vector::Zero(this->K); F = metadataDict.size(); + mdVecSize = multiMetadataDict.size() + 1; if (initDocs) { - lambda = log(this->alphas.array()).replicate(1, F); + lambda.resize(this->K, F * mdVecSize); + for (size_t f = 0; f < F; ++f) + { + lambda.col(f * mdVecSize) = this->alphas.array().log(); + lambda.middleCols(f * mdVecSize + 1, mdVecSize - 1).setZero(); + } } + else + { + for (auto& doc : this->docs) + { + if (doc.mdVec.size() == mdVecSize) continue; + doc.mdVec = Vector::Zero(mdVecSize); + doc.mdVec[0] = 1; + for (auto x : doc.multiMetadata) + { + doc.mdVec[x + 1] = 1; + } + + auto p = std::make_pair(doc.metadata, doc.mdVec); + auto it = this->mdHashMap.find(p); + if (it == this->mdHashMap.end()) + { + it = this->mdHashMap.emplace(p, mdHashMap.size()).first; + } + doc.mdHash = it->second; + } + } + if (_Flags & flags::continuous_doc_data) this->numByTopicDoc = Eigen::Matrix::Zero(this->K, this->docs.size()); - expLambda = lambda.array().exp(); LBFGSpp::LBFGSParam param; param.max_iterations = maxBFGSIteration; solver = decltype(solver){ param }; } + void prepareShared() + { + BaseClass::prepareShared(); + + for (auto doc : this->docs) + { + if (doc.mdHash != (size_t)-1) continue; + + auto p = std::make_pair(doc.metadata, doc.mdVec); + auto it = mdHashMap.find(p); + if (it == mdHashMap.end()) + { + it = mdHashMap.emplace(p, mdHashMap.size()).first; + } + doc.mdHash = it->second; + } + + updateCachedAlphas(); + } + public: DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda); - DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda); + DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict); DMRModel(const DMRArgs& args) : BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps) @@ -258,17 +369,29 @@ namespace tomoto } template - _DocType& _updateDoc(_DocType& doc, const std::string& metadata) + _DocType& _updateDoc(_DocType& doc, const std::string& metadata, const std::vector& mdVec = {}) { Vid xid; if (_const) { xid = metadataDict.toWid(metadata); - if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata"); + if (xid == (Vid)-1) throw exc::InvalidArgument("unknown metadata '" + metadata + "'"); + + for (auto& m : mdVec) + { + Vid x = multiMetadataDict.toWid(m); + if (x == (Vid)-1) throw exc::InvalidArgument("unknown multi_metadata '" + m + "'"); + doc.multiMetadata.emplace_back(x); + } } else { xid = metadataDict.add(metadata); + + for (auto& m : mdVec) + { + doc.multiMetadata.emplace_back(multiMetadataDict.add(m)); + } } doc.metadata = xid; return doc; @@ -277,28 +400,41 @@ namespace tomoto size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override { auto doc = this->template _makeFromRawDoc(rawDoc, tokenizer); - return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc("metadata"))); + return this->_addDoc(_updateDoc(doc, + rawDoc.template getMisc("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") + )); } std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMisc("metadata"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, + rawDoc.template getMisc("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") + )); } size_t addDoc(const RawDoc& rawDoc) override { auto doc = this->_makeFromRawDoc(rawDoc); - return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc("metadata"))); + return this->_addDoc(_updateDoc(doc, + rawDoc.template getMisc("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") + )); } std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMisc("metadata"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, + rawDoc.template getMisc("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") + )); } GETTER(F, size_t, F); + GETTER(MdVecSize, size_t, mdVecSize); GETTER(Sigma, Float, sigma); GETTER(AlphaEps, Float, alphaEps); GETTER(OptimRepeat, size_t, optimRepeat); @@ -316,7 +452,7 @@ namespace tomoto std::vector getTopicsByDoc(const _DocType& doc, bool normalize) const { std::vector ret(this->K); - auto alphaDoc = expLambda.col(doc.metadata); + auto alphaDoc = getCachedAlpha(doc); Eigen::Map> m{ ret.data(), this->K }; if (normalize) { @@ -333,20 +469,52 @@ namespace tomoto { assert(metadataId < metadataDict.size()); auto l = lambda.col(metadataId); - return { l.data(), l.data() + this->K }; + return { l.data(), l.data() + l.size() }; } std::vector getLambdaByTopic(Tid tid) const override { - std::vector ret(F); + std::vector ret(F * mdVecSize); if (this->lambda.size()) { - Eigen::Map>{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid); + Eigen::Map{ ret.data(), (Eigen::Index)ret.size() } = this->lambda.row(tid); + } + return ret; + } + + std::vector getTopicPrior(const std::string& metadata, + const std::vector& mdVec, + bool raw = false + ) const override + { + Vid xid = metadataDict.toWid(metadata); + if (xid == (Vid)-1) throw exc::InvalidArgument("unknown metadata '" + metadata + "'"); + + Vector xs = Vector::Zero(mdVecSize); + xs[0] = 1; + for (auto& m : mdVec) + { + Vid x = multiMetadataDict.toWid(m); + if (x == (Vid)-1) throw exc::InvalidArgument("unknown multi_metadata '" + m + "'"); + xs[x + 1] = 1; + } + + std::vector ret(this->K); + Eigen::Map map{ ret.data(), (Eigen::Index)ret.size() }; + + if (raw) + { + map = lambda.middleCols(xid * mdVecSize, mdVecSize) * xs; + } + else + { + map = (lambda.middleCols(xid * mdVecSize, mdVecSize) * xs).array().exp() + alphaEps; } return ret; } const Dictionary& getMetadataDict() const override { return metadataDict; } + const Dictionary& getMultiMetadataDict() const override { return multiMetadataDict; } }; /* This is for preventing 'undefined symbol' problem in compiling by clang. */ diff --git a/src/TopicModel/DTModel.hpp b/src/TopicModel/DTModel.hpp index 1d6b57d..1841017 100644 --- a/src/TopicModel/DTModel.hpp +++ b/src/TopicModel/DTModel.hpp @@ -47,10 +47,10 @@ namespace tomoto Float shapeA = 0.03f, shapeB = 0.1f, shapeC = 0.55f; Float alphaVar = 1.f, etaVar = 1.f, phiVar = 1.f, etaRegL2 = 0.0f; - Eigen::Matrix alphas; // Dim: (Topic, Time) - Eigen::Matrix etaByDoc; // Dim: (Topic, Docs) : Topic distribution by docs(and time) + Matrix alphas; // Dim: (Topic, Time) + Matrix etaByDoc; // Dim: (Topic, Docs) : Topic distribution by docs(and time) std::vector numDocsByTime; // Dim: (Time) - Eigen::Matrix phi; // Dim: (Word, Topic * Time) + Matrix phi; // Dim: (Word, Topic * Time) std::vector> wordAliasTables; // Dim: (Word * Time) template @@ -84,8 +84,8 @@ namespace tomoto // sampling eta { - Eigen::Matrix estimatedCnt = (doc.eta.array() - doc.eta.maxCoeff()).exp(); - Eigen::Matrix etaTmp; + Vector estimatedCnt = (doc.eta.array() - doc.eta.maxCoeff()).exp(); + Vector etaTmp; estimatedCnt *= doc.getSumWordWeight() / estimatedCnt.sum(); auto prior = (alphas.col(doc.timepoint) - doc.eta) / std::max(etaVar, eps * 2); auto grad = doc.numByTopic.template cast() - estimatedCnt; @@ -181,20 +181,21 @@ namespace tomoto template void _sampleGlobalLevel(ThreadPool* pool, _ModelState*, _RandGen* rgs, _DocIter first, _DocIter last) { + if (!this->realV) return; const auto K = this->K; const Float eps = shapeA * (std::pow(shapeB + 1 + this->globalStep, -shapeC)); // sampling phi for (size_t k = 0; k < K; ++k) { - Eigen::Matrix phiGrad{ (Eigen::Index)this->realV, (Eigen::Index)T }; + Matrix phiGrad{ (Eigen::Index)this->realV, (Eigen::Index)T }; for (size_t t = 0; t < T; ++t) { auto phi_tk = phi.col(k + K * t); - Eigen::Matrix estimatedCnt = (phi_tk.array() - phi_tk.maxCoeff()).exp(); + Vector estimatedCnt = (phi_tk.array() - phi_tk.maxCoeff()).exp(); estimatedCnt *= this->globalState.numByTopic(k, t) / estimatedCnt.sum(); - Eigen::Matrix grad = this->globalState.numByTopicWord.row(k + K * t).template cast(); + Vector grad = this->globalState.numByTopicWord.row(k + K * t).template cast(); grad -= estimatedCnt; auto epsNoise = Eigen::Rand::normal>(this->realV, 1, *rgs) * eps; if (t == 0) @@ -228,7 +229,7 @@ namespace tomoto } } - Eigen::Matrix newAlphas = Eigen::Matrix::Zero(alphas.rows(), alphas.cols()); + Matrix newAlphas = Matrix::Zero(alphas.rows(), alphas.cols()); for (size_t t = 0; t < T; ++t) { // update alias tables for word proposal @@ -398,9 +399,9 @@ namespace tomoto this->globalState.numByTopic = Eigen::Matrix::Zero(this->K, T); this->globalState.numByTopicWord = Eigen::Matrix::Zero(this->K * T, V); - alphas = Eigen::Matrix::Zero(this->K, T); - etaByDoc = Eigen::Matrix::Zero(this->K, this->docs.size()); - phi = Eigen::Matrix::Zero(this->realV, this->K * T); + alphas = Matrix::Zero(this->K, T); + etaByDoc = Matrix::Zero(this->K, this->docs.size()); + phi = Matrix::Zero(this->realV, this->K * T); } numDocsByTime.resize(T); @@ -508,7 +509,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc("timepoint"))); + return std::make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc("timepoint"))); } size_t addDoc(const RawDoc& rawDoc) override @@ -520,7 +521,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc("timepoint"))); + return std::make_unique<_DocType>(_updateDoc(doc, rawDoc.template getMisc("timepoint"))); } Float getAlpha(size_t k, size_t t) const override diff --git a/src/TopicModel/GDMR.h b/src/TopicModel/GDMR.h index 3817623..d2dc026 100644 --- a/src/TopicModel/GDMR.h +++ b/src/TopicModel/GDMR.h @@ -41,8 +41,21 @@ namespace tomoto virtual const std::vector& getFs() const = 0; virtual std::vector getLambdaByTopic(Tid tid) const = 0; - virtual std::vector getTDF(const Float* metadata, size_t metadataCat, bool normalize) const = 0; - virtual std::vector getTDFBatch(const Float* metadata, size_t metadataCat, size_t stride, size_t cnt, bool normalize) const = 0; + virtual std::vector getTDF( + const Float* metadata, + const std::string& metadataCat, + const std::vector& multiMetadataCat, + bool normalize + ) const = 0; + + virtual std::vector getTDFBatch( + const Float* metadata, + const std::string& metadataCat, + const std::vector& multiMetadataCat, + size_t stride, + size_t cnt, + bool normalize + ) const = 0; virtual void setMdRange(const std::vector& vMin, const std::vector& vMax) = 0; virtual void getMdRange(std::vector& vMin, std::vector& vMax) const = 0; diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp index ffd3ccf..15c0e16 100644 --- a/src/TopicModel/GDMRModel.hpp +++ b/src/TopicModel/GDMRModel.hpp @@ -8,8 +8,8 @@ namespace tomoto template struct ModelStateGDMR : public ModelStateDMR<_tw> { - /*Eigen::Matrix alphas; - Eigen::Matrix terms; + /*Vector alphas; + Vector terms; std::vector> slpCache; std::vector ndimCnt;*/ }; @@ -37,26 +37,28 @@ namespace tomoto std::vector mdCoefs, mdIntercepts, mdMax; std::vector degreeByF; Eigen::Array orderDecayCached; - size_t fCont = 1, fCat = 1; + size_t fCont = 1; - Float getIntegratedLambdaSq(const Eigen::Ref, 0, Eigen::InnerStride<>>& lambdas) const + Float getIntegratedLambdaSq(const Eigen::Ref>& lambdas) const { Float ret = 0; - for (size_t i = 0; i < fCat; ++i) + for (size_t i = 0; i < this->F; ++i) { - ret += pow(lambdas[fCont * i] - log(this->alpha), 2) / 2 / pow(this->sigma0, 2); - ret += (lambdas.segment(fCont * i + 1, fCont - 1).array().pow(2) / 2 * orderDecayCached.segment(1, fCont - 1) / pow(this->sigma, 2)).sum(); + ret += pow(lambdas[this->mdVecSize * i] - log(this->alpha), 2) / 2 / pow(this->sigma0, 2); + ret += (lambdas.segment(this->mdVecSize * i + 1, fCont - 1).array().pow(2) / 2 * orderDecayCached.segment(1, fCont - 1) / pow(this->sigma, 2)).sum(); + ret += lambdas.segment(this->mdVecSize * i + fCont, this->mdVecSize - fCont).array().pow(2).sum() / 2 / pow(this->sigma, 2); } return ret; } - void getIntegratedLambdaSqP(const Eigen::Ref, 0, Eigen::InnerStride<>>& lambdas, - Eigen::Ref, 0, Eigen::InnerStride<>> ret) const + void getIntegratedLambdaSqP(const Eigen::Ref>& lambdas, + Eigen::Ref> ret) const { - for (size_t i = 0; i < fCat; ++i) + for (size_t i = 0; i < this->F; ++i) { - ret[fCont * i] = (lambdas[fCont * i] - log(this->alpha)) / pow(this->sigma0, 2); - ret.segment(fCont * i + 1, fCont - 1) = lambdas.segment(fCont * i + 1, fCont - 1).array() * orderDecayCached.segment(1, fCont - 1) / pow(this->sigma, 2); + ret[this->mdVecSize * i] = (lambdas[this->mdVecSize * i] - log(this->alpha)) / pow(this->sigma0, 2); + ret.segment(this->mdVecSize * i + 1, fCont - 1) = lambdas.segment(this->mdVecSize * i + 1, fCont - 1).array() * orderDecayCached.segment(1, fCont - 1) / pow(this->sigma, 2); + ret.segment(this->mdVecSize * i + fCont, this->mdVecSize - fCont) = lambdas.segment(this->mdVecSize * i + fCont, this->mdVecSize - fCont).array() / pow(this->sigma, 2); } } @@ -64,22 +66,27 @@ namespace tomoto { this->lambda = Eigen::Rand::normalLike(this->lambda, this->rg); - for (size_t i = 0; i < fCat; ++i) + for (size_t i = 0; i < this->F; ++i) { - this->lambda.col(fCont * i).array() *= sigma0; - this->lambda.col(fCont * i).array() += log(this->alphas.array()); + this->lambda.col(this->mdVecSize * i).array() *= sigma0; + this->lambda.col(this->mdVecSize * i).array() += log(this->alphas.array()); for (size_t j = 1; j < fCont; ++j) { - this->lambda.col(fCont * i + j).array() *= this->sigma / std::sqrt(orderDecayCached[j]); + this->lambda.col(this->mdVecSize * i + j).array() *= this->sigma / std::sqrt(orderDecayCached[j]); + } + + for (size_t j = fCont; j < this->mdVecSize; ++j) + { + this->lambda.col(this->mdVecSize * i + j).array() *= this->sigma; } } } - Float getNegativeLambdaLL(Eigen::Ref> x, Eigen::Matrix& g) const + Float getNegativeLambdaLL(Eigen::Ref x, Vector& g) const { - auto mappedX = Eigen::Map>(x.data(), this->K, this->F); - auto mappedG = Eigen::Map>(g.data(), this->K, this->F); + auto mappedX = Eigen::Map(x.data(), this->K, this->F); + auto mappedG = Eigen::Map(g.data(), this->K, this->F); Float fx = 0; for (size_t k = 0; k < this->K; ++k) @@ -90,7 +97,7 @@ namespace tomoto return fx; } - Float evaluateLambdaObj(Eigen::Ref> x, Eigen::Matrix& g, ThreadPool& pool, _ModelState* localData) const + /*Float evaluateLambdaObj(Eigen::Ref x, Vector& g, ThreadPool& pool, _ModelState* localData) const { // if one of x is greater than maxLambda, return +inf for preventing search more if ((x.array() > this->maxLambda).any()) return INFINITY; @@ -98,18 +105,18 @@ namespace tomoto const auto K = this->K; const auto KF = this->K * this->F; - auto mappedX = Eigen::Map>(x.data(), K, this->F); + auto mappedX = Eigen::Map(x.data(), K, this->F); Float fx = -static_cast(this)->getNegativeLambdaLL(x, g); - std::vector>> res; + std::vector> res; const size_t chStride = pool.getNumWorkers() * 8; for (size_t ch = 0; ch < chStride; ++ch) { res.emplace_back(pool.enqueue([&, this](size_t threadId) { auto& ld = localData[threadId]; - thread_local Eigen::Matrix alphas{ K }, tmpK{ K }, terms{ fCont }; - Eigen::Matrix ret = Eigen::Matrix::Zero(KF + 1); + thread_local Vector alphas{ K }, tmpK{ K }, terms{ fCont }; + Vector ret = Vector::Zero(KF + 1); for (size_t docId = ch; docId < this->docs.size(); docId += chStride) { const auto& doc = this->docs[docId]; @@ -151,7 +158,7 @@ namespace tomoto // positive fx is an error from limited precision of float. if (fx > 0) return INFINITY; return -fx; - } + }*/ void getTermsFromMd(const Float* vx, Float* out, bool normalize = false) const { @@ -213,14 +220,14 @@ namespace tomoto return ret; } - template + /*template Float* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const { const size_t V = this->realV; assert(vid < V); auto etaHelper = this->template getEtaHelper<_asymEta>(); auto& zLikelihood = ld.zLikelihood; - thread_local Eigen::Matrix terms{ fCont }; + thread_local Vector terms{ fCont }; size_t xOffset = doc.metadata * fCont; getTermsFromMd(&doc.metadataNormalized[0], terms.data()); zLikelihood = (doc.numByTopic.array().template cast() + (this->lambda.middleCols(xOffset, fCont) * terms).array().exp() + this->alphaEps) @@ -229,19 +236,19 @@ namespace tomoto sample::prefixSum(zLikelihood.data(), this->K); return &zLikelihood[0]; - } + }*/ - template + /*template double getLLDocs(_DocIter _first, _DocIter _last) const { const auto K = this->K; double ll = 0; - Eigen::Matrix alphas(K); + Vector alphas(K); for (; _first != _last; ++_first) { auto& doc = *_first; - thread_local Eigen::Matrix terms{ fCont }; + thread_local Vector terms{ fCont }; getTermsFromMd(&doc.metadataNormalized[0], terms.data()); size_t xOffset = doc.metadata * fCont; for (Tid k = 0; k < K; ++k) @@ -257,7 +264,7 @@ namespace tomoto ll -= math::lgammaT(doc.getSumWordWeight() + alphaSum) - math::lgammaT(alphaSum); } return ll; - } + }*/ double getLLRest(const _ModelState& ld) const { @@ -322,8 +329,23 @@ namespace tomoto void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const { - BaseClass::prepareDoc(doc, docId, wordSize); + BaseClass::BaseClass::prepareDoc(doc, docId, wordSize); doc.metadataNormalized = normalizeMetadata(doc.metadataOrg); + + doc.mdVec = Vector::Zero(this->mdVecSize); + getTermsFromMd(doc.metadataNormalized.data(), doc.mdVec.data()); + for (auto x : doc.multiMetadata) + { + doc.mdVec[fCont + x] = 1; + } + + auto p = std::make_pair(doc.metadata, doc.mdVec); + auto it = this->mdHashMap.find(p); + if (it == this->mdHashMap.end()) + { + it = this->mdHashMap.emplace(p, this->mdHashMap.size()).first; + } + doc.mdHash = it->second; } void initGlobalState(bool initDocs) @@ -334,9 +356,17 @@ namespace tomoto { this->metadataDict.add(""); } - fCat = this->metadataDict.size(); - this->F = fCont * fCat; - if (initDocs) collectMinMaxMetadata(); + this->F = this->metadataDict.size(); + this->mdVecSize = fCont + this->multiMetadataDict.size(); + if (initDocs) + { + collectMinMaxMetadata(); + this->lambda = Matrix::Zero(this->K, this->F * this->mdVecSize); + for (size_t i = 0; i < this->F; ++i) + { + this->lambda.col(this->mdVecSize * i) = log(this->alphas.array()); + } + } else { // Old binary file has metadataNormalized values into `metadataOrg` @@ -352,16 +382,27 @@ namespace tomoto } } } - } - - if (initDocs) - { - this->lambda = Eigen::Matrix::Zero(this->K, this->F); - for (size_t i = 0; i < fCat; ++i) + + for (auto& doc : this->docs) { - this->lambda.col(fCont * i) = log(this->alphas.array()); + if (doc.mdVec.size() == this->mdVecSize) continue; + doc.mdVec = Vector::Zero(this->mdVecSize); + getTermsFromMd(doc.metadataNormalized.data(), doc.mdVec.data()); + for (auto x : doc.multiMetadata) + { + doc.mdVec[fCont + x] = 1; + } + + auto p = std::make_pair(doc.metadata, doc.mdVec); + auto it = this->mdHashMap.find(p); + if (it == this->mdHashMap.end()) + { + it = this->mdHashMap.emplace(p, this->mdHashMap.size()).first; + } + doc.mdHash = it->second; } } + orderDecayCached = calcOrderDecay(); LBFGSpp::LBFGSParam param; param.max_iterations = this->maxBFGSIteration; @@ -388,21 +429,33 @@ namespace tomoto } template - _DocType& _updateDoc(_DocType& doc, const std::vector& metadata, const std::string& metadataCat = {}) + _DocType& _updateDoc(_DocType& doc, const std::vector& metadata, const std::string& metadataCat = {}, const std::vector& mdVec = {}) { if (metadata.size() != degreeByF.size()) - throw std::invalid_argument{ "a length of `metadata` should be equal to a length of `degrees`" }; + throw exc::InvalidArgument{ "a length of `metadata` should be equal to a length of `degrees`" }; doc.metadataOrg = metadata; Vid xid; if (_const) { xid = this->metadataDict.toWid(metadataCat); - if (xid == (Vid)-1) throw std::invalid_argument("unknown metadata"); + if (xid == non_vocab_id) throw exc::InvalidArgument("unknown metadata '" + metadataCat + "'"); + + for (auto& m : mdVec) + { + Vid x = this->multiMetadataDict.toWid(m); + if (x == non_vocab_id) throw exc::InvalidArgument("unknown multi_metadata '" + m + "'"); + doc.multiMetadata.emplace_back(x); + } } else { xid = this->metadataDict.add(metadataCat); + + for (auto& m : mdVec) + { + doc.multiMetadata.emplace_back(this->multiMetadataDict.add(m)); + } } doc.metadata = xid; return doc; @@ -413,16 +466,18 @@ namespace tomoto auto doc = this->template _makeFromRawDoc(rawDoc, tokenizer); return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc>("numeric_metadata"), - rawDoc.template getMiscDefault("metadata") + rawDoc.template getMiscDefault("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") )); } std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMisc>("numeric_metadata"), - rawDoc.template getMiscDefault("metadata") + rawDoc.template getMiscDefault("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") )); } @@ -431,49 +486,37 @@ namespace tomoto auto doc = this->_makeFromRawDoc(rawDoc); return this->_addDoc(_updateDoc(doc, rawDoc.template getMisc>("numeric_metadata"), - rawDoc.template getMiscDefault("metadata") + rawDoc.template getMiscDefault("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") )); } std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMisc>("numeric_metadata"), - rawDoc.template getMiscDefault("metadata") + rawDoc.template getMiscDefault("metadata"), + rawDoc.template getMiscDefault>("multi_metadata") )); } - std::vector getTopicsByDoc(const _DocType& doc, bool normalize) const + std::vector getTDF(const Float* metadata, const std::string& metadataCat, const std::vector& multiMetadataCat, bool normalize) const override { - Eigen::Array alphas(this->K); - thread_local Eigen::Matrix terms{ fCont }; - getTermsFromMd(&doc.metadataNormalized[0], terms.data()); - for (Tid k = 0; k < this->K; ++k) - { - alphas[k] = exp(this->lambda.row(k) * terms) + this->alphaEps; - } - std::vector ret(this->K); - Eigen::Map> m{ ret.data(), this->K }; - Float sum = doc.getSumWordWeight() + alphas.sum(); - if (normalize) - { - m = (doc.numByTopic.array().template cast() + alphas) / (doc.getSumWordWeight() + alphas.sum()); - } - else + Vector terms = Vector::Zero(this->mdVecSize); + getTermsFromMd(metadata, terms.data(), true); + for (auto& s : multiMetadataCat) { - m = doc.numByTopic.array().template cast() + alphas; + Vid x = this->multiMetadataDict.toWid(s); + if (x == non_vocab_id) throw exc::InvalidArgument("unknown multi_metadata " + text::quote(s)); + terms[fCont + x] = 1; } - return ret; - } + Vid x = this->metadataDict.toWid(metadataCat); + if (x == non_vocab_id) throw exc::InvalidArgument("unknown metadata " + text::quote(metadataCat)); - std::vector getTDF(const Float* metadata, size_t metadataCat, bool normalize) const override - { - Eigen::Matrix terms{ fCont }; - getTermsFromMd(metadata, terms.data(), true); std::vector ret(this->K); Eigen::Map> retMap{ ret.data(), (Eigen::Index)ret.size() }; - retMap = (this->lambda.middleCols(metadataCat * fCont, fCont) * terms).array(); + retMap = (this->lambda.middleCols(x * this->mdVecSize, this->mdVecSize) * terms).array(); if (normalize) { retMap = (retMap - retMap.maxCoeff()).exp(); @@ -482,16 +525,25 @@ namespace tomoto return ret; } - std::vector getTDFBatch(const Float* metadata, size_t metadataCat, size_t stride, size_t cnt, bool normalize) const override + std::vector getTDFBatch(const Float* metadata, const std::string& metadataCat, const std::vector& multiMetadataCat, size_t stride, size_t cnt, bool normalize) const override { - Eigen::Matrix terms{ fCont, (Eigen::Index)cnt }; + Matrix terms = Matrix::Zero(this->mdVecSize, (Eigen::Index)cnt); for (size_t i = 0; i < cnt; ++i) { getTermsFromMd(metadata + stride * i, terms.col(i).data(), true); } + for (auto& s : multiMetadataCat) + { + Vid x = this->multiMetadataDict.toWid(s); + if (x == non_vocab_id) throw exc::InvalidArgument("unknown multi_metadata " + text::quote(s)); + terms.row(fCont + x).setOnes(); + } + Vid x = this->metadataDict.toWid(metadataCat); + if (x == non_vocab_id) throw exc::InvalidArgument("unknown metadata " + text::quote(metadataCat)); + std::vector ret(this->K * cnt); Eigen::Map> retMap{ ret.data(), (Eigen::Index)this->K, (Eigen::Index)cnt }; - retMap = (this->lambda.middleCols(metadataCat * fCont, fCont) * terms).array(); + retMap = (this->lambda.middleCols(x * this->mdVecSize, this->mdVecSize) * terms).array(); if (normalize) { retMap.rowwise() -= retMap.colwise().maxCoeff(); @@ -500,6 +552,7 @@ namespace tomoto } return ret; } + void setMdRange(const std::vector& vMin, const std::vector& vMax) override { mdIntercepts = vMin; diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp index 0be42f7..0fcdce5 100644 --- a/src/TopicModel/HDPModel.hpp +++ b/src/TopicModel/HDPModel.hpp @@ -14,7 +14,7 @@ namespace tomoto template struct ModelStateHDP : public ModelStateLDA<_tw> { - Eigen::Matrix tableLikelihood, topicLikelihood; + Vector tableLikelihood, topicLikelihood; Eigen::Matrix numTableByTopic; size_t totalTable = 0; @@ -397,9 +397,10 @@ namespace tomoto void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const { + sortAndWriteOrder(doc.words, doc.wOrder); doc.numByTopic.init(nullptr, this->K, 1); doc.numTopicByTable.clear(); - doc.Zs = tvector(wordSize); + doc.Zs = tvector(wordSize, non_topic_id); if (_tw != TermWeight::one) doc.wordWeights.resize(wordSize); } @@ -529,7 +530,7 @@ namespace tomoto args.k = liveK; args.alpha[0] = 0.1f; args.eta = this->eta; - auto lda = make_unique>(args); + auto lda = std::make_unique>(args); lda->dict = this->dict; for (auto& doc : this->docs) @@ -552,6 +553,11 @@ namespace tomoto { for (size_t j = 0; j < this->docs[i].Zs.size(); ++j) { + if (this->docs[i].Zs[j] == non_topic_id) + { + lda->docs[i].Zs[j] = non_topic_id; + continue; + } size_t newTopic = newK[this->docs[i].numTopicByTable[this->docs[i].Zs[j]].topic]; while (newTopic == (Tid)-1) newTopic = newK[randomTopic(rng)]; lda->docs[i].Zs[j] = newTopic; diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp index 4e3301f..37ccbe5 100644 --- a/src/TopicModel/HLDAModel.hpp +++ b/src/TopicModel/HLDAModel.hpp @@ -114,8 +114,8 @@ namespace tomoto static constexpr size_t blockSize = 8; std::vector nodes; std::vector levelBlocks; - Eigen::Matrix nodeLikelihoods; // - Eigen::Matrix nodeWLikelihoods; // + Vector nodeLikelihoods; // + Vector nodeWLikelihoods; // DEFINE_SERIALIZER(nodes, levelBlocks); @@ -351,6 +351,8 @@ namespace tomoto template void samplePathes(_DocType& doc, ThreadPool* pool, _ModelState& ld, _RandGen& rgs) const { + if (!doc.getSumWordWeight()) return; + if(_gs != GlobalSampler::inference) ld.nt->nodes[doc.path.back()].dropPathOne(); ld.nt->template calcNodeLikelihood<_gs == GlobalSampler::train>(gamma, this->K); @@ -516,7 +518,7 @@ namespace tomoto { sortAndWriteOrder(doc.words, doc.wOrder); doc.numByTopic.init(nullptr, this->K, 1); - doc.Zs = tvector(wordSize); + doc.Zs = tvector(wordSize, non_topic_id); doc.path.resize(this->K); for (size_t l = 0; l < this->K; ++l) doc.path[l] = l; diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp index 164d942..9087ffe 100644 --- a/src/TopicModel/HPAModel.hpp +++ b/src/TopicModel/HPAModel.hpp @@ -16,7 +16,7 @@ namespace tomoto std::array, 3> numByTopicWord; std::array, 3> numByTopic; - std::array, 2> subTmp; + std::array subTmp; Eigen::Matrix numByTopic1_2; @@ -45,10 +45,10 @@ namespace tomoto Float epsilon = 0.00001; size_t iteration = 5; - //Eigen::Matrix alphas; // len = (K + 1) + //Vector alphas; // len = (K + 1) - Eigen::Matrix subAlphaSum; // len = K - Eigen::Matrix subAlphas; // len = K * (K2 + 1) + Vector subAlphaSum; // len = K + Matrix subAlphas; // len = K * (K2 + 1) void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { @@ -379,7 +379,7 @@ namespace tomoto void initGlobalState(bool initDocs) { const size_t V = this->realV; - this->globalState.zLikelihood = Eigen::Matrix::Zero(1 + this->K + this->K * K2); + this->globalState.zLikelihood = Vector::Zero(1 + this->K + this->K * K2); if (initDocs) { this->globalState.numByTopic1_2 = Eigen::Matrix::Zero(this->K, K2 + 1); @@ -447,11 +447,11 @@ namespace tomoto if (args.alpha.size() == 1) { - this->alphas = Eigen::Matrix::Constant(args.k + 1, args.alpha[0]); + this->alphas = Vector::Constant(args.k + 1, args.alpha[0]); } else if (args.alpha.size() == args.k + 1) { - this->alphas = Eigen::Map>(args.alpha.data(), (Eigen::Index)args.alpha.size()); + this->alphas = Eigen::Map(args.alpha.data(), (Eigen::Index)args.alpha.size()); } else { @@ -460,7 +460,7 @@ namespace tomoto if (args.subalpha.size() == 1) { - subAlphas = Eigen::Matrix::Constant(args.k, args.k2 + 1, args.subalpha[0]); + subAlphas = Matrix::Constant(args.k, args.k2 + 1, args.subalpha[0]); } else if (args.subalpha.size() == args.k2 + 1) { @@ -479,7 +479,7 @@ namespace tomoto void setDirichletEstIteration(size_t iter) override { - if (!iter) throw std::invalid_argument("iter must > 0"); + if (!iter) throw exc::InvalidArgument("iter must > 0"); iteration = iter; } diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp index 3896db8..03f5ebc 100644 --- a/src/TopicModel/LDACVB0Model.hpp +++ b/src/TopicModel/LDACVB0Model.hpp @@ -85,7 +85,7 @@ namespace tomoto static constexpr static constexpr char TMID[] = "LDA\0"; Float alpha; - Eigen::Matrix alphas; + Vector alphas; Float eta; Tid K; size_t optimInterval = 50; @@ -93,7 +93,7 @@ namespace tomoto template static Float calcDigammaSum(_List list, size_t len, Float alpha) { - auto listExpr = Eigen::Matrix::NullaryExpr(len, list); + auto listExpr = Vector::NullaryExpr(len, list); auto dAlpha = math::digammaT(alpha); return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum(); } @@ -265,11 +265,11 @@ namespace tomoto void initGlobalState(bool initDocs) { const size_t V = this->realV; - this->globalState.zLikelihood = Eigen::Matrix::Zero(K); + this->globalState.zLikelihood = Vector::Zero(K); if (initDocs) { - this->globalState.numByTopic = Eigen::Matrix::Zero(K); - this->globalState.numByTopicWord = Eigen::Matrix::Zero(K, V); + this->globalState.numByTopic = Vector::Zero(K); + this->globalState.numByTopicWord = Matrix::Zero(K, V); } } @@ -335,7 +335,7 @@ namespace tomoto LDACVB0Model(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, size_t _rg = std::random_device{}()) : BaseClass(_rg), K(_K), alpha(_alpha), eta(_eta) { - alphas = Eigen::Matrix::Constant(K, alpha); + alphas = Vector::Constant(K, alpha); } GETTER(K, size_t, K); GETTER(Alpha, Float, alpha); @@ -355,7 +355,7 @@ namespace tomoto std::unique_ptr makeDoc(const std::vector& words) const override { - return make_unique<_DocType>(as_mutable(this)->template _makeDoc(words)); + return std::make_unique<_DocType>(as_mutable(this)->template _makeDoc(words)); } void updateDocs() diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index 1ebdb7d..8f13958 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -56,7 +56,7 @@ namespace tomoto { using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type; - Eigen::Matrix zLikelihood; + Vector zLikelihood; Eigen::Matrix numByTopic; // Dim: (Topic, 1) //Eigen::Matrix numByTopicWord; // Dim: (Topic, Vocabs) ShareableMatrix numByTopicWord; // Dim: (Topic, Vocabs) @@ -179,10 +179,10 @@ namespace tomoto std::vector sharedWordWeights; Tid K; Float alpha, eta; - Eigen::Matrix alphas; + Vector alphas; std::unordered_map> etaByWord; - Eigen::Matrix etaByTopicWord; // (K, V) - Eigen::Matrix etaSumByTopic; // (K, ) + Matrix etaByTopicWord; // (K, V) + Vector etaSumByTopic; // (K, ) uint32_t optimInterval = 10, burnIn = 0; Eigen::Matrix numByTopicDoc; @@ -197,7 +197,7 @@ namespace tomoto template static Float calcDigammaSum(ThreadPool* pool, _List list, size_t len, Float alpha) { - auto listExpr = Eigen::Matrix::NullaryExpr(len, list); + auto listExpr = Vector::NullaryExpr(len, list); auto dAlpha = math::digammaT(alpha); size_t suggested = (len + 127) / 128; @@ -663,6 +663,22 @@ namespace tomoto makeTransformIter(this->docs.end(), txWeights)); } } + + void updateForCopy() + { + BaseClass::updateForCopy(); + size_t offset = 0; + for (auto& doc : this->docs) + { + size_t size = doc.Zs.size(); + doc.Zs = tvector{ sharedZs.data() + offset, size }; + if (_tw != TermWeight::one) + { + doc.wordWeights = tvector{ sharedWordWeights.data() + offset, size }; + } + offset += size; + } + } WeightType* getTopicDocPtr(size_t docId) const { @@ -670,11 +686,14 @@ namespace tomoto return (WeightType*)numByTopicDoc.col(docId).data(); } + /* + * called only when initializing a new doc, not when loading from saved model + */ void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const { sortAndWriteOrder(doc.words, doc.wOrder); doc.numByTopic.init(getTopicDocPtr(docId), K, 1); - doc.Zs = tvector(wordSize); + doc.Zs = tvector(wordSize, non_topic_id); if(_tw != TermWeight::one) doc.wordWeights.resize(wordSize); } @@ -688,7 +707,7 @@ namespace tomoto { auto id = this->dict.toWid(it.first); if (id == (Vid)-1 || id >= this->realV) continue; - etaByTopicWord.col(id) = Eigen::Map>{ it.second.data(), (Eigen::Index)it.second.size() }; + etaByTopicWord.col(id) = Eigen::Map{ it.second.data(), (Eigen::Index)it.second.size() }; } etaSumByTopic = etaByTopicWord.rowwise().sum(); } @@ -696,7 +715,7 @@ namespace tomoto void initGlobalState(bool initDocs) { const size_t V = this->realV; - this->globalState.zLikelihood = Eigen::Matrix::Zero(K); + this->globalState.zLikelihood = Vector::Zero(K); if (initDocs) { this->globalState.numByTopic = Eigen::Matrix::Zero(K); @@ -797,7 +816,7 @@ namespace tomoto return ret; } - template + template std::vector _infer(_Iter docFirst, _Iter docLast, size_t maxIter, Float tolerance, size_t numWorkers) const { decltype(static_cast(this)->makeGeneratorForInit(nullptr)) generator; @@ -806,7 +825,7 @@ namespace tomoto generator = static_cast(this)->makeGeneratorForInit(nullptr); } - if (_Together) + if (together) { numWorkers = std::min(numWorkers, this->maxThreads[(size_t)_ps]); ThreadPool pool{ numWorkers }; @@ -923,11 +942,11 @@ namespace tomoto if (args.alpha.size() == 1) { - alphas = Eigen::Matrix::Constant(K, alpha); + alphas = Vector::Constant(K, alpha); } else if (args.alpha.size() == args.k) { - alphas = Eigen::Map>(args.alpha.data(), args.alpha.size()); + alphas = Eigen::Map(args.alpha.data(), args.alpha.size()); } else if (checkAlpha) { @@ -968,7 +987,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { - return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer)); + return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer)); } size_t addDoc(const RawDoc& rawDoc) override @@ -978,7 +997,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { - return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc)); + return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc)); } void setWordPrior(const std::string& word, const std::vector& priors) override diff --git a/src/TopicModel/LLDAModel.hpp b/src/TopicModel/LLDAModel.hpp index 42e2f04..1ad98d2 100644 --- a/src/TopicModel/LLDAModel.hpp +++ b/src/TopicModel/LLDAModel.hpp @@ -156,7 +156,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); } size_t addDoc(const RawDoc& rawDoc) override @@ -168,7 +168,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); } std::vector getTopicsByDoc(const _DocType& doc, bool normalize) const diff --git a/src/TopicModel/MGLDAModel.hpp b/src/TopicModel/MGLDAModel.hpp index d911704..a28f104 100644 --- a/src/TopicModel/MGLDAModel.hpp +++ b/src/TopicModel/MGLDAModel.hpp @@ -289,7 +289,7 @@ namespace tomoto const size_t S = doc.numBySent.size(); std::fill(doc.numBySent.begin(), doc.numBySent.end(), 0); - doc.Zs = tvector(wordSize); + doc.Zs = tvector(wordSize, non_topic_id); doc.Vs.resize(wordSize); if (_tw != TermWeight::one) doc.wordWeights.resize(wordSize); doc.numByTopic.init(nullptr, this->K + KL, 1); @@ -302,7 +302,7 @@ namespace tomoto void initGlobalState(bool initDocs) { const size_t V = this->realV; - this->globalState.zLikelihood = Eigen::Matrix::Zero(T * (this->K + KL)); + this->globalState.zLikelihood = Vector::Zero(T * (this->K + KL)); if (initDocs) { this->globalState.numByTopic = Eigen::Matrix::Zero(this->K + KL); @@ -442,7 +442,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const { - return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer, rawDoc.template getMisc("delimiter"))); + return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer, rawDoc.template getMisc("delimiter"))); } template @@ -513,7 +513,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const { - return make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc)); + return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc(rawDoc)); } void setWordPrior(const std::string& word, const std::vector& priors) override diff --git a/src/TopicModel/PAModel.hpp b/src/TopicModel/PAModel.hpp index d8e0704..a4c186e 100644 --- a/src/TopicModel/PAModel.hpp +++ b/src/TopicModel/PAModel.hpp @@ -16,7 +16,7 @@ namespace tomoto using WeightType = typename ModelStateLDA<_tw>::WeightType; Eigen::Matrix numByTopic1_2; Eigen::Matrix numByTopic2; - Eigen::Matrix subTmp; + Vector subTmp; DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2); }; @@ -41,8 +41,8 @@ namespace tomoto Float epsilon = 1e-5; size_t iteration = 5; - Eigen::Matrix subAlphaSum; // len = K - Eigen::Matrix subAlphas; // len = K * K2 + Vector subAlphaSum; // len = K + Matrix subAlphas; // len = K * K2 void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { const auto K = this->K; @@ -286,7 +286,7 @@ namespace tomoto BaseClass::prepareDoc(doc, docId, wordSize); doc.numByTopic1_2 = Eigen::Matrix::Zero(this->K, K2); - doc.Z2s = tvector(wordSize); + doc.Z2s = tvector(wordSize, non_topic_id); } void prepareWordPriors() @@ -299,7 +299,7 @@ namespace tomoto { auto id = this->dict.toWid(it.first); if (id == (Vid)-1 || id >= this->realV) continue; - this->etaByTopicWord.col(id) = Eigen::Map>{ it.second.data(), (Eigen::Index)it.second.size() }; + this->etaByTopicWord.col(id) = Eigen::Map{ it.second.data(), (Eigen::Index)it.second.size() }; } this->etaSumByTopic = this->etaByTopicWord.rowwise().sum(); } @@ -307,7 +307,7 @@ namespace tomoto void initGlobalState(bool initDocs) { const size_t V = this->realV; - this->globalState.zLikelihood = Eigen::Matrix::Zero(this->K * K2); + this->globalState.zLikelihood = Vector::Zero(this->K * K2); if (initDocs) { this->globalState.numByTopic = Eigen::Matrix::Zero(this->K); @@ -372,7 +372,7 @@ namespace tomoto if (args.subalpha.size() == 1) { - subAlphas = Eigen::Matrix::Constant(args.k, args.k2, args.subalpha[0]); + subAlphas = Matrix::Constant(args.k, args.k2, args.subalpha[0]); } else if(args.subalpha.size() == args.k2) { @@ -391,7 +391,7 @@ namespace tomoto void setDirichletEstIteration(size_t iter) override { - if (!iter) throw std::invalid_argument("iter must > 0"); + if (!iter) throw exc::InvalidArgument("iter must > 0"); iteration = iter; } diff --git a/src/TopicModel/PLDAModel.hpp b/src/TopicModel/PLDAModel.hpp index c48a1e4..bb37362 100644 --- a/src/TopicModel/PLDAModel.hpp +++ b/src/TopicModel/PLDAModel.hpp @@ -163,7 +163,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); } size_t addDoc(const RawDoc& rawDoc) override @@ -175,7 +175,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("labels"))); } std::vector getTopicsByDoc(const _DocType& doc, bool normalize) const diff --git a/src/TopicModel/PTModel.hpp b/src/TopicModel/PTModel.hpp index 4d9c312..b7396f9 100644 --- a/src/TopicModel/PTModel.hpp +++ b/src/TopicModel/PTModel.hpp @@ -158,13 +158,13 @@ namespace tomoto { sortAndWriteOrder(doc.words, doc.wOrder); doc.numByTopic.init((WeightType*)this->globalState.numByTopicPDoc.col(0).data(), this->K, 1); - doc.Zs = tvector(wordSize); + doc.Zs = tvector(wordSize, non_topic_id); if (_tw != TermWeight::one) doc.wordWeights.resize(wordSize); } void initGlobalState(bool initDocs) { - this->globalState.pLikelihood = Eigen::Matrix::Zero(numPDocs); + this->globalState.pLikelihood = Vector::Zero(numPDocs); this->globalState.numDocsByPDoc = Eigen::ArrayXi::Zero(numPDocs); this->globalState.numByTopicPDoc = Eigen::Matrix::Zero(this->K, numPDocs); BaseClass::initGlobalState(initDocs); diff --git a/src/TopicModel/SLDAModel.hpp b/src/TopicModel/SLDAModel.hpp index 791f8c5..90dc6c4 100644 --- a/src/TopicModel/SLDAModel.hpp +++ b/src/TopicModel/SLDAModel.hpp @@ -16,22 +16,24 @@ namespace tomoto template struct GLMFunctor { - Eigen::Matrix regressionCoef; // Dim : (K) + Vector regressionCoef; // Dim : (K) - GLMFunctor(size_t K = 0, Float mu = 0) : regressionCoef(Eigen::Matrix::Constant(K, mu)) + GLMFunctor(size_t K = 0, Float mu = 0) : regressionCoef(Vector::Constant(K, mu)) { } virtual ISLDAModel::GLM getType() const = 0; + virtual std::unique_ptr copy() const = 0; + virtual void updateZLL( - Eigen::Matrix& zLikelihood, + Vector& zLikelihood, Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const = 0; virtual void optimizeCoef( - const Eigen::Matrix& normZ, + const Matrix& normZ, Float mu, Float nuSq, - Eigen::Block, -1, 1, true> ys + Eigen::Block ys ) = 0; virtual double getLL(Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, @@ -69,8 +71,13 @@ namespace tomoto ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::linear; } + std::unique_ptr> copy() const override + { + return std::make_unique(*this); + } + void updateZLL( - Eigen::Matrix& zLikelihood, + Vector& zLikelihood, Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const override { Float yErr = y - @@ -81,14 +88,14 @@ namespace tomoto } void optimizeCoef( - const Eigen::Matrix& normZ, + const Matrix& normZ, Float mu, Float nuSq, - Eigen::Block, -1, 1, true> ys + Eigen::Block ys ) override { - Eigen::Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast(); - Eigen::Matrix normZZT = selectedNormZ * selectedNormZ.transpose(); - normZZT += Eigen::Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq; + Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast(); + Matrix normZZT = selectedNormZ * selectedNormZ.transpose(); + normZZT += Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq; this->regressionCoef = normZZT.colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, ys).matrix()); } @@ -113,17 +120,22 @@ namespace tomoto struct BinaryLogisticFunctor : public GLMFunctor<_WeightType> { Float b = 1; - Eigen::Matrix omega; + Vector omega; BinaryLogisticFunctor(size_t K = 0, Float mu = 0, Float _b = 1, size_t numDocs = 0) - : GLMFunctor<_WeightType>(K, mu), b(_b), omega{ Eigen::Matrix::Ones(numDocs) } + : GLMFunctor<_WeightType>(K, mu), b(_b), omega{ Vector::Ones(numDocs) } { } ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::binary_logistic; } + std::unique_ptr> copy() const override + { + return std::make_unique(*this); + } + void updateZLL( - Eigen::Matrix& zLikelihood, + Vector& zLikelihood, Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const override { Float yErr = b * (y - 0.5f) - @@ -134,18 +146,18 @@ namespace tomoto } void optimizeCoef( - const Eigen::Matrix& normZ, + const Matrix& normZ, Float mu, Float nuSq, - Eigen::Block, -1, 1, true> ys + Eigen::Block ys ) override { - Eigen::Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast(); - Eigen::Matrix normZZT = selectedNormZ * Eigen::DiagonalMatrix{ omega } * selectedNormZ.transpose(); - normZZT += Eigen::Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq; + Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast(); + Matrix normZZT = selectedNormZ * Eigen::DiagonalMatrix{ omega } * selectedNormZ.transpose(); + normZZT += Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq; this->regressionCoef = normZZT .colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, b * (ys.array() - 0.5f)).matrix() - + Eigen::Matrix::Constant(selectedNormZ.rows(), mu / nuSq)); + + Vector::Constant(selectedNormZ.rows(), mu / nuSq)); RandGen rng; for (size_t i = 0; i < (size_t)omega.size(); ++i) @@ -173,8 +185,20 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE(GLMFunctor<_WeightType>, b, omega); }; + + struct CopyGLMFunctor + { + template + std::vector>> operator()(const std::vector>>& o) + { + std::vector>> ret; + for (auto& p : o) ret.emplace_back(p->copy()); + return ret; + } + }; } + template varTypes; std::vector glmParam; - Eigen::Matrix mu; // Mean of regression coefficients, Dim : (F) - Eigen::Matrix nuSq; // Variance of regression coefficients, Dim : (F) + Vector mu; // Mean of regression coefficients, Dim : (F) + Vector nuSq; // Variance of regression coefficients, Dim : (F) - std::vector>> responseVars; - Eigen::Matrix normZ; // topic proportions for all docs, Dim : (K, D) - Eigen::Matrix Ys; // response variables, Dim : (D, F) + DelegateCopy>>, detail::CopyGLMFunctor> responseVars; + Matrix normZ; // topic proportions for all docs, Dim : (K, D) + Matrix Ys; // response variables, Dim : (D, F) template Float* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const @@ -299,11 +323,11 @@ namespace tomoto switch (varTypes[f]) { case ISLDAModel::GLM::linear: - v = make_unique>(this->K, mu[f], + v = std::make_unique>(this->K, mu[f], f < glmParam.size() ? glmParam[f] : 1.f); break; case ISLDAModel::GLM::binary_logistic: - v = make_unique>(this->K, mu[f], + v = std::make_unique>(this->K, mu[f], f < glmParam.size() ? glmParam[f] : 1.f, this->docs.size()); break; } @@ -333,15 +357,15 @@ namespace tomoto if (args.mu.size() == 0) { - mu = Eigen::Matrix::Zero(F); + mu = Vector::Zero(F); } else if (args.mu.size() == 1) { - mu = Eigen::Matrix::Constant(F, args.mu[0]); + mu = Vector::Constant(F, args.mu[0]); } else if (args.mu.size() == F) { - mu = Eigen::Map>(args.mu.data(), args.mu.size()); + mu = Eigen::Map(args.mu.data(), args.mu.size()); } else { @@ -350,15 +374,15 @@ namespace tomoto if (args.nuSq.size() == 0) { - nuSq = Eigen::Matrix::Ones(F); + nuSq = Vector::Ones(F); } else if (args.mu.size() == 1) { - nuSq = Eigen::Matrix::Constant(F, args.nuSq[0]); + nuSq = Vector::Constant(F, args.nuSq[0]); } else if (args.mu.size() == F) { - nuSq = Eigen::Map>(args.nuSq.data(), args.nuSq.size()); + nuSq = Eigen::Map(args.nuSq.data(), args.nuSq.size()); } else { @@ -411,7 +435,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc, tokenizer); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("y"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("y"))); } size_t addDoc(const RawDoc& rawDoc) override @@ -423,7 +447,7 @@ namespace tomoto std::unique_ptr makeDoc(const RawDoc& rawDoc) const override { auto doc = as_mutable(this)->template _makeFromRawDoc(rawDoc); - return make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("y"))); + return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc(doc, rawDoc.template getMiscDefault>("y"))); } std::vector estimateVars(const DocumentBase* doc) const override @@ -450,10 +474,10 @@ namespace tomoto switch ((ISLDAModel::GLM)(t - 1)) { case ISLDAModel::GLM::linear: - p = make_unique>(); + p = std::make_unique>(); break; case ISLDAModel::GLM::binary_logistic: - p = make_unique>(); + p = std::make_unique>(); break; default: throw std::ios_base::failure(text::format("wrong GLMFunctor type id %d", (t - 1))); diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index e5ae0e6..0e01c81 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -16,6 +16,9 @@ namespace tomoto using RandGen = Eigen::Rand::P8_mt19937_64; using ScalarRandGen = Eigen::Rand::UniversalRandomEngine; + using Vector = Eigen::Matrix; + using Matrix = Eigen::Matrix; + struct RawDocKernel { Float weight = 1; @@ -59,8 +62,8 @@ namespace tomoto const _Ty& getMisc(const std::string& name) const { auto it = misc.find(name); - if (it == misc.end()) throw std::invalid_argument{ "There is no value named `" + name + "` in misc data" }; - if (!it->second.template is<_Ty>()) throw std::invalid_argument{ "Value named `" + name + "` is not in right type." }; + if (it == misc.end()) throw exc::InvalidArgument{ "There is no value named `" + name + "` in misc data" }; + if (!it->second.template is<_Ty>()) throw exc::InvalidArgument{ "Value named `" + name + "` is not in right type." }; return it->second.template get<_Ty>(); } @@ -69,7 +72,7 @@ namespace tomoto { auto it = misc.find(name); if (it == misc.end()) return {}; - if (!it->second.template is<_Ty>()) throw std::invalid_argument{ "Value named `" + name + "` is not in right type." }; + if (!it->second.template is<_Ty>()) throw exc::InvalidArgument{ "Value named `" + name + "` is not in right type." }; return it->second.template get<_Ty>(); } }; @@ -220,6 +223,9 @@ namespace tomoto const std::vector* extra_data = nullptr) const = 0; virtual void loadModel(std::istream& reader, std::vector* extra_data = nullptr) = 0; + + virtual std::unique_ptr copy() const = 0; + virtual const DocumentBase* getDoc(size_t docId) const = 0; virtual size_t getDocIdByUid(const std::string& docUid) const = 0; @@ -316,7 +322,7 @@ namespace tomoto size_t maxThreads[(size_t)ParallelScheme::size] = { 0, }; size_t minWordCf = 0, minWordDf = 0, removeTopN = 0; - std::unique_ptr cachedPool; + PreventCopy> cachedPool; void _saveModel(std::ostream& writer, bool fullModel, const std::vector* extra_data) const { @@ -423,7 +429,7 @@ namespace tomoto } else { - throw std::invalid_argument{ "Either `words` or `rawWords` must be filled." }; + throw exc::InvalidArgument{ "Either `words` or `rawWords` must be filled." }; } return doc; } @@ -469,7 +475,19 @@ namespace tomoto auto tx = [](_DocType& doc) { return &doc.words; }; tvector::trade(words, makeTransformIter(docs.begin(), tx), - makeTransformIter(docs.end(), tx)); + makeTransformIter(docs.end(), tx) + ); + } + + void updateForCopy() + { + size_t offset = 0; + for (auto& doc : docs) + { + size_t size = doc.words.size(); + doc.words = tvector{ words.data() + offset, size }; + offset += size; + } } size_t countRealN() const @@ -547,6 +565,15 @@ namespace tomoto { } + TopicModel(const TopicModel&) = default; + + std::unique_ptr copy() const override + { + auto ret = std::make_unique<_Derived>(*static_cast(this)); + ret->updateForCopy(); + return ret; + } + size_t getNumDocs() const override { return docs.size(); @@ -605,7 +632,7 @@ namespace tomoto if (numWorkers == 1 || (_Flags & flags::shared_state)) ps = ParallelScheme::none; if (!cachedPool || cachedPool->getNumWorkers() != numWorkers) { - cachedPool = make_unique(numWorkers); + cachedPool = std::make_unique(numWorkers); } std::vector<_ModelState> localData; @@ -724,7 +751,7 @@ namespace tomoto double getDocLL(const DocumentBase* doc) const override { auto* p = dynamic_cast(doc); - if (!p) throw std::invalid_argument{ "wrong `doc` type." }; + if (!p) throw exc::InvalidArgument{ "wrong `doc` type." }; return static_cast(this)->getLLDocs(p, p + 1); } diff --git a/src/Utils/AliasMethod.hpp b/src/Utils/AliasMethod.hpp index f389cda..b1bcef0 100644 --- a/src/Utils/AliasMethod.hpp +++ b/src/Utils/AliasMethod.hpp @@ -35,8 +35,8 @@ namespace tomoto bitsize = o.bitsize; if (msize) { - arr = make_unique<_Precision[]>(1 << bitsize); - alias = make_unique(1 << bitsize); + arr = std::make_unique<_Precision[]>(1 << bitsize); + alias = std::make_unique(1 << bitsize); std::copy(o.arr.get(), o.arr.get() + (1 << bitsize), arr.get()); std::copy(o.alias.get(), o.alias.get() + (1 << bitsize), alias.get()); @@ -78,15 +78,15 @@ namespace tomoto if (nbsize != bitsize) { - arr = make_unique<_Precision[]>(psize); + arr = std::make_unique<_Precision[]>(psize); std::fill(arr.get(), arr.get() + psize, 0); - alias = make_unique(psize); + alias = std::make_unique(psize); bitsize = nbsize; } sum /= psize; - auto f = make_unique(psize); + auto f = std::make_unique(psize); auto pf = f.get(); for (auto it = first; it != last; ++it, ++pf) { diff --git a/src/Utils/Dictionary.h b/src/Utils/Dictionary.h index 6838e2b..de23a04 100644 --- a/src/Utils/Dictionary.h +++ b/src/Utils/Dictionary.h @@ -13,6 +13,7 @@ namespace tomoto using Vid = uint32_t; static constexpr Vid non_vocab_id = (Vid)-1; using Tid = uint16_t; + static constexpr Vid non_topic_id = (Tid)-1; using Float = float; struct VidPair : public std::pair diff --git a/src/Utils/Utils.hpp b/src/Utils/Utils.hpp index d3da7f2..30c8266 100644 --- a/src/Utils/Utils.hpp +++ b/src/Utils/Utils.hpp @@ -10,25 +10,110 @@ namespace tomoto { - template::value, int>::type = 0> - std::unique_ptr make_unique(Args&&... args) - { - return std::unique_ptr(new T(std::forward(args)...)); - } - - template::value, int>::type = 0> - std::unique_ptr make_unique(size_t size) - { - return std::unique_ptr(new typename std::remove_extent::type [size]); - } - template constexpr T * as_mutable(const T * value) noexcept { return const_cast(value); } + template + class PreventCopy : public T + { + public: + template + PreventCopy(Args&&... args) : + T(std::forward(args)...) + { + } + + PreventCopy(const PreventCopy& from) + { + } + + PreventCopy(PreventCopy&& from) : + T(static_cast(from)) + { + } + + PreventCopy& operator=(const T& from) + { + T::operator=(from); + return *this; + } + + PreventCopy& operator=(T&& from) + { + T::operator=(std::move(from)); + return *this; + } + + PreventCopy& operator=(const PreventCopy& from) + { + return *this; + } + + PreventCopy& operator=(PreventCopy&& from) + { + T::operator=(static_cast(from)); + return *this; + } + }; + + template + class DelegateCopy : public T + { + public: + template + DelegateCopy(Args&&... args) : + T(std::forward(args)...) + { + } + + DelegateCopy(const T& from) : + T(Delegator{}(from)) + { + } + + DelegateCopy(const DelegateCopy& from) : + T(Delegator{}(from)) + { + } + + DelegateCopy(T&& from) : + T(static_cast(from)) + { + } + + DelegateCopy(DelegateCopy&& from) : + T(static_cast(from)) + { + } + + DelegateCopy& operator=(const T& from) + { + T::operator=(from); + return *this; + } + + DelegateCopy& operator=(T&& from) + { + T::operator=(std::move(from)); + return *this; + } + + DelegateCopy& operator=(const DelegateCopy& from) + { + T::operator=(Delegator{}(from)); + return *this; + } + + DelegateCopy& operator=(DelegateCopy&& from) + { + T::operator=(static_cast(from)); + return *this; + } + }; + + template class OptionalLock : public std::lock_guard { diff --git a/src/Utils/math.h b/src/Utils/math.h index 3cee7fc..2ab47b9 100644 --- a/src/Utils/math.h +++ b/src/Utils/math.h @@ -155,30 +155,28 @@ namespace tomoto // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) template - inline auto lgammaApprox(_T z) -> decltype((z + 2.5)* log(z + 3) - (z + 3) + 0.91893853 + 1. / 12. / (z + 3) - log(z * (z + 1) * (z + 2))) + inline auto lgammaApprox(_T z) { return (z + 2.5) * log(z + 3) - (z + 3) + 0.91893853 + 1. / 12. / (z + 3) - log(z * (z + 1) * (z + 2)); } // calc lgamma(z + a) - lgamma(z) template - inline auto lgammaSubt(_T z, _U a) -> decltype((z + a + 1.5)* log(z + a + 2) - (z + 1.5) * log(z + 2) - a + (1. / (z + a + 2) - 1. / (z + 2)) / 12. - log((z + a) / z * (z + a + 1) / (z + 1))) + inline auto lgammaSubt(_T z, _U a) { return (z + a + 1.5) * log(z + a + 2) - (z + 1.5) * log(z + 2) - a + (1. / (z + a + 2) - 1. / (z + 2)) / 12. - log((z + a) / z * (z + a + 1) / (z + 1)); } // approximation : digamma(z) ~= ln(z+4) - 1/2/(z+4) - 1/12/(z+4)^2 - 1/z - 1/(z+1) - 1/(z+2) - 1/(z+3) template - inline auto digammaApprox(_T z) -> decltype(log(z + 4) - 1. / 2. / (z + 4) - 1. / 12. / ((z + 4) * (z + 4)) - 1. / z - 1. / (z + 1) - 1. / (z + 2) - 1. / (z + 3)) + inline auto digammaApprox(_T z) { return log(z + 4) - 1. / 2. / (z + 4) - 1. / 12. / ((z + 4) * (z + 4)) - 1. / z - 1. / (z + 1) - 1. / (z + 2) - 1. / (z + 3); } // calc digamma(z + a) - digamma(z) template - inline auto digammaSubt(_T z, _U a) -> decltype(log((z + a + 2) / (z + 2)) - (1 / (z + a + 2) - 1 / (z + 2)) / 2 - (1 / (z + a + 2) / (z + a + 2) - 1 / (z + 2) / (z + 2)) / 12 - - 1. / (z + a) - 1. / (z + a + 1) - - 1. / z - 1. / (z + 1)) + inline auto digammaSubt(_T z, _U a) { return log((z + a + 2) / (z + 2)) - (1 / (z + a + 2) - 1 / (z + 2)) / 2 - (1 / (z + a + 2) / (z + a + 2) - 1 / (z + 2) / (z + 2)) / 12 - 1. / (z + a) - 1. / (z + a + 1) diff --git a/src/Utils/serializer.hpp b/src/Utils/serializer.hpp index c4b8e1b..aeafd36 100644 --- a/src/Utils/serializer.hpp +++ b/src/Utils/serializer.hpp @@ -434,6 +434,16 @@ namespace tomoto } }; + template + struct Serializer> : public Serializer<_Ty> + { + }; + + template + struct Serializer> : public Serializer<_Ty> + { + }; + template struct Serializer, typename std::enable_if::value>::type> { diff --git a/src/Utils/text.hpp b/src/Utils/text.hpp index aa657c4..3c83fbd 100644 --- a/src/Utils/text.hpp +++ b/src/Utils/text.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,13 @@ namespace tomoto return s; } + inline std::string quote(const std::string& s) + { + std::ostringstream stream; + stream << std::quoted(s); + return stream.str(); + } + inline std::vector split(const std::string& str, const std::string& delim) { std::vector tokens; diff --git a/src/Utils/tvector.hpp b/src/Utils/tvector.hpp index 80cced6..c3f8e78 100644 --- a/src/Utils/tvector.hpp +++ b/src/Utils/tvector.hpp @@ -26,14 +26,10 @@ namespace tomoto tvector() noexcept { - _first = _Alloc{}.allocate(4); - _last = _first; - _rsvEnd = _first + 4; } tvector(std::nullptr_t) noexcept { - } // non-owning, just pointing constructor @@ -294,7 +290,7 @@ namespace tomoto bool isOwner() const noexcept { - return _rsvEnd; + return _rsvEnd || (_rsvEnd == nullptr && _first == nullptr); } // 23.3.11.5, modifiers: @@ -523,8 +519,11 @@ namespace tomoto { size_type s = size(); T *tarr = _Alloc{}.allocate(newSize); - memcpy(tarr, _first, s * sizeof(T)); - _Alloc{}.deallocate(_first, capacity()); + if (_first) + { + memcpy(tarr, _first, s * sizeof(T)); + _Alloc{}.deallocate(_first, capacity()); + } _first = tarr; _last = _first + s; _rsvEnd = _first + newSize; diff --git a/src/python/PyUtils.h b/src/python/PyUtils.h index 130eb4c..196a210 100644 --- a/src/python/PyUtils.h +++ b/src/python/PyUtils.h @@ -76,15 +76,311 @@ namespace py } }; - class ConversionFail : public std::runtime_error + class ExcPropagation : public std::runtime_error + { + public: + ExcPropagation() : std::runtime_error{ "" } + { + } + }; + + class BaseException : public std::runtime_error { public: using std::runtime_error::runtime_error; + virtual PyObject* pytype() const + { + return PyExc_BaseException; + } + }; + + class Exception : public BaseException + { + public: + using BaseException::BaseException; + + virtual PyObject* pytype() const + { + return PyExc_Exception; + } + }; + + class StopIteration : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_StopIteration; + } + }; + + class StopAsyncIteration : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_StopAsyncIteration; + } + }; + + class ArithmeticError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_ArithmeticError; + } + }; + + class AssertionError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_AssertionError; + } + }; + + class AttributeError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_AttributeError; + } + }; + + class BufferError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_BufferError; + } + }; + + class EOFError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_EOFError; + } + }; + + class ImportError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_ImportError; + } + }; + + class LookupError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_LookupError; + } + }; + + class IndexError : public LookupError + { + public: + using LookupError::LookupError; + + virtual PyObject* pytype() const + { + return PyExc_IndexError; + } + }; + + class KeyError : public LookupError + { + public: + using LookupError::LookupError; + + virtual PyObject* pytype() const + { + return PyExc_KeyError; + } + }; + + class MemoryError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_MemoryError; + } + }; + + class NameError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_NameError; + } + }; + + class OSError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_OSError; + } + }; + + class ReferenceError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_ReferenceError; + } + }; + + class RuntimeError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_RuntimeError; + } + }; + + class SyntaxError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_SyntaxError; + } + }; + + class SystemError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_SystemError; + } + }; + + class TypeError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_TypeError; + } + }; + + class ValueError : public Exception + { + public: + using Exception::Exception; + + virtual PyObject* pytype() const + { + return PyExc_ValueError; + } + }; + + template + auto handleExc(_Fn&& fn) + -> typename std::enable_if::value, decltype(fn())>::type + { + try + { + return fn(); + } + catch (const ExcPropagation&) + { + } + catch (const BaseException& e) + { + PyErr_SetString(e.pytype(), e.what()); + } + catch (const std::exception& e) + { + std::cerr << "Uncaughted c++ exception: " << e.what() << std::endl; + PyErr_SetString(PyExc_RuntimeError, e.what()); + } + return nullptr; + } + + template + auto handleExc(_Fn&& fn) + -> typename std::enable_if::value, decltype(fn())>::type + { + try + { + return fn(); + } + catch (const ExcPropagation&) + { + } + catch (const BaseException& e) + { + PyErr_SetString(e.pytype(), e.what()); + } + catch (const std::exception& e) + { + std::cerr << "Uncaughted c++ exception: " << e.what() << std::endl; + PyErr_SetString(PyExc_RuntimeError, e.what()); + } + return -1; + } + + class ConversionFail : public ValueError + { + public: + using ValueError::ValueError; + template, _Ty>::value>::type > - ConversionFail(_Ty&& callable) : runtime_error{ callable() } + ConversionFail(_Ty&& callable) : ValueError{ callable() } { } }; @@ -113,6 +409,14 @@ namespace py return toCpp(r, ""); } + template + inline std::string reprFromCpp(_Ty&& o) + { + UniqueObj p{ py::buildPyValue(std::forward<_Ty>(o)) }; + UniqueObj r{ PyObject_Repr(p) }; + return toCpp(r, ""); + } + template inline _Ty toCpp(PyObject* obj) { @@ -631,7 +935,7 @@ namespace py PyObject* >::type buildPyValueTransform(_Ty first, _Ty last, _Tx tx) { - using value_type = typename std::iterator_traits<_Ty>::value_type; + using value_type = decltype(tx(*first)); npy_intp size = std::distance(first, last); PyObject* ret = PyArray_EMPTY(1, &size, detail::NpyType::type, 0); size_t id = 0; @@ -730,48 +1034,4 @@ namespace py detail::setTupleItem<0>(tuple, std::forward<_Rest>(rest)...); return tuple; } - - class WarningLog - { - std::set> printed; - - WarningLog() - { - } - public: - static WarningLog& get() - { - thread_local WarningLog inst; - return inst; - } - - void print(std::ostream& ostr, const std::string& msg) - { - auto frame = PyEval_GetFrame(); - auto key = std::make_tuple( - std::string{ PyUnicode_AsUTF8(frame->f_code->co_filename) }, - PyFrame_GetLineNumber(frame), - msg); - - ostr << std::get<0>(key) << "(" << std::get<1>(key) << "): " << std::get<2>(key) << std::endl; - } - - void printOnce(std::ostream& ostr, const std::string& msg) - { - auto frame = PyEval_GetFrame(); - auto key = std::make_tuple( - std::string{ PyUnicode_AsUTF8(frame->f_code->co_filename) }, - PyFrame_GetLineNumber(frame), - msg); - - if (!printed.count(key)) - { - ostr << std::get<0>(key) << "(" << std::get<1>(key) << "): " << std::get<2>(key) << std::endl; - printed.insert(key); - } - } - }; } - -#define PRINT_WARN(msg) do{ py::WarningLog::get().print(std::cerr, msg); } while(0) -#define PRINT_WARN_ONCE(msg) do{ py::WarningLog::get().printOnce(std::cerr, msg); } while(0) \ No newline at end of file diff --git a/src/python/docs.h b/src/python/docs.h index 0e17f07..fdc3dfa 100644 --- a/src/python/docs.h +++ b/src/python/docs.h @@ -131,6 +131,14 @@ DOC_VARIABLE_EN_KO(Document_metadata__doc__, u8R""(categorical metadata of the document (for only `tomotopy.DMRModel` and `tomotopy.GDMRModel` model, read-only))"", u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용))""); +DOC_VARIABLE_EN_KO(Document_multi_metadata__doc__, + u8R""(categorical multiple metadata of the document (for only `tomotopy.DMRModel` and `tomotopy.GDMRModel` model, read-only) + +.. versionadded:: 0.12.0)"", + u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용) + +.. versionadded:: 0.12.0)""); + DOC_VARIABLE_EN_KO(Document_numeric_metadata__doc__, u8R""(continuous numeric metadata of the document (for only `tomotopy.GDMRModel` model, read-only) @@ -631,6 +639,16 @@ DOC_SIGNATURE_EN_KO(LDA_loads__doc__, u8R""(Return the model instance loaded from `data` in bytes-like object.)"", u8R""(bytes-like object인 `data`로로부터 모델 인스턴스를 읽어들여 반환합니다.)""); +DOC_SIGNATURE_EN_KO(LDA_copy__doc__, + "copy(self)", + u8R""(.. versionadded:: 0.12.0 + +Return a new deep-copied instance of the current instance)"", + u8R""(.. versionadded:: 0.12.0 + +깊게 복사된 새 인스턴스를 반환합니다.)""); + + DOC_SIGNATURE_EN_KO(LDA_summary__doc__, "summary(self, initial_hp=True, params=True, topic_word_top_n=5, file=None, flush=False)", u8R""(.. versionadded:: 0.9.0 @@ -880,45 +898,115 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(DMR_add_doc__doc__, - "add_doc(self, words, metadata='')", + "add_doc(self, words, metadata='', multi_metadata=[])", u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. + Parameters ---------- words : Iterable[str] an iterable of `str` metadata : str metadata of the document (e.g., author, title or year) +multi_metadata : Iterable[str] + metadata of the document (for multiple values) )"", u8R""(현재 모델에 `metadata`를 포함하는 새로운 문헌을 추가하고 추가된 문헌의 인덱스 번호를 반환합니다. +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. + Parameters ---------- words : Iterable[str] 문헌의 각 단어를 나열하는 `str` 타입의 iterable metadata : str 문헌의 메타데이터 (예로 저자나 제목, 작성연도 등) +multi_metadata : Iterable[str] + 문헌의 메타데이터 (다중 값이 필요한 경우 사용하십시오) )""); DOC_SIGNATURE_EN_KO(DMR_make_doc__doc__, - "make_doc(self, words, metadata='')", + "make_doc(self, words, metadata='', multi_metadata=[])", u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. + Parameters ---------- words : Iterable[str] an iteratable of `str` metadata : str metadata of the document (e.g., author, title or year) +multi_metadata : Iterable[str] + metadata of the document (for multiple values) )"", u8R""(`words` 단어를 바탕으로 새로운 문헌인 `tomotopy.Document` 인스턴스를 반환합니다. 이 인스턴스는 `tomotopy.LDAModel.infer` 메소드에 사용될 수 있습니다. +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. + Parameters ---------- words : Iterable[str] 문헌의 각 단어를 나열하는 `str` 타입의 iterable metadata : str 문헌의 메타데이터 (예를 들어 저자나 제목, 작성연도 등) +multi_metadata : Iterable[str] + 문헌의 메타데이터 (다중 값이 필요한 경우 사용하십시오) +)""); + +DOC_SIGNATURE_EN_KO(DMR_get_topic_prior__doc__, + "get_topic_prior(self, metadata='', multi_metadata=[], raw=False)", + u8R""(.. versionadded:: 0.12.0 + +Calculate the topic prior of any document with the given `metadata` and `multi_metadata`. +If `raw` is true, the value without applying `exp()` is returned, otherwise, the value with applying `exp()` is returned. + +The topic prior is calculated as follows: + +`np.dot(lambda_[:, id(metadata)], np.concat([[1], multi_hot(multi_metadata)]))` + +where `idx(metadata)` and `multi_hot(multi_metadata)` indicates +an integer id of given `metadata` and multi-hot encoded binary vector for given `multi_metadata` respectively. + + +Parameters +---------- +metadata : str + metadata of the document (e.g., author, title or year) +multi_metadata : Iterable[str] + metadata of the document (for multiple values) +raw : bool + If `raw` is true, the raw value of parameters without applying `exp()` is returned. +)"", +u8R""(.. versionadded:: 0.12.0 + +주어진 `metadata`와 `multi_metadata`에 대해 토픽의 사전 분포를 계산합니다. +`raw`가 참인 경우 `exp()`가 적용되기 전의 값이 반환되며, 그 외에는 `exp()`가 적용된 값이 반환됩니다. + +토픽의 사전분포는 다음과 같이 계산됩니다: + +`np.dot(lambda_[:, id(metadata)], np.concat([[1], multi_hot(multi_metadata)]))` + +여기서 `idx(metadata)`와 `multi_hot(multi_metadata)`는 각각 +주어진 `metadata`의 정수 인덱스 번호와 `multi_metadata`를 multi-hot 인코딩한, 0 혹은 1로 구성된 벡터입니다. + +Parameters +---------- +metadata : str + 문헌의 메타데이터 (예를 들어 저자나 제목, 작성연도 등) +multi_metadata : Iterable[str] + 문헌의 메타데이터 (다중 값이 필요한 경우 사용하십시오) +raw : bool + 참일 경우 파라미터에 `exp()`가 적용되지 않은 값이 반환됩니다. )""); DOC_VARIABLE_EN_KO(DMR_f__doc__, @@ -937,6 +1025,20 @@ DOC_VARIABLE_EN_KO(DMR_metadata_dict__doc__, u8R""(a dictionary of metadata in type `tomotopy.Dictionary` (read-only))"", u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용))""); +DOC_VARIABLE_EN_KO(DMR_multi_metadata_dict__doc__, + u8R""(a dictionary of metadata in type `tomotopy.Dictionary` (read-only) + +.. versionadded:: 0.12.0 + + This dictionary is distinct from `metadata_dict`. +)"", + u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용) + +.. versionadded:: 0.12.0 + + 이 사전은 `metadata_dict`와는 별개입니다. +)""); + DOC_VARIABLE_EN_KO(DMR_lamdas__doc__, u8R""(parameter lambdas in the shape `[k, f]` (read-only) @@ -950,6 +1052,21 @@ DOC_VARIABLE_EN_KO(DMR_lamdas__doc__, 0.11.0 버전 전까지는 lambda getter에 있는 버그로 잘못된 값이 출력되었습니다. 0.11.0 이후 버전으로 업그레이드하시길 권장합니다.)""); + +DOC_VARIABLE_EN_KO(DMR_lamda___doc__, + u8R""(parameter lambdas in the shape `[k, len(metadata_dict), l]` where `k` is the number of topics and `l` is the size of vector for multi_metadata (read-only) + +See `tomotopy.DMRModel.get_topic_prior` for the relation between the lambda parameter and the topic prior. + +.. versionadded:: 0.12.0 +)"", +u8R""(현재 모형의 lambda 파라미터을 보여주는 `[k, len(metadata_dict), l]` 모양의 float array (읽기전용) + +lambda 파라미터와 토픽 사전 분포 간의 관계에 대해서는 `tomotopy.DMRModel.get_topic_prior`를 참고하십시오. + +.. versionadded:: 0.12.0)""); + + DOC_VARIABLE_EN_KO(DMR_alpha__doc__, u8R""(Dirichlet prior on the per-document topic distributions for each metadata in the shape `[k, f]`. Equivalent to `np.exp(DMRModel.lambdas)` (read-only) @@ -1090,15 +1207,19 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(GDMR_add_doc__doc__, - "add_doc(self, words, numeric_metadata=[], metadata='')", + "add_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 Until version 0.10.2, `metadata` was used to represent numeric data and there was no argument for categorical data. Since version 0.11.0, the name of the previous `metadata` argument is changed to `numeric_metadata`, and `metadata` is added to represent categorical data for unification with the `tomotopy.DMRModel`. +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. + Parameters ---------- words : Iterable[str] @@ -1107,15 +1228,21 @@ numeric_metadata : Iterable[float] continuous numeric metadata variable of the document. Its length should be equal to the length of `degrees`. metadata : str categorical metadata of the document (e.g., author, title, journal or country) +multi_metadata : Iterable[str] + metadata of the document (for multiple values) )"", u8R""(현재 모델에 `metadata`를 포함하는 새로운 문헌을 추가하고 추가된 문헌의 인덱스 번호를 반환합니다. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 0.10.2버전까지는 `metadata`가 숫자형 연속 변수를 표현하는데 사용되었고, 별도로 범주형 변수에 사용되는 인자가 없었습니다. 0.11.0버전부터는 `tomotopy.DMRModel`과의 통일성을 위해 기존의 `metadata` 인수가 `numeric_metadata`라는 이름으로 변경되고, `metadata`라는 이름으로 범주형 변수를 사용할 수 있게 변경됩니다. +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. + Parameters ---------- words : Iterable[str] @@ -1124,18 +1251,24 @@ numeric_metadata : Iterable[float] 문헌의 연속형 숫자 메타데이터 변수. 길이는 `degrees`의 길이와 동일해야 합니다. metadata : str 문헌의 범주형 메타데이터 (예를 들어 저자나 제목, 저널, 국가 등) +multi_metadata : Iterable[str] + 문헌의 메타데이터 (다중 값이 필요한 경우 사용하십시오) )""); DOC_SIGNATURE_EN_KO(GDMR_make_doc__doc__, - "make_doc(self, words, numeric_metadata=[], metadata='')", + "make_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 Until version 0.10.2, `metadata` was used to represent numeric data and there was no argument for categorical data. Since version 0.11.0, the name of the previous `metadata` argument is changed to `numeric_metadata`, and `metadata` is added to represent categorical data for unification with the `tomotopy.DMRModel`. +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. + Parameters ---------- words : Iterable[str] @@ -1144,15 +1277,21 @@ numeric_metadata : Iterable[float] continuous numeric metadata variable of the document. Its length should be equal to the length of `degrees`. metadata : str categorical metadata of the document (e.g., author, title, journal or country) +multi_metadata : Iterable[str] + metadata of the document (for multiple values) )"", u8R""(`words` 단어를 바탕으로 새로운 문헌인 `tomotopy.Document` 인스턴스를 반환합니다. 이 인스턴스는 `tomotopy.LDAModel.infer` 메소드에 사용될 수 있습니다. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 0.10.2버전까지는 `metadata`가 숫자형 연속 변수를 표현하는데 사용되었고, 별도로 범주형 변수에 사용되는 인자가 없었습니다. 0.11.0버전부터는 `tomotopy.DMRModel`과의 통일성을 위해 기존의 `metadata` 인수가 `numeric_metadata`라는 이름으로 변경되고, `metadata`라는 이름으로 범주형 변수를 사용할 수 있게 변경됩니다. +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. + Parameters ---------- words : Iterable[str] @@ -1161,13 +1300,19 @@ numeric_metadata : Iterable[float] 문헌의 연속형 숫자 메타데이터 변수. 길이는 `degrees`의 길이와 동일해야 합니다. metadata : str 문헌의 범주형 메타데이터 (예를 들어 저자나 제목, 저널, 국가 등) +multi_metadata : Iterable[str] + 문헌의 메타데이터 (다중 값이 필요한 경우 사용하십시오) )""); DOC_SIGNATURE_EN_KO(GDMR_tdf__doc__, - "tdf(self, numeric_metadata, metadata='', normalize=True)", - u8R""(Calculate a topic distribution for given `metadata` value. It returns a list with length `k`. + "tdf(self, numeric_metadata, metadata='', multi_metadata=[], normalize=True)", + u8R""(Calculate a topic distribution for given `numeric_metadata` value. It returns a list with length `k`. + +.. versionchanged:: 0.11.0 -..versionchanged:: 0.11.0 +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. Parameters ---------- @@ -1175,27 +1320,41 @@ numeric_metadata : Iterable[float] continuous metadata variable whose length should be equal to the length of `degrees`. metadata : str categorical metadata variable +multi_metadata : Iterable[str] + categorical metadata variables (for multiple values) normalize : bool If true, the method returns probabilities for each topic in range [0, 1]. Otherwise, it returns raw values in logit. )"", u8R""(주어진 `metadata`에 대해 토픽 분포를 계산하여, `k` 길이의 list로 반환합니다. +.. versionchanged:: 0.11.0 + +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. + Parameters ---------- numeric_metadata : Iterable[float] 연속형 메타데이터 변수. 길이는 `degrees`의 길이와 동일해야 합니다. metadata : str 범주형 메타데이터 변수 +multi_metadata : Iterable[str] + 범주형 메타데이터 변수 (여러 개를 입력해야 하는 경우 사용하십시오) normalize : bool 참인 경우, 각 값이 [0, 1] 범위에 있는 확률 분포를 반환합니다. 거짓인 경우 logit값을 그대로 반환합니다. )""); DOC_SIGNATURE_EN_KO(GDMR_tdf_linspace__doc__, - "tdf_linspace(self, numeric_metadata_start, numeric_metadata_stop, num, metadata='', endpoint=True, normalize=True)", + "tdf_linspace(self, numeric_metadata_start, numeric_metadata_stop, num, metadata='', multi_metadata=[], endpoint=True, normalize=True)", u8R""(Calculate a topic distribution for given `metadata` value. It returns a list with length `k`. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 + +.. versionchanged:: 0.12.0 + + A new argument `multi_metadata` for multiple values of metadata was added. Parameters ---------- @@ -1207,6 +1366,8 @@ num : Iterable[int] the number of samples to generate for each metadata variable. Must be non-negative. Its length should be equal to the length of `degrees`. metadata : str categorical metadata variable +multi_metadata : Iterable[str] + categorical metadata variables (for multiple values) endpoint : bool If True, `metadata_stop` is the last sample. Otherwise, it is not included. Default is True. normalize : bool @@ -1219,7 +1380,11 @@ samples : ndarray )"", u8R""(주어진 `metadata`에 대해 토픽 분포를 계산하여, `k` 길이의 list로 반환합니다. -..versionchanged:: 0.11.0 +.. versionchanged:: 0.11.0 + +.. versionchanged:: 0.12.0 + + 여러 개의 메타데이터를 입력하는데 쓰이는 `multi_metadata`가 추가되었습니다. Parameters ---------- @@ -1231,6 +1396,8 @@ num : Iterable[int] 각 메타데이터 변수별로 생성할 샘플의 개수(0보다 큰 정수). 길이는 `degrees`의 길이와 동일해야 합니다. metadata : str 범주형 메타데이터 변수 +multi_metadata : Iterable[str] + 범주형 메타데이터 변수 (여러 개를 입력해야 하는 경우 사용하십시오) endpoint : bool 참인 경우 `metadata_stop`이 마지막 샘플이 됩니다. 거짓인 경우 끝값이 샘플에 포함되지 않습니다. 기본값은 참입니다. normalize : bool diff --git a/src/python/label_docs.h b/src/python/label_docs.h index 6063c44..f6d4bbf 100644 --- a/src/python/label_docs.h +++ b/src/python/label_docs.h @@ -60,7 +60,7 @@ topic_model )""); DOC_SIGNATURE_EN_KO(PMIExtractor___init____doc__, - "PMIExtractor(min_cf=10, min_df=5, min_len=1, max_len=5, max_cand=5000)", + "PMIExtractor(min_cf=10, min_df=5, min_len=1, max_len=5, max_cand=5000, normalized=False)", u8R""(.. versionadded:: 0.6.0 `PMIExtractor` exploits multivariate pointwise mutual information to extract collocations. diff --git a/src/python/py_CT.cpp b/src/python/py_CT.cpp index 3d412bd..c2a2bde 100644 --- a/src/python/py_CT.cpp +++ b/src/python/py_CT.cpp @@ -16,14 +16,14 @@ static int CT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &objAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`smoothing_alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::ICTModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -36,15 +36,7 @@ static int CT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* CT_getCorrelations(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -52,15 +44,10 @@ static PyObject* CT_getCorrelations(TopicModelObject* self, PyObject* args, PyOb PyObject* argTopicId = nullptr; static const char* kwlist[] = { "topic_id", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, &argTopicId)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ if (!argTopicId || argTopicId == Py_None) { @@ -75,19 +62,10 @@ static PyObject* CT_getCorrelations(TopicModelObject* self, PyObject* args, PyOb } size_t topicId = PyLong_AsLong(argTopicId); - if (topicId == (size_t)-1 && PyErr_Occurred()) throw bad_exception{}; - if (topicId >= inst->getK()) throw runtime_error{ "`topic_id` must be in range [0, `k`)" }; + if (topicId == (size_t)-1 && PyErr_Occurred()) return nullptr; + if (topicId >= inst->getK()) throw py::ValueError{ "`topic_id` must be in range [0, `k`)" }; return py::buildPyValue(inst->getCorrelationTopic(topicId)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_GETTER(tomoto::ICTModel, CT, getNumBetaSample); @@ -96,9 +74,9 @@ DEFINE_GETTER(tomoto::ICTModel, CT, getPriorMean); PyObject* CT_getPriorCov(TopicModelObject *self, void *closure) { - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); py::UniqueObj obj{ py::buildPyValue(inst->getPriorCov()) }; PyArray_Dims dims; @@ -106,16 +84,7 @@ PyObject* CT_getPriorCov(TopicModelObject *self, void *closure) dims.ptr = d; dims.len = 2; return PyArray_Newshape((PyArrayObject*)obj.get(), &dims, NPY_CORDER); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_SETTER_NON_NEGATIVE_INT(tomoto::ICTModel, CT, setNumBetaSample); @@ -187,40 +156,18 @@ TopicModelTypeObject CT_type = { { PyObject* Document_beta(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() -> PyObject* { - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `beta` field!" }; - if (!self->doc) throw runtime_error{ "doc is null!" }; - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValueTransform( - doc->smBeta.data(), doc->smBeta.data() + doc->smBeta.size(), - logf); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValueTransform( - doc->smBeta.data(), doc->smBeta.data() + doc->smBeta.size(), - logf); - } while (0); - do + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `beta` field!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; + + if (auto ret = docVisit(self->getBoundDoc(), [](auto* doc) { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValueTransform( + return py::buildPyValueTransform( doc->smBeta.data(), doc->smBeta.data() + doc->smBeta.size(), - logf); - } while (0); - throw runtime_error{ "doc doesn't has `beta` field!" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_AttributeError, e.what()); - return nullptr; - } + logf + ); + })) return ret; + throw py::AttributeError{ "doc doesn't has `beta` field!" }; + }); } diff --git a/src/python/py_DMR.cpp b/src/python/py_DMR.cpp index 959e35c..b50d1ff 100644 --- a/src/python/py_DMR.cpp +++ b/src/python/py_DMR.cpp @@ -8,7 +8,8 @@ using namespace std; tomoto::RawDoc::MiscType DMR_misc_args(TopicModelObject* self, const tomoto::RawDoc::MiscType& o) { tomoto::RawDoc::MiscType ret; - ret["metadata"] = getValueFromMiscDefault("metadata", o, "`DMRModel` needs a `metadata` value in `str` type."); + ret["metadata"] = getValueFromMiscDefault("metadata", o, "`DMRModel` needs a `metadata` value in `str` type."); + ret["multi_metadata"] = getValueFromMiscDefault>("multi_metadata", o, "`DMRModel` needs a `multi_metadata` value in `List[str]` type."); return ret; } @@ -22,14 +23,14 @@ static int DMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &objAlpha, &margs.eta, &margs.sigma, &margs.alphaEps, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::IDMRModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -42,130 +43,178 @@ static int DMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* DMR_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords; - const char* metadata = ""; - static const char* kwlist[] = { "words", "metadata", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &metadata)) return nullptr; - try + PyObject* argWords; + PyObject* multiMetadata = nullptr; + const char* metadata = nullptr; + static const char* kwlist[] = { "words", "metadata", "multi_metadata", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|zO", (char**)kwlist, + &argWords, &metadata, &multiMetadata)) return nullptr; + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } + if (multiMetadata && PyUnicode_Check(multiMetadata)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`multi_metadata` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); + if (!metadata) metadata = ""; raw.misc["metadata"] = metadata; + if (multiMetadata) + { + raw.misc["multi_metadata"] = py::toCpp>(multiMetadata, + [=]() { return "`multi_metadata` must be an instance of `List[str]` (but given " + py::repr(multiMetadata) + ")"; } + ); + } auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* DMR_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords; - const char* metadata = ""; - static const char* kwlist[] = { "words", "metadata", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &metadata)) return nullptr; - try + PyObject* argWords; + PyObject* multiMetadata = nullptr; + const char* metadata = nullptr; + static const char* kwlist[] = { "words", "metadata", "multi_metadata", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|zO", (char**)kwlist, + &argWords, &metadata, &multiMetadata)) return nullptr; + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } + if (multiMetadata && PyUnicode_Check(multiMetadata)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`multi_metadata` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); + if (!metadata) metadata = ""; raw.misc["metadata"] = metadata; + if (multiMetadata) + { + raw.misc["multi_metadata"] = py::toCpp>(multiMetadata, + [=]() { return "`multi_metadata` must be an instance of `List[str]` (but given " + py::repr(multiMetadata) + ")"; } + ); + } auto doc = inst->makeDoc(raw); py::UniqueObj corpus{ PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self, nullptr) }; auto* ret = (DocumentObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsDocument_type, corpus.get(), nullptr); ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) + }); +} + +static PyObject* DMR_getTopicPrior(TopicModelObject* self, PyObject* args, PyObject* kwargs) +{ + PyObject* multiMetadata = nullptr; + const char* metadata = nullptr; + size_t raw = 0; + static const char* kwlist[] = { "metadata", "multi_metadata", "raw", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|zOp", (char**)kwlist, + &metadata, &multiMetadata, &raw)) return nullptr; + return py::handleExc([&]() -> PyObject* { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + auto* inst = static_cast(self->inst); + if (multiMetadata && PyUnicode_Check(multiMetadata)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`multi_metadata` should be an iterable of str.", 1)) return nullptr; + } + if (!metadata) metadata = ""; + + vector multiMd; + if (multiMetadata) + { + multiMd = py::toCpp>(multiMetadata, + [=]() { return "`multi_metadata` must be an instance of `List[str]` (but given " + py::repr(multiMetadata) + ")"; } + ); + } + return py::buildPyValue(inst->getTopicPrior(metadata, multiMd, !!raw)); + }); } static VocabObject* DMR_getMetadataDict(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); ret->dep = (PyObject*)self; Py_INCREF(ret->dep); ret->vocabs = (tomoto::Dictionary*)&static_cast(self->inst)->getMetadataDict(); ret->size = -1; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) + }); +} + +static VocabObject* DMR_getMultiMetadataDict(TopicModelObject* self, void* closure) +{ + return py::handleExc([&]() { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); + ret->dep = (PyObject*)self; + Py_INCREF(ret->dep); + ret->vocabs = (tomoto::Dictionary*)&static_cast(self->inst)->getMultiMetadataDict(); + ret->size = -1; + return ret; + }); } static PyObject* DMR_getLambda(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getF() }; + npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getF() * inst->getMdVecSize() }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); for (size_t i = 0; i < inst->getK(); ++i) { auto l = inst->getLambdaByTopic(i); - memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * l.size()); + memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * shapes[1]); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) + }); +} + +static PyObject* DMR_getLambdaV2(TopicModelObject* self, void* closure) +{ + return py::handleExc([&]() { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + auto* inst = static_cast(self->inst); + npy_intp shapes[3] = { (npy_intp)inst->getK(), (npy_intp)inst->getF(), (npy_intp)inst->getMdVecSize() }; + PyObject* ret = PyArray_EMPTY(3, shapes, NPY_FLOAT, 0); + for (size_t i = 0; i < inst->getK(); ++i) + { + auto l = inst->getLambdaByTopic(i); + memcpy(PyArray_GETPTR3((PyArrayObject*)ret, i, 0, 0), l.data(), sizeof(float) * shapes[1] * shapes[2]); + } + return ret; + }); } static PyObject* DMR_getAlpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getF() }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); @@ -173,20 +222,11 @@ static PyObject* DMR_getAlpha(TopicModelObject* self, void* closure) { auto l = inst->getLambdaByTopic(i); Eigen::Map ml{ l.data(), (Eigen::Index)l.size() }; - ml = ml.exp(); - memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * l.size()); + ml = ml.exp() + inst->getAlphaEps(); + memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * shapes[1]); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_GETTER(tomoto::IDMRModel, DMR, getAlphaEps); @@ -195,37 +235,35 @@ DEFINE_GETTER(tomoto::IDMRModel, DMR, getF); PyObject* Document_DMR_metadata(DocumentObject * self, void* closure) { - try + return py::handleExc([&]() -> PyObject* { if (self->corpus->isIndependent()) return nullptr; - if (!self->doc) throw runtime_error{ "doc is null!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; auto inst = (tomoto::IDMRModel*)self->corpus->tm->inst; - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(inst->getMetadataDict().toWord(doc->metadata)); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(inst->getMetadataDict().toWord(doc->metadata)); - } while (0); - do + + return docVisit(self->getBoundDoc(), [&](auto* doc) { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(inst->getMetadataDict().toWord(doc->metadata)); - } while (0); - return nullptr; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) + return py::buildPyValue(inst->getMetadataDict().toWord(doc->metadata)); + }); + }); +} + +PyObject* Document_DMR_multiMetadata(DocumentObject* self, void* closure) +{ + return py::handleExc([&]() { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; + auto inst = (tomoto::IDMRModel*)self->corpus->tm->inst; + + if(auto* ret = docVisit(self->getBoundDoc(), [&](auto* doc) + { + return py::buildPyValueTransform(doc->multiMetadata.begin(), doc->multiMetadata.end(), [&](uint64_t x) + { + return inst->getMultiMetadataDict().toWord(x); + }); + })) return ret; + throw py::AttributeError{ "doc doesn't has `multi_metadata` field!" }; + }); } DEFINE_LOADER(DMR, DMR_type); @@ -236,6 +274,7 @@ static PyMethodDef DMR_methods[] = { "make_doc", (PyCFunction)DMR_makeDoc, METH_VARARGS | METH_KEYWORDS, DMR_make_doc__doc__ }, { "load", (PyCFunction)DMR_load, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_load__doc__ }, { "loads", (PyCFunction)DMR_loads, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_loads__doc__ }, + { "get_topic_prior", (PyCFunction)DMR_getTopicPrior, METH_VARARGS | METH_KEYWORDS, DMR_get_topic_prior__doc__ }, { nullptr } }; @@ -244,7 +283,9 @@ static PyGetSetDef DMR_getseters[] = { { (char*)"sigma", (getter)DMR_getSigma, nullptr, DMR_sigma__doc__, nullptr }, { (char*)"alpha_epsilon", (getter)DMR_getAlphaEps, nullptr, DMR_alpha_epsilon__doc__, nullptr }, { (char*)"metadata_dict", (getter)DMR_getMetadataDict, nullptr, DMR_metadata_dict__doc__, nullptr }, + { (char*)"multi_metadata_dict", (getter)DMR_getMultiMetadataDict, nullptr, DMR_multi_metadata_dict__doc__, nullptr }, { (char*)"lambdas", (getter)DMR_getLambda, nullptr, DMR_lamdas__doc__, nullptr }, + { (char*)"lambda_", (getter)DMR_getLambdaV2, nullptr, DMR_lamda___doc__, nullptr }, { (char*)"alpha", (getter)DMR_getAlpha, nullptr, DMR_alpha__doc__, nullptr }, { nullptr }, }; diff --git a/src/python/py_DT.cpp b/src/python/py_DT.cpp index c1059a7..83618b4 100644 --- a/src/python/py_DT.cpp +++ b/src/python/py_DT.cpp @@ -24,31 +24,23 @@ static int DT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.t, &margs.alpha[0], &margs.eta, &margs.phi, &margs.shapeA, &margs.shapeB, &margs.shapeC, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { tomoto::ITopicModel* inst = tomoto::IDTModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::RuntimeError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; self->minWordDf = minDf; self->removeTopWord = rmTop; self->initParams = py::buildPyDict(kwlist, - tw, minCnt, minDf, rmTop, margs.k, margs.t, margs.alpha[0], margs.eta, margs.phi,margs.shapeA, margs.shapeB, margs.shapeC, margs.seed + tw, minCnt, minDf, rmTop, margs.k, margs.t, margs.alpha[0], margs.eta, margs.phi, margs.shapeA, margs.shapeB, margs.shapeC, margs.seed ); py::setPyDictItem(self->initParams, "version", getVersion()); insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* DT_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -57,26 +49,20 @@ static PyObject* DT_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwa size_t timepoint = 0; static const char* kwlist[] = { "words", "timepoint", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|n", (char**)kwlist, &argWords, &timepoint)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["timepoint"] = (uint32_t)timepoint; auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* DT_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -85,11 +71,14 @@ static DocumentObject* DT_makeDoc(TopicModelObject* self, PyObject* args, PyObje size_t timepoint = 0; static const char* kwlist[] = { "words", "timepoint", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|n", (char**)kwlist, &argWords, &timepoint)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["timepoint"] = (uint32_t)timepoint; auto doc = inst->makeDoc(raw); @@ -98,16 +87,7 @@ static DocumentObject* DT_makeDoc(TopicModelObject* self, PyObject* args, PyObje ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* DT_getAlpha(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -115,17 +95,12 @@ static PyObject* DT_getAlpha(TopicModelObject* self, PyObject* args, PyObject *k size_t timepoint; static const char* kwlist[] = { "timepoint", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", (char**)kwlist, &timepoint)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ - if (timepoint >= inst->getT()) throw runtime_error{ "`timepoint` must < `DTModel.num_timepoints`" }; + if (timepoint >= inst->getT()) throw py::ValueError{ "`timepoint` must < `DTModel.num_timepoints`" }; vector alphas; for (size_t i = 0; i < inst->getK(); ++i) @@ -133,16 +108,7 @@ static PyObject* DT_getAlpha(TopicModelObject* self, PyObject* args, PyObject *k alphas.emplace_back(inst->getAlpha(i, timepoint)); } return py::buildPyValue(alphas); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* DT_getPhi(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -150,27 +116,13 @@ static PyObject* DT_getPhi(TopicModelObject* self, PyObject* args, PyObject *kwa size_t timepoint, topicId; static const char* kwlist[] = { "timepoint", "topic_id", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nn", (char**)kwlist, &timepoint, &topicId)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ return py::buildPyValue(inst->getPhi(topicId, timepoint)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* DT_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -178,28 +130,15 @@ static PyObject* DT_getTopicWords(TopicModelObject* self, PyObject* args, PyObje size_t topicId, timepoint, topN = 10; static const char* kwlist[] = { "topic_id", "timepoint", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nn|n", (char**)kwlist, &topicId, &timepoint, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < k" }; - if (timepoint >= inst->getT()) throw runtime_error{ "must topic_id < t" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < k" }; + if (timepoint >= inst->getT()) throw py::ValueError{ "must topic_id < t" }; + return py::buildPyValue(inst->getWordsByTopicSorted(topicId + inst->getK() * timepoint, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* DT_getTopicWordDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -207,41 +146,24 @@ static PyObject* DT_getTopicWordDist(TopicModelObject* self, PyObject* args, PyO size_t topicId, timepoint, normalize = 1; static const char* kwlist[] = { "topic_id", "timepoint", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "nn|p", (char**)kwlist, &topicId, &timepoint, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < k" }; - if (timepoint >= inst->getT()) throw runtime_error{ "must topic_id < t" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < k" }; + if (timepoint >= inst->getT()) throw py::ValueError{ "must topic_id < t" }; + return py::buildPyValue(inst->getWidsByTopic(topicId + inst->getK() * timepoint, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* DT_getCountByTopics(TopicModelObject* self) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + auto l = inst->getCountByTopic(); npy_intp shapes[2] = { (npy_intp)inst->getT(), (npy_intp)inst->getK() }; @@ -251,16 +173,7 @@ static PyObject* DT_getCountByTopics(TopicModelObject* self) memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), &l[inst->getK() * i], sizeof(uint64_t) * inst->getK()); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_LOADER(DT, DT_type); @@ -291,9 +204,9 @@ DEFINE_SETTER_CHECKED_FLOAT(tomoto::IDTModel, DT, setShapeC, 0.5 < value && valu static PyObject* DT_alpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); npy_intp shapes[2] = { (npy_intp)inst->getT(), (npy_intp)inst->getK() }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); @@ -305,16 +218,7 @@ static PyObject* DT_alpha(TopicModelObject* self, void* closure) } } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyGetSetDef DT_getseters[] = { @@ -371,36 +275,18 @@ TopicModelTypeObject DT_type = { { PyObject* Document_eta(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() -> PyObject* { - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `eta` field!" }; - if (!self->doc) throw runtime_error{ "doc is null!" }; - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->eta.array().data(), doc->eta.array().data() + doc->eta.array().size()); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->eta.array().data(), doc->eta.array().data() + doc->eta.array().size()); - } while (0); - do + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `eta` field!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; + + if (auto* ret = docVisit(self->getBoundDoc(), [](auto* doc) { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->eta.array().data(), doc->eta.array().data() + doc->eta.array().size()); - } while (0); - throw runtime_error{ "doc doesn't has `eta` field!" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_AttributeError, e.what()); - return nullptr; - } + return py::buildPyValue(doc->eta.array().data(), doc->eta.array().data() + doc->eta.array().size()); + })) return ret; + + throw py::AttributeError{ "doc doesn't has `eta` field!" }; + }); } DEFINE_DOCUMENT_GETTER(tomoto::DocumentDTM, timepoint, timepoint); diff --git a/src/python/py_GDMR.cpp b/src/python/py_GDMR.cpp index 6e649df..11d6ffb 100644 --- a/src/python/py_GDMR.cpp +++ b/src/python/py_GDMR.cpp @@ -31,7 +31,7 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) &tw, &minCnt, &minDf, &rmTop, &margs.k, &objDegrees, &objAlpha, &margs.eta, &margs.sigma, &margs.sigma0, &margs.alphaEps, &margs.orderDecay, &objRange, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } @@ -43,14 +43,14 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) } tomoto::IGDMRModel* inst = tomoto::IGDMRModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; self->minWordDf = minDf; self->removeTopWord = rmTop; self->initParams = py::buildPyDict(kwlist, - tw, minCnt, minDf, rmTop, + tw, minCnt, minDf, rmTop, margs.k, margs.degrees, margs.alpha, margs.eta, margs.sigma, margs.sigma0, margs.alphaEps, margs.orderDecay ); @@ -60,61 +60,55 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { vector vMin, vMax; py::UniqueObj rangeIter{ PyObject_GetIter(objRange) }, item; - if(!rangeIter) throw runtime_error{ "`metadata_range` must be a list of pairs." }; + if (!rangeIter) throw py::ValueError{ "`metadata_range` must be a list of pairs." }; while (item = py::UniqueObj{ PyIter_Next(rangeIter) }) { auto r = py::toCpp>(item, "`metadata_range` must be a list of pairs."); - if (r.size() != 2) throw runtime_error{ "`metadata_range` must be a list of pairs." }; + if (r.size() != 2) throw py::ValueError{ "`metadata_range` must be a list of pairs." }; vMin.emplace_back(r[0]); vMax.emplace_back(r[1]); } - if(vMin.size() != margs.degrees.size()) throw runtime_error{ "`len(metadata_range)` must be equal to `len(degrees)`" }; + if (vMin.size() != margs.degrees.size()) throw py::ValueError{ "`len(metadata_range)` must be equal to `len(degrees)`" }; inst->setMdRange(vMin, vMax); } insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* GDMR_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { PyObject* argWords, *argNumMetadata = nullptr; - const char* metadata = ""; + const char* metadata = nullptr; static const char* kwlist[] = { "words", "numeric_metadata", "metadata", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Os", (char**)kwlist, &argWords, &argNumMetadata, &metadata)) return nullptr; - try + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Oz", (char**)kwlist, &argWords, &argNumMetadata, &metadata)) return nullptr; + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); - + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } + + if (!metadata) metadata = ""; + tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["metadata"] = metadata; - raw.misc["numeric_metadata"] = py::toCpp>(argNumMetadata, "`numeric_metadata` must be an iterable of float."); + + auto nmd = py::toCpp>(argNumMetadata, "`numeric_metadata` must be an iterable of float."); + for (auto x : nmd) + { + if (!isfinite(x)) throw py::ValueError{ "`numeric_metadata` has non-finite value (" + py::reprFromCpp(nmd) + ")." }; + } + raw.misc["numeric_metadata"] = move(nmd); auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* GDMR_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -122,16 +116,27 @@ static DocumentObject* GDMR_makeDoc(TopicModelObject* self, PyObject* args, PyOb PyObject* argWords, * argNumMetadata = nullptr; const char* metadata = ""; static const char* kwlist[] = { "words", "numeric_metadata", "metadata", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Os", (char**)kwlist, &argWords, &argNumMetadata, &metadata)) return nullptr; - try + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Oz", (char**)kwlist, &argWords, &argNumMetadata, &metadata)) return nullptr; + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); - + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } + + if (!metadata) metadata = ""; + tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["metadata"] = metadata; - raw.misc["numeric_metadata"] = py::toCpp>(argNumMetadata, "`numeric_metadata` must be an iterable of float."); + + auto nmd = py::toCpp>(argNumMetadata, "`numeric_metadata` must be an iterable of float."); + for (auto x : nmd) + { + if (!isfinite(x)) throw py::ValueError{ "`numeric_metadata` has non-finite value (" + py::reprFromCpp(nmd) + ")." }; + } + raw.misc["numeric_metadata"] = move(nmd); auto doc = inst->makeDoc(raw); py::UniqueObj corpus{ PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self, nullptr) }; @@ -139,71 +144,57 @@ static DocumentObject* GDMR_makeDoc(TopicModelObject* self, PyObject* args, PyOb ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* GDMR_tdf(TopicModelObject* self, PyObject* args, PyObject *kwargs) { PyObject *argNumMetadata = nullptr; + PyObject* multiMetadata = nullptr; const char* metadata = ""; int normalize = 1; - static const char* kwlist[] = { "numeric_metadata", "metadata", "normalize", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|sp", (char**)kwlist, &argNumMetadata, &metadata, &normalize)) return nullptr; - try + static const char* kwlist[] = { "numeric_metadata", "metadata", "multi_metadata", "normalize", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|zOp", (char**)kwlist, &argNumMetadata, &metadata, &multiMetadata, &normalize)) return nullptr; + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - + auto v = py::toCpp>(argNumMetadata, "`numeric_metadata` must be an iterable of float."); - if (v.size() != inst->getFs().size()) throw runtime_error{ "`len(numeric_metadata)` must be equal to `len(degree).`" }; + if (v.size() != inst->getFs().size()) throw py::ValueError{ "`len(numeric_metadata)` must be equal to `len(degree).`" }; - size_t cat = inst->getMetadataDict().toWid(metadata); - if (cat == tomoto::non_vocab_id) throw runtime_error{ "unknown categorical metadata '" + string{metadata} + "'" }; - return py::buildPyValue(inst->getTDF(v.data(), cat, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + try + { + return py::buildPyValue(inst->getTDF(v.data(), metadata, {}, !!normalize)); + } + catch (const tomoto::exc::InvalidArgument& e) + { + throw py::ValueError{ e.what() }; + } + }); } static PyObject* GDMR_tdfLinspace(TopicModelObject* self, PyObject* args, PyObject *kwargs) { PyObject *argMetadataStart = nullptr, *argMetadataStop = nullptr, *argNum = nullptr; + PyObject* multiMetadata = nullptr; const char* metadata = ""; size_t endpoint = 1, normalize = 1; - static const char* kwlist[] = { "numeric_metadata_start", "numeric_metadata_stop", "num", "metadata", "endpoint", "normalize", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOO|spp", (char**)kwlist, - &argMetadataStart, &argMetadataStop, &argNum, &metadata, &endpoint, &normalize)) return nullptr; - try + static const char* kwlist[] = { "numeric_metadata_start", "numeric_metadata_stop", "num", "metadata", "multi_metadata", "endpoint", "normalize", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOO|zOpp", (char**)kwlist, + &argMetadataStart, &argMetadataStop, &argNum, &metadata, &multiMetadata, &endpoint, &normalize)) return nullptr; + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); auto start = py::toCpp>(argMetadataStart, "`metadata_start` must be an iterable of float."); - if (start.size() != inst->getFs().size()) throw runtime_error{ "`len(metadata_start)` must be equal to `len(degree).`" }; + if (start.size() != inst->getFs().size()) throw py::ValueError{ "`len(metadata_start)` must be equal to `len(degree).`" }; auto stop = py::toCpp>(argMetadataStop, "`metadata_stop` must be an iterable of float."); - if (stop.size() != inst->getFs().size()) throw runtime_error{ "`len(metadata_stop)` must be equal to `len(degree).`" }; + if (stop.size() != inst->getFs().size()) throw py::ValueError{ "`len(metadata_stop)` must be equal to `len(degree).`" }; auto num = py::toCpp>(argNum, "`num` must be an iterable of float."); - if (num.size() != inst->getFs().size()) throw runtime_error{ "`len(num)` must be equal to `len(degree).`" }; - - size_t cat = inst->getMetadataDict().toWid(metadata); - if (cat == tomoto::non_vocab_id) throw runtime_error{ "unknown categorical metadata '" + string{metadata} + "'" }; + if (num.size() != inst->getFs().size()) throw py::ValueError{ "`len(num)` must be equal to `len(degree).`" }; ssize_t tot = 1; for (auto& v : num) @@ -227,35 +218,33 @@ static PyObject* GDMR_tdfLinspace(TopicModelObject* self, PyObject* args, PyObje if (idcs[j] >= num[j]) { idcs[j] = 0; - if(j) idcs[j - 1]++; + if (j) idcs[j - 1]++; } else break; } } - py::UniqueObj obj{ py::buildPyValue(inst->getTDFBatch(mds.data(), cat, num.size(), tot, !!normalize)) }; - PyArray_Dims dims; - num.emplace_back(inst->getK()); - dims.ptr = num.data(); - dims.len = num.size(); - return PyArray_Newshape((PyArrayObject*)obj.get(), &dims, NPY_CORDER); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + try + { + py::UniqueObj obj{ py::buildPyValue(inst->getTDFBatch(mds.data(), metadata, {}, num.size(), tot, !!normalize)) }; + PyArray_Dims dims; + num.emplace_back(inst->getK()); + dims.ptr = num.data(); + dims.len = num.size(); + return PyArray_Newshape((PyArrayObject*)obj.get(), &dims, NPY_CORDER); + } + catch (const tomoto::exc::InvalidArgument& e) + { + throw py::ValueError{ e.what() }; + } + }); } static PyObject* GDMR_getMetadataRange(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); vector vMin, vMax; inst->getMdRange(vMin, vMax); @@ -265,23 +254,24 @@ static PyObject* GDMR_getMetadataRange(TopicModelObject* self, void* closure) ret.emplace_back(vMin[i], vMax[i]); } return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) + }); +} + + +static PyObject* GDMR_getTopicPrior(TopicModelObject* self, PyObject* args, PyObject* kwargs) +{ + return py::handleExc([&]() -> PyObject* { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + throw py::RuntimeError{ "GDMRModel doesn't support get_topic_prior(). Use tdf() instead." }; + }); } + DEFINE_GETTER(tomoto::IGDMRModel, GDMR, getSigma0); DEFINE_GETTER(tomoto::IGDMRModel, GDMR, getOrderDecay); DEFINE_GETTER(tomoto::IGDMRModel, GDMR, getFs); -DEFINE_DOCUMENT_GETTER_WITHOUT_EXC(tomoto::DocumentGDMR, numeric_metadata, metadataOrg); +DEFINE_DOCUMENT_GETTER(tomoto::DocumentGDMR, numericMetadata, metadataOrg); DEFINE_LOADER(GDMR, GDMR_type); @@ -291,6 +281,7 @@ static PyMethodDef GDMR_methods[] = { "make_doc", (PyCFunction)GDMR_makeDoc, METH_VARARGS | METH_KEYWORDS, GDMR_make_doc__doc__ }, { "load", (PyCFunction)GDMR_load, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_load__doc__ }, { "loads", (PyCFunction)GDMR_loads, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_loads__doc__ }, + { "get_topic_prior", (PyCFunction)GDMR_getTopicPrior, METH_VARARGS | METH_KEYWORDS, DMR_get_topic_prior__doc__ }, { "tdf", (PyCFunction)GDMR_tdf, METH_VARARGS | METH_KEYWORDS, GDMR_tdf__doc__ }, { "tdf_linspace", (PyCFunction)GDMR_tdfLinspace, METH_VARARGS | METH_KEYWORDS, GDMR_tdf_linspace__doc__ }, { nullptr } diff --git a/src/python/py_HDP.cpp b/src/python/py_HDP.cpp index efdafed..cf0be00 100644 --- a/src/python/py_HDP.cpp +++ b/src/python/py_HDP.cpp @@ -14,10 +14,10 @@ static int HDP_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnfffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.alpha[0], &margs.eta, &margs.gamma, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { tomoto::ITopicModel* inst = tomoto::IHDPModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -30,15 +30,7 @@ static int HDP_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* HDP_isLiveTopic(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -46,27 +38,14 @@ static PyObject* HDP_isLiveTopic(TopicModelObject* self, PyObject* args, PyObjec size_t topicId; static const char* kwlist[] = { "topic_id", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", (char**)kwlist, &topicId)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < K" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" }; + return py::buildPyValue(inst->isLiveTopic(topicId)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* HDP_convertToLDA(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -74,9 +53,9 @@ static PyObject* HDP_convertToLDA(TopicModelObject* self, PyObject* args, PyObje float topicThreshold = 0; static const char* kwlist[] = { "topic_threshold", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|f", (char**)kwlist, &topicThreshold)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto inst = static_cast(self->inst); std::vector newK; auto lda = inst->convertToLDA(topicThreshold, newK); @@ -89,36 +68,19 @@ static PyObject* HDP_convertToLDA(TopicModelObject* self, PyObject* args, PyObje ret->minWordDf = self->minWordDf; ret->removeTopWord = self->removeTopWord; return Py_BuildValue("(NN)", r.release(), py::buildPyValue(newK, py::cast_to_signed)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* Document_HDP_Z(DocumentObject* self, void* closure) { - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); - } while (0); - return nullptr; + return docVisit(self->getBoundDoc(), [](auto* doc) + { + return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](tomoto::Tid x) -> int16_t + { + if (x == tomoto::non_topic_id) return -1; + return doc->numTopicByTable[x].topic; + }); + }); } diff --git a/src/python/py_HLDA.cpp b/src/python/py_HLDA.cpp index f2e03c3..f7b01b2 100644 --- a/src/python/py_HLDA.cpp +++ b/src/python/py_HLDA.cpp @@ -15,14 +15,14 @@ static int HLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &objAlpha, &margs.eta, &margs.gamma, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `depth` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::IHLDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -35,15 +35,7 @@ static int HLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } #define DEFINE_HLDA_TOPIC_METH(NAME) \ @@ -52,52 +44,35 @@ static PyObject* HLDA_##NAME(TopicModelObject* self, PyObject* args, PyObject *k size_t topicId;\ static const char* kwlist[] = { "topic_id", nullptr };\ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", (char**)kwlist, &topicId)) return nullptr;\ - try\ + return py::handleExc([&]()\ {\ - if (!self->inst) throw runtime_error{ "inst is null" };\ + if (!self->inst) throw py::RuntimeError{ "inst is null" };\ auto* inst = static_cast(self->inst);\ - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < K" };\ - if (!self->isPrepared) throw runtime_error{ "train() should be called first" };\ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" };\ + if (!self->isPrepared) throw py::RuntimeError{ "train() should be called first" };\ return py::buildPyValue(inst->NAME(topicId));\ - }\ - catch (const bad_exception&)\ - {\ - return nullptr;\ - }\ - catch (const exception& e)\ - {\ - PyErr_SetString(PyExc_Exception, e.what());\ - return nullptr;\ - }\ + });\ } PyObject* Document_HLDA_Z(DocumentObject* self, void* closure) { - do + return docVisit(self->getBoundDoc(), [](auto* doc) { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); - } while (0); - return nullptr; + return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](tomoto::Tid x) -> int16_t + { + if (x == tomoto::non_topic_id) return -1; + return doc->path[x]; + }); + }); } PyObject* HLDA_getAlpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); vector ret; for (size_t i = 0; i < inst->getLevelDepth(); ++i) @@ -105,16 +80,7 @@ PyObject* HLDA_getAlpha(TopicModelObject* self, void* closure) ret.emplace_back(inst->getAlpha(i)); } return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } diff --git a/src/python/py_HPA.cpp b/src/python/py_HPA.cpp index ec60d6f..c396629 100644 --- a/src/python/py_HPA.cpp +++ b/src/python/py_HPA.cpp @@ -15,7 +15,7 @@ static int HPA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnOOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.k2, &objAlpha, &objSubAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k + 1, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k1 + 1` (given " + py::repr(objAlpha) + ")"; } @@ -24,9 +24,9 @@ static int HPA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if (objSubAlpha) margs.subalpha = broadcastObj(objSubAlpha, margs.k2 + 1, [=]() { return "`subalpha` must be an instance of `float` or `List[float]` with length `k2 + 1` (given " + py::repr(objSubAlpha) + ")"; } ); - tomoto::ITopicModel* inst = tomoto::IHPAModel::create((tomoto::TermWeight)tw, + tomoto::ITopicModel* inst = tomoto::IHPAModel::create((tomoto::TermWeight)tw, false, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -39,15 +39,7 @@ static int HPA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* HPA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -55,27 +47,14 @@ static PyObject* HPA_getTopicWords(TopicModelObject* self, PyObject* args, PyObj size_t topicId, topN = 10; static const char* kwlist[] = { "topic_id", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId > inst->getK() + inst->getK2()) throw runtime_error{ "must topic_id < 1 + K1 + K2" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId > inst->getK() + inst->getK2()) throw py::ValueError{ "must topic_id < 1 + K1 + K2" }; + return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* HPA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -83,37 +62,25 @@ static PyObject* HPA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py size_t topicId, normalize = 1; static const char* kwlist[] = { "topic_id", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|p", (char**)kwlist, &topicId, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId > inst->getK() + inst->getK2()) throw runtime_error{ "must topic_id < 1 + K1 + K2" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId > inst->getK() + inst->getK2()) throw py::ValueError{ "must topic_id < 1 + K1 + K2" }; + return py::buildPyValue(inst->getWidsByTopic(topicId, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } + DEFINE_LOADER(HPA, HPA_type); PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs); static PyObject* HPA_getAlpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); npy_intp shapes[1] = { (npy_intp)inst->getK() + 1 }; PyObject* ret = PyArray_EMPTY(1, shapes, NPY_FLOAT, 0); @@ -122,23 +89,14 @@ static PyObject* HPA_getAlpha(TopicModelObject* self, void* closure) *(float*)PyArray_GETPTR1((PyArrayObject*)ret, i) = inst->getAlpha(i); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* HPA_getSubalpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getK2() + 1 }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); @@ -148,16 +106,7 @@ static PyObject* HPA_getSubalpha(TopicModelObject* self, void* closure) memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * l.size()); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyMethodDef HPA_methods[] = diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 3bbb545..b521d48 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -19,14 +19,14 @@ static int LDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &objAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { - if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, + if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::ILDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -39,15 +39,7 @@ static int LDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* LDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -55,25 +47,19 @@ static PyObject* LDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kw PyObject *argWords; static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &argWords)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_addCorpus(TopicModelObject* self, PyObject* args, PyObject* kwargs) @@ -81,11 +67,11 @@ static PyObject* LDA_addCorpus(TopicModelObject* self, PyObject* args, PyObject* PyObject* corpus, *transform = nullptr; static const char* kwlist[] = { "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &corpus, &transform)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_corpus() after train()" }; - if (!PyObject_TypeCheck(corpus, &UtilsCorpus_type)) throw runtime_error{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_corpus() after train()" }; + if (!PyObject_TypeCheck(corpus, &UtilsCorpus_type)) throw py::ValueError{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; py::UniqueObj _corpusRet{ PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self, nullptr) }; CorpusObject* corpusRet = (CorpusObject*)_corpusRet.get(); corpusRet->docIdcs = insertCorpus(self, corpus, transform); @@ -94,15 +80,7 @@ static PyObject* LDA_addCorpus(TopicModelObject* self, PyObject* args, PyObject* corpusRet->invmap.emplace(self->inst->getDoc(corpusRet->docIdcs[i])->docUid, i); } return _corpusRet.release(); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } static DocumentObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -110,11 +88,14 @@ static DocumentObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObj PyObject *argWords = nullptr; static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &argWords)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); auto doc = inst->makeDoc(raw); py::UniqueObj corpus{ PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self, nullptr) }; @@ -122,16 +103,7 @@ static DocumentObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObj ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* LDA_setWordPrior(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -140,24 +112,15 @@ PyObject* LDA_setWordPrior(TopicModelObject* self, PyObject* args, PyObject *kwa PyObject* prior; static const char* kwlist[] = { "word", "prior", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "sO", (char**)kwlist, &word, &prior)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot set_word_prior() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot set_word_prior() after train()" }; auto* inst = static_cast(self->inst); inst->setWordPrior(word, py::toCpp>(prior, "`prior` must be a list of floats with len = k")); Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* LDA_getWordPrior(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -165,21 +128,12 @@ PyObject* LDA_getWordPrior(TopicModelObject* self, PyObject* args, PyObject *kwa const char* word; static const char* kwlist[] = { "word", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &word)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); return py::buildPyValue(inst->getWordPrior(word)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -187,9 +141,9 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa size_t iteration = 10, workers = 0, ps = 0, fixed = 0; static const char* kwlist[] = { "iter", "workers", "parallel", "freeze_topics", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnp", (char**)kwlist, &iteration, &workers, &ps, &fixed)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); if (!self->isPrepared) { @@ -199,16 +153,7 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa inst->train(iteration, workers, (tomoto::ParallelScheme)ps, !!fixed); Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* LDA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -216,27 +161,14 @@ PyObject* LDA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kw size_t topicId, topN = 10; static const char* kwlist[] = { "topic_id", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{"must topic_id < K"}; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" }; + return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -244,27 +176,14 @@ static PyObject* LDA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py size_t topicId, normalize = 1; static const char* kwlist[] = { "topic_id", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|p", (char**)kwlist, &topicId, &normalize)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < K" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < K" }; + return py::buildPyValue(inst->getWidsByTopic(topicId, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -275,10 +194,10 @@ PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnnpO", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &ps, &together, &argTransform)) return nullptr; DEBUG_LOG("infer " << self->ob_base.ob_type << ", " << self->ob_base.ob_refcnt); - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (!self->isPrepared) throw runtime_error{ "cannot infer with untrained model" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (!self->isPrepared) throw py::RuntimeError{ "cannot infer with untrained model" }; py::UniqueObj iter; if (PyObject_TypeCheck(argDoc, &UtilsCorpus_type)) { @@ -291,7 +210,7 @@ PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) else if (PyObject_TypeCheck(argDoc, &UtilsDocument_type)) { auto* doc = (DocumentObject*)argDoc; - if (doc->corpus->tm != self) throw runtime_error{ "`doc` was from another model, not fit to this model" }; + if (doc->corpus->tm != self) throw py::ValueError{ "`doc` was from another model, not fit to this model" }; if (doc->owner) { std::vector docs; @@ -310,12 +229,12 @@ PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) py::UniqueObj item; while ((item = py::UniqueObj{ PyIter_Next(iter) })) { - if (!PyObject_TypeCheck(item, &UtilsDocument_type)) throw runtime_error{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; + if (!PyObject_TypeCheck(item, &UtilsDocument_type)) throw py::ValueError{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; auto* doc = (DocumentObject*)item.get(); - if (doc->corpus->tm != self) throw runtime_error{ "`doc` was from another model, not fit to this model" }; + if (doc->corpus->tm != self) throw py::ValueError{ "`doc` was from another model, not fit to this model" }; docs.emplace_back((tomoto::DocumentBase*)doc->doc); } - if (PyErr_Occurred()) throw bad_exception{}; + if (PyErr_Occurred()) throw py::ExcPropagation{}; auto ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together); PyObject* ret = PyList_New(docs.size()); size_t i = 0; @@ -334,18 +253,9 @@ PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) } else { - throw runtime_error{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; + throw py::ValueError{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; } - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_save(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -354,11 +264,11 @@ static PyObject* LDA_save(TopicModelObject* self, PyObject* args, PyObject *kwar size_t full = 1; static const char* kwlist[] = { "filename", "full", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|p", (char**)kwlist, &filename, &full)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; ofstream str{ filename, ios_base::binary }; - if (!str) throw runtime_error{ std::string("cannot open file '") + filename + std::string("'") }; + if (!str) throw py::OSError{ std::string("cannot open file '") + filename + std::string("'") }; vector extra_data; { @@ -379,16 +289,7 @@ static PyObject* LDA_save(TopicModelObject* self, PyObject* args, PyObject *kwar self->inst->saveModel(str, !!full, &extra_data); Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_saves(TopicModelObject* self, PyObject* args, PyObject* kwargs) @@ -396,9 +297,9 @@ static PyObject* LDA_saves(TopicModelObject* self, PyObject* args, PyObject* kwa size_t full = 1; static const char* kwlist[] = { "full", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|p", (char**)kwlist, &full)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; ostringstream str; vector extra_data; @@ -419,15 +320,7 @@ static PyObject* LDA_saves(TopicModelObject* self, PyObject* args, PyObject* kwa self->inst->saveModel(str, !!full, &extra_data); return PyBytes_FromStringAndSize(str.str().data(), str.str().size()); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } static PyObject* LDA_update_vocab(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -435,153 +328,87 @@ static PyObject* LDA_update_vocab(TopicModelObject* self, PyObject* args, PyObje PyObject* objWords; static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &objWords)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; self->inst->updateVocab(py::toCpp>(objWords, "`words` must be an iterable of str")); Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static CorpusObject* LDA_getDocs(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; py::UniqueObj args{ py::buildPyTuple((PyObject*)self) }; auto ret = (CorpusObject*)PyObject_CallObject((PyObject*)&UtilsCorpus_type, args); return ret; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } static VocabObject* LDA_getVocabs(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); ret->dep = (PyObject*)self; Py_INCREF(ret->dep); ret->vocabs = (tomoto::Dictionary*)&self->inst->getVocabDict(); ret->size = self->inst->getVocabDict().size(); return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static VocabObject* LDA_getUsedVocabs(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); ret->dep = (PyObject*)self; Py_INCREF(ret->dep); ret->vocabs = (tomoto::Dictionary*)&self->inst->getVocabDict(); ret->size = self->inst->getV(); return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_getUsedVocabCf(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; return py::buildPyValue(self->inst->getVocabCf().begin(), self->inst->getVocabCf().begin() + self->inst->getV()); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_getUsedVocabDf(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; return py::buildPyValue(self->inst->getVocabDf().begin(), self->inst->getVocabDf().begin() + self->inst->getV()); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_getCountByTopics(TopicModelObject* self) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + return py::buildPyValue(inst->getCountByTopic()); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* LDA_getAlpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); vector ret; for (size_t i = 0; i < inst->getK(); ++i) @@ -589,29 +416,16 @@ PyObject* LDA_getAlpha(TopicModelObject* self, void* closure) ret.emplace_back(inst->getAlpha(i)); } return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_getRemovedTopWords(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + vector ret; size_t last = inst->getVocabDict().size(); for (size_t rmV = last - self->removeTopWord; rmV < last; ++rmV) @@ -619,16 +433,7 @@ static PyObject* LDA_getRemovedTopWords(TopicModelObject* self, void* closure) ret.emplace_back(inst->getVocabDict().toWord(rmV)); } return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* LDA_summary(TopicModelObject* self, PyObject* args, PyObject* kwargs) @@ -641,32 +446,45 @@ static PyObject* LDA_summary(TopicModelObject* self, PyObject* args, PyObject* k static const char* kwlist[] = { "initial_hp", "params", "topic_word_top_n", "file", "flush", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOOOO", (char**)kwlist, &argInitialHP, &argParams, &argTopicWordTopN, &argFile, &argFlush)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; py::UniqueObj mod{ PyImport_ImportModule("tomotopy._summary") }; - if (!mod) throw bad_exception{}; + if (!mod) throw py::ExcPropagation{}; PyObject* mod_dict = PyModule_GetDict(mod); - if (!mod_dict) throw bad_exception{}; + if (!mod_dict) throw py::ExcPropagation{}; PyObject* summary_func = PyDict_GetItemString(mod_dict, "summary"); - if (!summary_func) throw bad_exception{}; + if (!summary_func) throw py::ExcPropagation{}; py::UniqueObj args{ Py_BuildValue("(O)", self) }; py::UniqueObj kwargs{ py::buildPyDictSkipNull(kwlist, argInitialHP, argParams, argTopicWordTopN, argFile, argFlush ) }; return PyObject_Call(summary_func, args, kwargs); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); +} + +static PyObject* LDA_copy(TopicModelObject* self) +{ + return py::handleExc([&]() + { + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + + py::UniqueObj type{ PyObject_Type((PyObject*)self) }; + py::UniqueObj ret{ PyObject_CallFunctionObjArgs(type, nullptr) }; + + auto* obj = (TopicModelObject*)ret.get(); + delete obj->inst; + obj->inst = self->inst->copy().release(); + obj->isPrepared = self->isPrepared; + obj->minWordCnt = self->minWordCnt; + obj->minWordDf = self->minWordDf; + obj->removeTopWord = self->removeTopWord; + obj->initParams = self->initParams; + Py_INCREF(obj->initParams); + return ret.release(); + }); } DEFINE_GETTER(tomoto::ILDAModel, LDA, getK); @@ -800,58 +618,27 @@ PyObject* LDA_loads(PyObject*, PyObject* args, PyObject *kwargs) PyObject* Document_LDA_Z(DocumentObject* self, void* closure) { - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); - } while (0); - return nullptr; + return docVisit(self->getBoundDoc(), [](auto* doc) + { + return buildPyValueReorder(doc->Zs, doc->wOrder, [](tomoto::Tid x) -> int16_t { return x; }); + }); } PyObject* Document_getCountVector(DocumentObject* self) { - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; + if (self->corpus->isIndependent()) throw py::AttributeError{ "This method can only be called by documents bound to the topic model." }; if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; size_t v = self->corpus->tm->inst->getV(); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->getCountVector(v)); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->getCountVector(v)); - } while (0); - do + + if (auto* ret = docVisit(self->getBoundDoc(), [&](auto* doc) { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return py::buildPyValue(doc->getCountVector(v)); - } while (0); - - throw runtime_error{ "cannot get count vector" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + return py::buildPyValue(doc->getCountVector(v)); + })) return ret; + + throw py::AttributeError{ "cannot get count vector" }; + }); } PyObject* LDA_getInitParams(TopicModelObject* self) @@ -875,6 +662,7 @@ static PyMethodDef LDA_methods[] = { "saves", (PyCFunction)LDA_saves, METH_VARARGS | METH_KEYWORDS, LDA_saves__doc__}, { "load", (PyCFunction)LDA_load, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_load__doc__}, { "loads", (PyCFunction)LDA_loads, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_loads__doc__}, + { "copy", (PyCFunction)LDA_copy, METH_NOARGS, LDA_copy__doc__}, { "_update_vocab", (PyCFunction)LDA_update_vocab, METH_VARARGS | METH_KEYWORDS, ""}, { "summary", (PyCFunction)LDA_summary, METH_VARARGS | METH_KEYWORDS, LDA_summary__doc__}, { nullptr } diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index 71476a1..51f0974 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -22,14 +22,17 @@ static int LLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &objAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + + if (PyErr_WarnEx(PyExc_DeprecationWarning, "`tomotopy.LLDAModel` is deprecated. Please use `tomotopy.PLDAModel` instead.", 1)) return -1; + + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::ILLDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -42,15 +45,7 @@ static int LLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -58,30 +53,27 @@ static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); - if(argLabels) + if (argLabels) { - if (PyUnicode_Check(argLabels)) PRINT_WARN_ONCE("[warn] `labels` should be an iterable of str."); + if (PyUnicode_Check(argLabels)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`labels` should be an iterable of str.", 1)) return nullptr; + } raw.misc["labels"] = py::toCpp>(argLabels, "`labels` must be an iterable of str."); } auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -89,16 +81,22 @@ static DocumentObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); if (argLabels) { - if (PyUnicode_Check(argLabels)) PRINT_WARN_ONCE("[warn] `labels` should be an iterable of str."); + if (PyUnicode_Check(argLabels)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`labels` should be an iterable of str.", 1)) return nullptr; + } raw.misc["labels"] = py::toCpp>(argLabels, "`labels` must be an iterable of str."); } auto doc = inst->makeDoc(raw); @@ -107,90 +105,49 @@ static DocumentObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static VocabObject* LLDA_getTopicLabelDict(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); ret->dep = (PyObject*)self; Py_INCREF(ret->dep); ret->vocabs = (tomoto::Dictionary*)&static_cast(self->inst)->getTopicLabelDict(); ret->size = -1; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* Document_labels(DocumentObject* self, void* closure) { - auto makeReturn = [&](const tomoto::DocumentBase* doc, const Eigen::Matrix& labelMask) + return py::handleExc([&]() { - auto inst = dynamic_cast(self->corpus->tm->inst); - auto dict = inst->getTopicLabelDict(); - vector>> ret; - auto topicDist = inst->getTopicsByDoc(doc); - for (size_t i = 0; i < dict.size(); ++i) + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `labels` field!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; + + if (auto* ret = docVisit(self->getBoundDoc(), [&](auto* doc) { - if (labelMask[i * inst->getNumTopicsPerLabel()]) + auto inst = dynamic_cast(self->corpus->tm->inst); + auto dict = inst->getTopicLabelDict(); + vector>> ret; + auto topicDist = inst->getTopicsByDoc(doc); + for (size_t i = 0; i < dict.size(); ++i) { - ret.emplace_back(inst->getTopicLabelDict().toWord(i), - vector{ &topicDist[i * inst->getNumTopicsPerLabel()], &topicDist[(i + 1) * inst->getNumTopicsPerLabel()] }); + if (doc->labelMask[i * inst->getNumTopicsPerLabel()]) + { + ret.emplace_back(inst->getTopicLabelDict().toWord(i), + vector{ &topicDist[i * inst->getNumTopicsPerLabel()], &topicDist[(i + 1) * inst->getNumTopicsPerLabel()] }); + } } - } - return py::buildPyValue(ret); - }; - - try - { - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `labels` field!" }; - if (!self->doc) throw runtime_error{ "doc is null!" }; - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return makeReturn(doc, doc->labelMask); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return makeReturn(doc, doc->labelMask); - } while (0); - do - { - auto* doc = dynamic_cast*>(self->getBoundDoc()); - if (doc) return makeReturn(doc, doc->labelMask); - } while (0); - throw runtime_error{ "doc doesn't has `labels` field!" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_AttributeError, e.what()); - return nullptr; - } + return py::buildPyValue(ret); + })) return ret; + + throw py::AttributeError{ "doc doesn't has `labels` field!" }; + }); } DEFINE_LOADER(LLDA, LLDA_type); diff --git a/src/python/py_MGLDA.cpp b/src/python/py_MGLDA.cpp index 26214b3..8d3f471 100644 --- a/src/python/py_MGLDA.cpp +++ b/src/python/py_MGLDA.cpp @@ -22,33 +22,25 @@ static int MGLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnnfffffffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.kL, &margs.t, &margs.alpha[0], &margs.alphaL[0], &margs.alphaMG, &margs.alphaML, &margs.eta, &margs.etaL, &margs.gamma, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { tomoto::ITopicModel* inst = tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; self->minWordDf = minDf; self->removeTopWord = rmTop; self->initParams = py::buildPyDict(kwlist, - tw, minCnt, minDf, rmTop, - margs.k, margs.kL, margs.t, margs.alpha[0], margs.alphaL[0], + tw, minCnt, minDf, rmTop, + margs.k, margs.kL, margs.t, margs.alpha[0], margs.alphaL[0], margs.alphaMG, margs.alphaML, margs.eta, margs.etaL, margs.gamma, margs.seed ); py::setPyDictItem(self->initParams, "version", getVersion()); insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* MGLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -57,26 +49,20 @@ static PyObject* MGLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject * const char* delimiter = "."; static const char* kwlist[] = { "words", "delimiter", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &delimiter)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["delimiter"] = delimiter; auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -85,11 +71,14 @@ static DocumentObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyO const char* delimiter = "."; static const char* kwlist[] = { "words", "delimiter", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &delimiter)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); raw.misc["delimiter"] = delimiter; auto doc = inst->makeDoc(raw); @@ -98,16 +87,7 @@ static DocumentObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyO ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* MGLDA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -115,27 +95,14 @@ static PyObject* MGLDA_getTopicWords(TopicModelObject* self, PyObject* args, PyO size_t topicId, topN = 10; static const char* kwlist[] = { "topic_id", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK() + inst->getKL()) throw runtime_error{ "must topic_id < KG + KL" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK() + inst->getKL()) throw py::ValueError{ "must topic_id < KG + KL" }; + return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* MGLDA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -143,27 +110,14 @@ static PyObject* MGLDA_getTopicWordDist(TopicModelObject* self, PyObject* args, size_t topicId, normalize = 1; static const char* kwlist[] = { "topic_id", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|p", (char**)kwlist, &topicId, &normalize)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK() + inst->getKL()) throw runtime_error{ "must topic_id < KG + KL" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK() + inst->getKL()) throw py::ValueError{ "must topic_id < KG + KL" }; + return py::buildPyValue(inst->getWidsByTopic(topicId, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_GETTER(tomoto::IMGLDAModel, MGLDA, getKL); @@ -243,4 +197,4 @@ TopicModelTypeObject MGLDA_type = { { (initproc)MGLDA_init, /* tp_init */ PyType_GenericAlloc, PyType_GenericNew, -}, MGLDA_misc_args }; \ No newline at end of file +}, MGLDA_misc_args }; diff --git a/src/python/py_PA.cpp b/src/python/py_PA.cpp index 10d2b83..b00fac8 100644 --- a/src/python/py_PA.cpp +++ b/src/python/py_PA.cpp @@ -16,7 +16,7 @@ static int PA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnOOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.k2, &objAlpha, &objSubAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k1` (given " + py::repr(objAlpha) + ")"; } @@ -27,7 +27,7 @@ static int PA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) ); tomoto::ITopicModel* inst = tomoto::IPAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -40,15 +40,7 @@ static int PA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* PA_getSubTopicDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -56,27 +48,14 @@ static PyObject* PA_getSubTopicDist(TopicModelObject* self, PyObject* args, PyOb size_t topicId, normalize = 1; static const char* kwlist[] = { "super_topic_id", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|p", (char**)kwlist, &topicId, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < k1" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < k1" }; + return py::buildPyValue(inst->getSubTopicBySuperTopic(topicId, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* PA_getSubTopics(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -84,28 +63,14 @@ static PyObject* PA_getSubTopics(TopicModelObject* self, PyObject* args, PyObjec size_t topicId, topN = 10; static const char* kwlist[] = { "super_topic_id", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK()) throw runtime_error{ "must topic_id < k1" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ - return py::buildPyValue(inst->getSubTopicBySuperTopicSorted(topicId, topN)); + if (topicId >= inst->getK()) throw py::ValueError{ "must topic_id < k1" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + return py::buildPyValue(inst->getSubTopicBySuperTopicSorted(topicId, topN)); + }); } static PyObject* PA_getTopicWords(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -113,27 +78,14 @@ static PyObject* PA_getTopicWords(TopicModelObject* self, PyObject* args, PyObje size_t topicId, topN = 10; static const char* kwlist[] = { "sub_topic_id", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &topicId, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK2()) throw runtime_error{ "must topic_id < k2" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK2()) throw py::ValueError{ "must topic_id < k2" }; + return py::buildPyValue(inst->getWordsByTopicSorted(topicId, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* PA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -141,27 +93,14 @@ static PyObject* PA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyO size_t topicId, normalize = 1; static const char* kwlist[] = { "sub_topic_id", "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|p", (char**)kwlist, &topicId, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (topicId >= inst->getK2()) throw runtime_error{ "must topic_id < k2" }; - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + if (topicId >= inst->getK2()) throw py::ValueError{ "must topic_id < k2" }; + return py::buildPyValue(inst->getWidsByTopic(topicId, !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } @@ -170,23 +109,14 @@ PyObject* Document_getSubTopics(DocumentObject* self, PyObject* args, PyObject * size_t topN = 10; static const char* kwlist[] = { "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; + if (self->corpus->isIndependent()) throw py::AttributeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->corpus->tm->inst); - if (!self->corpus->tm->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" }; return py::buildPyValue(inst->getSubTopicsByDocSorted(self->getBoundDoc(), topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* Document_getSubTopicDist(DocumentObject* self, PyObject* args, PyObject* kwargs) @@ -194,23 +124,14 @@ PyObject* Document_getSubTopicDist(DocumentObject* self, PyObject* args, PyObjec size_t normalize = 1; static const char* kwlist[] = { "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|p", (char**)kwlist, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; + if (self->corpus->isIndependent()) throw py::AttributeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->corpus->tm->inst); - if (!self->corpus->tm->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" }; return py::buildPyValue(inst->getSubTopicsByDoc(self->getBoundDoc(), !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -221,10 +142,10 @@ static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwar static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnnpO", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &ps, &together, &argTransform)) return nullptr; DEBUG_LOG("infer " << self->ob_base.ob_type << ", " << self->ob_base.ob_refcnt); - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (!self->isPrepared) throw runtime_error{ "cannot infer with untrained model" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (!self->isPrepared) throw py::RuntimeError{ "cannot infer with untrained model" }; auto inst = static_cast(self->inst); py::UniqueObj iter; if (PyObject_TypeCheck(argDoc, &UtilsCorpus_type)) @@ -238,7 +159,7 @@ static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwar else if (PyObject_TypeCheck(argDoc, &UtilsDocument_type)) { auto* doc = (DocumentObject*)argDoc; - if (doc->corpus->tm != self) throw runtime_error{ "'doc' was from another model, not fit to this model" }; + if (doc->corpus->tm != self) throw py::ValueError{ "`doc` was from another model, not fit to this model" }; if (doc->owner) { std::vector docs; @@ -259,19 +180,19 @@ static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwar py::UniqueObj item; while ((item = py::UniqueObj{ PyIter_Next(iter) })) { - if (!PyObject_TypeCheck(item, &UtilsDocument_type)) throw runtime_error{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; + if (!PyObject_TypeCheck(item, &UtilsDocument_type)) throw py::ValueError{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; auto* doc = (DocumentObject*)item.get(); - if (doc->corpus->tm != self) throw runtime_error{ "`doc` was from another model, not fit to this model" }; + if (doc->corpus->tm != self) throw py::ValueError{ "`doc` was from another model, not fit to this model" }; docs.emplace_back((tomoto::DocumentBase*)doc->getBoundDoc()); } - if (PyErr_Occurred()) throw bad_exception{}; - if (!self->isPrepared) throw runtime_error{ "cannot infer with untrained model" }; + if (PyErr_Occurred()) throw py::ExcPropagation{}; + if (!self->isPrepared) throw py::RuntimeError{ "cannot infer with untrained model" }; auto ll = inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together); PyObject* ret = PyList_New(docs.size()); size_t i = 0; for (auto d : docs) { - PyList_SetItem(ret, i++, Py_BuildValue("(NN)", + PyList_SetItem(ret, i++, Py_BuildValue("(NN)", py::buildPyValue(inst->getTopicsByDoc(d)), py::buildPyValue(inst->getSubTopicsByDoc(d)) )); @@ -287,42 +208,20 @@ static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwar } else { - throw runtime_error{ "'doc' must be tomotopy.Document type or list of tomotopy.Document" }; + throw py::ValueError{ "`doc` must be tomotopy.Document type or list of tomotopy.Document" }; } - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* PA_getCountBySuperTopic(TopicModelObject* self) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - /*if (!self->isPrepared) - { - inst->prepare(true, self->minWordCnt, self->minWordDf, self->removeTopWord); - self->isPrepared = true; - }*/ + return py::buildPyValue(inst->getCountBySuperTopic()); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } DEFINE_GETTER(tomoto::IPAModel, PA, getK2); @@ -344,9 +243,9 @@ static PyMethodDef PA_methods[] = static PyObject* PA_getSubalpha(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getK2() }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); @@ -356,16 +255,7 @@ static PyObject* PA_getSubalpha(TopicModelObject* self, void* closure) memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * l.size()); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyGetSetDef PA_getseters[] = { diff --git a/src/python/py_PLDA.cpp b/src/python/py_PLDA.cpp index 09aee5a..cb0a75d 100644 --- a/src/python/py_PLDA.cpp +++ b/src/python/py_PLDA.cpp @@ -22,36 +22,28 @@ static int PLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.numLatentTopics, &margs.numTopicsPerLabel, &objAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::IPLDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; self->minWordDf = minDf; self->removeTopWord = rmTop; self->initParams = py::buildPyDict(kwlist, - tw, minCnt, minDf, rmTop, + tw, minCnt, minDf, rmTop, margs.numLatentTopics, margs.numTopicsPerLabel, margs.alpha, margs.eta, margs.seed ); py::setPyDictItem(self->initParams, "version", getVersion()); insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* PLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -59,31 +51,28 @@ static PyObject* PLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); - if(argLabels) + if (argLabels) { - if (PyUnicode_Check(argLabels)) PRINT_WARN_ONCE("[warn] `labels` should be an iterable of str."); + if (PyUnicode_Check(argLabels)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`labels` should be an iterable of str.", 1)) return nullptr; + } raw.misc["labels"] = py::toCpp>(argLabels, "`labels` must be an iterable of str."); } auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -91,16 +80,22 @@ static DocumentObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); if (argLabels) { - if (PyUnicode_Check(argLabels)) PRINT_WARN_ONCE("[warn] `labels` should be an iterable of str."); + if (PyUnicode_Check(argLabels)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`labels` should be an iterable of str.", 1)) return nullptr; + } raw.misc["labels"] = py::toCpp>(argLabels, "`labels` must be an iterable of str."); } auto doc = inst->makeDoc(raw); @@ -109,39 +104,21 @@ static DocumentObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static VocabObject* PLDA_getTopicLabelDict(TopicModelObject* self, void* closure) { - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* ret = (VocabObject*)PyObject_CallObject((PyObject*)&UtilsVocab_type, nullptr); ret->dep = (PyObject*)self; Py_INCREF(ret->dep); ret->vocabs = (tomoto::Dictionary*)&static_cast(self->inst)->getTopicLabelDict(); ret->size = -1; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } diff --git a/src/python/py_PT.cpp b/src/python/py_PT.cpp index fa79d50..c4c121c 100644 --- a/src/python/py_PT.cpp +++ b/src/python/py_PT.cpp @@ -16,14 +16,14 @@ static int PT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) "seed", "corpus", "transform", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnOfnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &margs.k, &margs.p, &objAlpha, &margs.eta, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } ); tomoto::ITopicModel* inst = tomoto::IPTModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; @@ -36,15 +36,7 @@ static int PT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } DEFINE_GETTER(tomoto::IPTModel, PT, getP); diff --git a/src/python/py_SLDA.cpp b/src/python/py_SLDA.cpp index 7309b8f..18e0ca8 100644 --- a/src/python/py_SLDA.cpp +++ b/src/python/py_SLDA.cpp @@ -26,7 +26,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) &tw, &minCnt, &minDf, &rmTop, &margs.k, &vars, &objAlpha, &margs.eta, &mu, &nuSq, &glmCoef, &margs.seed, &objCorpus, &objTransform)) return -1; - try + return py::handleExc([&]() { if (objAlpha) margs.alpha = broadcastObj(objAlpha, margs.k, [=]() { return "`alpha` must be an instance of `float` or `List[float]` with length `k` (given " + py::repr(objAlpha) + ")"; } @@ -41,11 +41,11 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) tomoto::ISLDAModel::GLM t; if (s == "l") t = tomoto::ISLDAModel::GLM::linear; else if (s == "b") t = tomoto::ISLDAModel::GLM::binary_logistic; - else throw runtime_error{ "Unknown var type '" + s + "'" }; + else throw py::ValueError{ "Unknown var type '" + s + "'" }; margs.vars.emplace_back(t); } } - + float fTemp; if (mu) { @@ -87,14 +87,14 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) } tomoto::ITopicModel* inst = tomoto::ISLDAModel::create((tomoto::TermWeight)tw, margs); - if (!inst) throw runtime_error{ "unknown tw value" }; + if (!inst) throw py::ValueError{ "unknown `tw` value" }; self->inst = inst; self->isPrepared = false; self->minWordCnt = minCnt; self->minWordDf = minDf; self->removeTopWord = rmTop; self->initParams = py::buildPyDict(kwlist, - tw, minCnt, minDf, rmTop, + tw, minCnt, minDf, rmTop, margs.k, varTypeStrs, margs.alpha, margs.eta, margs.mu, margs.nuSq, margs.glmParam ); @@ -102,15 +102,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) insertCorpus(self, objCorpus, objTransform); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -118,12 +110,15 @@ static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k PyObject *argWords, *argY = nullptr; static const char* kwlist[] = { "words", "y", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argY)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; - if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; + if (self->isPrepared) throw py::RuntimeError{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); if (argY) @@ -132,16 +127,7 @@ static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k } auto ret = inst->addDoc(raw); return py::buildPyValue(ret); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static DocumentObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -149,13 +135,16 @@ static DocumentObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb PyObject *argWords, *argY = nullptr; static const char* kwlist[] = { "words", "y", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argY)) return nullptr; - try + return py::handleExc([&]() -> DocumentObject* { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (PyUnicode_Check(argWords)) PRINT_WARN_ONCE("[warn] `words` should be an iterable of str."); + if (PyUnicode_Check(argWords)) + { + if (PyErr_WarnEx(PyExc_RuntimeWarning, "`words` should be an iterable of str.", 1)) return nullptr; + } tomoto::RawDoc raw = buildRawDoc(argWords); - + if (argY) { raw.misc["y"] = py::toCpp>(argY, "`y` must be an iterable of float."); @@ -166,16 +155,7 @@ static DocumentObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyOb ret->doc = doc.release(); ret->owner = true; return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* SLDA_getRegressionCoef(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -183,9 +163,9 @@ static PyObject* SLDA_getRegressionCoef(TopicModelObject* self, PyObject* args, PyObject* argVarId = nullptr; static const char* kwlist[] = { "var_id", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, &argVarId)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); if (!argVarId || argVarId == Py_None) { @@ -200,18 +180,9 @@ static PyObject* SLDA_getRegressionCoef(TopicModelObject* self, PyObject* args, } size_t varId = PyLong_AsLong(argVarId); - if (varId >= inst->getF()) throw runtime_error{ "`var_id` must be < `f`" }; + if (varId >= inst->getF()) throw py::ValueError{ "`var_id` must be < `f`" }; return py::buildPyValue(inst->getRegressionCoef(varId)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* SLDA_getTypeOfVar(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -219,22 +190,13 @@ static PyObject* SLDA_getTypeOfVar(TopicModelObject* self, PyObject* args, PyObj size_t varId; static const char* kwlist[] = { "var_id", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", (char**)kwlist, &varId)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - if (varId >= inst->getF()) throw runtime_error{ "`var_id` must be < `f`" }; + if (varId >= inst->getF()) throw py::ValueError{ "`var_id` must be < `f`" }; return py::buildPyValue(std::string{ "l\0b" + (size_t)inst->getTypeOfVar(varId) * 2 }); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* SLDA_estimateVars(TopicModelObject* self, PyObject* args, PyObject *kwargs) @@ -242,11 +204,11 @@ static PyObject* SLDA_estimateVars(TopicModelObject* self, PyObject* args, PyObj PyObject* argDoc; static const char* kwlist[] = { "doc", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &argDoc)) return nullptr; - try + return py::handleExc([&]() { - if (!self->inst) throw runtime_error{ "inst is null" }; + if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - try + try { if (!PyObject_TypeCheck(argDoc, &UtilsDocument_type)) throw py::ConversionFail{ "`doc` must be tomotopy.Document or list of tomotopy.Document" }; auto* doc = (DocumentObject*)argDoc; @@ -258,7 +220,7 @@ static PyObject* SLDA_estimateVars(TopicModelObject* self, PyObject* args, PyObj { PyErr_Clear(); } - + py::UniqueObj iter = py::UniqueObj{ PyObject_GetIter(argDoc) }; py::UniqueObj nextDoc; std::vector docs; @@ -269,21 +231,12 @@ static PyObject* SLDA_estimateVars(TopicModelObject* self, PyObject* args, PyObj if (doc->corpus->tm != self) throw py::ConversionFail{ "`doc` was from another model, not fit to this model" }; docs.emplace_back(doc->getBoundDoc()); } - if (PyErr_Occurred()) return nullptr; + if (PyErr_Occurred()) throw py::ExcPropagation{}; return py::buildPyValueTransform(docs.begin(), docs.end(), [&](const tomoto::DocumentBase* d) { return inst->estimateVars(d); }); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } diff --git a/src/python/py_coherence.cpp b/src/python/py_coherence.cpp index e54a7f5..4309fab 100644 --- a/src/python/py_coherence.cpp +++ b/src/python/py_coherence.cpp @@ -21,11 +21,11 @@ int CoherenceObject::init(CoherenceObject* self, PyObject* args, PyObject* kwarg static const char* kwlist[] = { "corpus", "pe", "seg", "cm", "im", "window_size", "eps", "gamma", "targets", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiiinddO", (char**)kwlist, &corpus, &pe, &seg, &cm, &im, &windowSize, &eps, &gamma, &targets)) return -1; - try + return py::handleExc([&]() { - if(!PyObject_TypeCheck(corpus, &UtilsCorpus_type)) + if (!PyObject_TypeCheck(corpus, &UtilsCorpus_type)) { - throw runtime_error{ "`corpus` must be an instance of `tomotopy.utils.Corpus`." }; + throw py::ValueError{ "`corpus` must be an instance of `tomotopy.utils.Corpus`." }; } self->model.~CoherenceModel(); new (&self->model) tomoto::coherence::CoherenceModel{ pe, windowSize }; @@ -37,16 +37,16 @@ int CoherenceObject::init(CoherenceObject* self, PyObject* args, PyObject* kwarg py::foreach(targets, [&](const string& w) { auto wid = corpus->getVocabDict().toWid(w); - if(wid != tomoto::non_vocab_id) targetIds.emplace_back(wid); + if (wid != tomoto::non_vocab_id) targetIds.emplace_back(wid); }, "`targets` must be an iterable of `str`."); - + self->model.insertTargets(targetIds.begin(), targetIds.end()); - + for (size_t i = 0; i < CorpusObject::len(corpus); ++i) { auto* doc = corpus->getDoc(i); self->model.insertDoc( - wordBegin(doc, corpus->isIndependent()), + wordBegin(doc, corpus->isIndependent()), wordEnd(doc, corpus->isIndependent()) ); } @@ -54,12 +54,7 @@ int CoherenceObject::init(CoherenceObject* self, PyObject* args, PyObject* kwarg self->seg = seg; self->cm = tomoto::coherence::AnyConfirmMeasurer::getInstance(cm, im, targetIds.begin(), targetIds.end(), eps, gamma); return 0; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_ValueError, e.what()); - } - return -1; + }); } PyObject* CoherenceObject::repr(CoherenceObject* self) @@ -82,7 +77,7 @@ PyObject* CoherenceObject::getScore(CoherenceObject* self, PyObject* args, PyObj if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &words)) return nullptr; - try + return py::handleExc([&]() { vector wordIds; py::foreach(words, [&](const string& w) @@ -104,19 +99,9 @@ PyObject* CoherenceObject::getScore(CoherenceObject* self, PyObject* args, PyObj case Segmentation::one_set: return py::buildPyValue(self->model.template getScore(self->cm, wordIds.begin(), wordIds.end())); default: - throw invalid_argument{ "invalid Segmentation `seg`" }; + throw py::ValueError{ "invalid Segmentation `seg`" }; } - - } - catch (const bad_exception&) - { - - } - catch (const exception& e) - { - PyErr_SetString(PyExc_ValueError, e.what()); - } - return nullptr; + }); } static PyMethodDef Coherence_methods[] = diff --git a/src/python/py_label.cpp b/src/python/py_label.cpp index d8f2ce5..162a560 100644 --- a/src/python/py_label.cpp +++ b/src/python/py_label.cpp @@ -17,18 +17,13 @@ int CandidateObject::init(CandidateObject *self, PyObject *args, PyObject *kwarg static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, &words)) return -1; - try + return py::handleExc([&]() { self->tm = nullptr; self->corpus = nullptr; new(&self->cand) tomoto::label::Candidate{}; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } - return 0; + return 0; + }); } void CandidateObject::dealloc(CandidateObject* self) @@ -60,106 +55,52 @@ PyObject* CandidateObject::repr(CandidateObject* self) static PyObject* Candidate_getWords(CandidateObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->begin(), self->end()); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Candidate_getName(CandidateObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->cand.name); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } int Candidate_setName(CandidateObject* self, PyObject* val, void* closure) { - try + return py::handleExc([&]() { if (!PyUnicode_Check(val)) throw runtime_error{ "`name` must be `str` type." }; self->cand.name = PyUnicode_AsUTF8(val); - } - catch (const bad_exception&) - { - return -1; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } - return 0; + return 0; + }); } static PyObject* Candidate_getScore(CandidateObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->cand.score); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Candidate_getCf(CandidateObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->cand.cf); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Candidate_getDf(CandidateObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->cand.df); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyGetSetDef Candidate_getseters[] = { @@ -222,7 +163,7 @@ struct ExtractorObject TopicModelObject* tm; static const char* kwlist[] = { "topic_model", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &tm)) return nullptr; - try + return py::handleExc([&]() { auto cands = self->inst->extract(tm->inst); PyObject* ret = PyList_New(0); @@ -235,16 +176,7 @@ struct ExtractorObject PyList_Append(ret, item); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static void dealloc(ExtractorObject* self) @@ -271,19 +203,10 @@ struct LabelerObject static const char* kwlist[] = { "k", "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|n", (char**)kwlist, &k, &topN)) return nullptr; - try + return py::handleExc([&]() { return py::buildPyValue(self->inst->getLabels(k, topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static void dealloc(LabelerObject* self) @@ -305,16 +228,11 @@ static int PMIExtractor_init(ExtractorObject *self, PyObject *args, PyObject *kw static const char* kwlist[] = { "min_cf", "min_df", "min_len", "max_len", "max_cand", "normalized", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnp", (char**)kwlist, &minCf, &minDf, &minLen, &maxLen, &maxCand, &normalized)) return -1; - try + return py::handleExc([&]() { self->inst = new tomoto::label::PMIExtractor{ minCf, minDf, minLen, maxLen, maxCand, !!normalized }; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } - return 0; + return 0; + }); } PyTypeObject PMIExtractor_type = { @@ -367,13 +285,13 @@ static int FoRelevance_init(LabelerObject *self, PyObject *args, PyObject *kwarg static const char* kwlist[] = { "topic_model", "cands", "min_df", "smoothing", "mu", "window_size", "workers", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nffnn", (char**)kwlist, &tm, &cands, &minDf, &smoothing, &mu, &windowSize, &numWorkers)) return -1; - try + return py::handleExc([&]() { self->tm = tm; self->inst = nullptr; Py_INCREF(tm); py::UniqueObj iter{ PyObject_GetIter(cands) }; - if (!iter) throw runtime_error{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; + if (!iter) throw py::ValueError{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; vector pcands; { py::UniqueObj item; @@ -381,25 +299,20 @@ static int FoRelevance_init(LabelerObject *self, PyObject *args, PyObject *kwarg { if (!PyObject_TypeCheck(item, &Candidate_type)) { - throw runtime_error{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; + throw py::ValueError{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; } pcands.emplace_back(&((CandidateObject*)item.get())->cand); } } auto deref = [](tomoto::label::Candidate* p)->tomoto::label::Candidate& { return *p; }; - self->inst = new tomoto::label::FoRelevance{ - tm->inst, - tomoto::makeTransformIter(pcands.begin(), deref), - tomoto::makeTransformIter(pcands.end(), deref), - minDf, smoothing, 0, mu, windowSize, numWorkers + self->inst = new tomoto::label::FoRelevance{ + tm->inst, + tomoto::makeTransformIter(pcands.begin(), deref), + tomoto::makeTransformIter(pcands.end(), deref), + minDf, smoothing, 0, mu, windowSize, numWorkers }; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } - return 0; + return 0; + }); } PyTypeObject FoRelevance_type = { diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 411bd8d..f88c9ef 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -71,7 +71,7 @@ PyObject* VocabObject::getstate(VocabObject* self, PyObject*) PyObject* VocabObject::setstate(VocabObject* self, PyObject* args) { - try + return py::handleExc([&]() { PyObject* dict = PyTuple_GetItem(args, 0); PyObject* id2word = PyDict_GetItemString(dict, "id2word"); @@ -81,62 +81,32 @@ PyObject* VocabObject::setstate(VocabObject* self, PyObject* args) self->size = -1; py::foreach(id2word, [&](const char* str) { - if (!str) throw bad_exception{}; + if (!str) throw py::ExcPropagation{}; self->vocabs->add(str); }, ""); - if (PyErr_Occurred()) throw bad_exception{}; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } - Py_INCREF(Py_None); - return Py_None; + if (PyErr_Occurred()) throw py::ExcPropagation{}; + Py_INCREF(Py_None); + return Py_None; + }); } Py_ssize_t VocabObject::len(VocabObject* self) { - try + return py::handleExc([&]() { - if(self->size == -1) return self->vocabs->size(); + if (self->size == -1) return self->vocabs->size(); return self->size; - } - catch (const bad_exception&) - { - return -1; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } + }); } PyObject* VocabObject::getitem(VocabObject* self, Py_ssize_t key) { - try + return py::handleExc([&]() { - if (key >= len(self)) - { - PyErr_SetString(PyExc_IndexError, ""); - throw bad_exception{}; - } + if (key >= len(self)) throw py::IndexError{ "" }; + return py::buildPyValue(self->vocabs->toWord(key)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* VocabObject::repr(VocabObject* self) @@ -311,27 +281,18 @@ void CorpusObject::dealloc(CorpusObject* self) PyObject* CorpusObject::getstate(CorpusObject* self, PyObject*) { - try + return py::handleExc([&]() { - if (!self->isIndependent()) - throw runtime_error{ "Cannot pickle the corpus bound to a topic model. Try to use a topic model's `save` method." }; + if (!self->isIndependent()) + throw py::RuntimeError{ "Cannot pickle the corpus bound to a topic model. Try to use a topic model's `save` method." }; static const char* keys[] = { "_docs", "_vocab" }; return py::buildPyDict(keys, py::UniqueObj{ py::buildPyValue(self->docs) }, (PyObject*)self->vocab); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* CorpusObject::setstate(CorpusObject* self, PyObject* args) { - try + return py::handleExc([&]() { PyObject* dict = PyTuple_GetItem(args, 0); PyObject* vocab = PyDict_GetItemString(dict, "_vocab"); @@ -339,7 +300,7 @@ PyObject* CorpusObject::setstate(CorpusObject* self, PyObject* args) Py_INCREF(self->vocab); PyObject* docs = PyDict_GetItemString(dict, "_docs"); py::UniqueObj iter{ PyObject_GetIter(docs) }, next; - if (!iter) throw bad_exception{}; + if (!iter) throw py::ExcPropagation{}; while ((next = py::UniqueObj{ PyIter_Next(iter) })) { auto size = PyTuple_Size(next); @@ -366,8 +327,8 @@ PyObject* CorpusObject::setstate(CorpusObject* self, PyObject* args) if (raw) doc.rawStr = tomoto::SharedString{ PyUnicode_AsUTF8(raw) }; if (pos) doc.origWordPos = py::toCpp>(pos, ""); if (len) doc.origWordLen = py::toCpp>(len, ""); - - PyObject *key, *value; + + PyObject* key, * value; Py_ssize_t p = 0; while (PyDict_Next(kwargs, &p, &key, &value)) { @@ -380,27 +341,19 @@ PyObject* CorpusObject::setstate(CorpusObject* self, PyObject* args) } self->docs.emplace_back(move(doc)); } - if (PyErr_Occurred()) throw bad_exception{}; + if (PyErr_Occurred()) throw py::ExcPropagation{}; Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwargs) { - try + return py::handleExc([&]() { if (!self->isIndependent()) - throw runtime_error{ "Cannot modify the corpus bound to a topic model." }; - if (PyTuple_Size(args) != 3) throw runtime_error{ "function takes 3 positional arguments." }; + throw py::RuntimeError{ "Cannot modify the corpus bound to a topic model." }; + if (PyTuple_Size(args) != 3) throw py::ValueError{ "function takes 3 positional arguments." }; PyObject* words = PyTuple_GetItem(args, 0); PyObject* raw = PyTuple_GetItem(args, 1); PyObject* user_data = PyTuple_GetItem(args, 2); @@ -412,11 +365,11 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa if (PyObject_HasAttrString((PyObject*)self, "_tokenizer") && PyObject_IsTrue(py::UniqueObj{ PyObject_GetAttrString((PyObject*)self, "_tokenizer") })) { - if (words && words != Py_None) throw runtime_error{ "only `raw` is required when `tokenizer` is provided." }; + if (words && words != Py_None) throw py::ValueError{ "only `raw` is required when `tokenizer` is provided." }; if (!PyObject_IsTrue(raw)) return py::buildPyValue(-1); py::UniqueObj tokenizer{ PyObject_GetAttrString((PyObject*)self, "_tokenizer") }; - + py::UniqueObj args{ PyTuple_New(1) }; Py_INCREF(raw); PyTuple_SET_ITEM(args.get(), 0, raw); @@ -424,7 +377,7 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa PyDict_SetItemString(kwargs, "user_data", user_data); py::UniqueObj ret{ PyObject_Call(tokenizer, args, kwargs) }; - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; py::foreach(ret, [&](PyObject* t) { if (PyUnicode_Check(t)) @@ -436,29 +389,29 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa PyObject* word = PyTuple_GetItem(t, 0); PyObject* pos = PyTuple_GetItem(t, 1); PyObject* len = PyTuple_GetItem(t, 2); - if(!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw runtime_error{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." }; + if (!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." }; py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) }; - if (!stopRet) throw bad_exception{}; + if (!stopRet) throw py::ExcPropagation{}; doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word))); doc.origWordPos.emplace_back(PyLong_AsLong(pos)); doc.origWordLen.emplace_back(PyLong_AsLong(len)); } else { - throw runtime_error{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." }; + throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." }; } }, "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)."); doc.rawStr = tomoto::SharedString{ PyUnicode_AsUTF8(raw) }; } else { - if (raw && raw != Py_None) throw runtime_error{ "only `words` is required when `tokenizer` is not provided." }; + if (raw && raw != Py_None) throw py::ValueError{ "only `words` is required when `tokenizer` is not provided." }; if (!PyObject_IsTrue(words)) return py::buildPyValue(-1); py::foreach(words, [&](const string& w) { py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(w) }) }; - if (!stopRet) throw bad_exception{}; + if (!stopRet) throw py::ExcPropagation{}; doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(w)); }, ""); } @@ -471,12 +424,12 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa { if (value == Py_None) continue; const char* uid = PyUnicode_AsUTF8(value); - if (!uid) throw runtime_error{ "`uid` must be str type." }; + if (!uid) throw py::ValueError{ "`uid` must be str type." }; string suid = uid; - if (suid.empty()) throw runtime_error{ "wrong `uid` value : empty str not allowed" }; + if (suid.empty()) throw py::ValueError{ "wrong `uid` value : empty str not allowed" }; if (self->invmap.find(suid) != self->invmap.end()) { - throw runtime_error{ "there is a document with uid = " + py::repr(value) + " already." }; + throw py::ValueError{ "there is a document with uid = " + py::repr(value) + " already." }; } self->invmap.emplace(suid, self->docs.size()); doc.docUid = tomoto::SharedString{ uid }; @@ -491,16 +444,7 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa } self->docs.emplace_back(move(doc)); return py::buildPyValue(self->docs.size() - 1); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs) @@ -511,12 +455,12 @@ PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObje static const char* kwlist[] = { "min_cf", "min_df", "max_len", "max_cand", "min_score", "normalized", "workers", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnfpn", (char**)kwlist, &minCf, &minDf, &maxLen, &maxCand, &minScore, &normalized, &workers)) return nullptr; - try + return py::handleExc([&]() { if (!self->isIndependent()) - throw runtime_error{ "Cannot modify the corpus bound to a topic model." }; + throw py::RuntimeError{ "Cannot modify the corpus bound to a topic model." }; size_t vSize = self->vocab->vocabs->size(); - vector cf(vSize), + vector cf(vSize), df(vSize), odf(vSize); for (auto& d : self->docs) @@ -527,7 +471,7 @@ PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObje odf[w] = 1; cf[w]++; } - + for (size_t i = 0; i < df.size(); ++i) df[i] += odf[i]; fill(odf.begin(), odf.end(), 0); } @@ -553,16 +497,7 @@ PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObje PyList_Append(ret, item); } return ret; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } // TODO: It loses some ngram patterns. Fix me! @@ -574,13 +509,13 @@ PyObject* CorpusObject::concatNgrams(CorpusObject* self, PyObject* args, PyObjec static const char* kwlist[] = { "cands", "delimiter", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &cands, &delimiter)) return nullptr; - try + return py::handleExc([&]() -> PyObject* { if (!self->isIndependent()) - throw runtime_error{ "Cannot modify the corpus bound to a topic model." }; + throw py::RuntimeError{ "Cannot modify the corpus bound to a topic model." }; py::UniqueObj iter{ PyObject_GetIter(cands) }; - if (!iter) throw runtime_error{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; + if (!iter) throw py::ValueError{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; vector pcands; vector pcandVids; { @@ -589,7 +524,7 @@ PyObject* CorpusObject::concatNgrams(CorpusObject* self, PyObject* args, PyObjec { if (!PyObject_TypeCheck(item, &Candidate_type)) { - throw runtime_error{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; + throw py::ValueError{ "`cands` must be an iterable of `tomotopy.label.Candidate`" }; } CandidateObject* cand = (CandidateObject*)item.get(); if (cand->corpus == self) @@ -603,7 +538,9 @@ PyObject* CorpusObject::concatNgrams(CorpusObject* self, PyObject* args, PyObjec if (find(c.w.begin(), c.w.end(), tomoto::non_vocab_id) != c.w.end()) { auto repr = py::toCpp(py::UniqueObj{ PyObject_Repr(item.get()) }); - PRINT_WARN("Candidate is ignored because it is not found in the corpus.\n" + repr); + if (PyErr_WarnEx(PyExc_RuntimeWarning, + ("Candidate is ignored because it is not found in the corpus.\n" + repr).c_str(), 1 + )) return nullptr; continue; } pcands.emplace_back(move(c)); @@ -615,7 +552,9 @@ PyObject* CorpusObject::concatNgrams(CorpusObject* self, PyObject* args, PyObjec if (find(c.w.begin(), c.w.end(), tomoto::non_vocab_id) != c.w.end()) { auto repr = py::toCpp(py::UniqueObj{ PyObject_Repr(item.get()) }); - PRINT_WARN("Candidate is ignored because it is not found in the corpus.\n" + repr); + if (PyErr_WarnEx(PyExc_RuntimeWarning, + ("Candidate is ignored because it is not found in the corpus.\n" + repr).c_str(), 1 + )) return nullptr; continue; } pcands.emplace_back(move(c)); @@ -675,16 +614,7 @@ PyObject* CorpusObject::concatNgrams(CorpusObject* self, PyObject* args, PyObjec } } return py::buildPyValue(totUpdated); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } Py_ssize_t CorpusObject::len(CorpusObject* self) @@ -697,15 +627,16 @@ Py_ssize_t CorpusObject::len(CorpusObject* self) PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) { - try + return py::handleExc([&]() { // indexing by int Py_ssize_t v = PyLong_AsLongLong(idx); if (v != -1 || !(PyErr_Occurred() && (PyErr_Clear(), true))) { - if(v >= len(self) || -v > len(self)) throw out_of_range{ "IndexError: " + to_string(v) }; + if (v >= len(self) || -v > len(self)) throw py::IndexError{ to_string(v) }; + if (v < 0) v += len(self); auto doc = (DocumentObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsDocument_type, (PyObject*)self, nullptr); - if (!doc) throw bad_exception{}; + if (!doc) throw py::ExcPropagation{}; doc->doc = self->getDoc(v); return (PyObject*)doc; } @@ -714,9 +645,9 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) { string v = PyUnicode_AsUTF8(idx); auto doc = (DocumentObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsDocument_type, (PyObject*)self, nullptr); - if (!doc) throw bad_exception{}; + if (!doc) throw py::ExcPropagation{}; size_t iidx = self->findUid(v); - if (iidx == (size_t)-1) throw out_of_range{ "Cannot find a document with uid = " + py::repr(idx) }; + if (iidx == (size_t)-1) throw py::KeyError{ "Cannot find a document with uid = " + py::repr(idx) }; doc->doc = self->getDoc(iidx); return (PyObject*)doc; } @@ -726,13 +657,13 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) Py_ssize_t start, end, step, size; if (PySlice_GetIndicesEx(idx, len(self), &start, &end, &step, &size)) { - throw bad_exception{}; + throw py::ExcPropagation{}; } if (self->isIndependent()) { auto ret = (CorpusObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self->vocab, nullptr); - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; for (Py_ssize_t i = start; i < end; i += step) { ret->docs.emplace_back(self->docs[i]); @@ -784,7 +715,7 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) { if (v >= len(self) || -v > len(self)) { - throw out_of_range{ "IndexError. len = " + to_string(len(self)) + ", idx = " + to_string(v) }; + throw py::IndexError{ "len = " + to_string(len(self)) + ", idx = " + to_string(v) }; } if (v < 0) v += len(self); idcs.emplace_back((size_t)v); @@ -793,21 +724,21 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) { string k = py::toCpp(o); size_t idx = self->findUid(k); - if (idx == (size_t)-1) throw out_of_range{ "Cannot find a document with uid = " + py::repr(o) }; + if (idx == (size_t)-1) throw py::KeyError{ "Cannot find a document with uid = " + py::repr(o) }; idcs.emplace_back(idx); } else { py::UniqueObj ty{ PyObject_Type(o) }; py::UniqueObj repr{ PyObject_Str(ty) }; - throw runtime_error{ string{"Unsupported indexing type "} + PyUnicode_AsUTF8(repr) }; + throw py::IndexError{ string{"Unsupported indexing type "} + PyUnicode_AsUTF8(repr) }; } }, ""); if (self->isIndependent()) { auto ret = (CorpusObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self->vocab, nullptr); - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; for (auto i : idcs) { ret->docs.emplace_back(self->docs[i]); @@ -817,7 +748,7 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) else if (self->made) { auto ret = (CorpusObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self->tm, nullptr); - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; for (auto i : ret->docIdcs) { ret->docsMade.emplace_back(self->docsMade[i]); @@ -828,7 +759,7 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) else { auto ret = (CorpusObject*)PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self->tm, nullptr); - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; ret->docIdcs = move(idcs); for (auto i : ret->docIdcs) { @@ -841,21 +772,9 @@ PyObject* CorpusObject::getitem(CorpusObject* self, PyObject* idx) { py::UniqueObj ty{ PyObject_Type(idx) }; py::UniqueObj repr{ PyObject_Str(ty) }; - throw runtime_error{ string{"Unsupported indexing type "} + PyUnicode_AsUTF8(repr) }; + throw py::IndexError{ string{"Unsupported indexing type "} + PyUnicode_AsUTF8(repr) }; } - } - catch (const bad_exception&) - { - } - catch (const out_of_range& e) - { - PyErr_SetString(PyExc_KeyError, e.what()); - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* CorpusObject::iter(CorpusObject* self) @@ -1026,28 +945,19 @@ DocWordIterator wordEnd(const tomoto::RawDocKernel* doc, bool independent) int DocumentObject::init(DocumentObject* self, PyObject* args, PyObject* kwargs) { - try - { - PyObject* corpus = nullptr; - static const char* kwlist[] = { "corpus", nullptr }; + PyObject* corpus = nullptr; + static const char* kwlist[] = { "corpus", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, - &corpus)) return -1; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, + &corpus)) return -1; + return py::handleExc([&]() + { self->corpus = (CorpusObject*)corpus; Py_INCREF(corpus); self->doc = nullptr; return 0; - } - catch (const bad_exception&) - { - return -1; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return -1; - } + }); } void DocumentObject::dealloc(DocumentObject* self) @@ -1074,9 +984,9 @@ Py_ssize_t DocumentObject::len(DocumentObject* self) PyObject* DocumentObject::getitem(DocumentObject* self, Py_ssize_t idx) { - try + return py::handleExc([&]() { - if (idx >= len(self)) throw out_of_range{ "" }; + if (idx >= len(self)) throw py::IndexError{ "" }; if (self->corpus->isIndependent()) { if (self->getRawDoc()->words[idx] == tomoto::non_vocab_id) @@ -1091,57 +1001,29 @@ PyObject* DocumentObject::getitem(DocumentObject* self, Py_ssize_t idx) idx = self->getBoundDoc()->wOrder.empty() ? idx : self->getBoundDoc()->wOrder[idx]; return py::buildPyValue(self->corpus->getVocabDict().toWord(self->getBoundDoc()->words[idx])); } - } - catch (const bad_exception&) - { - } - catch (const out_of_range& e) - { - PyErr_SetString(PyExc_IndexError, e.what()); - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getWords(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() { if (self->corpus->isIndependent()) return py::buildPyValue(self->getRawDoc()->words); else return buildPyValueReorder(self->getBoundDoc()->words, self->getBoundDoc()->wOrder); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getRaw(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->doc->rawStr); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getSpan(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() { PyObject* ret = PyList_New(self->doc->origWordPos.size()); for (size_t i = 0; i < self->doc->origWordPos.size(); ++i) @@ -1150,71 +1032,39 @@ PyObject* DocumentObject::getSpan(DocumentObject* self, void* closure) PyList_SET_ITEM(ret, i, py::buildPyTuple(begin, end)); } return ret; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getWeight(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->doc->weight); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getUid(DocumentObject* self, void* closure) { - try + return py::handleExc([&]() { return py::buildPyValue(self->doc->docUid); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::getattro(DocumentObject* self, PyObject* attr) { - try + return py::handleExc([&]() { - if(!self->corpus->isIndependent()) return PyObject_GenericGetAttr((PyObject*)self, attr); + if (!self->corpus->isIndependent()) return PyObject_GenericGetAttr((PyObject*)self, attr); const char* a = PyUnicode_AsUTF8(attr); - if (!a) throw runtime_error{ "invalid attribute name" }; + if (!a) throw py::AttributeError{ "invalid attribute name" }; string name = a; auto it = self->getRawDoc()->misc.find(name); if (it == self->getRawDoc()->misc.end()) return PyObject_GenericGetAttr((PyObject*)self, attr); auto ret = (PyObject*)it->second.template get>().get(); Py_INCREF(ret); return ret; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* DocumentObject::repr(DocumentObject* self) @@ -1246,22 +1096,13 @@ static PyObject* Document_getTopics(DocumentObject* self, PyObject* args, PyObje size_t topN = 10; static const char* kwlist[] = { "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; - if (!self->corpus->tm->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; + if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" }; return py::buildPyValue(self->corpus->tm->inst->getTopicsByDocSorted(self->getBoundDoc(), topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Document_getTopicDist(DocumentObject* self, PyObject* args, PyObject* kwargs) @@ -1269,22 +1110,13 @@ static PyObject* Document_getTopicDist(DocumentObject* self, PyObject* args, PyO size_t normalize = 1; static const char* kwlist[] = { "normalize", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|p", (char**)kwlist, &normalize)) return nullptr; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; - if (!self->corpus->tm->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; + if (!self->corpus->tm->isPrepared) throw py::RuntimeError{ "train() should be called first for calculating the topic distribution" }; return py::buildPyValue(self->corpus->tm->inst->getTopicsByDoc(self->getBoundDoc(), !!normalize)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Document_getWords(DocumentObject* self, PyObject* args, PyObject* kwargs) @@ -1292,95 +1124,52 @@ static PyObject* Document_getWords(DocumentObject* self, PyObject* args, PyObjec size_t topN = 10; static const char* kwlist[] = { "top_n", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &topN)) return nullptr; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; + if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; return py::buildPyValue(self->corpus->tm->inst->getWordsByDocSorted(self->getBoundDoc(), topN)); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyObject* Document_Z(DocumentObject* self, void* closure) { - PyObject* ret; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `topics` field!" }; - if (!self->doc) throw runtime_error{ "doc is null!" }; + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `topics` field!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; #ifdef TM_HLDA - ret = Document_HLDA_Z(self, closure); - if (ret) return ret; + if (auto* ret = Document_HLDA_Z(self, closure)) return ret; #endif #ifdef TM_HDP - ret = Document_HDP_Z(self, closure); - if (ret) return ret; + if (auto* ret = Document_HDP_Z(self, closure)) return ret; #endif - ret = Document_LDA_Z(self, closure); - if (ret) return ret; - throw runtime_error{ "doc doesn't has `topics` field!" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_AttributeError, e.what()); - return nullptr; - } + if (auto* ret = Document_LDA_Z(self, closure)) return ret; + throw py::AttributeError{ "doc doesn't has `topics` field!" }; + }); } static PyObject* Document_metadata(DocumentObject* self, void* closure) { - PyObject* ret; - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `metadata` field!" }; - if (!self->doc) throw runtime_error{ "doc is null!" }; + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `metadata` field!" }; + if (!self->doc) throw py::RuntimeError{ "doc is null!" }; #ifdef TM_DMR - ret = Document_DMR_metadata(self, closure); - if (ret) return ret; + if (auto* ret = Document_DMR_metadata(self, closure)) return ret; #endif - throw runtime_error{ "doc doesn't has `metadata` field!" }; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_AttributeError, e.what()); - return nullptr; - } + throw py::AttributeError{ "doc doesn't has `metadata` field!" }; + }); } - PyObject* Document_getLL(DocumentObject* self) { - try + return py::handleExc([&]() { - if (self->corpus->isIndependent()) throw runtime_error{ "This method can only be called by documents bound to the topic model." }; - if (!self->corpus->tm->inst) throw runtime_error{ "inst is null" }; + if (self->corpus->isIndependent()) throw py::RuntimeError{ "This method can only be called by documents bound to the topic model." }; + if (!self->corpus->tm->inst) throw py::RuntimeError{ "inst is null" }; return py::buildPyValue(self->corpus->tm->inst->getDocLL(self->getBoundDoc())); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } static PyMethodDef UtilsDocument_methods[] = @@ -1413,9 +1202,10 @@ static PyGetSetDef UtilsDocument_getseters[] = { { (char*)"span", (getter)DocumentObject::getSpan, nullptr, Document_span__doc__, nullptr }, #ifdef TM_DMR { (char*)"metadata", (getter)Document_metadata, nullptr, Document_metadata__doc__, nullptr }, + { (char*)"multi_metadata", (getter)Document_DMR_multiMetadata, nullptr, Document_multi_metadata__doc__, nullptr }, #endif #ifdef TM_GDMR - { (char*)"numeric_metadata", (getter)Document_numeric_metadata, nullptr, Document_numeric_metadata__doc__, nullptr }, + { (char*)"numeric_metadata", (getter)Document_numericMetadata, nullptr, Document_numeric_metadata__doc__, nullptr }, #endif #ifdef TM_PA { (char*)"subtopics", (getter)Document_Z2, nullptr, Document_subtopics__doc__, nullptr }, @@ -1498,7 +1288,7 @@ PhraserObject* PhraserObject::_new(PyTypeObject* subtype, PyObject* args, PyObje int PhraserObject::init(PhraserObject* self, PyObject* args, PyObject* kwargs) { - try + return py::handleExc([&]() { PyObject* candidates = nullptr; const char* delimiter = "_"; @@ -1509,7 +1299,7 @@ int PhraserObject::init(PhraserObject* self, PyObject* args, PyObject* kwargs) if (!candidates || candidates == Py_None) return 0; py::UniqueObj iter{ PyObject_GetIter(candidates) }, item; - if (!iter) throw runtime_error{ "`candidates` must be an iterable of Candidates." }; + if (!iter) throw py::ValueError{ "`candidates` must be an iterable of Candidates." }; CorpusObject* base_corpus = nullptr; auto alloc = [&]() { self->trie_nodes.emplace_back(); return &self->trie_nodes.back(); }; @@ -1569,22 +1359,14 @@ int PhraserObject::init(PhraserObject* self, PyObject* args, PyObject* kwargs) } else { - throw runtime_error{ "`candidates` must be an iterable of Candidates." }; + throw py::ValueError{ "`candidates` must be an iterable of Candidates." }; } } - if (PyErr_Occurred()) throw bad_exception{}; + if (PyErr_Occurred()) throw py::ExcPropagation{}; self->trie_nodes[0].fillFail(); self->trie_nodes.shrink_to_fit(); return 0; - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return -1; + }); } void PhraserObject::dealloc(PhraserObject* self) @@ -1597,7 +1379,7 @@ void PhraserObject::dealloc(PhraserObject* self) PyObject* PhraserObject::repr(PhraserObject* self) { - string ret = "cphraser.Phraser(... with "; + string ret = "Phraser(... with "; ret += to_string(self->cand_info.size()); ret += " items)"; return py::buildPyValue(ret); @@ -1605,14 +1387,14 @@ PyObject* PhraserObject::repr(PhraserObject* self) PyObject* PhraserObject::call(PhraserObject* self, PyObject* args, PyObject* kwargs) { - try - { - PyObject* words = nullptr; - static const char* kwlist[] = { "words", nullptr }; + PyObject* words = nullptr; + static const char* kwlist[] = { "words", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, - &words)) return nullptr; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, + &words)) return nullptr; + return py::handleExc([&]() + { py::UniqueObj ret{ PyList_New(0) }; deque buffer; size_t c_found = 0; @@ -1680,27 +1462,18 @@ PyObject* PhraserObject::call(PhraserObject* self, PyObject* args, PyObject* kwa } for (auto v : buffer) PyList_Append(ret, py::UniqueObj{ py::buildPyValue(self->vocabs.toWord(v)) }); return ret.release(); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* PhraserObject::findall(PhraserObject* self, PyObject* args, PyObject* kwargs) { - try + PyObject* words = nullptr; + static const char* kwlist[] = { "words", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, + &words)) return nullptr; + + return py::handleExc([&]() { - PyObject* words = nullptr; - static const char* kwlist[] = { "words", nullptr }; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, - &words)) return nullptr; - py::UniqueObj ret{ PyList_New(0) }; size_t c_found = 0, stack_size = 0, cur_pos = 0; auto* node = self->trie_nodes.data(); @@ -1769,15 +1542,7 @@ PyObject* PhraserObject::findall(PhraserObject* self, PyObject* args, PyObject* } } return ret.release(); - } - catch (const bad_exception&) - { - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - } - return nullptr; + }); } PyObject* PhraserObject::save(PhraserObject* self, PyObject* args, PyObject* kwargs) @@ -1787,9 +1552,10 @@ PyObject* PhraserObject::save(PhraserObject* self, PyObject* args, PyObject* kwa if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &path)) return nullptr; - try + return py::handleExc([&]() { ofstream ofs{ path, ios_base::binary }; + if (!ofs) throw py::OSError{ string{"cannot write to '"} + path + "'" }; tomoto::serializer::writeMany(ofs, tomoto::serializer::to_keyz("tph1"), self->vocabs, self->cand_info, @@ -1797,16 +1563,7 @@ PyObject* PhraserObject::save(PhraserObject* self, PyObject* args, PyObject* kwa ); Py_INCREF(Py_None); return Py_None; - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } PyObject* PhraserObject::load(PhraserObject*, PyObject* args, PyObject* kwargs) @@ -1817,30 +1574,22 @@ PyObject* PhraserObject::load(PhraserObject*, PyObject* args, PyObject* kwargs) if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|O", (char**)kwlist, &path, &baseCls)) return nullptr; - try + return py::handleExc([&]() { if (!baseCls) baseCls = (PyObject*)&Phraser_type; else if (!PyObject_IsSubclass(baseCls, (PyObject*)&Phraser_type)) throw runtime_error{ "`cls` must be a derived class of `Phraser`." }; ifstream ifs{ path }; + if (!ifs) throw py::OSError{ string{"cannot read from '"} + path + "'" }; py::UniqueObj ret{ PyObject_CallObject(baseCls, nullptr) }; - if (!ret) throw bad_exception{}; + if (!ret) throw py::ExcPropagation{}; tomoto::serializer::readMany(ifs, tomoto::serializer::to_keyz("tph1"), ((PhraserObject*)ret.get())->vocabs, ((PhraserObject*)ret.get())->cand_info, ((PhraserObject*)ret.get())->trie_nodes ); return ret.release(); - } - catch (const bad_exception&) - { - return nullptr; - } - catch (const exception& e) - { - PyErr_SetString(PyExc_Exception, e.what()); - return nullptr; - } + }); } @@ -1907,7 +1656,7 @@ vector insertCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* { vector ret; if (!_corpus || _corpus == Py_None) return ret; - if (!PyObject_TypeCheck(_corpus, &UtilsCorpus_type)) throw runtime_error{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; + if (!PyObject_TypeCheck(_corpus, &UtilsCorpus_type)) throw py::ValueError{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; auto corpus = (CorpusObject*)_corpus; bool insert_into_empty = self->inst->updateVocab(corpus->getVocabDict().getRaw()); if (corpus->isIndependent()) @@ -1935,7 +1684,7 @@ vector insertCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* if (doc.words.empty()) { - fprintf(stderr, "[warn] Adding empty document was ignored.\n"); + fprintf(stderr, "Adding empty document was ignored.\n"); continue; } @@ -1972,7 +1721,7 @@ vector insertCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* if (doc.words.empty()) { - fprintf(stderr, "[warn] Adding empty document was ignored.\n"); + fprintf(stderr, "Adding empty document was ignored.\n"); continue; } @@ -1991,7 +1740,7 @@ vector insertCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* CorpusObject* makeCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* transform) { if (!_corpus || _corpus == Py_None) return nullptr; - if (!PyObject_TypeCheck(_corpus, &UtilsCorpus_type)) throw runtime_error{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; + if (!PyObject_TypeCheck(_corpus, &UtilsCorpus_type)) throw py::ValueError{ "`corpus` must be an instance of `tomotopy.utils.Corpus`" }; auto corpus = (CorpusObject*)_corpus; py::UniqueObj _corpusMade{ PyObject_CallFunctionObjArgs((PyObject*)&UtilsCorpus_type, (PyObject*)self, nullptr) }; CorpusObject* corpusMade = (CorpusObject*)_corpusMade.get(); @@ -2019,7 +1768,7 @@ CorpusObject* makeCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* tr if (doc.words.empty()) { - fprintf(stderr, "[warn] Adding empty document was ignored.\n"); + fprintf(stderr, "Adding empty document was ignored.\n"); continue; } diff --git a/src/python/utils.h b/src/python/utils.h index b986820..7648fbc 100644 --- a/src/python/utils.h +++ b/src/python/utils.h @@ -1,5 +1,6 @@ #pragma once +#include "../TopicModel/LDA.h" #include "../Utils/Dictionary.h" #include "module.h" #include "../Labeling/Phraser.hpp" @@ -197,112 +198,99 @@ struct PhraserObject void addUtilsTypes(PyObject* gModule); +template< + template class DocTy, + typename Fn +> +PyObject* docVisit(tomoto::DocumentBase* doc, Fn&& visitor) +{ + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + return nullptr; +} + +template< + template class DocTy, + typename Fn +> +PyObject* docVisit(const tomoto::DocumentBase* doc, Fn&& visitor) +{ + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + if (auto* d = dynamic_cast*>(doc)) + { + return visitor(d); + } + + return nullptr; +} + #define DEFINE_DOCUMENT_GETTER_PROTOTYPE(NAME) \ PyObject* Document_##NAME(DocumentObject* self, void* closure); #define DEFINE_DOCUMENT_GETTER(DOCTYPE, NAME, FIELD) \ PyObject* Document_##NAME(DocumentObject* self, void* closure)\ {\ - try\ + return py::handleExc([&]()\ {\ - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `" #FIELD "` field!" };\ - if (!self->doc) throw runtime_error{ "doc is null!" };\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - do\ + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `" #FIELD "` field!" };\ + if (!self->doc) throw py::RuntimeError{ "doc is null!" };\ + if (auto* ret = docVisit(self->getBoundDoc(), [](auto* doc)\ {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - throw runtime_error{ "doc doesn't has `" #FIELD "` field!" };\ - }\ - catch (const bad_exception&)\ - {\ - return nullptr;\ - }\ - catch (const exception& e)\ - {\ - PyErr_SetString(PyExc_AttributeError, e.what());\ - return nullptr;\ - }\ + return py::buildPyValue(doc->FIELD);\ + })) return ret;\ + throw py::AttributeError{ "doc doesn't has `" #FIELD "` field!" };\ + });\ } #define DEFINE_DOCUMENT_GETTER_WITHOUT_EXC(DOCTYPE, NAME, FIELD) \ PyObject* Document_##NAME(DocumentObject* self, void* closure)\ {\ - try\ + return py::handleExc([&]()\ {\ - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `" #FIELD "` field!" };\ - if (!self->doc) throw runtime_error{ "doc is null!" };\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - do\ + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `" #FIELD "` field!" };\ + if (!self->doc) throw py::RuntimeError{ "doc is null!" };\ + return docVisit(self->getBoundDoc(), [](auto* doc)\ {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return py::buildPyValue(doc->FIELD);\ - } while (0);\ - return nullptr;\ - }\ - catch (const bad_exception&)\ - {\ - return nullptr;\ - }\ - catch (const exception& e)\ - {\ - PyErr_SetString(PyExc_AttributeError, e.what());\ - return nullptr;\ - }\ + return py::buildPyValue(doc->FIELD);\ + });\ + });\ } #define DEFINE_DOCUMENT_GETTER_REORDER(DOCTYPE, NAME, FIELD) \ PyObject* Document_##NAME(DocumentObject* self, void* closure)\ {\ - try\ + return py::handleExc([&]()\ {\ - if (self->corpus->isIndependent()) throw runtime_error{ "doc doesn't has `" #FIELD "` field!" };\ - if (!self->doc) throw runtime_error{ "doc is null!" };\ - do\ + if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `" #FIELD "` field!" };\ + if (!self->doc) throw py::RuntimeError{ "doc is null!" };\ + if (auto* ret = docVisit(self->getBoundDoc(), [](auto* doc)\ {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ - } while (0);\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ - } while (0);\ - do\ - {\ - auto* doc = dynamic_cast*>(self->getBoundDoc());\ - if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ - } while (0);\ - throw runtime_error{ "doc doesn't has `" #FIELD "` field!" };\ - }\ - catch (const bad_exception&)\ - {\ - return nullptr;\ - }\ - catch (const exception& e)\ - {\ - PyErr_SetString(PyExc_AttributeError, e.what());\ - return nullptr;\ - }\ + return buildPyValueReorder(doc->FIELD, doc->wOrder);\ + })) return ret;\ + throw py::AttributeError{ "doc doesn't has `" #FIELD "` field!" }; \ + });\ } namespace py @@ -327,8 +315,9 @@ PyObject* Document_HDP_Z(DocumentObject* self, void* closure); PyObject* Document_HLDA_Z(DocumentObject* self, void* closure); PyObject* Document_DMR_metadata(DocumentObject* self, void* closure); +PyObject* Document_DMR_multiMetadata(DocumentObject* self, void* closure); -PyObject* Document_numeric_metadata(DocumentObject* self, void* closure); +PyObject* Document_numericMetadata(DocumentObject* self, void* closure); DEFINE_DOCUMENT_GETTER_PROTOTYPE(windows); @@ -388,4 +377,4 @@ PyObject* buildPyValueReorder(const _Target& target, const _Order& order, _Tx&& } std::vector insertCorpus(TopicModelObject* self, PyObject* corpus, PyObject* transform); -CorpusObject* makeCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* transform); \ No newline at end of file +CorpusObject* makeCorpus(TopicModelObject* self, PyObject* _corpus, PyObject* transform); diff --git a/test/unit_test.py b/test/unit_test.py index f6b553d..8e7e78c 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -58,12 +58,28 @@ (tp.PAModel, curpath + '/sample.txt', 0, None, {'k1':5, 'k2':10}, [tp.ParallelScheme.COPY_MERGE]), (tp.HPAModel, curpath + '/sample.txt', 0, None, {'k1':5, 'k2':10}, [tp.ParallelScheme.COPY_MERGE]), (tp.DMRModel, curpath + '/sample_with_md.txt', 1, lambda x:{'metadata':'_'.join(x)}, {'k':10}, None), + (tp.DMRModel, curpath + '/sample_with_md.txt', 1, lambda x:{'multi_metadata':x}, {'k':10}, None), (tp.SLDAModel, curpath + '/sample_with_md.txt', 1, lambda x:{'y':list(map(float, x))}, {'k':10, 'vars':'b'}, None), (tp.DTModel, curpath + '/sample_tp.txt', 1, lambda x:{'timepoint':int(x[0])}, {'k':10, 't':13}, None), (tp.GDMRModel, curpath + '/sample_tp.txt', 1, lambda x:{'numeric_metadata':list(map(float, x))}, {'k':10, 'degrees':[3]}, None), (tp.PTModel, curpath + '/sample.txt', 0, None, {'k':10, 'p':100}, [tp.ParallelScheme.PARTITION]), ] +def null_doc(cls, inputFile, mdFields, f, kargs, ps): + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_df=200, rm_top=200, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(100, workers=1, parallel=ps) + + print(mdl.docs[0].words) + print(mdl.docs[0].topics) + def train1(cls, inputFile, mdFields, f, kargs, ps): print('Test train') tw = 0 @@ -139,6 +155,24 @@ def save_and_load(cls, inputFile, mdFields, f, kargs, ps): mdl = cls.loads(bytearr) mdl.train(20, parallel=ps) +def copy_train(cls, inputFile, mdFields, f, kargs, ps): + print('Test copy & train') + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_df=2, rm_top=2, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(200, parallel=ps) + mdl.summary(file=sys.stderr) + new_mdl = mdl.copy() + del mdl + new_mdl.summary(file=sys.stderr) + new_mdl.train(200, parallel=ps) + def infer(cls, inputFile, mdFields, f, kargs, ps): print('Test infer') tw = 0 @@ -453,9 +487,10 @@ def test_corpus_save_load(): pss = model_case[5] if not pss: pss = [tp.ParallelScheme.COPY_MERGE, tp.ParallelScheme.PARTITION] for ps in pss: - for func in [train1, train4, train0, - save_and_load, infer, infer_together - ]: + for func in [null_doc, train1, train4, train0, + save_and_load, infer, infer_together, + copy_train, + ]: locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*(mc + (ps,))))(func, model_case[:-1], ps) for model_case in model_asym_cases: @@ -463,7 +498,7 @@ def test_corpus_save_load(): if not pss: pss = [tp.ParallelScheme.COPY_MERGE, tp.ParallelScheme.PARTITION] for ps in pss: for func in [train1, train4, train0_without_optim, - ][-1:]: + ]: locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*(mc + (ps,))))(func, model_case[:-1], ps) for model_case in model_corpus_cases: diff --git a/tomotopy/_summary.py b/tomotopy/_summary.py index 6114dfc..4aad71c 100644 --- a/tomotopy/_summary.py +++ b/tomotopy/_summary.py @@ -59,9 +59,16 @@ def basic_info_DMRModel(mdl, file): from collections import Counter basic_info_LDAModel(mdl, file) md_cnt = Counter(doc.metadata for doc in mdl.docs) - print('| Metadata of docs and its distribution', file=file) - for md in mdl.metadata_dict: - print('| {}: {}'.format(md, md_cnt.get(md, 0)), file=file) + if len(md_cnt) > 1: + print('| Metadata of docs and its distribution', file=file) + for md in mdl.metadata_dict: + print('| {}: {}'.format(md, md_cnt.get(md, 0)), file=file) + md_cnt = Counter() + [md_cnt.update(doc.multi_metadata) for doc in mdl.docs] + if len(md_cnt) > 0: + print('| Multi-Metadata of docs and its distribution', file=file) + for md in mdl.multi_metadata_dict: + print('| {}: {}'.format(md, md_cnt.get(md, 0)), file=file) def basic_info_GDMRModel(mdl, file): from collections import Counter @@ -73,6 +80,12 @@ def basic_info_GDMRModel(mdl, file): print('| Categorical metadata of docs and its distribution', file=file) for md in mdl.metadata_dict: print('| {}: {}'.format(md, md_cnt.get(md, 0)), file=file) + md_cnt = Counter() + [md_cnt.update(doc.multi_metadata) for doc in mdl.docs] + if len(md_cnt) > 0: + print('| Categorical multi-metadata of docs and its distribution', file=file) + for md in mdl.multi_metadata_dict: + print('| {}: {}'.format(md, md_cnt.get(md, 0)), file=file) md_stack = np.stack([doc.numeric_metadata for doc in mdl.docs]) md_min = md_stack.min(axis=0) @@ -153,7 +166,7 @@ def params_info_SLDAModel(mdl, file): def params_info_DMRModel(mdl, file): print('| lambda (feature vector per metadata of documents)\n' - '| {}'.format(_format_numpy(mdl.lambdas, '| ')), file=file) + '| {}'.format(_format_numpy(mdl.lambda_, '| ')), file=file) print('| alpha (Dirichlet prior on the per-document topic distributions for each metadata)', file=file) for i, md in enumerate(mdl.metadata_dict): print('| {}: {}'.format(md, _format_numpy(mdl.alpha[:, i], '| ')), file=file) @@ -162,7 +175,7 @@ def params_info_DMRModel(mdl, file): def params_info_GDMRModel(mdl, file): print('| lambda (feature vector per metadata of documents)\n' - '| {}'.format(_format_numpy(mdl.lambdas, '| ')), file=file) + '| {}'.format(_format_numpy(mdl.lambda_, '| ')), file=file) print('| eta (Dirichlet prior on the per-topic word distribution)\n' '| {:.5}'.format(mdl.eta), file=file) diff --git a/tomotopy/_version.py b/tomotopy/_version.py index ff1e05d..a514f89 100644 --- a/tomotopy/_version.py +++ b/tomotopy/_version.py @@ -1 +1 @@ -__version__ = '0.11.1' \ No newline at end of file +__version__ = '0.12.0' \ No newline at end of file From e59b38b24327f25499e40d17ecca9b087e8cd6c7 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 25 Apr 2021 22:37:06 +0900 Subject: [PATCH 2/5] update compiler args --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1c7dbf7..112e421 100644 --- a/setup.py +++ b/setup.py @@ -26,13 +26,13 @@ cargs = ['/O2', '/MT', '/Gy'] arch_levels = {'':'', 'sse2':'/arch:SSE2', 'avx':'/arch:AVX', 'avx2':'/arch:AVX2'} elif platform.system() == 'Darwin': - cargs = ['-std=c++0x', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch'] + cargs = ['-std=c++1y', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch'] largs += ['-stdlib=libc++'] if 'many' not in os.environ.get('AUDITWHEEL_PLAT', ''): arch_levels = {'':'-march=native'} elif 'many' in os.environ.get('AUDITWHEEL_PLAT', ''): - cargs = ['-std=c++0x', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch'] + cargs = ['-std=c++1y', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch'] else: - cargs = ['-std=c++0x', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch'] + cargs = ['-std=c++1y', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch'] arch_levels = {'':'-march=native'} if struct.calcsize('P') < 8: arch_levels = {k:v for k, v in arch_levels.items() if k in ('', 'sse2')} From a3dd3b95c7af5512b05d7d47eca201c0843c5bfe Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 25 Apr 2021 22:45:07 +0900 Subject: [PATCH 3/5] fixed compilation errors --- src/TopicModel/DMRModel.hpp | 4 +- src/python/docs.h | 632 ++++++++++++++++++------------------ src/python/py_DMR.cpp | 2 +- 3 files changed, 319 insertions(+), 319 deletions(-) diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp index 2e39a82..cf898a9 100644 --- a/src/TopicModel/DMRModel.hpp +++ b/src/TopicModel/DMRModel.hpp @@ -77,7 +77,7 @@ namespace tomoto const auto K = this->K; Float fx = -static_cast(this)->getNegativeLambdaLL(x, g); - Eigen::Map xReshaped{ x.data(), K, F * mdVecSize }; + Eigen::Map xReshaped{ x.data(), (Eigen::Index)K, (Eigen::Index)(F * mdVecSize) }; std::vector>> res; const size_t chStride = pool.getNumWorkers() * 8; @@ -88,7 +88,7 @@ namespace tomoto auto& tmpK = localData[threadId].tmpK; if (!tmpK.size()) tmpK.resize(this->K); Eigen::Array val = Eigen::Array::Zero(K * F * mdVecSize + 1); - Eigen::Map grad{ val.data(), K, F * mdVecSize }; + Eigen::Map grad{ val.data(), (Eigen::Index)K, (Eigen::Index)(F * mdVecSize) }; Float& fx = val[K * F * mdVecSize]; for (size_t docId = ch; docId < this->docs.size(); docId += chStride) { diff --git a/src/python/docs.h b/src/python/docs.h index fdc3dfa..c268fc4 100644 --- a/src/python/docs.h +++ b/src/python/docs.h @@ -12,23 +12,23 @@ #endif /* - class Document + class Document */ DOC_SIGNATURE_EN_KO(Document___init____doc__, - "Document()", - u8R""(This type provides abstract model to access documents to be used Topic Model. + "Document()", + u8R""(This type provides abstract model to access documents to be used Topic Model. An instance of this type can be acquired from `tomotopy.LDAModel.make_doc` method or `tomotopy.LDAModel.docs` member of each Topic Model instance.)"", - u8R""(이 타입은 토픽 모델에 사용되는 문헌들에 접근할 수 있는 추상 인터페이스을 제공합니다.)""); +u8R""(이 타입은 토픽 모델에 사용되는 문헌들에 접근할 수 있는 추상 인터페이스을 제공합니다.)""); DOC_SIGNATURE_EN_KO(Document_get_topics__doc__, - "get_topics(self, top_n=10)", - u8R""(Return the `top_n` topics with its probability of the document.)"", - u8R""(현재 문헌의 상위 `top_n`개의 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다.)""); + "get_topics(self, top_n=10)", + u8R""(Return the `top_n` topics with its probability of the document.)"", + u8R""(현재 문헌의 상위 `top_n`개의 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다.)""); DOC_SIGNATURE_EN_KO(Document_get_topic_dist__doc__, - "get_topic_dist(self, normalize=True)", - u8R""(Return a distribution of the topics in the document. + "get_topic_dist(self, normalize=True)", + u8R""(Return a distribution of the topics in the document. Parameters ---------- @@ -37,7 +37,7 @@ normalize : bool If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values. )"", - u8R""(현재 문헌의 토픽 확률 분포를 `list` 형태로 반환합니다. +u8R""(현재 문헌의 토픽 확률 분포를 `list` 형태로 반환합니다. Parameters ---------- @@ -48,17 +48,17 @@ normalize : bool )""); DOC_SIGNATURE_EN_KO(Document_get_sub_topics__doc__, - "get_sub_topics(self, top_n=10)", - u8R""(.. versionadded:: 0.5.0 + "get_sub_topics(self, top_n=10)", + u8R""(.. versionadded:: 0.5.0 Return the `top_n` sub topics with its probability of the document. (for only `tomotopy.PAModel`))"", - u8R""(.. versionadded:: 0.5.0 +u8R""(.. versionadded:: 0.5.0 현재 문헌의 상위 `top_n`개의 하위 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))""); DOC_SIGNATURE_EN_KO(Document_get_sub_topic_dist__doc__, - "get_sub_topic_dist(self, normalize=True)", - u8R""(.. versionadded:: 0.5.0 + "get_sub_topic_dist(self, normalize=True)", + u8R""(.. versionadded:: 0.5.0 Return a distribution of the sub topics in the document. (for only `tomotopy.PAModel`) @@ -68,7 +68,7 @@ normalize : bool .. versionadded:: 0.11.0 If True, it returns the probability distribution with the sum being 1. Otherwise it returns the distribution of raw values.)"", - u8R""(.. versionadded:: 0.5.0 + u8R""(.. versionadded:: 0.5.0 현재 문헌의 하위 토픽 확률 분포를 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용) @@ -81,17 +81,17 @@ normalize : bool )""); DOC_SIGNATURE_EN_KO(Document_get_words__doc__, - "get_words(self, top_n=10)", - u8R""(.. versionadded:: 0.4.2 + "get_words(self, top_n=10)", + u8R""(.. versionadded:: 0.4.2 Return the `top_n` words with its probability of the document.)"", - u8R""(.. versionadded:: 0.4.2 +u8R""(.. versionadded:: 0.4.2 현재 문헌의 상위 `top_n`개의 단어와 그 확률을 `tuple`의 `list` 형태로 반환합니다.)""); DOC_SIGNATURE_EN_KO(Document_get_count_vector__doc__, - "get_count_vector(self)", - u8R""(.. versionadded:: 0.7.0 + "get_count_vector(self)", + u8R""(.. versionadded:: 0.7.0 Return a count vector for the current document.)"", u8R""(.. versionadded:: 0.7.0 @@ -108,15 +108,15 @@ u8R""(.. versionadded:: 0.10.0 현재 문헌의 로그가능도 총합을 반환합니다.)""); DOC_VARIABLE_EN_KO(Document_words__doc__, - u8R""(a `list` of IDs for each word (read-only))"", - u8R""(문헌 내 단어들의 ID가 담긴 `list` (읽기전용))""); + u8R""(a `list` of IDs for each word (read-only))"", + u8R""(문헌 내 단어들의 ID가 담긴 `list` (읽기전용))""); DOC_VARIABLE_EN_KO(Document_weight__doc__, - u8R""(a weight of the document (read-only))"", - u8R""(문헌의 가중치 (읽기전용))""); + u8R""(a weight of the document (read-only))"", + u8R""(문헌의 가중치 (읽기전용))""); DOC_VARIABLE_EN_KO(Document_topics__doc__, - u8R""(a `list` of topics for each word (read-only) + u8R""(a `list` of topics for each word (read-only) This represents super topics in `tomotopy.PAModel` and `tomotopy.HPAModel` model.)"", u8R""(문헌의 단어들이 각각 할당된 토픽을 보여주는 `list` (읽기 전용) @@ -128,14 +128,14 @@ DOC_VARIABLE_EN_KO(Document_uid__doc__, u8R""(문헌의 고유 ID (읽기전용))""); DOC_VARIABLE_EN_KO(Document_metadata__doc__, - u8R""(categorical metadata of the document (for only `tomotopy.DMRModel` and `tomotopy.GDMRModel` model, read-only))"", - u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용))""); + u8R""(categorical metadata of the document (for only `tomotopy.DMRModel` and `tomotopy.GDMRModel` model, read-only))"", + u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용))""); DOC_VARIABLE_EN_KO(Document_multi_metadata__doc__, u8R""(categorical multiple metadata of the document (for only `tomotopy.DMRModel` and `tomotopy.GDMRModel` model, read-only) .. versionadded:: 0.12.0)"", - u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용) +u8R""(문헌의 범주형 메타데이터 (`tomotopy.DMRModel`과 `tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용) .. versionadded:: 0.12.0)""); @@ -143,44 +143,44 @@ DOC_VARIABLE_EN_KO(Document_numeric_metadata__doc__, u8R""(continuous numeric metadata of the document (for only `tomotopy.GDMRModel` model, read-only) .. versionadded:: 0.11.0)"", - u8R""(문헌의 연속형 숫자 메타데이터 (`tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용) +u8R""(문헌의 연속형 숫자 메타데이터 (`tomotopy.GDMRModel` 모형에서만 사용됨, 읽기전용) .. versionadded:: 0.11.0)""); DOC_VARIABLE_EN_KO(Document_subtopics__doc__, - u8R""(a `list` of sub topics for each word (for only `tomotopy.PAModel` and `tomotopy.HPAModel` model, read-only))"", - u8R""(문헌의 단어들이 각각 할당된 하위 토픽을 보여주는 `list` (`tomotopy.PAModel`와 `tomotopy.HPAModel` 모형에서만 사용됨, 읽기전용))""); + u8R""(a `list` of sub topics for each word (for only `tomotopy.PAModel` and `tomotopy.HPAModel` model, read-only))"", + u8R""(문헌의 단어들이 각각 할당된 하위 토픽을 보여주는 `list` (`tomotopy.PAModel`와 `tomotopy.HPAModel` 모형에서만 사용됨, 읽기전용))""); DOC_VARIABLE_EN_KO(Document_windows__doc__, - u8R""(a `list` of window IDs for each word (for only `tomotopy.MGLDAModel` model, read-only))"", - u8R""(문헌의 단어들이 할당된 윈도우의 ID를 보여주는 `list` (`tomotopy.MGLDAModel` 모형에서만 사용됨, 읽기전용))""); + u8R""(a `list` of window IDs for each word (for only `tomotopy.MGLDAModel` model, read-only))"", + u8R""(문헌의 단어들이 할당된 윈도우의 ID를 보여주는 `list` (`tomotopy.MGLDAModel` 모형에서만 사용됨, 읽기전용))""); DOC_VARIABLE_EN_KO(Document_path__doc__, - u8R""(a `list` of topic ids by depth for a given document (for only `tomotopy.HLDAModel` model, read-only) + u8R""(a `list` of topic ids by depth for a given document (for only `tomotopy.HLDAModel` model, read-only) .. versionadded:: 0.7.1)"", u8R""(주어진 문헌에 대한 깊이별 토픽 번호의 `list` (`tomotopy.HLDAModel` 모형에서만 사용됨, 읽기전용) .. versionadded:: 0.7.1)""); -DOC_VARIABLE_EN_KO(Document_beta__doc__, - u8R""(a `list` of beta parameters for each topic (for only `tomotopy.CTModel` model, read-only) +DOC_VARIABLE_EN_KO(Document_beta__doc__, + u8R""(a `list` of beta parameters for each topic (for only `tomotopy.CTModel` model, read-only) .. versionadded:: 0.2.0)"", - u8R""(문헌의 각 토픽별 beta 파라미터를 보여주는 `list` (`tomotopy.CTModel` 모형에서만 사용됨, 읽기전용) +u8R""(문헌의 각 토픽별 beta 파라미터를 보여주는 `list` (`tomotopy.CTModel` 모형에서만 사용됨, 읽기전용) .. versionadded:: 0.2.0)""); DOC_VARIABLE_EN_KO(Document_vars__doc__, - u8R""(a `list` of response variables (for only `tomotopy.SLDAModel` model, read-only) + u8R""(a `list` of response variables (for only `tomotopy.SLDAModel` model, read-only) .. versionadded:: 0.2.0)"", - u8R""(문헌의 응답 변수를 보여주는 `list` (`tomotopy.SLDAModel` 모형에서만 사용됨 , 읽기전용) +u8R""(문헌의 응답 변수를 보여주는 `list` (`tomotopy.SLDAModel` 모형에서만 사용됨 , 읽기전용) .. versionadded:: 0.2.0)""); DOC_VARIABLE_EN_KO(Document_labels__doc__, - u8R""(a `list` of (label, list of probabilties of each topic belonging to the label) of the document (for only `tomotopy.LLDAModel` and `tomotopy.PLDAModel` models, read-only) + u8R""(a `list` of (label, list of probabilties of each topic belonging to the label) of the document (for only `tomotopy.LLDAModel` and `tomotopy.PLDAModel` models, read-only) .. versionadded:: 0.3.0)"", u8R""(문헌에 매겨진 (레이블, 레이블에 속하는 각 주제의 확률들)의 `list` (`tomotopy.LLDAModel`, `tomotopy.PLDAModel` 모형에서만 사용됨 , 읽기전용) @@ -188,7 +188,7 @@ u8R""(문헌에 매겨진 (레이블, 레이블에 속하는 각 주제의 확 .. versionadded:: 0.3.0)""); DOC_VARIABLE_EN_KO(Document_eta__doc__, - u8R""(a `list` of eta parameters(topic distribution) for the current document (for only `tomotopy.DTModel` model, read-only) + u8R""(a `list` of eta parameters(topic distribution) for the current document (for only `tomotopy.DTModel` model, read-only) .. versionadded:: 0.7.0)"", u8R""(문헌의 eta 파라미터(토픽 분포)를 나타내는 `list` (`tomotopy.DTModel` 모형에서만 사용됨, 읽기전용) @@ -196,7 +196,7 @@ u8R""(문헌의 eta 파라미터(토픽 분포)를 나타내는 `list` (`tomotop .. versionadded:: 0.7.0)""); DOC_VARIABLE_EN_KO(Document_timepoint__doc__, - u8R""(a timepoint of the document (for only `tomotopy.DTModel` model, read-only) + u8R""(a timepoint of the document (for only `tomotopy.DTModel` model, read-only) .. versionadded:: 0.7.0)"", u8R""(문헌의 시점 (`tomotopy.DTModel` 모형에서만 사용됨, 읽기전용) @@ -220,11 +220,11 @@ u8R""(문헌이 할당된 가상 문헌의 id (`tomotopy.PTModel` 모형에서 .. versionadded:: 0.11.0)""); /* - class LDA + class LDA */ DOC_SIGNATURE_EN_KO(LDA___init____doc__, - "LDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(This type provides Latent Dirichlet Allocation(LDA) topic model and its implementation is based on following papers: + "LDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(This type provides Latent Dirichlet Allocation(LDA) topic model and its implementation is based on following papers: > * Blei, D.M., Ng, A.Y., &Jordan, M.I. (2003).Latent dirichlet allocation.Journal of machine Learning research, 3(Jan), 993 - 1022. > * Newman, D., Asuncion, A., Smyth, P., &Welling, M. (2009).Distributed algorithms for topic models.Journal of Machine Learning Research, 10(Aug), 1801 - 1828. @@ -305,8 +305,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(LDA_add_doc__doc__, - "add_doc(self, words)", - u8R""(Add a new document into the model instance and return an index of the inserted document. This method should be called before calling the `tomotopy.LDAModel.train`. + "add_doc(self, words)", + u8R""(Add a new document into the model instance and return an index of the inserted document. This method should be called before calling the `tomotopy.LDAModel.train`. Parameters ---------- @@ -347,8 +347,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(LDA_make_doc__doc__, - "make_doc(self, words)", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words)", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` that can be used for `tomotopy.LDAModel.infer` method. Parameters ---------- @@ -364,8 +364,8 @@ words : Iterable[str] )""); DOC_SIGNATURE_EN_KO(LDA_set_word_prior__doc__, - "set_word_prior(self, word, prior)", - u8R""(.. versionadded:: 0.6.0 + "set_word_prior(self, word, prior)", + u8R""(.. versionadded:: 0.6.0 Set word-topic prior. This method should be called before calling the `tomotopy.LDAModel.train`. @@ -389,8 +389,8 @@ prior : Iterable[float] )""); DOC_SIGNATURE_EN_KO(LDA_get_word_prior__doc__, - "get_word_prior(self, word)", - u8R""(.. versionadded:: 0.6.0 + "get_word_prior(self, word)", + u8R""(.. versionadded:: 0.6.0 Return word-topic prior for `word`. If there is no set prior for `word`, an empty list is returned. @@ -410,8 +410,8 @@ word : str )""); DOC_SIGNATURE_EN_KO(LDA_train__doc__, - "train(self, iter=10, workers=0, parallel=0, freeze_topics=False)", - u8R""(Train the model using Gibbs-sampling with `iter` iterations. Return `None`. + "train(self, iter=10, workers=0, parallel=0, freeze_topics=False)", + u8R""(Train the model using Gibbs-sampling with `iter` iterations. Return `None`. After calling this method, you cannot `tomotopy.LDAModel.add_doc` or `tomotopy.LDAModel.set_word_prior` more. Parameters @@ -451,8 +451,8 @@ freeze_topics : bool )""); DOC_SIGNATURE_EN_KO(LDA_get_topic_words__doc__, - "get_topic_words(self, topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id`. + "get_topic_words(self, topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -470,8 +470,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(LDA_get_topic_word_dist__doc__, - "get_topic_word_dist(self, topic_id, normalize=True)", - u8R""(Return the word distribution of the topic `topic_id`. + "get_topic_word_dist(self, topic_id, normalize=True)", + u8R""(Return the word distribution of the topic `topic_id`. The returned value is a `list` that has `len(vocabs)` fraction numbers indicating probabilities for each word in the current topic. Parameters @@ -497,13 +497,13 @@ normalize : bool )""); DOC_SIGNATURE_EN_KO(LDA_get_count_by_topics__doc__, - "get_count_by_topics(self)", - u8R""(Return the number of words allocated to each topic.)"", - u8R""(각각의 토픽에 할당된 단어의 개수를 `list`형태로 반환합니다.)""); + "get_count_by_topics(self)", + u8R""(Return the number of words allocated to each topic.)"", + u8R""(각각의 토픽에 할당된 단어의 개수를 `list`형태로 반환합니다.)""); DOC_SIGNATURE_EN_KO(LDA_infer__doc__, - "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False, transform=None)", - u8R""(Return the inferred topic distribution from unseen `doc`s. + "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False, transform=None)", + u8R""(Return the inferred topic distribution from unseen `doc`s. Parameters ---------- @@ -597,8 +597,8 @@ log_ll : float )""); DOC_SIGNATURE_EN_KO(LDA_save__doc__, - "save(self, filename, full=True)", - u8R""(Save the model instance to file `filename`. Return `None`. + "save(self, filename, full=True)", + u8R""(Save the model instance to file `filename`. Return `None`. If `full` is `True`, the model with its all documents and state will be saved. If you want to train more after, use full model. If `False`, only topic parameters of the model will be saved. This model can be only used for inference of an unseen document. @@ -630,9 +630,9 @@ u8R""(.. versionadded:: 0.11.0 DOC_SIGNATURE_EN_KO(LDA_load__doc__, - "load(filename)", - u8R""(Return the model instance loaded from file `filename`.)"", - u8R""(`filename` 경로의 파일로부터 모델 인스턴스를 읽어들여 반환합니다.)""); + "load(filename)", + u8R""(Return the model instance loaded from file `filename`.)"", + u8R""(`filename` 경로의 파일로부터 모델 인스턴스를 읽어들여 반환합니다.)""); DOC_SIGNATURE_EN_KO(LDA_loads__doc__, "loads(data)", @@ -644,7 +644,7 @@ DOC_SIGNATURE_EN_KO(LDA_copy__doc__, u8R""(.. versionadded:: 0.12.0 Return a new deep-copied instance of the current instance)"", - u8R""(.. versionadded:: 0.12.0 +u8R""(.. versionadded:: 0.12.0 깊게 복사된 새 인스턴스를 반환합니다.)""); @@ -688,46 +688,46 @@ flush : bool DOC_VARIABLE_EN_KO(LDA_tw__doc__, - u8R""(the term weighting scheme (read-only))"", - u8R""(현재 모델의 용어 가중치 계획 (읽기전용))""); + u8R""(the term weighting scheme (read-only))"", + u8R""(현재 모델의 용어 가중치 계획 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_perplexity__doc__, - u8R""(a perplexity of the model (read-only))"", - u8R""(현재 모델의 Perplexity (읽기전용))""); + u8R""(a perplexity of the model (read-only))"", + u8R""(현재 모델의 Perplexity (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_ll_per_word__doc__, - u8R""(a log likelihood per-word of the model (read-only))"", - u8R""(현재 모델의 단어당 로그 가능도 (읽기전용))""); + u8R""(a log likelihood per-word of the model (read-only))"", + u8R""(현재 모델의 단어당 로그 가능도 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_k__doc__, - u8R""(K, the number of topics (read-only))"", - u8R""(토픽의 개수 (읽기전용))""); + u8R""(K, the number of topics (read-only))"", + u8R""(토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_alpha__doc__, - u8R""(Dirichlet prior on the per-document topic distributions (read-only))"", - u8R""(문헌의 토픽 분포에 대한 디리클레 분포 파라미터 (읽기전용))""); + u8R""(Dirichlet prior on the per-document topic distributions (read-only))"", + u8R""(문헌의 토픽 분포에 대한 디리클레 분포 파라미터 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_eta__doc__, - u8R""(the hyperparameter eta (read-only))"", - u8R""(하이퍼 파라미터 eta (읽기전용))""); + u8R""(the hyperparameter eta (read-only))"", + u8R""(하이퍼 파라미터 eta (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_docs__doc__, - u8R""(a `list`-like interface of `tomotopy.Document` in the model instance (read-only))"", - u8R""(현재 모델에 포함된 `tomotopy.Document`에 접근할 수 있는 `list`형 인터페이스 (읽기전용))""); + u8R""(a `list`-like interface of `tomotopy.Document` in the model instance (read-only))"", + u8R""(현재 모델에 포함된 `tomotopy.Document`에 접근할 수 있는 `list`형 인터페이스 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_vocabs__doc__, - u8R""(a dictionary, which contains both vocabularies filtered by frequency and vocabularies actually used in modeling, as the type `tomotopy.Dictionary` (read-only))"", - u8R""(빈도수로 필터링된 어휘와 모델에 포함된 어휘 전체를 포함하는 `tomotopy.Dictionary` 타입의 어휘 사전 (읽기전용))""); + u8R""(a dictionary, which contains both vocabularies filtered by frequency and vocabularies actually used in modeling, as the type `tomotopy.Dictionary` (read-only))"", + u8R""(빈도수로 필터링된 어휘와 모델에 포함된 어휘 전체를 포함하는 `tomotopy.Dictionary` 타입의 어휘 사전 (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_num_vocabs__doc__, - u8R""(the number of vocabuluaries after words with a smaller frequency were removed (read-only) + u8R""(the number of vocabuluaries after words with a smaller frequency were removed (read-only) This value is 0 before `train` called. .. deprecated:: 0.8.0 Due to the confusion of its name, this property will be removed. Please use `len(used_vocabs)` instead.)"", - u8R""(작은 빈도의 단어들을 제거한 뒤 남은 어휘의 개수 (읽기전용) + u8R""(작은 빈도의 단어들을 제거한 뒤 남은 어휘의 개수 (읽기전용) `train`이 호출되기 전에는 이 값은 0입니다. @@ -736,45 +736,45 @@ This value is 0 before `train` called. 이 프로퍼티의 이름은 혼동을 일으킬 여지가 있어 제거될 예정입니다. 대신 `len(used_vocabs)`을 사용하십시오.)""); DOC_VARIABLE_EN_KO(LDA_used_vocabs__doc__, - u8R""(a dictionary, which contains only the vocabularies actually used in modeling, as the type `tomotopy.Dictionary` (read-only) + u8R""(a dictionary, which contains only the vocabularies actually used in modeling, as the type `tomotopy.Dictionary` (read-only) .. versionadded:: 0.8.0)"", - u8R""(모델에 실제로 사용된 어휘만을 포함하는 `tomotopy.Dictionary` 타입의 어휘 사전 (읽기전용) +u8R""(모델에 실제로 사용된 어휘만을 포함하는 `tomotopy.Dictionary` 타입의 어휘 사전 (읽기전용) .. versionadded:: 0.8.0)""); DOC_VARIABLE_EN_KO(LDA_vocab_freq__doc__, - u8R""(a `list` of vocabulary frequencies which contains both vocabularies filtered by frequency and vocabularies actually used in modeling (read-only))"", - u8R""(빈도수로 필터링된 어휘와 현재 모델에 포함된 어휘 전체의 빈도를 보여주는 `list` (읽기전용))""); + u8R""(a `list` of vocabulary frequencies which contains both vocabularies filtered by frequency and vocabularies actually used in modeling (read-only))"", + u8R""(빈도수로 필터링된 어휘와 현재 모델에 포함된 어휘 전체의 빈도를 보여주는 `list` (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_used_vocab_freq__doc__, - u8R""(a `list` of vocabulary frequencies which contains only vocabularies actually used in modeling (read-only) + u8R""(a `list` of vocabulary frequencies which contains only vocabularies actually used in modeling (read-only) .. versionadded:: 0.8.0)"", - u8R""(모델에 실제로 사용된 어휘들의 빈도를 보여주는 `list` (읽기전용))""); +u8R""(모델에 실제로 사용된 어휘들의 빈도를 보여주는 `list` (읽기전용))""); DOC_VARIABLE_EN_KO(LDA_vocab_df__doc__, - u8R""(a `list` of vocabulary document-frequencies which contains both vocabularies filtered by frequency and vocabularies actually used in modeling (read-only) + u8R""(a `list` of vocabulary document-frequencies which contains both vocabularies filtered by frequency and vocabularies actually used in modeling (read-only) .. versionadded:: 0.8.0)"", - u8R""(빈도수로 필터링된 어휘와 현재 모델에 포함된 어휘 전체의 문헌빈도를 보여주는 `list` (읽기전용) +u8R""(빈도수로 필터링된 어휘와 현재 모델에 포함된 어휘 전체의 문헌빈도를 보여주는 `list` (읽기전용) .. versionadded:: 0.8.0)""); DOC_VARIABLE_EN_KO(LDA_used_vocab_df__doc__, - u8R""(a `list` of vocabulary document-frequencies which contains only vocabularies actually used in modeling (read-only) + u8R""(a `list` of vocabulary document-frequencies which contains only vocabularies actually used in modeling (read-only) .. versionadded:: 0.8.0)"", - u8R""(모델에 실제로 사용된 어휘들의 문헌빈도를 보여주는 `list` (읽기전용) +u8R""(모델에 실제로 사용된 어휘들의 문헌빈도를 보여주는 `list` (읽기전용) .. versionadded:: 0.8.0)""); DOC_VARIABLE_EN_KO(LDA_num_words__doc__, - u8R""(the number of total words (read-only) + u8R""(the number of total words (read-only) This value is 0 before `train` called.)"", - u8R""(현재 모델에 포함된 문헌들 전체의 단어 개수 (읽기전용) +u8R""(현재 모델에 포함된 문헌들 전체의 단어 개수 (읽기전용) `train`이 호출되기 전에는 이 값은 0입니다.)""); @@ -787,31 +787,31 @@ u8R""(현재까지 수행된 학습의 총 반복 횟수 (읽기전용) .. versionadded:: 0.9.0)""); DOC_VARIABLE_EN_KO(LDA_optim_interval__doc__, - u8R""(get or set the interval for optimizing parameters + u8R""(get or set the interval for optimizing parameters Its default value is 10. If it is set to 0, the parameter optimization is turned off.)"", - u8R""(파라미터 최적화의 주기를 얻거나 설정합니다. +u8R""(파라미터 최적화의 주기를 얻거나 설정합니다. 기본값은 10이며, 0으로 설정할 경우 학습 과정에서 파라미터 최적화를 수행하지 않습니다.)""); DOC_VARIABLE_EN_KO(LDA_burn_in__doc__, - u8R""(get or set the burn-in iterations for optimizing parameters + u8R""(get or set the burn-in iterations for optimizing parameters Its default value is 0.)"", - u8R""(파라미터 학습 초기의 Burn-in 단계의 반복 횟수를 얻거나 설정합니다. +u8R""(파라미터 학습 초기의 Burn-in 단계의 반복 횟수를 얻거나 설정합니다. 기본값은 0입니다.)""); DOC_VARIABLE_EN_KO(LDA_removed_top_words__doc__, - u8R""(a `list` of `str` which is a word removed from the model if you set `rm_top` greater than 0 at initializing the model (read-only))"", -u8R""(모델 생성시 `rm_top` 파라미터를 1 이상으로 설정한 경우, 빈도수가 높아서 모델에서 제외된 단어의 목록을 보여줍니다. (읽기전용))""); + u8R""(a `list` of `str` which is a word removed from the model if you set `rm_top` greater than 0 at initializing the model (read-only))"", + u8R""(모델 생성시 `rm_top` 파라미터를 1 이상으로 설정한 경우, 빈도수가 높아서 모델에서 제외된 단어의 목록을 보여줍니다. (읽기전용))""); /* - class DMR + class DMR */ DOC_SIGNATURE_EN_KO(DMR___init____doc__, - "DMRModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, sigma=1.0, alpha_epsilon=0.0000000001, seed=None, corpus=None, transform=None)", - u8R""(This type provides Dirichlet Multinomial Regression(DMR) topic model and its implementation is based on following papers: + "DMRModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, sigma=1.0, alpha_epsilon=0.0000000001, seed=None, corpus=None, transform=None)", + u8R""(This type provides Dirichlet Multinomial Regression(DMR) topic model and its implementation is based on following papers: > * Mimno, D., & McCallum, A. (2012). Topic models conditioned on arbitrary features with dirichlet-multinomial regression. arXiv preprint arXiv:1206.3278. @@ -898,8 +898,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(DMR_add_doc__doc__, - "add_doc(self, words, metadata='', multi_metadata=[])", - u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. + "add_doc(self, words, metadata='', multi_metadata=[])", + u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. .. versionchanged:: 0.12.0 @@ -931,8 +931,8 @@ multi_metadata : Iterable[str] )""); DOC_SIGNATURE_EN_KO(DMR_make_doc__doc__, - "make_doc(self, words, metadata='', multi_metadata=[])", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, metadata='', multi_metadata=[])", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. .. versionchanged:: 0.12.0 @@ -1010,20 +1010,20 @@ raw : bool )""); DOC_VARIABLE_EN_KO(DMR_f__doc__, - u8R""(the number of metadata features (read-only))"", - u8R""(메타데이터 자질 종류의 개수 (읽기전용))""); + u8R""(the number of metadata features (read-only))"", + u8R""(메타데이터 자질 종류의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(DMR_sigma__doc__, - u8R""(the hyperparameter sigma (read-only))"", - u8R""(하이퍼 파라미터 sigma (읽기전용))""); + u8R""(the hyperparameter sigma (read-only))"", + u8R""(하이퍼 파라미터 sigma (읽기전용))""); DOC_VARIABLE_EN_KO(DMR_alpha_epsilon__doc__, - u8R""(the smooting value alpha-epsilon (read-only))"", - u8R""(평탄화 계수 alpha-epsilon (읽기전용))""); + u8R""(the smooting value alpha-epsilon (read-only))"", + u8R""(평탄화 계수 alpha-epsilon (읽기전용))""); DOC_VARIABLE_EN_KO(DMR_metadata_dict__doc__, - u8R""(a dictionary of metadata in type `tomotopy.Dictionary` (read-only))"", - u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용))""); + u8R""(a dictionary of metadata in type `tomotopy.Dictionary` (read-only))"", + u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용))""); DOC_VARIABLE_EN_KO(DMR_multi_metadata_dict__doc__, u8R""(a dictionary of metadata in type `tomotopy.Dictionary` (read-only) @@ -1032,7 +1032,7 @@ DOC_VARIABLE_EN_KO(DMR_multi_metadata_dict__doc__, This dictionary is distinct from `metadata_dict`. )"", - u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용) +u8R""(`tomotopy.Dictionary` 타입의 메타데이터 사전 (읽기전용) .. versionadded:: 0.12.0 @@ -1040,13 +1040,13 @@ DOC_VARIABLE_EN_KO(DMR_multi_metadata_dict__doc__, )""); DOC_VARIABLE_EN_KO(DMR_lamdas__doc__, - u8R""(parameter lambdas in the shape `[k, f]` (read-only) + u8R""(parameter lambdas in the shape `[k, f]` (read-only) .. warning:: Prior to version 0.11.0, there was a bug in the lambda getter, so it yielded the wrong value. It is recommended to upgrade to version 0.11.0 or later. )"", - u8R""(현재 모형의 lambda 파라미터을 보여주는 `[k, f]` 모양의 float array (읽기전용) +u8R""(현재 모형의 lambda 파라미터을 보여주는 `[k, f]` 모양의 float array (읽기전용) .. warning:: @@ -1084,11 +1084,11 @@ DOC_VARIABLE_EN_KO(DMR_alpha__doc__, 0.11.0 버전 전까지는 lambda getter에 있는 버그로 잘못된 값이 출력되었습니다. 0.11.0 이후 버전으로 업그레이드하시길 권장합니다.)""); /* - class GDMR + class GDMR */ DOC_SIGNATURE_EN_KO(GDMR___init____doc__, - "GDMRModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, degrees=[], alpha=0.1, eta=0.01, sigma=1.0, sigma0=3.0, decay=0, alpha_epsilon=0.0000000001, metadata_range=None, seed=None, corpus=None, transform=None)", - u8R""(This type provides Generalized DMR(g-DMR) topic model and its implementation is based on following papers: + "GDMRModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, degrees=[], alpha=0.1, eta=0.01, sigma=1.0, sigma0=3.0, decay=0, alpha_epsilon=0.0000000001, metadata_range=None, seed=None, corpus=None, transform=None)", + u8R""(This type provides Generalized DMR(g-DMR) topic model and its implementation is based on following papers: > * Lee, M., & Song, M. Incorporating citation impact into analysis of research trends. Scientometrics, 1-34. @@ -1207,8 +1207,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(GDMR_add_doc__doc__, - "add_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", - u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. + "add_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", + u8R""(Add a new document into the model instance with `metadata` and return an index of the inserted document. .. versionchanged:: 0.11.0 @@ -1256,8 +1256,8 @@ multi_metadata : Iterable[str] )""); DOC_SIGNATURE_EN_KO(GDMR_make_doc__doc__, - "make_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, numeric_metadata=[], metadata='', multi_metadata=[])", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `metadata` that can be used for `tomotopy.LDAModel.infer` method. .. versionchanged:: 0.11.0 @@ -1305,8 +1305,8 @@ multi_metadata : Iterable[str] )""); DOC_SIGNATURE_EN_KO(GDMR_tdf__doc__, - "tdf(self, numeric_metadata, metadata='', multi_metadata=[], normalize=True)", - u8R""(Calculate a topic distribution for given `numeric_metadata` value. It returns a list with length `k`. + "tdf(self, numeric_metadata, metadata='', multi_metadata=[], normalize=True)", + u8R""(Calculate a topic distribution for given `numeric_metadata` value. It returns a list with length `k`. .. versionchanged:: 0.11.0 @@ -1347,8 +1347,8 @@ normalize : bool DOC_SIGNATURE_EN_KO(GDMR_tdf_linspace__doc__, - "tdf_linspace(self, numeric_metadata_start, numeric_metadata_stop, num, metadata='', multi_metadata=[], endpoint=True, normalize=True)", - u8R""(Calculate a topic distribution for given `metadata` value. It returns a list with length `k`. + "tdf_linspace(self, numeric_metadata_start, numeric_metadata_stop, num, metadata='', multi_metadata=[], endpoint=True, normalize=True)", + u8R""(Calculate a topic distribution for given `metadata` value. It returns a list with length `k`. .. versionchanged:: 0.11.0 @@ -1406,28 +1406,28 @@ normalize : bool DOC_VARIABLE_EN_KO(GDMR_degrees__doc__, - u8R""(the degrees of Legendre polynomials (read-only))"", - u8R""(르장드르 다항식의 차수 (읽기전용))""); + u8R""(the degrees of Legendre polynomials (read-only))"", + u8R""(르장드르 다항식의 차수 (읽기전용))""); DOC_VARIABLE_EN_KO(GDMR_sigma0__doc__, - u8R""(the hyperparameter sigma0 (read-only))"", - u8R""(하이퍼 파라미터 sigma0 (읽기전용))""); + u8R""(the hyperparameter sigma0 (read-only))"", + u8R""(하이퍼 파라미터 sigma0 (읽기전용))""); DOC_VARIABLE_EN_KO(GDMR_decay__doc__, u8R""(the hyperparameter decay (read-only))"", u8R""(하이퍼 파라미터 decay (읽기전용))""); DOC_VARIABLE_EN_KO(GDMR_metadata_range__doc__, - u8R""(the ranges of each metadata variable (read-only))"", - u8R""(각 메타데이터 변수의 범위를 나타내는 `list` (읽기전용))""); + u8R""(the ranges of each metadata variable (read-only))"", + u8R""(각 메타데이터 변수의 범위를 나타내는 `list` (읽기전용))""); /* - class HDP + class HDP */ DOC_SIGNATURE_EN_KO(HDP___init____doc__, - "HDPModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, initial_k=2, alpha=0.1, eta=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", - u8R""(This type provides Hierarchical Dirichlet Process(HDP) topic model and its implementation is based on following papers: + "HDPModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, initial_k=2, alpha=0.1, eta=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", + u8R""(This type provides Hierarchical Dirichlet Process(HDP) topic model and its implementation is based on following papers: > * Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing clusters among related groups: Hierarchical Dirichlet processes. In Advances in neural information processing systems (pp. 1385-1392). > * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed algorithms for topic models. Journal of Machine Learning Research, 10(Aug), 1801-1828. @@ -1525,8 +1525,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(HDP_is_live_topic__doc__, - "is_live_topic(self, topic_id)", - u8R""(Return `True` if the topic `topic_id` is valid, otherwise return `False`. + "is_live_topic(self, topic_id)", + u8R""(Return `True` if the topic `topic_id` is valid, otherwise return `False`. Parameters ---------- @@ -1542,8 +1542,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HDP_convert_to_lda__doc__, - "convert_to_lda(self, topic_threshold=0.0)", - u8R""(.. versionadded:: 0.8.0 + "convert_to_lda(self, topic_threshold=0.0)", + u8R""(.. versionadded:: 0.8.0 Convert the current HDP model to equivalent LDA model and return `(new_lda_model, new_topic_id)`. Topics with proportion less than `topic_threshold` are removed in `new_lda_model`. @@ -1573,23 +1573,23 @@ topic_threshold : float )""); DOC_VARIABLE_EN_KO(HDP_gamma__doc__, - u8R""(the hyperparameter gamma (read-only))"", - u8R""(하이퍼 파라미터 gamma (읽기전용))""); + u8R""(the hyperparameter gamma (read-only))"", + u8R""(하이퍼 파라미터 gamma (읽기전용))""); DOC_VARIABLE_EN_KO(HDP_live_k__doc__, - u8R""(the number of alive topics (read-only))"", - u8R""(현재 모델 내의 유효한 토픽의 개수 (읽기전용))""); + u8R""(the number of alive topics (read-only))"", + u8R""(현재 모델 내의 유효한 토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(HDP_num_tables__doc__, - u8R""(the number of total tables (read-only))"", - u8R""(현재 모델 내의 총 테이블 개수 (읽기전용))""); + u8R""(the number of total tables (read-only))"", + u8R""(현재 모델 내의 총 테이블 개수 (읽기전용))""); /* - class MGLDA + class MGLDA */ DOC_SIGNATURE_EN_KO(MGLDA___init____doc__, - "MGLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k_g=1, k_l=1, t=3, alpha_g=0.1, alpha_l=0.1, alpha_mg=0.1, alpha_ml=0.1, eta_g=0.01, eta_l=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", - u8R""(This type provides Multi Grain Latent Dirichlet Allocation(MG-LDA) topic model and its implementation is based on following papers: + "MGLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k_g=1, k_l=1, t=3, alpha_g=0.1, alpha_l=0.1, alpha_mg=0.1, alpha_ml=0.1, eta_g=0.01, eta_l=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", + u8R""(This type provides Multi Grain Latent Dirichlet Allocation(MG-LDA) topic model and its implementation is based on following papers: > * Titov, I., & McDonald, R. (2008, April). Modeling online reviews with multi-grain topic models. In Proceedings of the 17th international conference on World Wide Web (pp. 111-120). ACM. @@ -1696,8 +1696,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(MGLDA_add_doc__doc__, - "add_doc(self, words, delimiter='.')", - u8R""(Add a new document into the model instance and return an index of the inserted document. + "add_doc(self, words, delimiter='.')", + u8R""(Add a new document into the model instance and return an index of the inserted document. Parameters ---------- @@ -1717,8 +1717,8 @@ delimiter : str )""); DOC_SIGNATURE_EN_KO(MGLDA_make_doc__doc__, - "make_doc(self, words, delimiter='.')", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, delimiter='.')", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` that can be used for `tomotopy.LDAModel.infer` method. Parameters ---------- @@ -1738,8 +1738,8 @@ delimiter : str )""); DOC_SIGNATURE_EN_KO(MGLDA_get_topic_words__doc__, - "get_topic_words(self, topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id`. + "get_topic_words(self, topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -1758,8 +1758,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(MGLDA_get_topic_word_dist__doc__, - "get_topic_word_dist(self, topic_id, normalize=True)", - u8R""(Return the word distribution of the topic `topic_id`. + "get_topic_word_dist(self, topic_id, normalize=True)", + u8R""(Return the word distribution of the topic `topic_id`. The returned value is a `list` that has `len(vocabs)` fraction numbers indicating probabilities for each word in the current topic. Parameters @@ -1786,52 +1786,52 @@ normalize : bool )""); DOC_VARIABLE_EN_KO(MGLDA_k_g__doc__, - u8R""(the hyperparameter k_g (read-only))"", - u8R""(하이퍼 파라미터 k_g (읽기전용))""); + u8R""(the hyperparameter k_g (read-only))"", + u8R""(하이퍼 파라미터 k_g (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_k_l__doc__, - u8R""(the hyperparameter k_l (read-only))"", - u8R""(하이퍼 파라미터 k_l (읽기전용))""); + u8R""(the hyperparameter k_l (read-only))"", + u8R""(하이퍼 파라미터 k_l (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_gamma__doc__, - u8R""(the hyperparameter gamma (read-only))"", - u8R""(하이퍼 파라미터 gamma (읽기전용))""); + u8R""(the hyperparameter gamma (read-only))"", + u8R""(하이퍼 파라미터 gamma (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_t__doc__, - u8R""(the hyperparameter t (read-only))"", - u8R""(하이퍼 파라미터 t (읽기전용))""); + u8R""(the hyperparameter t (read-only))"", + u8R""(하이퍼 파라미터 t (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_alpha_g__doc__, - u8R""(the hyperparameter alpha_g (read-only))"", - u8R""(하이퍼 파라미터 alpha_g (읽기전용))""); + u8R""(the hyperparameter alpha_g (read-only))"", + u8R""(하이퍼 파라미터 alpha_g (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_alpha_l__doc__, - u8R""(the hyperparameter alpha_l (read-only))"", - u8R""(하이퍼 파라미터 alpha_l (읽기전용))""); + u8R""(the hyperparameter alpha_l (read-only))"", + u8R""(하이퍼 파라미터 alpha_l (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_alpha_mg__doc__, - u8R""(the hyperparameter alpha_mg (read-only))"", - u8R""(하이퍼 파라미터 alpha_mg (읽기전용))""); + u8R""(the hyperparameter alpha_mg (read-only))"", + u8R""(하이퍼 파라미터 alpha_mg (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_alpha_ml__doc__, - u8R""(the hyperparameter alpha_ml (read-only))"", - u8R""(하이퍼 파라미터 alpha_ml (읽기전용))""); + u8R""(the hyperparameter alpha_ml (read-only))"", + u8R""(하이퍼 파라미터 alpha_ml (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_eta_g__doc__, - u8R""(the hyperparameter eta_g (read-only))"", - u8R""(하이퍼 파라미터 eta_g (읽기전용))""); + u8R""(the hyperparameter eta_g (read-only))"", + u8R""(하이퍼 파라미터 eta_g (읽기전용))""); DOC_VARIABLE_EN_KO(MGLDA_eta_l__doc__, - u8R""(the hyperparameter eta_l (read-only))"", - u8R""(하이퍼 파라미터 eta_l (읽기전용))""); + u8R""(the hyperparameter eta_l (read-only))"", + u8R""(하이퍼 파라미터 eta_l (읽기전용))""); /* - class PA + class PA */ DOC_SIGNATURE_EN_KO(PA___init____doc__, - "PAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k1=1, k2=1, alpha=0.1, subalpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(This type provides Pachinko Allocation(PA) topic model and its implementation is based on following papers: + "PAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k1=1, k2=1, alpha=0.1, subalpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(This type provides Pachinko Allocation(PA) topic model and its implementation is based on following papers: > * Li, W., & McCallum, A. (2006, June). Pachinko allocation: DAG-structured mixture models of topic correlations. In Proceedings of the 23rd international conference on Machine learning (pp. 577-584). ACM. @@ -1922,8 +1922,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(PA_get_topic_words__doc__, - "get_topic_words(self, sub_topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the sub topic `sub_topic_id`. + "get_topic_words(self, sub_topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the sub topic `sub_topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -1941,8 +1941,8 @@ sub_topic_id : int )""); DOC_SIGNATURE_EN_KO(PA_get_topic_word_dist__doc__, - "get_topic_word_dist(self, sub_topic_id, normalize=True)", - u8R""(Return the word distribution of the sub topic `sub_topic_id`. + "get_topic_word_dist(self, sub_topic_id, normalize=True)", + u8R""(Return the word distribution of the sub topic `sub_topic_id`. The returned value is a `list` that has `len(vocabs)` fraction numbers indicating probabilities for each word in the current sub topic. Parameters @@ -1968,8 +1968,8 @@ normalize : bool )""); DOC_SIGNATURE_EN_KO(PA_get_sub_topics__doc__, - "get_sub_topics(self, super_topic_id, top_n=10)", - u8R""(.. versionadded:: 0.1.4 + "get_sub_topics(self, super_topic_id, top_n=10)", + u8R""(.. versionadded:: 0.1.4 Return the `top_n` sub topics and its probability in a super topic `super_topic_id`. The return type is a `list` of (subtopic:`int`, probability:`float`). @@ -1991,8 +1991,8 @@ super_topic_id : int )""); DOC_SIGNATURE_EN_KO(PA_get_sub_topic_dist__doc__, - "get_sub_topic_dist(self, super_topic_id, normalize=True)", - u8R""(Return a distribution of the sub topics in a super topic `super_topic_id`. + "get_sub_topic_dist(self, super_topic_id, normalize=True)", + u8R""(Return a distribution of the sub topics in a super topic `super_topic_id`. The returned value is a `list` that has `k2` fraction numbers indicating probabilities for each sub topic in the current super topic. Parameters @@ -2014,8 +2014,8 @@ normalize : bool )""); DOC_SIGNATURE_EN_KO(PA_infer__doc__, - "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False)", - u8R""(.. versionadded:: 0.5.0 + "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False)", + u8R""(.. versionadded:: 0.5.0 Return the inferred topic distribution and sub-topic distribution from unseen `doc`s. @@ -2117,23 +2117,23 @@ DOC_SIGNATURE_EN_KO(PA_get_count_by_super_topic__doc__, u8R""(Return the number of words allocated to each super-topic. .. versionadded:: 0.9.0)"", - u8R""(각각의 상위 토픽에 할당된 단어의 개수를 `list`형태로 반환합니다. +u8R""(각각의 상위 토픽에 할당된 단어의 개수를 `list`형태로 반환합니다. .. versionadded:: 0.9.0)""); DOC_VARIABLE_EN_KO(PA_k1__doc__, - u8R""(k1, the number of super topics (read-only))"", - u8R""(k1, 상위 토픽의 개수 (읽기전용))""); + u8R""(k1, the number of super topics (read-only))"", + u8R""(k1, 상위 토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(PA_k2__doc__, - u8R""(k2, the number of sub topics (read-only))"", - u8R""(k2, 하위 토픽의 개수 (읽기전용))""); + u8R""(k2, the number of sub topics (read-only))"", + u8R""(k2, 하위 토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(PA_alpha__doc__, u8R""(Dirichlet prior on the per-document super topic distributions in shape `[k1]` (read-only) .. versionadded:: 0.9.0)"", - u8R""(문헌의 상위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1]` 모양 (읽기전용) +u8R""(문헌의 상위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1]` 모양 (읽기전용) .. versionadded:: 0.9.0)""); @@ -2141,17 +2141,17 @@ DOC_VARIABLE_EN_KO(PA_subalpha__doc__, u8R""(Dirichlet prior on the sub topic distributions for each super topic in shape `[k1, k2]` (read-only) .. versionadded:: 0.9.0)"", - u8R""(상위 토픽의 하위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1, k2]` 모양 (읽기전용) +u8R""(상위 토픽의 하위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1, k2]` 모양 (읽기전용) .. versionadded:: 0.9.0)""); /* - class HPA + class HPA */ DOC_SIGNATURE_EN_KO(HPA___init____doc__, - "HPAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k1=1, k2=1, alpha=0.1, subalpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(This type provides Hierarchical Pachinko Allocation(HPA) topic model and its implementation is based on following papers: + "HPAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k1=1, k2=1, alpha=0.1, subalpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(This type provides Hierarchical Pachinko Allocation(HPA) topic model and its implementation is based on following papers: > * Mimno, D., Li, W., & McCallum, A. (2007, June). Mixtures of hierarchical topics with pachinko allocation. In Proceedings of the 24th international conference on Machine learning (pp. 633-640). ACM. @@ -2242,8 +2242,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(HPA_get_topic_words__doc__, - "get_topic_words(self, topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id`. + "get_topic_words(self, topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -2265,8 +2265,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HPA_get_topic_word_dist__doc__, - "get_topic_word_dist(self, topic_id, normalize=True)", - u8R""(Return the word distribution of the topic `topic_id`. + "get_topic_word_dist(self, topic_id, normalize=True)", + u8R""(Return the word distribution of the topic `topic_id`. The returned value is a `list` that has `len(vocabs)` fraction numbers indicating probabilities for each word in current topic. Parameters @@ -2300,7 +2300,7 @@ DOC_VARIABLE_EN_KO(HPA_alpha__doc__, Its element 0 indicates the prior to the top topic and elements 1 ~ k1 indicates ones to the super topics. (read-only) .. versionadded:: 0.9.0)"", - u8R""(문헌의 상위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1 + 1]` 모양. +u8R""(문헌의 상위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1 + 1]` 모양. 0번째 요소는 최상위 토픽을 가리키며, 1 ~ k1번째가 상위 토픽을 가리킨다. (읽기전용) .. versionadded:: 0.9.0)""); @@ -2311,19 +2311,19 @@ Its `[x, 0]` element indicates the prior to the super topic `x` and `[x, 1 ~ k2]` elements indicate ones to the sub topics in the super topic `x`. (read-only) .. versionadded:: 0.9.0)"", - u8R""(상위 토픽의 하위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1, k2 + 1]` 모양. +u8R""(상위 토픽의 하위 토픽 분포에 대한 디리클레 분포 파라미터, `[k1, k2 + 1]` 모양. `[x, 0]` 요소는 상위 토픽 `x`를 가리키며, `[x, 1 ~ k2]` 요소는 상위 토픽 `x` 내의 하위 토픽들을 가리킨다. (읽기전용) .. versionadded:: 0.9.0)""); /* - class CT + class CT */ DOC_SIGNATURE_EN_KO(CT___init____doc__, - "CTModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, smoothing_alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(.. versionadded:: 0.2.0 + "CTModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, smoothing_alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(.. versionadded:: 0.2.0 This type provides Correlated Topic Model (CTM) and its implementation is based on following papers: > * Blei, D., & Lafferty, J. (2006). Correlated topic models. Advances in neural information processing systems, 18, 147. @@ -2402,8 +2402,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(CT_get_correlations__doc__, - "get_correlations(self, topic_id=None)", - u8R""(Return correlations between the topic `topic_id` and other topics. + "get_correlations(self, topic_id=None)", + u8R""(Return correlations between the topic `topic_id` and other topics. The returned value is a `list` of `float`s of size `tomotopy.LDAModel.k`. Parameters @@ -2413,7 +2413,7 @@ topic_id : Union[int, None] If omitted, the whole correlation matrix is returned. )"", - u8R""(토픽 `topic_id`와 나머지 토픽들 간의 상관관계를 반환합니다. +u8R""(토픽 `topic_id`와 나머지 토픽들 간의 상관관계를 반환합니다. 반환값은 `tomotopy.LDAModel.k` 길이의 `float`의 `list`입니다. Parameters @@ -2424,14 +2424,14 @@ topic_id : Union[int, None] 생략 시 상관계수 행렬 전체가 반환됩니다. )""); -DOC_VARIABLE_EN_KO(CT_num_beta_sample__doc__, - u8R""(the number of times to sample beta parameters, default value is 10. +DOC_VARIABLE_EN_KO(CT_num_beta_sample__doc__, + u8R""(the number of times to sample beta parameters, default value is 10. CTModel samples `num_beta_sample` beta parameters for each document. The more beta it samples, the more accurate the distribution will be, but the longer time it takes to learn. If you have a small number of documents in your model, keeping this value larger will help you get better result. -)"", - u8R""(beta 파라미터를 표집하는 횟수, 기본값은 10. +)"", +u8R""(beta 파라미터를 표집하는 횟수, 기본값은 10. CTModel은 각 문헌마다 총 `num_beta_sample` 개수의 beta 파라미터를 표집합니다. beta 파라미터를 더 많이 표집할 수록, 전체 분포는 정교해지지만 학습 시간이 더 많이 걸립니다. @@ -2439,36 +2439,36 @@ beta 파라미터를 더 많이 표집할 수록, 전체 분포는 정교해지 )""); DOC_VARIABLE_EN_KO(CT_num_tmn_sample__doc__, - u8R""(the number of iterations for sampling Truncated Multivariate Normal distribution, default value is 5. + u8R""(the number of iterations for sampling Truncated Multivariate Normal distribution, default value is 5. If your model shows biased topic correlations, increasing this value may be helpful.)"", - u8R""(절단된 다변수 정규분포에서 표본을 추출하기 위한 반복 횟수, 기본값은 5. +u8R""(절단된 다변수 정규분포에서 표본을 추출하기 위한 반복 횟수, 기본값은 5. 만약 결과에서 토픽 간 상관관계가 편향되게 나올 경우 이 값을 키우면 편향을 해소하는 데에 도움이 될 수 있습니다. )""); DOC_VARIABLE_EN_KO(CT_prior_mean__doc__, - u8R""(the mean of prior logistic-normal distribution for the topic distribution (read-only))"", - u8R""(토픽의 사전 분포인 로지스틱 정규 분포의 평균 벡터 (읽기전용))""); + u8R""(the mean of prior logistic-normal distribution for the topic distribution (read-only))"", + u8R""(토픽의 사전 분포인 로지스틱 정규 분포의 평균 벡터 (읽기전용))""); DOC_VARIABLE_EN_KO(CT_prior_cov__doc__, - u8R""(the covariance matrix of prior logistic-normal distribution the for topic distribution (read-only))"", - u8R""(토픽의 사전 분포인 로지스틱 정규 분포의 공분산 행렬 (읽기전용))""); + u8R""(the covariance matrix of prior logistic-normal distribution the for topic distribution (read-only))"", + u8R""(토픽의 사전 분포인 로지스틱 정규 분포의 공분산 행렬 (읽기전용))""); DOC_VARIABLE_EN_KO(CT_alpha__doc__, u8R""(This property is not available in `CTModel`. Use `CTModel.prior_mean` and `CTModel.prior_cov` instead. .. versionadded:: 0.9.1)"", - u8R""(이 프로퍼티는 `CTModel`에서 사용불가합니다. 대신 `CTModel.prior_mean`와 `CTModel.prior_cov`를 사용하십시오. +u8R""(이 프로퍼티는 `CTModel`에서 사용불가합니다. 대신 `CTModel.prior_mean`와 `CTModel.prior_cov`를 사용하십시오. .. versionadded:: 0.9.1)""); /* - class SLDA + class SLDA */ DOC_SIGNATURE_EN_KO(SLDA___init____doc__, - "SLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, vars='', alpha=0.1, eta=0.01, mu=[], nu_sq=[], glm_param=[], seed=None, corpus=None, transform=None)", - u8R""(This type provides supervised Latent Dirichlet Allocation(sLDA) topic model and its implementation is based on following papers: + "SLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, vars='', alpha=0.1, eta=0.01, mu=[], nu_sq=[], glm_param=[], seed=None, corpus=None, transform=None)", + u8R""(This type provides supervised Latent Dirichlet Allocation(sLDA) topic model and its implementation is based on following papers: > * Mcauliffe, J. D., & Blei, D. M. (2008). Supervised topic models. In Advances in neural information processing systems (pp. 121-128). > * Python version implementation using Gibbs sampling : https://github.com/Savvysherpa/slda @@ -2575,8 +2575,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(SLDA_add_doc__doc__, - "add_doc(self, words, y=[])", - u8R""(Add a new document into the model instance with response variables `y` and return an index of the inserted document. + "add_doc(self, words, y=[])", + u8R""(Add a new document into the model instance with response variables `y` and return an index of the inserted document. Parameters ---------- @@ -2605,8 +2605,8 @@ y : Iterable[float] )""); DOC_SIGNATURE_EN_KO(SLDA_make_doc__doc__, - "make_doc(self, words, y=[])", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and response variables `y` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, y=[])", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and response variables `y` that can be used for `tomotopy.LDAModel.infer` method. Parameters ---------- @@ -2630,8 +2630,8 @@ y : Iterable[float] )""); DOC_SIGNATURE_EN_KO(SLDA_get_regression_coef__doc__, - "get_regression_coef(self, var_id=None)", - u8R""(Return the regression coefficient of the response variable `var_id`. + "get_regression_coef(self, var_id=None)", + u8R""(Return the regression coefficient of the response variable `var_id`. Parameters ---------- @@ -2640,7 +2640,7 @@ var_id : int If omitted, the whole regression coefficients with shape `[f, k]` are returned. )"", - u8R""(응답 변수 `var_id`의 회귀 계수를 반환합니다. +u8R""(응답 변수 `var_id`의 회귀 계수를 반환합니다. Parameters ---------- @@ -2651,13 +2651,13 @@ var_id : int )""); DOC_SIGNATURE_EN_KO(SLDA_get_var_type__doc__, - "get_var_type(self, var_id)", - u8R""(Return the type of the response variable `var_id`. 'l' means linear variable, 'b' means binary variable.)"", - u8R""(응답 변수 `var_id`의 종류를 반환합니다. 'l'은 선형 변수, 'b'는 이진 변수를 뜻합니다.)""); + "get_var_type(self, var_id)", + u8R""(Return the type of the response variable `var_id`. 'l' means linear variable, 'b' means binary variable.)"", + u8R""(응답 변수 `var_id`의 종류를 반환합니다. 'l'은 선형 변수, 'b'는 이진 변수를 뜻합니다.)""); DOC_SIGNATURE_EN_KO(SLDA_estimate__doc__, - "estimate(self, doc)", - u8R""(Return the estimated response variable for `doc`. + "estimate(self, doc)", + u8R""(Return the estimated response variable for `doc`. If `doc` is an unseen document instance which is generated by `tomotopy.SLDAModel.make_doc` method, it should be inferred by `tomotopy.LDAModel.infer` method first. Parameters @@ -2665,7 +2665,7 @@ Parameters doc : tomotopy.Document an instance of document or a list of them to be used for estimating response variables )"", - u8R""(`doc`의 추정된 응답 변수를 반환합니다. +u8R""(`doc`의 추정된 응답 변수를 반환합니다. 만약 `doc`이 `tomotopy.SLDAModel.make_doc`에 의해 생성된 인스턴스라면, 먼저 `tomotopy.LDAModel.infer`를 통해 토픽 추론을 실시한 다음 이 메소드를 사용해야 합니다. Parameters @@ -2675,15 +2675,15 @@ doc : tomotopy.Document )""); DOC_VARIABLE_EN_KO(SLDA_f__doc__, - u8R""(the number of response variables (read-only))"", - u8R""(응답 변수의 개수 (읽기전용))""); + u8R""(the number of response variables (read-only))"", + u8R""(응답 변수의 개수 (읽기전용))""); /* - class LLDA + class LLDA */ DOC_SIGNATURE_EN_KO(LLDA___init____doc__, - "LLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(This type provides Labeled LDA(L-LDA) topic model and its implementation is based on following papers: + "LLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(This type provides Labeled LDA(L-LDA) topic model and its implementation is based on following papers: > * Ramage, D., Hall, D., Nallapati, R., & Manning, C. D. (2009, August). Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora. In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 1-Volume 1 (pp. 248-256). Association for Computational Linguistics. @@ -2768,8 +2768,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(LLDA_add_doc__doc__, - "add_doc(self, words, labels=[])", - u8R""(Add a new document into the model instance with `labels` and return an index of the inserted document. + "add_doc(self, words, labels=[])", + u8R""(Add a new document into the model instance with `labels` and return an index of the inserted document. Parameters ---------- @@ -2789,8 +2789,8 @@ labels : Iterable[str] )""); DOC_SIGNATURE_EN_KO(LLDA_make_doc__doc__, - "make_doc(self, words, labels=[])", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `labels` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, labels=[])", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `labels` that can be used for `tomotopy.LDAModel.infer` method. Parameters ---------- @@ -2810,8 +2810,8 @@ labels : Iterable[str] )""); DOC_SIGNATURE_EN_KO(LLDA_get_topic_words__doc__, - "get_topic_words(self, topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id`. + "get_topic_words(self, topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -2835,15 +2835,15 @@ topic_id : int DOC_VARIABLE_EN_KO(LLDA_topic_label_dict__doc__, - u8R""(a dictionary of topic labels in type `tomotopy.Dictionary` (read-only))"", - u8R""(`tomotopy.Dictionary` 타입의 토픽 레이블 사전 (읽기전용))""); + u8R""(a dictionary of topic labels in type `tomotopy.Dictionary` (read-only))"", + u8R""(`tomotopy.Dictionary` 타입의 토픽 레이블 사전 (읽기전용))""); /* - class PLDA + class PLDA */ DOC_SIGNATURE_EN_KO(PLDA___init____doc__, - "PLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, latent_topics=0, topics_per_label=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", - u8R""(This type provides Partially Labeled LDA(PLDA) topic model and its implementation is based on following papers: + "PLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, latent_topics=0, topics_per_label=1, alpha=0.1, eta=0.01, seed=None, corpus=None, transform=None)", + u8R""(This type provides Partially Labeled LDA(PLDA) topic model and its implementation is based on following papers: > * Ramage, D., Manning, C. D., & Dumais, S. (2011, August). Partially labeled topic models for interpretable text mining. In Proceedings of the 17th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 457-465). ACM. @@ -2927,8 +2927,8 @@ transform : Callable[dict, dict] DOC_SIGNATURE_EN_KO(PLDA_get_topic_words__doc__, - "get_topic_words(self, topic_id, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id`. + "get_topic_words(self, topic_id, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -2952,23 +2952,23 @@ topic_id : int DOC_VARIABLE_EN_KO(PLDA_topic_label_dict__doc__, - u8R""(a dictionary of topic labels in type `tomotopy.Dictionary` (read-only))"", - u8R""(`tomotopy.Dictionary` 타입의 토픽 레이블 사전 (읽기전용))""); + u8R""(a dictionary of topic labels in type `tomotopy.Dictionary` (read-only))"", + u8R""(`tomotopy.Dictionary` 타입의 토픽 레이블 사전 (읽기전용))""); DOC_VARIABLE_EN_KO(PLDA_latent_topics__doc__, - u8R""(the number of latent topics (read-only))"", - u8R""(잠재 토픽의 개수 (읽기전용))""); + u8R""(the number of latent topics (read-only))"", + u8R""(잠재 토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(PLDA_topics_per_label__doc__, - u8R""(the number of topics per label (read-only))"", - u8R""(레이블별 토픽의 개수 (읽기전용))""); + u8R""(the number of topics per label (read-only))"", + u8R""(레이블별 토픽의 개수 (읽기전용))""); /* - class HLDA + class HLDA */ DOC_SIGNATURE_EN_KO(HLDA___init____doc__, - "HLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, depth=2, alpha=0.1, eta=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", - u8R""(This type provides Hierarchical LDA topic model and its implementation is based on following papers: + "HLDAModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, depth=2, alpha=0.1, eta=0.01, gamma=0.1, seed=None, corpus=None, transform=None)", + u8R""(This type provides Hierarchical LDA topic model and its implementation is based on following papers: > * Griffiths, T. L., Jordan, M. I., Tenenbaum, J. B., & Blei, D. M. (2004). Hierarchical topic models and the nested Chinese restaurant process. In Advances in neural information processing systems (pp. 17-24). @@ -3053,8 +3053,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(HLDA_is_live_topic__doc__, - "is_live_topic(self, topic_id)", - u8R""(Return `True` if the topic `topic_id` is alive, otherwise return `False`. + "is_live_topic(self, topic_id)", + u8R""(Return `True` if the topic `topic_id` is alive, otherwise return `False`. Parameters ---------- @@ -3070,8 +3070,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HLDA_num_docs_of_topic__doc__, - "num_docs_of_topic(self, topic_id)", - u8R""(Return the number of documents belonging to a topic `topic_id`. + "num_docs_of_topic(self, topic_id)", + u8R""(Return the number of documents belonging to a topic `topic_id`. Parameters ---------- @@ -3087,8 +3087,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HLDA_level__doc__, - "level(self, topic_id)", - u8R""(Return the level of a topic `topic_id`. + "level(self, topic_id)", + u8R""(Return the level of a topic `topic_id`. Parameters ---------- @@ -3104,8 +3104,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HLDA_parent_topic__doc__, - "parent_topic(self, topic_id)", - u8R""(Return the topic ID of parent of a topic `topic_id`. + "parent_topic(self, topic_id)", + u8R""(Return the topic ID of parent of a topic `topic_id`. Parameters ---------- @@ -3121,8 +3121,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(HLDA_children_topics__doc__, - "children_topics(self, topic_id)", - u8R""(Return a list of topic IDs with children of a topic `topic_id`. + "children_topics(self, topic_id)", + u8R""(Return a list of topic IDs with children of a topic `topic_id`. Parameters ---------- @@ -3138,23 +3138,23 @@ topic_id : int )""); DOC_VARIABLE_EN_KO(HLDA_gamma__doc__, - u8R""(the hyperparameter gamma (read-only))"", - u8R""(하이퍼 파라미터 gamma (읽기전용))""); + u8R""(the hyperparameter gamma (read-only))"", + u8R""(하이퍼 파라미터 gamma (읽기전용))""); DOC_VARIABLE_EN_KO(HLDA_live_k__doc__, - u8R""(the number of alive topics (read-only))"", - u8R""(현재 모델 내의 유효한 토픽의 개수 (읽기전용))""); + u8R""(the number of alive topics (read-only))"", + u8R""(현재 모델 내의 유효한 토픽의 개수 (읽기전용))""); DOC_VARIABLE_EN_KO(HLDA_depth__doc__, - u8R""(the number of depth (read-only))"", - u8R""(현재 모델의 총 깊이 (읽기전용))""); + u8R""(the number of depth (read-only))"", + u8R""(현재 모델의 총 깊이 (읽기전용))""); /* - class DT + class DT */ DOC_SIGNATURE_EN_KO(DT___init____doc__, - "DTModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, t=1, alpha_var=0.1, eta_var=0.1, phi_var=0.1, lr_a=0.01, lr_b=0.1, lr_c=0.55, seed=None, corpus=None, transform=None)", - u8R""(This type provides Dynamic Topic model and its implementation is based on following papers: + "DTModel(tw=TermWeight.ONE, min_cf=0, min_df=0, rm_top=0, k=1, t=1, alpha_var=0.1, eta_var=0.1, phi_var=0.1, lr_a=0.01, lr_b=0.1, lr_c=0.55, seed=None, corpus=None, transform=None)", + u8R""(This type provides Dynamic Topic model and its implementation is based on following papers: > * Blei, D. M., & Lafferty, J. D. (2006, June). Dynamic topic models. In Proceedings of the 23rd international conference on Machine learning (pp. 113-120). > * Bhadury, A., Chen, J., Zhu, J., & Liu, S. (2016, April). Scaling up dynamic topic models. In Proceedings of the 25th International Conference on World Wide Web (pp. 381-390). @@ -3245,8 +3245,8 @@ transform : Callable[dict, dict] )""); DOC_SIGNATURE_EN_KO(DT_add_doc__doc__, - "add_doc(self, words, timepoint=0)", - u8R""(Add a new document into the model instance with `timepoint` and return an index of the inserted document. + "add_doc(self, words, timepoint=0)", + u8R""(Add a new document into the model instance with `timepoint` and return an index of the inserted document. Parameters ---------- @@ -3266,8 +3266,8 @@ timepoint : int )""); DOC_SIGNATURE_EN_KO(DT_make_doc__doc__, - "make_doc(self, words, timepoint=0)", - u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `timepoint` that can be used for `tomotopy.LDAModel.infer` method. + "make_doc(self, words, timepoint=0)", + u8R""(Return a new `tomotopy.Document` instance for an unseen document with `words` and `timepoint` that can be used for `tomotopy.LDAModel.infer` method. Parameters ---------- @@ -3287,8 +3287,8 @@ timepoint : int )""); DOC_SIGNATURE_EN_KO(DT_get_alpha__doc__, - "get_alpha(self, timepoint)", - u8R""(Return a `list` of alpha parameters for `timepoint`. + "get_alpha(self, timepoint)", + u8R""(Return a `list` of alpha parameters for `timepoint`. Parameters ---------- @@ -3304,8 +3304,8 @@ timepoint : int )""); DOC_SIGNATURE_EN_KO(DT_get_phi__doc__, - "get_phi(self, timepoint, topic_id)", - u8R""(Return a `list` of phi parameters for `timepoint` and `topic_id`. + "get_phi(self, timepoint, topic_id)", + u8R""(Return a `list` of phi parameters for `timepoint` and `topic_id`. Parameters ---------- @@ -3325,8 +3325,8 @@ topic_id : int )""); DOC_SIGNATURE_EN_KO(DT_get_topic_words__doc__, - "get_topic_words(self, topic_id, timepoint, top_n=10)", - u8R""(Return the `top_n` words and its probability in the topic `topic_id` with `timepoint`. + "get_topic_words(self, topic_id, timepoint, top_n=10)", + u8R""(Return the `top_n` words and its probability in the topic `topic_id` with `timepoint`. The return type is a `list` of (word:`str`, probability:`float`). Parameters @@ -3348,8 +3348,8 @@ timepoint : int )""); DOC_SIGNATURE_EN_KO(DT_get_topic_word_dist__doc__, - "get_topic_word_dist(self, topic_id, timepoint, normalize=True)", - u8R""(Return the word distribution of the topic `topic_id` with `timepoint`. + "get_topic_word_dist(self, topic_id, timepoint, normalize=True)", + u8R""(Return the word distribution of the topic `topic_id` with `timepoint`. The returned value is a `list` that has `len(vocabs)` fraction numbers indicating probabilities for each word in the current topic. Parameters @@ -3383,21 +3383,21 @@ DOC_SIGNATURE_EN_KO(DT_get_count_by_topics__doc__, u8R""(Return the number of words allocated to each timepoint and topic in the shape `[num_timepoints, k]`. .. versionadded:: 0.9.0)"", - u8R""(각각의 시점과 토픽에 할당된 단어의 개수를 `[num_timepoints, k]` 모양으로 반환합니다. +u8R""(각각의 시점과 토픽에 할당된 단어의 개수를 `[num_timepoints, k]` 모양으로 반환합니다. .. versionadded:: 0.9.0)""); DOC_VARIABLE_EN_KO(DT_lr_a__doc__, - u8R""(parameter `a` greater than zero for SGLD step size (e_i = a * (b + i) ^ -c))"", - u8R""(SGLD의 스텝 크기를 결정하는 0보다 큰 파라미터 `a` (e_i = a * (b + i) ^ -c))""); + u8R""(parameter `a` greater than zero for SGLD step size (e_i = a * (b + i) ^ -c))"", + u8R""(SGLD의 스텝 크기를 결정하는 0보다 큰 파라미터 `a` (e_i = a * (b + i) ^ -c))""); DOC_VARIABLE_EN_KO(DT_lr_b__doc__, - u8R""(parameter `b` greater than zero or equal to zero for SGLD step size (e_i = a * (b + i) ^ -c))"", - u8R""(SGLD의 스텝 크기를 결정하는 0 이상의 파라미터 `b` (e_i = a * (b + i) ^ -c))""); + u8R""(parameter `b` greater than zero or equal to zero for SGLD step size (e_i = a * (b + i) ^ -c))"", + u8R""(SGLD의 스텝 크기를 결정하는 0 이상의 파라미터 `b` (e_i = a * (b + i) ^ -c))""); DOC_VARIABLE_EN_KO(DT_lr_c__doc__, - u8R""(parameter `c` with range (0.5, 1] for SGLD step size (e_i = a * (b + i) ^ -c))"", - u8R""(SGLD의 스텝 크기를 결정하는 (0.5, 1] 범위의 파라미터 `c` (e_i = a * (b + i) ^ -c))""); + u8R""(parameter `c` with range (0.5, 1] for SGLD step size (e_i = a * (b + i) ^ -c))"", + u8R""(SGLD의 스텝 크기를 결정하는 (0.5, 1] 범위의 파라미터 `c` (e_i = a * (b + i) ^ -c))""); DOC_VARIABLE_EN_KO(DT_num_timepoints__doc__, u8R""(the number of timepoints of the model (read-only))"", @@ -3411,7 +3411,7 @@ DOC_VARIABLE_EN_KO(DT_alpha__doc__, u8R""(per-document topic distribution in the shape `[num_timepoints, k]` (read-only) .. versionadded:: 0.9.0)"", - u8R""(문헌별 토픽 분포, `[num_timepoints, k]` 모양 (읽기전용) +u8R""(문헌별 토픽 분포, `[num_timepoints, k]` 모양 (읽기전용) .. versionadded:: 0.9.0)""); @@ -3419,7 +3419,7 @@ DOC_VARIABLE_EN_KO(DT_eta__doc__, u8R""(This property is not available in `DTModel`. Use `DTModel.docs[x].eta` instead. .. versionadded:: 0.9.0)"", - u8R""(이 프로퍼티는 `DTModel`에서 사용불가합니다. 대신 `DTModel.docs[x].eta`를 사용하십시오. +u8R""(이 프로퍼티는 `DTModel`에서 사용불가합니다. 대신 `DTModel.docs[x].eta`를 사용하십시오. .. versionadded:: 0.9.0)""); @@ -3505,4 +3505,4 @@ DOC_VARIABLE_EN_KO(PT_p__doc__, .. versionadded:: 0.11.0)"", u8R""(가상 문헌의 개수 (읽기전용) -.. versionadded:: 0.11.0)""); \ No newline at end of file +.. versionadded:: 0.11.0)""); diff --git a/src/python/py_DMR.cpp b/src/python/py_DMR.cpp index b50d1ff..2474e9d 100644 --- a/src/python/py_DMR.cpp +++ b/src/python/py_DMR.cpp @@ -182,7 +182,7 @@ static PyObject* DMR_getLambda(TopicModelObject* self, void* closure) { if (!self->inst) throw py::RuntimeError{ "inst is null" }; auto* inst = static_cast(self->inst); - npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)inst->getF() * inst->getMdVecSize() }; + npy_intp shapes[2] = { (npy_intp)inst->getK(), (npy_intp)(inst->getF() * inst->getMdVecSize()) }; PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); for (size_t i = 0; i < inst->getK(); ++i) { From ae28a1fbad578389b42a05936356384e78704398 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 25 Apr 2021 22:52:17 +0900 Subject: [PATCH 4/5] fixed copilation errors --- src/python/py_LLDA.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index 51f0974..e5a738b 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -129,7 +129,7 @@ PyObject* Document_labels(DocumentObject* self, void* closure) if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `labels` field!" }; if (!self->doc) throw py::RuntimeError{ "doc is null!" }; - if (auto* ret = docVisit(self->getBoundDoc(), [&](auto* doc) + if (auto* r = docVisit(self->getBoundDoc(), [&](auto* doc) { auto inst = dynamic_cast(self->corpus->tm->inst); auto dict = inst->getTopicLabelDict(); @@ -144,7 +144,7 @@ PyObject* Document_labels(DocumentObject* self, void* closure) } } return py::buildPyValue(ret); - })) return ret; + })) return r; throw py::AttributeError{ "doc doesn't has `labels` field!" }; }); From 6b932d2b0d4855e72d9d25315cab6728ab160e02 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Mon, 26 Apr 2021 01:49:06 +0900 Subject: [PATCH 5/5] fixed DTModel.copy() --- src/TopicModel/DTModel.hpp | 10 ++++++++++ src/python/py_LLDA.cpp | 10 +++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/TopicModel/DTModel.hpp b/src/TopicModel/DTModel.hpp index 1841017..1649622 100644 --- a/src/TopicModel/DTModel.hpp +++ b/src/TopicModel/DTModel.hpp @@ -477,6 +477,16 @@ namespace tomoto return cnt; } + void updateForCopy() + { + BaseClass::updateForCopy(); + size_t docId = 0; + for (auto& doc : this->docs) + { + doc.eta.init((Float*)etaByDoc.col(docId++).data(), this->K, 1); + } + } + public: DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi); diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index e5a738b..f9189f8 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -129,22 +129,22 @@ PyObject* Document_labels(DocumentObject* self, void* closure) if (self->corpus->isIndependent()) throw py::AttributeError{ "doc doesn't has `labels` field!" }; if (!self->doc) throw py::RuntimeError{ "doc is null!" }; - if (auto* r = docVisit(self->getBoundDoc(), [&](auto* doc) + if (auto* ret = docVisit(self->getBoundDoc(), [&](auto* doc) { auto inst = dynamic_cast(self->corpus->tm->inst); auto dict = inst->getTopicLabelDict(); - vector>> ret; + vector>> r; auto topicDist = inst->getTopicsByDoc(doc); for (size_t i = 0; i < dict.size(); ++i) { if (doc->labelMask[i * inst->getNumTopicsPerLabel()]) { - ret.emplace_back(inst->getTopicLabelDict().toWord(i), + r.emplace_back(inst->getTopicLabelDict().toWord(i), vector{ &topicDist[i * inst->getNumTopicsPerLabel()], &topicDist[(i + 1) * inst->getNumTopicsPerLabel()] }); } } - return py::buildPyValue(ret); - })) return r; + return py::buildPyValue(r); + })) return ret; throw py::AttributeError{ "doc doesn't has `labels` field!" }; });