From 706bfa016c2d6831aa777bcdd40a8e1f2580f545 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 14 Jul 2020 21:18:31 +0900 Subject: [PATCH 1/5] preparing 0.8.2 fixed #59, partially fixed #63 --- README.kr.rst | 6 +- README.rst | 5 + examples/dtm_plot.py | 44 +++++++ examples/gdmr_plot.py | 2 +- src/Labeling/FoRelevance.cpp | 2 +- src/TopicModel/CT.h | 11 +- src/TopicModel/CTModel.cpp | 8 +- src/TopicModel/CTModel.hpp | 26 ++-- src/TopicModel/DMR.h | 11 +- src/TopicModel/DMRModel.cpp | 8 +- src/TopicModel/DMRModel.hpp | 26 ++-- src/TopicModel/DT.h | 14 ++- src/TopicModel/DTModel.cpp | 8 +- src/TopicModel/DTModel.hpp | 39 +++--- src/TopicModel/GDMR.h | 9 +- src/TopicModel/GDMRModel.cpp | 8 +- src/TopicModel/GDMRModel.hpp | 13 +- src/TopicModel/HDP.h | 4 +- src/TopicModel/HDPModel.cpp | 8 +- src/TopicModel/HDPModel.hpp | 24 ++-- src/TopicModel/HLDA.h | 4 +- src/TopicModel/HLDAModel.cpp | 8 +- src/TopicModel/HLDAModel.hpp | 26 ++-- src/TopicModel/HPA.h | 4 +- src/TopicModel/HPAModel.cpp | 10 +- src/TopicModel/HPAModel.hpp | 21 ++-- src/TopicModel/LDA.h | 8 +- src/TopicModel/LDACVB0Model.hpp | 14 +-- src/TopicModel/LDAModel.cpp | 8 +- src/TopicModel/LDAModel.hpp | 92 ++++++++------ src/TopicModel/LLDA.h | 4 +- src/TopicModel/LLDAModel.cpp | 8 +- src/TopicModel/LLDAModel.hpp | 14 +-- src/TopicModel/MGLDA.h | 3 +- src/TopicModel/MGLDAModel.cpp | 10 +- src/TopicModel/MGLDAModel.hpp | 16 +-- src/TopicModel/PA.h | 4 +- src/TopicModel/PAModel.cpp | 8 +- src/TopicModel/PAModel.hpp | 20 ++-- src/TopicModel/PLDA.h | 3 +- src/TopicModel/PLDAModel.cpp | 8 +- src/TopicModel/PLDAModel.hpp | 14 +-- src/TopicModel/SLDA.h | 11 +- src/TopicModel/SLDAModel.cpp | 8 +- src/TopicModel/SLDAModel.hpp | 15 +-- src/TopicModel/TopicModel.hpp | 34 ++---- src/Utils/Dictionary.h | 2 +- src/Utils/EigenAddonOps.hpp | 205 -------------------------------- src/Utils/LUT.hpp | 10 +- src/Utils/Utils.hpp | 2 +- src/Utils/avx_mathfun.h | 62 +++++----- src/Utils/math.h | 10 +- src/Utils/sample.hpp | 117 ++++++------------ src/Utils/serializer.hpp | 14 +-- src/Utils/sse_mathfun.h | 62 +++++----- src/python/PyUtils.h | 11 +- src/python/docs.h | 14 ++- src/python/py_CT.cpp | 25 +++- src/python/py_DMR.cpp | 25 +++- src/python/py_DT.cpp | 32 ++++- src/python/py_GDMR.cpp | 26 +++- src/python/py_HDP.cpp | 27 ++++- src/python/py_HLDA.cpp | 27 ++++- src/python/py_HPA.cpp | 27 ++++- src/python/py_LDA.cpp | 26 +++- src/python/py_LLDA.cpp | 27 ++++- src/python/py_MGLDA.cpp | 28 ++++- src/python/py_PA.cpp | 29 ++++- src/python/py_PLDA.cpp | 27 ++++- src/python/py_SLDA.cpp | 49 ++++++-- tomotopy/documentation.kr.rst | 6 +- tomotopy/documentation.rst | 7 +- tomotopy/version.py | 2 +- 73 files changed, 822 insertions(+), 728 deletions(-) create mode 100644 examples/dtm_plot.py diff --git a/README.kr.rst b/README.kr.rst index 6be3782..110e624 100644 --- a/README.kr.rst +++ b/README.kr.rst @@ -35,7 +35,7 @@ tomotopy 란? 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다. -tomotopy의 가장 최신버전은 0.8.1 입니다. +tomotopy의 가장 최신버전은 0.8.2 입니다. 시작하기 --------------- @@ -240,6 +240,10 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.8.2 (2020-07-14) + * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다. + * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다. + * 0.8.1 (2020-06-08) * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다. * 이제 `tomotopy.CTModel.prior_cov`가 `[k, k]` 모양의 공분산 행렬을 반환합니다. diff --git a/README.rst b/README.rst index b43fe74..95df717 100644 --- a/README.rst +++ b/README.rst @@ -246,6 +246,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.8.2 (2020-07-14) + * New properties `tomotopy.DTModel.num_timepoints` and `tomotopy.DTModel.num_docs_by_timepoint` have been added. + * A bug which causes different results with the different platform even if `seeds` were the same was partially fixed. + As a result of this fix, now `tomotopy` in 32 bit yields different training results from earlier version. + * 0.8.1 (2020-06-08) * A bug where `tomotopy.LDAModel.used_vocabs` returned an incorrect value was fixed. * Now `tomotopy.CTModel.prior_cov` returns a covariance matrix with shape `[k, k]`. diff --git a/examples/dtm_plot.py b/examples/dtm_plot.py new file mode 100644 index 0000000..ffac27d --- /dev/null +++ b/examples/dtm_plot.py @@ -0,0 +1,44 @@ +import tomotopy as tp +import numpy as np +import nltk + +def data_feeder(input_file): + for line in open(input_file, encoding='utf-8'): + fd = line.strip().split(maxsplit=1) + timepoint = int(fd[0]) + yield fd[1], None, {'timepoint':timepoint} + +porter_stemmer = nltk.PorterStemmer().stem +corpus = tp.utils.Corpus( + tokenizer=tp.utils.SimpleTokenizer(porter_stemmer) +) +corpus.process(data_feeder('../test/sample_tp.txt')) + +num_timepoints = 13 + +mdl = tp.DTModel(min_cf=3, k=10, t=num_timepoints, phi_var=1e-2, corpus=corpus) +mdl.train(0) + +print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( + len(mdl.docs), len(mdl.used_vocabs), mdl.num_words +)) +print('Removed Top words: ', *mdl.removed_top_words) + +# Let's train the model +for i in range(0, 1000, 20): + print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word)) + mdl.train(20) +print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word)) + +topic_dist_by_time = np.zeros(shape=[num_timepoints, mdl.k], dtype=np.float) +doc_counts_by_time = np.zeros(shape=[num_timepoints], dtype=np.int32) +for doc in mdl.docs: + doc_counts_by_time[doc.timepoint] += 1 + topic_dist_by_time[doc.timepoint] += doc.get_topic_dist() + +topic_dist_by_time /= doc_counts_by_time[:, np.newaxis] + +for k in range(mdl.k): + print('Topic #{}'.format(k), *(w for w, _ in mdl.get_topic_words(k, 0, top_n=5))) + print(topic_dist_by_time[:, k]) + \ No newline at end of file diff --git a/examples/gdmr_plot.py b/examples/gdmr_plot.py index 57e1594..e32b5f8 100644 --- a/examples/gdmr_plot.py +++ b/examples/gdmr_plot.py @@ -51,7 +51,7 @@ def __call__(self, value, clip=None): ''' corpus = tp.utils.Corpus() -for line in open('examples/dataset2.txt', encoding='utf-8'): +for line in open('dataset2.txt', encoding='utf-8'): fd = line.strip().split() corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2]))) diff --git a/src/Labeling/FoRelevance.cpp b/src/Labeling/FoRelevance.cpp index b001fbf..1dec18b 100644 --- a/src/Labeling/FoRelevance.cpp +++ b/src/Labeling/FoRelevance.cpp @@ -138,7 +138,7 @@ std::vector PMIExtractor::extract(const tomoto::ITopicModel * tm) con trieNodes[0].traverse_with_keys([&](const TrieEx* node, const std::vector& rkeys) { if (rkeys.size() <= 2 || node->val < candMinCnt) return; - float n = tm->getN(); + float n = (float)tm->getN(); auto pmi = node->val / n; for (auto k : rkeys) { diff --git a/src/TopicModel/CT.h b/src/TopicModel/CT.h index 44cc688..7539996 100644 --- a/src/TopicModel/CT.h +++ b/src/TopicModel/CT.h @@ -3,11 +3,11 @@ namespace tomoto { - template - struct DocumentCTM : public DocumentLDA<_tw, _Flags> + template + struct DocumentCTM : public DocumentLDA<_tw> { - using BaseDocument = DocumentLDA<_tw, _Flags>; - using DocumentLDA<_tw, _Flags>::DocumentLDA; + using BaseDocument = DocumentLDA<_tw>; + using DocumentLDA<_tw>::DocumentLDA; Eigen::Matrix beta; // Dim: (K, betaSample) Eigen::Matrix smBeta; // Dim: K @@ -21,7 +21,8 @@ namespace tomoto using DefaultDocType = DocumentCTM; static ICTModel* create(TermWeight _weight, size_t _K = 1, Float smoothingAlpha = 0.1, Float _eta = 0.01, - const RandGen& _rg = RandGen{ std::random_device{}() }); + size_t seed = std::random_device{}(), + bool scalarRng = false); virtual void setNumBetaSample(size_t numSample) = 0; virtual size_t getNumBetaSample() const = 0; diff --git a/src/TopicModel/CTModel.cpp b/src/TopicModel/CTModel.cpp index 8ed2e2a..b4afbba 100644 --- a/src/TopicModel/CTModel.cpp +++ b/src/TopicModel/CTModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class CTModel; + /*template class CTModel; template class CTModel; - template class CTModel; + template class CTModel;*/ - ICTModel* ICTModel::create(TermWeight _weight, size_t _K, Float smoothingAlpha, Float _eta, const RandGen& _rg) + ICTModel* ICTModel::create(TermWeight _weight, size_t _K, Float smoothingAlpha, Float _eta, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, CTModel, _K, smoothingAlpha, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, CTModel, _K, smoothingAlpha, _eta, seed); } } \ No newline at end of file diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp index 58259be..b523f43 100644 --- a/src/TopicModel/CTModel.hpp +++ b/src/TopicModel/CTModel.hpp @@ -16,18 +16,19 @@ namespace tomoto { }; - template, typename _ModelState = ModelStateCTM<_tw>> - class CTModel : public LDAModel<_tw, _Flags, _Interface, - typename std::conditional::value, CTModel<_tw, _Flags>, _Derived>::type, + class CTModel : public LDAModel<_tw, _RandGen, _Flags, _Interface, + typename std::conditional::value, CTModel<_tw, _RandGen, _Flags>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, CTModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, CTModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -53,12 +54,11 @@ namespace tomoto return &zLikelihood[0]; } - void updateBeta(_DocType& doc, RandGen& rg) const + void updateBeta(_DocType& doc, _RandGen& rg) const { Eigen::Matrix pbeta, lowerBound, upperBound; constexpr Float epsilon = 1e-8; constexpr size_t burnIn = 3; - sample::FastRealGenerator frg; pbeta = lowerBound = upperBound = Eigen::Matrix::Zero(this->K); for (size_t i = 0; i < numBetaSample + burnIn; ++i) @@ -71,7 +71,7 @@ namespace tomoto { Float N_k = doc.numByTopic[k] + this->alpha; Float N_nk = doc.getSumWordWeight() + this->alpha * (this->K + 1) - N_k; - Float u1 = frg(rg), u2 = frg(rg); + Float u1 = rg.uniform_real(), u2 = rg.uniform_real(); Float max_uk = epsilon + pow(u1, (Float)1 / N_k) * (pbeta[k] - epsilon); Float min_unk = (1 - pow(u2, (Float)1 / N_nk)) * (1 - pbeta[k]) + pbeta[k]; @@ -111,7 +111,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId); /*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0) @@ -121,7 +121,7 @@ namespace tomoto } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) const { if (this->iterated < this->burnIn || !this->optimInterval || (this->iterated + 1) % this->optimInterval != 0) return; @@ -154,7 +154,7 @@ namespace tomoto } } - int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, RandGen* rgs) + int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::cerr << "Failed to sample! Reset prior and retry!" << std::endl; const size_t chStride = std::min(pool.getNumWorkers() * 8, this->docs.size()); @@ -175,7 +175,7 @@ namespace tomoto return 0; } - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::vector> res; topicPrior = math::MultiNormalDistribution::estimate([this](size_t i) @@ -239,7 +239,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior); - CTModel(size_t _K = 1, Float smoothingAlpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + CTModel(size_t _K = 1, Float smoothingAlpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K, smoothingAlpha, _eta, _rg) { this->optimInterval = 2; diff --git a/src/TopicModel/DMR.h b/src/TopicModel/DMR.h index a0c9362..f29da49 100644 --- a/src/TopicModel/DMR.h +++ b/src/TopicModel/DMR.h @@ -3,11 +3,11 @@ namespace tomoto { - template - struct DocumentDMR : public DocumentLDA<_tw, _Flags> + template + struct DocumentDMR : public DocumentLDA<_tw> { - using BaseDocument = DocumentLDA<_tw, _Flags>; - using DocumentLDA<_tw, _Flags>::DocumentLDA; + using BaseDocument = DocumentLDA<_tw>; + using DocumentLDA<_tw>::DocumentLDA; size_t metadata = 0; DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata); @@ -20,7 +20,8 @@ namespace tomoto using DefaultDocType = DocumentDMR; static IDMRModel* create(TermWeight _weight, size_t _K = 1, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01, Float _alphaEps = 1e-10, - const RandGen& _rg = RandGen{ std::random_device{}() }); + size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words, const std::vector& metadata) = 0; virtual std::unique_ptr makeDoc(const std::vector& words, const std::vector& metadata) const = 0; diff --git a/src/TopicModel/DMRModel.cpp b/src/TopicModel/DMRModel.cpp index 22182b8..dbff5e2 100644 --- a/src/TopicModel/DMRModel.cpp +++ b/src/TopicModel/DMRModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class DMRModel; + /*template class DMRModel; template class DMRModel; - template class DMRModel; + template class DMRModel;*/ - IDMRModel* IDMRModel::create(TermWeight _weight, size_t _K, Float _defaultAlpha, Float _sigma, Float _eta, Float _alphaEps, const RandGen& _rg) + IDMRModel* IDMRModel::create(TermWeight _weight, size_t _K, Float _defaultAlpha, Float _sigma, Float _eta, Float _alphaEps, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, DMRModel, _K, _defaultAlpha, _sigma, _eta, _alphaEps, _rg); + TMT_SWITCH_TW(_weight, scalarRng, DMRModel, _K, _defaultAlpha, _sigma, _eta, _alphaEps, seed); } } \ No newline at end of file diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp index 25f71c7..a32e12d 100644 --- a/src/TopicModel/DMRModel.hpp +++ b/src/TopicModel/DMRModel.hpp @@ -16,18 +16,19 @@ namespace tomoto Eigen::Matrix tmpK; }; - template, typename _ModelState = ModelStateDMR<_tw>> - class DMRModel : public LDAModel<_tw, _Flags, _Interface, - typename std::conditional::value, DMRModel<_tw, _Flags>, _Derived>::type, + class DMRModel : public LDAModel<_tw, _RandGen, _Flags, _Interface, + typename std::conditional::value, DMRModel<_tw, _RandGen, _Flags>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, DMRModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, DMRModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -118,7 +119,7 @@ namespace tomoto } } - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { Eigen::Matrix bLambda; Float fx = 0, bestFx = INFINITY; @@ -146,7 +147,7 @@ namespace tomoto expLambda = lambda.array().exp() + alphaEps; } - int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, RandGen* rgs) + int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::cerr << "Failed to optimize! Reset prior and retry!" << std::endl; lambda.setZero(); @@ -254,7 +255,7 @@ namespace tomoto DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda); DMRModel(size_t _K = 1, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01, - Float _alphaEps = 0, const RandGen& _rg = RandGen{ std::random_device{}() }) + Float _alphaEps = 0, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K, defaultAlpha, _eta, _rg), sigma(_sigma), alphaEps(_alphaEps) { if (_sigma <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong sigma value (sigma = %f)", _sigma)); @@ -362,11 +363,12 @@ namespace tomoto }; /* This is for preventing 'undefined symbol' problem in compiling by clang. */ - template - constexpr Float DMRModel<_tw, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxLambda; + constexpr Float DMRModel<_tw, _RandGen, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxLambda; - template - constexpr size_t DMRModel<_tw, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxBFGSIteration; + constexpr size_t DMRModel<_tw, _RandGen, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxBFGSIteration; + } diff --git a/src/TopicModel/DT.h b/src/TopicModel/DT.h index 34ec99e..8996178 100644 --- a/src/TopicModel/DT.h +++ b/src/TopicModel/DT.h @@ -4,12 +4,11 @@ namespace tomoto { - template - struct DocumentDTM : public DocumentLDA<_tw, _Flags> + template + struct DocumentDTM : public DocumentLDA<_tw> { - using BaseDocument = DocumentLDA<_tw, _Flags>; - using DocumentLDA<_tw, _Flags>::DocumentLDA; - using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type; + using BaseDocument = DocumentLDA<_tw>; + using DocumentLDA<_tw>::DocumentLDA; size_t timepoint = 0; ShareableVector eta; @@ -27,7 +26,8 @@ namespace tomoto Float _alphaVar = 1.0, Float _etaVar = 1.0, Float _phiVar = 1.0, Float _shapeA = 0.03, Float _shapeB = 0.1, Float _shapeC = 0.55, Float _etaRegL2 = 0, - const RandGen& _rg = RandGen{ std::random_device{}() }); + size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words, size_t timepoint) = 0; virtual std::unique_ptr makeDoc(const std::vector& words, size_t timepoint) const = 0; @@ -45,6 +45,8 @@ namespace tomoto size_t timepoint) const = 0; virtual size_t getT() const = 0; + virtual std::vector getNumDocsByT() const = 0; + virtual Float getAlphaVar() const = 0; virtual Float getEtaVar() const = 0; virtual Float getPhiVar() const = 0; diff --git a/src/TopicModel/DTModel.cpp b/src/TopicModel/DTModel.cpp index 84e64c0..5df744e 100644 --- a/src/TopicModel/DTModel.cpp +++ b/src/TopicModel/DTModel.cpp @@ -2,14 +2,14 @@ namespace tomoto { - template class DTModel; + /*template class DTModel; template class DTModel; - template class DTModel; + template class DTModel;*/ IDTModel* IDTModel::create(TermWeight _weight, size_t _K, size_t _T, Float _alphaVar, Float _etaVar, Float _phiVar, - Float _shapeA, Float _shapeB, Float _shapeC, Float _etaRegL2, const RandGen& _rg) + Float _shapeA, Float _shapeB, Float _shapeC, Float _etaRegL2, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, DTModel, _K, _T, _alphaVar, _etaVar, _phiVar, _shapeA, _shapeB, _shapeC, _etaRegL2, _rg); + TMT_SWITCH_TW(_weight, scalarRng, DTModel, _K, _T, _alphaVar, _etaVar, _phiVar, _shapeA, _shapeB, _shapeC, _etaRegL2, seed); } } diff --git a/src/TopicModel/DTModel.hpp b/src/TopicModel/DTModel.hpp index 00f0949..7eeacd2 100644 --- a/src/TopicModel/DTModel.hpp +++ b/src/TopicModel/DTModel.hpp @@ -23,18 +23,19 @@ namespace tomoto DEFINE_SERIALIZER(numByTopic, numByTopicWord); }; - template, typename _ModelState = ModelStateDTM<_tw>> - class DTModel : public LDAModel<_tw, _Flags, _Interface, - typename std::conditional::value, DTModel<_tw, _Flags>, _Derived>::type, + class DTModel : public LDAModel<_tw, _RandGen, _Flags, _Interface, + typename std::conditional::value, DTModel<_tw, _RandGen, _Flags>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, DTModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, DTModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -76,7 +77,7 @@ namespace tomoto - sampling alpha */ - void presampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + void presampleDocument(_DocType& doc, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt) const { const Float eps = shapeA * (std::pow(shapeB + 1 + iterationCnt, -shapeC)); @@ -88,7 +89,7 @@ namespace tomoto auto prior = (alphas.col(doc.timepoint) - doc.eta) / std::max(etaVar, eps * 2); auto grad = doc.numByTopic.template cast() - estimatedCnt; doc.eta.array() += (eps / 2) * (prior.array() + grad.array()) - + Eigen::norm_dist>(this->K, 1, rgs) * eps; + + Eigen::Rand::normal>(this->K, 1, rgs) * eps; } Eigen::Array expEta = (doc.eta.array() - doc.eta.maxCoeff()).exp(); @@ -96,7 +97,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) @@ -106,7 +107,6 @@ namespace tomoto } size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0; - sample::FastRealGenerator frg; // sampling zeta for (size_t w = b; w < e; ++w) @@ -122,14 +122,14 @@ namespace tomoto Float acceptance = std::min(1.f, std::exp(phi(v, new_z + this->K * doc.timepoint) - phi(v, doc.Zs[w] + this->K * doc.timepoint)) ); - if (acceptance >= 1 || frg(rgs) < acceptance) doc.Zs[w] = new_z; + if (acceptance >= 1 || rgs.uniform_real() < acceptance) doc.Zs[w] = new_z; // word proposal new_z = wordAliasTables[v + this->realV * doc.timepoint](rgs); acceptance = std::min(1.f, std::exp(doc.eta(new_z) - doc.eta(doc.Zs[w])) ); - if (acceptance >= 1 || frg(rgs) < acceptance) doc.Zs[w] = new_z; + if (acceptance >= 1 || rgs.uniform_real() < acceptance) doc.Zs[w] = new_z; } addWordTo<1>(ld, doc, w, v - vOffset, doc.Zs[w]); @@ -137,7 +137,7 @@ namespace tomoto } template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, _RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -194,7 +194,7 @@ namespace tomoto } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) { const auto K = this->K; const Float eps = shapeA * (std::pow(shapeB + 1 + this->iterated, -shapeC)); @@ -211,7 +211,7 @@ namespace tomoto Eigen::Matrix grad = this->globalState.numByTopicWord.row(k + K * t).template cast(); grad -= estimatedCnt; - auto epsNoise = Eigen::norm_dist>(this->realV, 1, *rgs) * eps; + auto epsNoise = Eigen::Rand::normal>(this->realV, 1, *rgs) * eps; if (t == 0) { if (T > 1) @@ -308,18 +308,18 @@ namespace tomoto newAlpha += (alphas.col(t + 1) + alphas.col(t - 1)) / alphaVar; } newAlpha /= lambda; - newAlpha.array() += Eigen::norm_dist>(this->K, 1, *rgs) / std::sqrt(lambda); + newAlpha.array() += Eigen::Rand::normal>(this->K, 1, *rgs) / std::sqrt(lambda); } alphas = newAlphas; } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) const { // do nothing } - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { } @@ -446,7 +446,7 @@ namespace tomoto } template - void updateStateWithDoc(_Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(_Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto& z = doc.Zs[i]; auto w = doc.words[i]; @@ -495,6 +495,7 @@ namespace tomoto T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi); GETTER(T, size_t, T); + GETTER(NumDocsByT, std::vector, numDocsByTime); GETTER(AlphaVar, Float, alphaVar); GETTER(EtaVar, Float, etaVar); GETTER(PhiVar, Float, phiVar); @@ -504,7 +505,7 @@ namespace tomoto GETTER(ShapeC, Float, shapeC); DTModel(size_t _K, size_t _T, Float _alphaVar, Float _etaVar, Float _phiVar, - Float _shapeA, Float _shapeB, Float _shapeC, Float _etaRegL2, const RandGen& _rg) + Float _shapeA, Float _shapeB, Float _shapeC, Float _etaRegL2, const _RandGen& _rg) : BaseClass{ _K, _alphaVar, _etaVar, _rg }, T{ _T }, alphaVar{ _alphaVar }, etaVar{ _etaVar }, phiVar{ _phiVar }, shapeA{ _shapeA }, shapeB{ _shapeB }, shapeC{ _shapeC }, etaRegL2{ _etaRegL2 } diff --git a/src/TopicModel/GDMR.h b/src/TopicModel/GDMR.h index e653577..ae1aaac 100644 --- a/src/TopicModel/GDMR.h +++ b/src/TopicModel/GDMR.h @@ -4,10 +4,10 @@ namespace tomoto { template - struct DocumentGDMR : public DocumentDMR<_tw, _Flags> + struct DocumentGDMR : public DocumentDMR<_tw> { - using BaseDocument = DocumentDMR<_tw, _Flags>; - using DocumentDMR<_tw, _Flags>::DocumentDMR; + using BaseDocument = DocumentDMR<_tw>; + using DocumentDMR<_tw>::DocumentDMR; std::vector metadataOrg, metadataNormalized; DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataOrg); @@ -20,7 +20,8 @@ namespace tomoto using DefaultDocType = DocumentDMR; static IGDMRModel* create(TermWeight _weight, size_t _K = 1, const std::vector& _degreeByF = {}, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _sigma0 = 1.0, Float _eta = 0.01, Float _alphaEps = 1e-10, - const RandGen& _rg = RandGen{ std::random_device{}() }); + size_t seed = std::random_device{}(), + bool scalarRng = false); virtual Float getSigma0() const = 0; virtual void setSigma0(Float) = 0; diff --git a/src/TopicModel/GDMRModel.cpp b/src/TopicModel/GDMRModel.cpp index 6f49f69..d33c61b 100644 --- a/src/TopicModel/GDMRModel.cpp +++ b/src/TopicModel/GDMRModel.cpp @@ -2,13 +2,13 @@ namespace tomoto { - template class GDMRModel; + /*template class GDMRModel; template class GDMRModel; - template class GDMRModel; + template class GDMRModel;*/ IGDMRModel* IGDMRModel::create(TermWeight _weight, size_t _K, const std::vector& degreeByF, - Float _defaultAlpha, Float _sigma, Float _sigma0, Float _eta, Float _alphaEps, const RandGen& _rg) + Float _defaultAlpha, Float _sigma, Float _sigma0, Float _eta, Float _alphaEps, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, GDMRModel, _K, degreeByF, _defaultAlpha, _sigma, _sigma0, _eta, _alphaEps, _rg); + TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, _K, degreeByF, _defaultAlpha, _sigma, _sigma0, _eta, _alphaEps, seed); } } diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp index 88ec5b2..f3190ac 100644 --- a/src/TopicModel/GDMRModel.hpp +++ b/src/TopicModel/GDMRModel.hpp @@ -14,18 +14,19 @@ namespace tomoto std::vector ndimCnt;*/ }; - template, typename _ModelState = ModelStateGDMR<_tw>> - class GDMRModel : public DMRModel<_tw, _Flags, _Interface, - typename std::conditional::value, GDMRModel<_tw>, _Derived>::type, + class GDMRModel : public DMRModel<_tw, _RandGen, _Flags, _Interface, + typename std::conditional::value, GDMRModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, GDMRModel<_tw>, _Derived>::type; - using BaseClass = DMRModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, GDMRModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = DMRModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; friend typename BaseClass::BaseClass::BaseClass; @@ -335,7 +336,7 @@ namespace tomoto GDMRModel(size_t _K = 1, const std::vector& _degreeByF = {}, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _sigma0 = 1.0, Float _eta = 0.01, - Float _alphaEps = 1e-10, const RandGen& _rg = RandGen{ std::random_device{}() }) + Float _alphaEps = 1e-10, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K, defaultAlpha, _sigma, _eta, _alphaEps, _rg), sigma0(_sigma0), degreeByF(_degreeByF) { this->F = accumulate(degreeByF.begin(), degreeByF.end(), 1, [](size_t a, size_t b) {return a * (b + 1); }); diff --git a/src/TopicModel/HDP.h b/src/TopicModel/HDP.h index 4065aec..2226461 100644 --- a/src/TopicModel/HDP.h +++ b/src/TopicModel/HDP.h @@ -60,7 +60,9 @@ namespace tomoto { public: using DefaultDocType = DocumentHDP; - static IHDPModel* create(TermWeight _weight, size_t _K = 1, Float alpha = 0.1, Float eta = 0.01, Float gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }); + static IHDPModel* create(TermWeight _weight, size_t _K = 1, + Float alpha = 0.1, Float eta = 0.01, Float gamma = 0.1, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual Float getGamma() const = 0; virtual size_t getTotalTables() const = 0; diff --git a/src/TopicModel/HDPModel.cpp b/src/TopicModel/HDPModel.cpp index 367e555..89c38a9 100644 --- a/src/TopicModel/HDPModel.cpp +++ b/src/TopicModel/HDPModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class HDPModel; + /*template class HDPModel; template class HDPModel; - template class HDPModel; + template class HDPModel;*/ - IHDPModel* IHDPModel::create(TermWeight _weight, size_t _K, Float _alpha , Float _eta, Float _gamma, const RandGen& _rg) + IHDPModel* IHDPModel::create(TermWeight _weight, size_t _K, Float _alpha , Float _eta, Float _gamma, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, HDPModel, _K, _alpha, _eta, _gamma, _rg); + TMT_SWITCH_TW(_weight, scalarRng, HDPModel, _K, _alpha, _eta, _gamma, seed); } } \ No newline at end of file diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp index b931623..3b0423a 100644 --- a/src/TopicModel/HDPModel.hpp +++ b/src/TopicModel/HDPModel.hpp @@ -21,18 +21,18 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable); }; - template, typename _ModelState = ModelStateHDP<_tw>> - class HDPModel : public LDAModel<_tw, 0, _Interface, - typename std::conditional::value, HDPModel<_tw>, _Derived>::type, + class HDPModel : public LDAModel<_tw, _RandGen, 0, _Interface, + typename std::conditional::value, HDPModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, HDPModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, 0, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, HDPModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, 0, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -40,7 +40,7 @@ namespace tomoto Float gamma; template - static Float estimateConcentrationParameter(_NumFunc ns, Float tableCnt, size_t size, Float alpha, RandGen& rgs) + static Float estimateConcentrationParameter(_NumFunc ns, Float tableCnt, size_t size, Float alpha, _RandGen& rgs) { Float a = 1, b = 1; for (size_t i = 0; i < 10; ++i) @@ -61,7 +61,7 @@ namespace tomoto return alpha; } - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { size_t tableCnt = 0; for (auto& doc : this->docs) @@ -191,7 +191,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { // sample a table for each word for (size_t w = 0; w < doc.words.size(); ++w) @@ -291,7 +291,7 @@ namespace tomoto } template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, _RandGen*, const _ExtraDocData& edd) const { std::vector> res; const size_t V = this->realV; @@ -416,7 +416,7 @@ namespace tomoto } template - void updateStateWithDoc(typename BaseClass::Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(typename BaseClass::Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { // generate tables for each topic when inferring if (_Infer) @@ -481,7 +481,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma); - HDPModel(size_t initialK = 2, Float _alpha = 0.1, Float _eta = 0.01, Float _gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }) + HDPModel(size_t initialK = 2, Float _alpha = 0.1, Float _eta = 0.01, Float _gamma = 0.1, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(initialK, _alpha, _eta, _rg), gamma(_gamma) { if (_gamma <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong gamma value (gamma = %f)", _gamma)); @@ -540,7 +540,7 @@ namespace tomoto liveK++; } - auto lda = make_unique>(liveK, 0.1f, this->eta); + auto lda = make_unique>(liveK, 0.1f, this->eta); lda->dict = this->dict; for (auto& doc : this->docs) diff --git a/src/TopicModel/HLDA.h b/src/TopicModel/HLDA.h index 798164b..c2a25e0 100644 --- a/src/TopicModel/HLDA.h +++ b/src/TopicModel/HLDA.h @@ -24,7 +24,9 @@ namespace tomoto { public: using DefaultDocType = DocumentHLDA; - static IHLDAModel* create(TermWeight _weight, size_t levelDepth = 1, Float alpha = 0.1, Float eta = 0.01, Float gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }); + static IHLDAModel* create(TermWeight _weight, size_t levelDepth = 1, + Float alpha = 0.1, Float eta = 0.01, Float gamma = 0.1, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual Float getGamma() const = 0; virtual size_t getLiveK() const = 0; diff --git a/src/TopicModel/HLDAModel.cpp b/src/TopicModel/HLDAModel.cpp index 1145dc4..d1d01e1 100644 --- a/src/TopicModel/HLDAModel.cpp +++ b/src/TopicModel/HLDAModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class HLDAModel; + /*template class HLDAModel; template class HLDAModel; - template class HLDAModel; + template class HLDAModel;*/ - IHLDAModel* IHLDAModel::create(TermWeight _weight, size_t levelDepth, Float _alpha, Float _eta, Float _gamma, const RandGen& _rg) + IHLDAModel* IHLDAModel::create(TermWeight _weight, size_t levelDepth, Float _alpha, Float _eta, Float _gamma, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, HLDAModel, levelDepth, _alpha, _eta, _gamma, _rg); + TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, levelDepth, _alpha, _eta, _gamma, seed); } } \ No newline at end of file diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp index 2ce9f32..16a874a 100644 --- a/src/TopicModel/HLDAModel.hpp +++ b/src/TopicModel/HLDAModel.hpp @@ -312,18 +312,18 @@ namespace tomoto } }; - template, typename _ModelState = ModelStateHLDA<_tw>> - class HLDAModel : public LDAModel<_tw, flags::shared_state, _Interface, - typename std::conditional::value, HLDAModel<_tw>, _Derived>::type, + class HLDAModel : public LDAModel<_tw, _RandGen, flags::shared_state, _Interface, + typename std::conditional::value, HLDAModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, HLDAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, flags::shared_state, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, HLDAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, flags::shared_state, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -332,7 +332,7 @@ namespace tomoto Float gamma; - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { // for alphas BaseClass::optimizeParameters(pool, localData, rgs); @@ -342,7 +342,7 @@ namespace tomoto // Words of all documents should be sorted by ascending order. template - void samplePathes(_DocType& doc, ThreadPool* pool, _ModelState& ld, RandGen& rgs) const + void samplePathes(_DocType& doc, ThreadPool* pool, _ModelState& ld, _RandGen& rgs) const { if(_MakeNewPath) ld.nt->nodes[doc.path.back()].dropPathOne(); ld.nt->template calcNodeLikelihood<_MakeNewPath>(gamma, this->K); @@ -439,7 +439,7 @@ namespace tomoto return &zLikelihood[0]; } - void sampleTopics(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs) const + void sampleTopics(_DocType& doc, size_t docId, _ModelState& ld, _RandGen& rgs) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -461,13 +461,13 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { sampleTopics(doc, docId, ld, rgs); } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) { for (auto doc = first; doc != last; ++doc) { @@ -477,7 +477,7 @@ namespace tomoto } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) const { for (auto doc = first; doc != last; ++doc) { @@ -556,7 +556,7 @@ namespace tomoto } template - void updateStateWithDoc(typename BaseClass::Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(typename BaseClass::Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { if (i == 0) { @@ -599,7 +599,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma); - HLDAModel(size_t _levelDepth = 4, Float _alpha = 0.1, Float _eta = 0.01, Float _gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }) + HLDAModel(size_t _levelDepth = 4, Float _alpha = 0.1, Float _eta = 0.01, Float _gamma = 0.1, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_levelDepth, _alpha, _eta, _rg), gamma(_gamma) { if (_levelDepth == 0 || _levelDepth >= 0x80000000) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong levelDepth value (levelDepth = %zd)", _levelDepth)); diff --git a/src/TopicModel/HPA.h b/src/TopicModel/HPA.h index 4fda870..4c62647 100644 --- a/src/TopicModel/HPA.h +++ b/src/TopicModel/HPA.h @@ -20,6 +20,8 @@ namespace tomoto { public: using DefaultDocType = DocumentHPA; - static IHPAModel* create(TermWeight _weight, bool _exclusive = false, size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 50, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + static IHPAModel* create(TermWeight _weight, bool _exclusive = false, size_t _K1 = 1, size_t _K2 = 1, + Float _alpha = 50, Float _eta = 0.01, size_t seed = std::random_device{}(), + bool scalarRng = false); }; } diff --git a/src/TopicModel/HPAModel.cpp b/src/TopicModel/HPAModel.cpp index 80d179e..77b504a 100644 --- a/src/TopicModel/HPAModel.cpp +++ b/src/TopicModel/HPAModel.cpp @@ -2,19 +2,19 @@ namespace tomoto { - template class HPAModel; + /*template class HPAModel; template class HPAModel; - template class HPAModel; + template class HPAModel;*/ - IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, size_t _K, size_t _K2, Float _alphaSum, Float _eta, const RandGen& _rg) + IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, size_t _K, size_t _K2, Float _alphaSum, Float _eta, size_t seed, bool scalarRng) { if (_exclusive) { - //SWITCH_TW(_weight, HPAModelExclusive, _K, _K2, _alphaSum, _eta, _rg); + //TMT_SWITCH_TW(_weight, HPAModelExclusive, _K, _K2, _alphaSum, _eta, seed); } else { - SWITCH_TW(_weight, HPAModel, _K, _K2, _alphaSum, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, HPAModel, _K, _K2, _alphaSum, _eta, seed); } return nullptr; } diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp index c668e5f..2a8eb91 100644 --- a/src/TopicModel/HPAModel.hpp +++ b/src/TopicModel/HPAModel.hpp @@ -23,19 +23,19 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2); }; - template, typename _ModelState = ModelStateHPA<_tw>> - class HPAModel : public LDAModel<_tw, 0, _Interface, - typename std::conditional::value, HPAModel<_tw, _Exclusive>, _Derived>::type, + class HPAModel : public LDAModel<_tw, _RandGen, 0, _Interface, + typename std::conditional::value, HPAModel<_tw, _RandGen, _Exclusive>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, HPAModel<_tw, _Exclusive>, _Derived>::type; - using BaseClass = LDAModel<_tw, 0, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, HPAModel<_tw, _RandGen, _Exclusive>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, 0, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -49,7 +49,7 @@ namespace tomoto Eigen::Matrix subAlphaSum; // len = K Eigen::Matrix subAlphas; // len = K * (K2 + 1) - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { const auto K = this->K; for (size_t i = 0; i < iteration; ++i) @@ -175,7 +175,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) @@ -248,7 +248,7 @@ namespace tomoto } template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, _RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -418,7 +418,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto w = doc.words[i]; switch (g.level(rgs)) @@ -450,7 +450,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum); - HPAModel(size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + HPAModel(size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K1, _alpha, _eta, _rg), K2(_K2) { if (_K2 == 0 || _K2 >= 0x80000000) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong K2 value (K2 = %zd)", _K2)); @@ -561,5 +561,4 @@ namespace tomoto } } - template using HPAModelExclusive = HPAModel<_tw, true>; } diff --git a/src/TopicModel/LDA.h b/src/TopicModel/LDA.h index 1239190..a186d47 100644 --- a/src/TopicModel/LDA.h +++ b/src/TopicModel/LDA.h @@ -76,8 +76,8 @@ namespace tomoto } }; - template - struct DocumentLDA : public DocumentBase, SumWordWeight, _tw> + template + struct DocumentLDA : public DocumentBase, SumWordWeight, _tw> { public: using DocumentBase::DocumentBase; @@ -113,7 +113,9 @@ namespace tomoto { public: using DefaultDocType = DocumentLDA; - static ILDAModel* create(TermWeight _weight, size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + static ILDAModel* create(TermWeight _weight, size_t _K = 1, + Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words) = 0; virtual std::unique_ptr makeDoc(const std::vector& words) const = 0; diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp index 2342fd2..711ed03 100644 --- a/src/TopicModel/LDACVB0Model.hpp +++ b/src/TopicModel/LDACVB0Model.hpp @@ -51,7 +51,7 @@ namespace tomoto { public: using DefaultDocType = DocumentLDACVB0; - static ILDACVB0Model* create(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + static ILDACVB0Model* create(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }); virtual size_t addDoc(const std::vector& words) = 0; virtual std::unique_ptr makeDoc(const std::vector& words) const = 0; @@ -138,7 +138,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -155,7 +155,7 @@ namespace tomoto } template - void trainOne(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void trainOne(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::vector> res; const size_t chStride = std::min(pool.getNumWorkers() * 8, this->docs.size()); @@ -284,7 +284,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { doc.Zs.col(i).setZero(); doc.Zs(g.theta(rgs), i) = 1; @@ -292,7 +292,7 @@ namespace tomoto } template - void initializeDocState(_DocType& doc, Float* topicDocPtr, _Generator& g, _ModelState& ld, RandGen& rgs) const + void initializeDocState(_DocType& doc, Float* topicDocPtr, _Generator& g, _ModelState& ld, _RandGen& rgs) const { std::vector tf(this->realV); static_cast(this)->prepareDoc(doc, topicDocPtr, doc.words.size()); @@ -332,7 +332,7 @@ namespace tomoto DEFINE_SERIALIZER(alpha, eta, K); public: - LDACVB0Model(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + LDACVB0Model(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_rg), K(_K), alpha(_alpha), eta(_eta) { alphas = Eigen::Matrix::Constant(K, alpha); @@ -434,7 +434,7 @@ namespace tomoto } } - inline ILDACVB0Model* ILDACVB0Model::create(size_t _K, Float _alpha, Float _eta, const RandGen& _rg) + inline ILDACVB0Model* ILDACVB0Model::create(size_t _K, Float _alpha, Float _eta, const _RandGen& _rg) { return new LDACVB0Model<>(_K, _alpha, _eta, _rg); } diff --git a/src/TopicModel/LDAModel.cpp b/src/TopicModel/LDAModel.cpp index c672b7a..7322cca 100644 --- a/src/TopicModel/LDAModel.cpp +++ b/src/TopicModel/LDAModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class LDAModel; + /*template class LDAModel; template class LDAModel; - template class LDAModel; + template class LDAModel;*/ - ILDAModel* ILDAModel::create(TermWeight _weight, size_t _K, Float _alpha, Float _eta, const RandGen& _rg) + ILDAModel* ILDAModel::create(TermWeight _weight, size_t _K, Float _alpha, Float _eta, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, LDAModel, _K, _alpha, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, LDAModel, _K, _alpha, _eta, seed); } } diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index 43804bc..a7df634 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -19,14 +19,26 @@ Term Weighting Scheme is based on following paper: */ -#define SWITCH_TW(TW, MDL, ...) do{ switch (TW)\ - {\ - case TermWeight::one:\ - return new MDL(__VA_ARGS__);\ - case TermWeight::idf:\ - return new MDL(__VA_ARGS__);\ - case TermWeight::pmi:\ - return new MDL(__VA_ARGS__);\ +#define TMT_SWITCH_TW(TW, SRNG, MDL, ...) do{\ + if(SRNG){\ + switch (TW){\ + case TermWeight::one:\ + return new MDL(__VA_ARGS__);\ + case TermWeight::idf:\ + return new MDL(__VA_ARGS__);\ + case TermWeight::pmi:\ + return new MDL(__VA_ARGS__);\ + }\ + }\ + else{\ + switch (TW){\ + case TermWeight::one:\ + return new MDL(__VA_ARGS__);\ + case TermWeight::idf:\ + return new MDL(__VA_ARGS__);\ + case TermWeight::pmi:\ + return new MDL(__VA_ARGS__);\ + }\ }\ return nullptr; } while(0) @@ -116,25 +128,27 @@ namespace tomoto // to make HDP friend of LDA for HDPModel::converToLDA template class HDPModel; - template, + typename _DocType = DocumentLDA<_tw>, typename _ModelState = ModelStateLDA<_tw>> - class LDAModel : public TopicModel<_Flags, _Interface, - typename std::conditional::value, LDAModel<_tw, _Flags>, _Derived>::type, + class LDAModel : public TopicModel<_RandGen, _Flags, _Interface, + typename std::conditional::value, LDAModel<_tw, _RandGen, _Flags>, _Derived>::type, _DocType, _ModelState>, protected TwId<_tw> { protected: using DerivedClass = typename std::conditional::value, LDAModel, _Derived>::type; - using BaseClass = TopicModel<_Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using BaseClass = TopicModel<_RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend EtaHelper; friend EtaHelper; @@ -143,6 +157,7 @@ namespace tomoto typename, typename, typename, + typename, typename> friend class HDPModel; @@ -178,11 +193,12 @@ namespace tomoto auto dAlpha = math::digammaT(alpha); size_t suggested = (len + 127) / 128; + if (pool && suggested > pool->getNumWorkers()) suggested = pool->getNumWorkers(); if (suggested <= 1 || !pool) { return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum(); } - if (suggested > pool->getNumWorkers()) suggested = pool->getNumWorkers(); + std::vector> futures; for (size_t i = 0; i < suggested; ++i) @@ -200,9 +216,9 @@ namespace tomoto } /* - function for optimizing hyperparameters + function for optimizing hyperparameters */ - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { const auto K = this->K; for (size_t i = 0; i < 10; ++i) @@ -268,7 +284,7 @@ namespace tomoto /* called once before sampleDocument */ - void presampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + void presampleDocument(_DocType& doc, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt) const { } @@ -276,7 +292,7 @@ namespace tomoto main sampling procedure (can be called one or more by ParallelScheme) */ template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) @@ -308,20 +324,20 @@ namespace tomoto } template - void performSampling(ThreadPool& pool, _ModelState* localData, RandGen* rgs, std::vector>& res, + void performSampling(ThreadPool& pool, _ModelState* localData, _RandGen* rgs, std::vector>& res, _DocIter docFirst, _DocIter docLast, const _ExtraDocData& edd) const { // single-threaded sampling if (_ps == ParallelScheme::none) { - size_t docId = 0; - for (auto doc = docFirst; doc != docLast; ++doc) + forRandom((size_t)std::distance(docFirst, docLast), rgs[0](), [&](size_t id) { - static_cast(this)->presampleDocument(*doc, docId, *localData, *rgs, this->iterated); + static_cast(this)->presampleDocument(docFirst[id], id, *localData, *rgs, this->iterated); static_cast(this)->template sampleDocument<_ps, _infer>( - *doc, edd, docId++, + docFirst[id], edd, id, *localData, *rgs, this->iterated, 0); - } + + }); } // multi-threaded sampling on partition ad update into global else if (_ps == ParallelScheme::partition) @@ -446,7 +462,7 @@ namespace tomoto } template - void trainOne(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void trainOne(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { std::vector> res; try @@ -461,7 +477,7 @@ namespace tomoto static_cast(this)->optimizeParameters(pool, localData, rgs); } } - catch (const exception::TrainingError& e) + catch (const exception::TrainingError&) { for (auto& r : res) if(r.valid()) r.get(); throw; @@ -480,7 +496,7 @@ namespace tomoto merges multithreaded document sampling result */ template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, _RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -540,12 +556,12 @@ namespace tomoto * if pool is nullptr, workers has been already pooled and cannot branch works more. */ template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) const { } template - void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) { } @@ -659,7 +675,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto& z = doc.Zs[i]; auto w = doc.words[i]; @@ -676,7 +692,7 @@ namespace tomoto } template - void initializeDocState(_DocType& doc, size_t docId, _Generator& g, _ModelState& ld, RandGen& rgs) const + void initializeDocState(_DocType& doc, size_t docId, _Generator& g, _ModelState& ld, _RandGen& rgs) const { std::vector tf(this->realV); static_cast(this)->prepareDoc(doc, docId, doc.words.size()); @@ -750,7 +766,7 @@ namespace tomoto numWorkers = std::min(numWorkers, this->maxThreads[(size_t)_ps]); ThreadPool pool{ numWorkers }; // temporary state variable - RandGen rgc{}; + _RandGen rgc{}; auto tmpState = this->globalState, tState = this->globalState; for (auto d = docFirst; d != docLast; ++d) { @@ -758,7 +774,7 @@ namespace tomoto } std::vector localData((m_flags & flags::shared_state) ? 0 : pool.getNumWorkers(), tmpState); - std::vector rgs; + std::vector<_RandGen> rgs; for (size_t i = 0; i < pool.getNumWorkers(); ++i) rgs.emplace_back(rgc()); ExtraDocData edd; @@ -789,7 +805,7 @@ namespace tomoto const double gllRest = static_cast(this)->getLLRest(this->globalState); for (auto d = docFirst; d != docLast; ++d) { - RandGen rgc{}; + _RandGen rgc{}; auto tmpState = this->globalState; initializeDocState(*d, -1, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) @@ -815,7 +831,7 @@ namespace tomoto { res.emplace_back(pool.enqueue([&, d](size_t threadId) { - RandGen rgc{}; + _RandGen rgc{}; auto tmpState = this->globalState; initializeDocState(*d, -1, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) @@ -844,7 +860,7 @@ namespace tomoto DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord); - LDAModel(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + LDAModel(size_t _K = 1, Float _alpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_rg), K(_K), alpha(_alpha), eta(_eta) { if (_K == 0 || _K >= 0x80000000) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong K value (K = %zd)", _K)); @@ -1022,9 +1038,9 @@ namespace tomoto }; - template + template template - void DocumentLDA<_tw, _Flags>::update(WeightType* ptr, const _TopicModel& mdl) + void DocumentLDA<_tw>::update(WeightType* ptr, const _TopicModel& mdl) { numByTopic.init(ptr, mdl.getK()); for (size_t i = 0; i < Zs.size(); ++i) diff --git a/src/TopicModel/LLDA.h b/src/TopicModel/LLDA.h index 5e3860f..da090bb 100644 --- a/src/TopicModel/LLDA.h +++ b/src/TopicModel/LLDA.h @@ -19,7 +19,9 @@ namespace tomoto { public: using DefaultDocType = DocumentLLDA; - static ILLDAModel* create(TermWeight _weight, size_t _K = 1, Float alpha = 0.1, Float eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + static ILLDAModel* create(TermWeight _weight, size_t _K = 1, + Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words, const std::vector& label) = 0; virtual std::unique_ptr makeDoc(const std::vector& words, const std::vector& label) const = 0; diff --git a/src/TopicModel/LLDAModel.cpp b/src/TopicModel/LLDAModel.cpp index 05371cf..3e65036 100644 --- a/src/TopicModel/LLDAModel.cpp +++ b/src/TopicModel/LLDAModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class LLDAModel; + /*template class LLDAModel; template class LLDAModel; - template class LLDAModel; + template class LLDAModel;*/ - ILLDAModel* ILLDAModel::create(TermWeight _weight, size_t _K, Float _alpha, Float _eta, const RandGen& _rg) + ILLDAModel* ILLDAModel::create(TermWeight _weight, size_t _K, Float _alpha, Float _eta, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, LLDAModel, _K, _alpha, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, _K, _alpha, _eta, seed); } } \ No newline at end of file diff --git a/src/TopicModel/LLDAModel.hpp b/src/TopicModel/LLDAModel.hpp index b550344..9e7bb0f 100644 --- a/src/TopicModel/LLDAModel.hpp +++ b/src/TopicModel/LLDAModel.hpp @@ -10,18 +10,18 @@ Implementation of Labeled LDA using Gibbs sampling by bab2min namespace tomoto { - template, typename _ModelState = ModelStateLDA<_tw>> - class LLDAModel : public LDAModel<_tw, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, - typename std::conditional::value, LLDAModel<_tw>, _Derived>::type, + class LLDAModel : public LDAModel<_tw, _RandGen, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, + typename std::conditional::value, LLDAModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, LLDAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, LLDAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -81,7 +81,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto& z = doc.Zs[i]; auto w = doc.words[i]; @@ -102,7 +102,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict); - LLDAModel(size_t _K = 1, Float _alpha = 1.0, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + LLDAModel(size_t _K = 1, Float _alpha = 1.0, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K, _alpha, _eta, _rg) { } diff --git a/src/TopicModel/MGLDA.h b/src/TopicModel/MGLDA.h index 70f8c99..4a9838d 100644 --- a/src/TopicModel/MGLDA.h +++ b/src/TopicModel/MGLDA.h @@ -34,7 +34,8 @@ namespace tomoto using DefaultDocType = DocumentMGLDA; static IMGLDAModel* create(TermWeight _weight, size_t _KG = 1, size_t _KL = 1, size_t _T = 3, Float _alphaG = 0.1, Float _alphaL = 0.1, Float _alphaMG = 0.1, Float _alphaML = 0.1, - Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }); + Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words, const std::string& delimiter) = 0; virtual std::unique_ptr makeDoc(const std::vector& words, const std::string& delimiter) const = 0; diff --git a/src/TopicModel/MGLDAModel.cpp b/src/TopicModel/MGLDAModel.cpp index 055cb8a..cf6923d 100644 --- a/src/TopicModel/MGLDAModel.cpp +++ b/src/TopicModel/MGLDAModel.cpp @@ -2,16 +2,16 @@ namespace tomoto { - template class MGLDAModel; + /*template class MGLDAModel; template class MGLDAModel; - template class MGLDAModel; + template class MGLDAModel;*/ IMGLDAModel* IMGLDAModel::create(TermWeight _weight, size_t _KG, size_t _KL, size_t _T, Float _alphaG, Float _alphaL, Float _alphaMG, Float _alphaML, - Float _etaG, Float _etaL, Float _gamma, const RandGen& _rg) + Float _etaG, Float _etaL, Float _gamma, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, MGLDAModel, _KG, _KL, _T, + TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, _KG, _KL, _T, _alphaG, _alphaL, _alphaMG, _alphaML, - _etaG, _etaL, _gamma, _rg); + _etaG, _etaL, _gamma, seed); } } \ No newline at end of file diff --git a/src/TopicModel/MGLDAModel.hpp b/src/TopicModel/MGLDAModel.hpp index bbdbc20..f9df8d1 100644 --- a/src/TopicModel/MGLDAModel.hpp +++ b/src/TopicModel/MGLDAModel.hpp @@ -11,18 +11,18 @@ Improved version of java implementation(https://github.com/yinfeiy/MG-LDA) namespace tomoto { - template, typename _ModelState = ModelStateLDA<_tw>> - class MGLDAModel : public LDAModel<_tw, flags::partitioned_multisampling, _Interface, - typename std::conditional::value, MGLDAModel<_tw>, _Derived>::type, + class MGLDAModel : public LDAModel<_tw, _RandGen, flags::partitioned_multisampling, _Interface, + typename std::conditional::value, MGLDAModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, MGLDAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, MGLDAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -98,7 +98,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) @@ -329,7 +329,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { doc.numBySent[doc.sents[i]] += _tw == TermWeight::one ? 1 : doc.wordWeights[i]; auto w = doc.words[i]; @@ -361,7 +361,7 @@ namespace tomoto MGLDAModel(size_t _KG = 1, size_t _KL = 1, size_t _T = 3, Float _alphaG = 0.1, Float _alphaL = 0.1, Float _alphaMG = 0.1, Float _alphaML = 0.1, - Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, const RandGen& _rg = RandGen{ std::random_device{}() }) + Float _etaG = 0.01, Float _etaL = 0.01, Float _gamma = 0.1, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_KG, _alphaG, _etaG, _rg), KL(_KL), T(_T), alphaL(_alphaL), alphaM(_KG ? _alphaMG : 0), alphaML(_alphaML), etaL(_etaL), gamma(_gamma) diff --git a/src/TopicModel/PA.h b/src/TopicModel/PA.h index 0b23651..aac0b89 100644 --- a/src/TopicModel/PA.h +++ b/src/TopicModel/PA.h @@ -23,7 +23,9 @@ namespace tomoto { public: using DefaultDocType = DocumentPA; - static IPAModel* create(TermWeight _weight, size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + static IPAModel* create(TermWeight _weight, size_t _K1 = 1, size_t _K2 = 1, + Float _alpha = 0.1, Float _eta = 0.01, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t getDirichletEstIteration() const = 0; virtual void setDirichletEstIteration(size_t iter) = 0; diff --git a/src/TopicModel/PAModel.cpp b/src/TopicModel/PAModel.cpp index 16f874b..9456ab8 100644 --- a/src/TopicModel/PAModel.cpp +++ b/src/TopicModel/PAModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class PAModel; + /*template class PAModel; template class PAModel; - template class PAModel; + template class PAModel;*/ - IPAModel* IPAModel::create(TermWeight _weight, size_t _K, size_t _K2, Float _alpha, Float _eta, const RandGen& _rg) + IPAModel* IPAModel::create(TermWeight _weight, size_t _K, size_t _K2, Float _alpha, Float _eta, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, PAModel, _K, _K2, _alpha, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, PAModel, _K, _K2, _alpha, _eta, seed); } } diff --git a/src/TopicModel/PAModel.hpp b/src/TopicModel/PAModel.hpp index 1225265..81f3728 100644 --- a/src/TopicModel/PAModel.hpp +++ b/src/TopicModel/PAModel.hpp @@ -21,18 +21,18 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2); }; - template, typename _ModelState = ModelStatePA<_tw>> - class PAModel : public LDAModel<_tw, 0, _Interface, - typename std::conditional::value, PAModel<_tw>, _Derived>::type, + class PAModel : public LDAModel<_tw, _RandGen, 0, _Interface, + typename std::conditional::value, PAModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, PAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, 0, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, PAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, 0, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -43,7 +43,7 @@ namespace tomoto Eigen::Matrix subAlphaSum; // len = K Eigen::Matrix subAlphas; // len = K * K2 - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { const auto K = this->K; std::vector> res; @@ -106,7 +106,7 @@ namespace tomoto } template - void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const + void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { size_t b = 0, e = doc.words.size(); if (_ps == ParallelScheme::partition) @@ -155,7 +155,7 @@ namespace tomoto } template - void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const + void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, _RandGen*, const _ExtraDocData& edd) const { std::vector> res; @@ -322,7 +322,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto w = doc.words[i]; doc.Zs[i] = g.theta(rgs); @@ -342,7 +342,7 @@ namespace tomoto DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum); - PAModel(size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + PAModel(size_t _K1 = 1, size_t _K2 = 1, Float _alpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K1, _alpha, _eta, _rg), K2(_K2) { if (_K2 == 0 || _K2 >= 0x80000000) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong K2 value (K2 = %zd)", _K2)); diff --git a/src/TopicModel/PLDA.h b/src/TopicModel/PLDA.h index 40d61cc..5b9b80a 100644 --- a/src/TopicModel/PLDA.h +++ b/src/TopicModel/PLDA.h @@ -9,7 +9,8 @@ namespace tomoto public: using DefaultDocType = DocumentLLDA; static IPLDAModel* create(TermWeight _weight, size_t _numLatentTopics = 0, size_t _numTopicsPerLabel = 1, - Float alpha = 0.1, Float eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); + Float alpha = 0.1, Float eta = 0.01, size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t getNumLatentTopics() const = 0; }; diff --git a/src/TopicModel/PLDAModel.cpp b/src/TopicModel/PLDAModel.cpp index 1c7cf6f..1072221 100644 --- a/src/TopicModel/PLDAModel.cpp +++ b/src/TopicModel/PLDAModel.cpp @@ -2,12 +2,12 @@ namespace tomoto { - template class PLDAModel; + /*template class PLDAModel; template class PLDAModel; - template class PLDAModel; + template class PLDAModel;*/ - IPLDAModel* IPLDAModel::create(TermWeight _weight, size_t _numLatentTopics, size_t _numTopicsPerLabel, Float _alpha, Float _eta, const RandGen& _rg) + IPLDAModel* IPLDAModel::create(TermWeight _weight, size_t _numLatentTopics, size_t _numTopicsPerLabel, Float _alpha, Float _eta, size_t seed, bool scalarRng) { - SWITCH_TW(_weight, PLDAModel, _numLatentTopics, _numTopicsPerLabel, _alpha, _eta, _rg); + TMT_SWITCH_TW(_weight, scalarRng, PLDAModel, _numLatentTopics, _numTopicsPerLabel, _alpha, _eta, seed); } } diff --git a/src/TopicModel/PLDAModel.hpp b/src/TopicModel/PLDAModel.hpp index c4297b6..893ae14 100644 --- a/src/TopicModel/PLDAModel.hpp +++ b/src/TopicModel/PLDAModel.hpp @@ -10,18 +10,18 @@ Implementation of Labeled LDA using Gibbs sampling by bab2min namespace tomoto { - template, typename _ModelState = ModelStateLDA<_tw>> - class PLDAModel : public LDAModel<_tw, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, - typename std::conditional::value, PLDAModel<_tw>, _Derived>::type, + class PLDAModel : public LDAModel<_tw, _RandGen, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, + typename std::conditional::value, PLDAModel<_tw, _RandGen>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, PLDAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, PLDAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -86,7 +86,7 @@ namespace tomoto } template - void updateStateWithDoc(Generator& g, _ModelState& ld, RandGen& rgs, _DocType& doc, size_t i) const + void updateStateWithDoc(Generator& g, _ModelState& ld, _RandGen& rgs, _DocType& doc, size_t i) const { auto& z = doc.Zs[i]; auto w = doc.words[i]; @@ -108,7 +108,7 @@ namespace tomoto DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict, numLatentTopics, numTopicsPerLabel); PLDAModel(size_t _numLatentTopics = 0, size_t _numTopicsPerLabel = 1, - Float _alpha = 1.0, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }) + Float _alpha = 1.0, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(1, _alpha, _eta, _rg), numLatentTopics(_numLatentTopics), numTopicsPerLabel(_numTopicsPerLabel) { diff --git a/src/TopicModel/SLDA.h b/src/TopicModel/SLDA.h index a203ed2..7b8d779 100644 --- a/src/TopicModel/SLDA.h +++ b/src/TopicModel/SLDA.h @@ -3,11 +3,11 @@ namespace tomoto { - template - struct DocumentSLDA : public DocumentLDA<_tw, _Flags> + template + struct DocumentSLDA : public DocumentLDA<_tw> { - using BaseDocument = DocumentLDA<_tw, _Flags>; - using DocumentLDA<_tw, _Flags>::DocumentLDA; + using BaseDocument = DocumentLDA<_tw>; + using DocumentLDA<_tw>::DocumentLDA; std::vector y; DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, y); DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, y); @@ -28,7 +28,8 @@ namespace tomoto Float alpha = 0.1, Float _eta = 0.01, const std::vector& _mu = {}, const std::vector& _nuSq = {}, const std::vector& _glmParam = {}, - const RandGen& _rg = RandGen{ std::random_device{}() }); + size_t seed = std::random_device{}(), + bool scalarRng = false); virtual size_t addDoc(const std::vector& words, const std::vector& y) = 0; virtual std::unique_ptr makeDoc(const std::vector& words, const std::vector& y) const = 0; diff --git a/src/TopicModel/SLDAModel.cpp b/src/TopicModel/SLDAModel.cpp index 5809e81..0f7f9fa 100644 --- a/src/TopicModel/SLDAModel.cpp +++ b/src/TopicModel/SLDAModel.cpp @@ -2,16 +2,16 @@ namespace tomoto { - template class SLDAModel; + /*template class SLDAModel; template class SLDAModel; - template class SLDAModel; + template class SLDAModel;*/ ISLDAModel* ISLDAModel::create(TermWeight _weight, size_t _K, const std::vector& vars, Float _alpha, Float _eta, const std::vector& _mu, const std::vector& _nuSq, const std::vector& _glmParam, - const RandGen& _rg) + size_t seed, bool scalarRng) { - SWITCH_TW(_weight, SLDAModel, _K, vars, _alpha, _eta, _mu, _nuSq, _glmParam, _rg); + TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, _K, vars, _alpha, _eta, _mu, _nuSq, _glmParam, seed); } } \ No newline at end of file diff --git a/src/TopicModel/SLDAModel.hpp b/src/TopicModel/SLDAModel.hpp index f756348..119e25b 100644 --- a/src/TopicModel/SLDAModel.hpp +++ b/src/TopicModel/SLDAModel.hpp @@ -175,18 +175,19 @@ namespace tomoto }; } - template, typename _ModelState = ModelStateLDA<_tw>> - class SLDAModel : public LDAModel<_tw, _Flags, _Interface, - typename std::conditional::value, SLDAModel<_tw, _Flags>, _Derived>::type, + class SLDAModel : public LDAModel<_tw, _RandGen, _Flags, _Interface, + typename std::conditional::value, SLDAModel<_tw, _RandGen, _Flags>, _Derived>::type, _DocType, _ModelState> { protected: - using DerivedClass = typename std::conditional::value, SLDAModel<_tw>, _Derived>::type; - using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; + using DerivedClass = typename std::conditional::value, SLDAModel<_tw, _RandGen>, _Derived>::type; + using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -239,7 +240,7 @@ namespace tomoto } } - void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs) + void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { BaseClass::optimizeParameters(pool, localData, rgs); } @@ -325,7 +326,7 @@ namespace tomoto Float _alpha = 0.1, Float _eta = 0.01, const std::vector& _mu = {}, const std::vector& _nuSq = {}, const std::vector& _glmParam = {}, - const RandGen& _rg = RandGen{ std::random_device{}() }) + const _RandGen& _rg = _RandGen{ std::random_device{}() }) : BaseClass(_K, _alpha, _eta, _rg), F(vars.size()), varTypes(vars), glmParam(_glmParam) { diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index 736d106..1a7f291 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -1,4 +1,5 @@ #pragma once +#include #include #include "../Utils/Utils.hpp" #include "../Utils/Dictionary.h" @@ -6,25 +7,12 @@ #include "../Utils/ThreadPool.hpp" #include "../Utils/serializer.hpp" #include "../Utils/exception.h" - +#include namespace tomoto { -#if _WIN32 || _WIN64 -#if _WIN64 - typedef std::mt19937_64 RandGen; -#else - typedef std::mt19937 RandGen; -#endif -#endif - -#if __GNUC__ -#if __x86_64__ || __ppc64__ - typedef std::mt19937_64 RandGen; -#else - typedef std::mt19937 RandGen; -#endif -#endif + using RandGen = Eigen::Rand::P8_mt19937_64; + using ScalarRandGen = Eigen::Rand::UniversalRandomEngine; class DocumentBase { @@ -189,14 +177,16 @@ namespace tomoto }; } - template + template class TopicModel : public _Interface { friend class Document; public: using DocType = _DocType; protected: - RandGen rg; + _RandGen rg; std::vector words; std::vector wOffsetByDoc; @@ -423,13 +413,13 @@ namespace tomoto } } - int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, RandGen* rgs) + int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs) { throw e; } public: - TopicModel(const RandGen& _rg) : rg(_rg) + TopicModel(const _RandGen& _rg) : rg(_rg) { } @@ -494,10 +484,10 @@ namespace tomoto } std::vector<_ModelState> localData; - std::vector localRG; + std::vector<_RandGen> localRG; for (size_t i = 0; i < numWorkers; ++i) { - localRG.emplace_back(RandGen{rg()}); + localRG.emplace_back(_RandGen{rg()}); if(ps == ParallelScheme::copy_merge) localData.emplace_back(static_cast<_Derived*>(this)->globalState); } diff --git a/src/Utils/Dictionary.h b/src/Utils/Dictionary.h index 66c9565..15584f0 100644 --- a/src/Utils/Dictionary.h +++ b/src/Utils/Dictionary.h @@ -27,7 +27,7 @@ namespace tomoto { dict.emplace(std::make_pair(word, dict.size())); id2word.emplace_back(word); - return dict.size() - 1; + return (Vid)(dict.size() - 1); } return it->second; } diff --git a/src/Utils/EigenAddonOps.hpp b/src/Utils/EigenAddonOps.hpp index 0f1a585..bdeb1a8 100644 --- a/src/Utils/EigenAddonOps.hpp +++ b/src/Utils/EigenAddonOps.hpp @@ -12,33 +12,9 @@ namespace Eigen { typedef PacketType type; }; - - inline float bit_to_ur(uint32_t x) - { - union - { - float f; - uint32_t u; - }; - - u = x; - u = (127 << 23) | (u & 0x7FFFFF); - return f - 1; - } - - template - struct box_muller - { - }; } } -#ifdef __GNUC__ -#if __GNUC__ < 8 -#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1) -#endif -#endif - #ifdef EIGEN_VECTORIZE_AVX #include #include "avx_gamma.h" @@ -57,70 +33,6 @@ namespace Eigen return _mm256_cvtepi32_ps(a); } -#ifdef EIGEN_VECTORIZE_AVX2 - inline Packet8f bit_to_ur(const Packet8i& x) - { - const __m256i lower = _mm256_set1_epi32(0x7FFFFF), - upper = _mm256_set1_epi32(127 << 23); - const __m256 one = _mm256_set1_ps(1); - union - { - __m256 f; - __m256i u; - }; - u = _mm256_or_si256(_mm256_and_si256(x, lower), upper); - return _mm256_sub_ps(f, one); - } -#else - inline Packet8f bit_to_ur(const Packet8i& x) - { - const __m128i lower = _mm_set1_epi32(0x7FFFFF), - upper = _mm_set1_epi32(127 << 23); - const __m256 one = _mm256_set1_ps(1); - union - { - __m256 f; - __m256i u; - }; - u = _mm256_set_m128i( - _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(x, 1), lower), upper), - _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(x, 0), lower), upper) - ); - return _mm256_sub_ps(f, one); - } -#endif - - template - struct box_muller - { - Packet8f operator()(Rng&& rng, Packet8f& cache) - { - __m256 u1, u2; - if (sizeof(decltype(rng())) == 8) - { - u1 = bit_to_ur(_mm256_set_epi64x(rng(), rng(), rng(), rng())); - u2 = bit_to_ur(_mm256_set_epi64x(rng(), rng(), rng(), rng())); - } - else - { - u1 = bit_to_ur(_mm256_set_epi32(rng(), rng(), rng(), rng(), rng(), rng(), rng(), rng())); - u2 = bit_to_ur(_mm256_set_epi32(rng(), rng(), rng(), rng(), rng(), rng(), rng(), rng())); - } - - const __m256 twopi = _mm256_set1_ps(2.0f * 3.14159265358979323846f); - const __m256 one = _mm256_set1_ps(1.0f); - const __m256 minustwo = _mm256_set1_ps(-2.0f); - - u1 = _mm256_sub_ps(one, u1); - - __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(minustwo, log_ps(u1))); - __m256 theta = _mm256_mul_ps(twopi, u2); - __m256 sintheta, costheta; - sincos_ps(theta, &sintheta, &costheta); - cache = _mm256_mul_ps(radius, costheta); - return _mm256_mul_ps(radius, sintheta); - } - }; } } @@ -142,52 +54,6 @@ namespace Eigen return _mm_cvtepi32_ps(a); } - inline Packet4f bit_to_ur(const Packet4i& x) - { - const __m128i lower = _mm_set1_epi32(0x7FFFFF), - upper = _mm_set1_epi32(127 << 23); - const __m128 one = _mm_set1_ps(1); - union - { - __m128 f; - __m128i u; - }; - u = _mm_or_si128(_mm_and_si128(x, lower), upper); - return _mm_sub_ps(f, one); - } - - template - struct box_muller - { - Packet4f operator()(Rng&& rng, Packet4f& cache) - { - __m128 u1, u2; - if (sizeof(decltype(rng())) == 8) - { - u1 = bit_to_ur(_mm_set_epi64x(rng(), rng())); - u2 = bit_to_ur(_mm_set_epi64x(rng(), rng())); - } - else - { - u1 = bit_to_ur(_mm_set_epi32(rng(), rng(), rng(), rng())); - u2 = bit_to_ur(_mm_set_epi32(rng(), rng(), rng(), rng())); - } - - const __m128 twopi = _mm_set1_ps(2.0f * 3.14159265358979323846f); - const __m128 one = _mm_set1_ps(1.0f); - const __m128 minustwo = _mm_set1_ps(-2.0f); - - u1 = _mm_sub_ps(one, u1); - - __m128 radius = _mm_sqrt_ps(_mm_mul_ps(minustwo, log_ps(u1))); - __m128 theta = _mm_mul_ps(twopi, u2); - __m128 sintheta, costheta; - sincos_ps(theta, &sintheta, &costheta); - cache = _mm_mul_ps(radius, costheta); - return _mm_mul_ps(radius, sintheta); - } - }; - } } #endif @@ -290,68 +156,6 @@ namespace Eigen evaluator m_argImpl; }; - template struct scalar_norm_dist_op { - Rng rng; - - scalar_norm_dist_op(const Rng& _rng) : rng{ _rng } - { - } - - scalar_norm_dist_op(const scalar_norm_dist_op& o) - : rng{ o.rng } - { - } - - scalar_norm_dist_op(scalar_norm_dist_op&& o) - : rng{ std::move(o.rng) } - { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const - { - thread_local Scalar cache; - thread_local bool valid = false; - if (valid) - { - valid = false; - return cache; - } - - Scalar v1, v2, sx; - while(1) - { - v1 = 2 * bit_to_ur(rng()) - 1; - v2 = 2 * bit_to_ur(rng()) - 1; - sx = v1 * v1 + v2 * v2; - if (sx && sx < 1) break; - } - Scalar fx = std::sqrt((Scalar)-2.0 * std::log(sx) / sx); - cache = fx * v2; - valid = true; - return fx * v1; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp() const - { - thread_local Packet cache; - thread_local bool valid = false; - if (valid) - { - valid = false; - return cache; - } - valid = true; - return box_muller{}(rng, cache); - } - }; - - template - struct functor_traits > - { - enum { Cost = HugeCost, PacketAccess = packet_traits::Vectorizable, IsRepeatable = false }; - }; - } template EIGEN_DEVICE_FUNC inline @@ -374,13 +178,4 @@ namespace Eigen y.derived() ); } - - template - inline const CwiseNullaryOp, const Derived> - norm_dist(Index rows, Index cols, Urng&& urng) - { - return CwiseNullaryOp, const Derived>( - rows, cols, internal::scalar_norm_dist_op(std::forward(urng)) - ); - } } diff --git a/src/Utils/LUT.hpp b/src/Utils/LUT.hpp index 9f047dd..a8e45f8 100644 --- a/src/Utils/LUT.hpp +++ b/src/Utils/LUT.hpp @@ -11,16 +11,16 @@ namespace tomoto class LUT3 { protected: - std::array<_Prec, N + M + L> points; - static constexpr auto P = 1.0 / S; - static constexpr auto Q = 1.0 / T; - static constexpr auto R = 1.0 / U; + std::array<_Prec, N + M + L> points = {}; + static constexpr _Prec P = (_Prec)(1. / S); + static constexpr _Prec Q = (_Prec)(1. / T); + static constexpr _Prec R = (_Prec)(1. / U); LUT3() { _Func fun; for (size_t i = 0; i < N; i++) { - points[i] = fun(i ? i * P : 0.0001); + points[i] = fun(i ? i * P : (_Prec)0.0001); } for (size_t i = 0; i < M; i++) { diff --git a/src/Utils/Utils.hpp b/src/Utils/Utils.hpp index a62a7dd..f7cdbc8 100644 --- a/src/Utils/Utils.hpp +++ b/src/Utils/Utils.hpp @@ -149,7 +149,7 @@ namespace tomoto pv[i] = std::make_pair(src[i], i); } - std::sort(pv.begin(), pv.end(), [&cmp](const voPair_t& a, const voPair_t& b) + std::stable_sort(pv.begin(), pv.end(), [&cmp](const voPair_t& a, const voPair_t& b) { return cmp(a.first, b.first); }); diff --git a/src/Utils/avx_mathfun.h b/src/Utils/avx_mathfun.h index 185a9fa..97daa7a 100644 --- a/src/Utils/avx_mathfun.h +++ b/src/Utils/avx_mathfun.h @@ -80,18 +80,18 @@ _PI32_CONST256(2, 2); _PI32_CONST256(4, 4); _PI32_CONST256(0x7f, 0x7f); -_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); -_PS256_CONST(cephes_log_p0, 7.0376836292E-2); -_PS256_CONST(cephes_log_p1, -1.1514610310E-1); -_PS256_CONST(cephes_log_p2, 1.1676998740E-1); -_PS256_CONST(cephes_log_p3, -1.2420140846E-1); -_PS256_CONST(cephes_log_p4, +1.4249322787E-1); -_PS256_CONST(cephes_log_p5, -1.6668057665E-1); -_PS256_CONST(cephes_log_p6, +2.0000714765E-1); -_PS256_CONST(cephes_log_p7, -2.4999993993E-1); -_PS256_CONST(cephes_log_p8, +3.3333331174E-1); -_PS256_CONST(cephes_log_q1, -2.12194440e-4); -_PS256_CONST(cephes_log_q2, 0.693359375); +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524f); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2f); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1f); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1f); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1f); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1f); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1f); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1f); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1f); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1f); +_PS256_CONST(cephes_log_q1, -2.12194440e-4f); +_PS256_CONST(cephes_log_q2, 0.693359375f); #ifndef __AVX2__ @@ -252,16 +252,16 @@ inline v8sf log_ps(v8sf x) { _PS256_CONST(exp_hi, 88.3762626647949f); _PS256_CONST(exp_lo, -88.3762626647949f); -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341f); +_PS256_CONST(cephes_exp_C1, 0.693359375f); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4f); -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4f); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3f); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3f); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2f); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1f); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1f); inline v8sf exp_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; @@ -319,16 +319,16 @@ inline v8sf exp_ps(v8sf x) { return y; } -_PS256_CONST(minus_cephes_DP1, -0.78515625); -_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS256_CONST(sincof_p0, -1.9515295891E-4); -_PS256_CONST(sincof_p1, 8.3321608736E-3); -_PS256_CONST(sincof_p2, -1.6666654611E-1); -_PS256_CONST(coscof_p0, 2.443315711809948E-005); -_PS256_CONST(coscof_p1, -1.388731625493765E-003); -_PS256_CONST(coscof_p2, 4.166664568298827E-002); -_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI +_PS256_CONST(minus_cephes_DP1, -0.78515625f); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); +_PS256_CONST(sincof_p0, -1.9515295891E-4f); +_PS256_CONST(sincof_p1, 8.3321608736E-3f); +_PS256_CONST(sincof_p2, -1.6666654611E-1f); +_PS256_CONST(coscof_p0, 2.443315711809948E-005f); +_PS256_CONST(coscof_p1, -1.388731625493765E-003f); +_PS256_CONST(coscof_p2, 4.166664568298827E-002f); +_PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI /* evaluation of 8 sines at onces using AVX intrisics diff --git a/src/Utils/math.h b/src/Utils/math.h index 01f19b9..a1aadb5 100644 --- a/src/Utils/math.h +++ b/src/Utils/math.h @@ -94,15 +94,15 @@ namespace tomoto .424592039e-7f, -.113691296e-7f, .304502217e-8f, -.815684550e-9f, }; - float Tn_1 = 1.0L; - float Tn = x - 2.0L; + float Tn_1 = (float)1.0; + float Tn = (float)(x - 2.0); float resul = Kncoe[0] + Kncoe[1] * Tn; x -= 2.0L; for (size_t n = 2; n < sizeof(Kncoe) / sizeof(float); n++) { - const float Tn1 = 2.0L * x * Tn - Tn_1; + const float Tn1 = (float)(2.0L * x * Tn - Tn_1); resul += Kncoe[n] * Tn1; Tn_1 = Tn; Tn = Tn1; @@ -119,7 +119,7 @@ namespace tomoto struct F_lgamma { float operator()(float x) { return lgamma(x); } - static constexpr float smallThreshold = 0.001; + static constexpr float smallThreshold = (float)(0.001); float forSmall(float x) { if (x == 0) return INFINITY; @@ -137,7 +137,7 @@ namespace tomoto { return digamma(x); } - static constexpr float smallThreshold = 0.001; + static constexpr float smallThreshold = (float)(0.001); float forSmall(float x) { if (x == 0) return -INFINITY; diff --git a/src/Utils/sample.hpp b/src/Utils/sample.hpp index 416dea5..f099997 100644 --- a/src/Utils/sample.hpp +++ b/src/Utils/sample.hpp @@ -3,7 +3,7 @@ #include #ifdef __AVX__ #include -#elif defined(__SSE2__) +#elif defined(__SSE2__) || defined(_WIN64) #include #else @@ -56,50 +56,7 @@ namespace tomoto #endif -#ifdef __AVX__ - inline __m256 scan_AVX(__m256 x) - { - __m256 t0, t1; - //shift1_AVX + add - t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3)); - t1 = _mm256_permute2f128_ps(t0, t0, 41); - x = _mm256_add_ps(x, _mm256_blend_ps(t0, t1, 0x11)); - //shift2_AVX + add - t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); - t1 = _mm256_permute2f128_ps(t0, t0, 41); - x = _mm256_add_ps(x, _mm256_blend_ps(t0, t1, 0x33)); - //shift3_AVX + add - x = _mm256_add_ps(x, _mm256_permute2f128_ps(x, x, 41)); - return x; - } - - inline void prefix_sum_AVX(float *a, const int n) - { - __m256 offset = _mm256_setzero_ps(); - for (int i = 0; i < n; i += 8) - { - __m256 x = _mm256_loadu_ps(&a[i]); - __m256 out = scan_AVX(x); - out = _mm256_add_ps(out, offset); - _mm256_storeu_ps(&a[i], out); - //broadcast last element - __m256 t0 = _mm256_permute2f128_ps(out, out, 0x11); - offset = _mm256_permute_ps(t0, 0xff); - } - } - - inline void prefixSum(float* arr, size_t K) - { - size_t Kf = (K >> 3) << 3; - if (Kf) prefix_sum_AVX(arr, Kf); - else Kf = 1; - for (size_t i = Kf; i < K; ++i) - { - arr[i] += arr[i - 1]; - } - } - -#elif defined(__SSE2__) +#if defined(__SSE2__) || defined(_WIN64) inline __m128 scan_SSE(__m128 x) { x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4))); @@ -107,60 +64,60 @@ namespace tomoto return x; } - inline void prefix_sum_SSE(float *a, const int n) + inline void prefixSum(float* arr, int n) { + int n4 = n & ~3; __m128 offset = _mm_setzero_ps(); - for (int i = 0; i < n; i += 4) + for (int i = 0; i < n4; i += 4) { - __m128 x = _mm_load_ps(&a[i]); + __m128 x = _mm_load_ps(&arr[i]); __m128 out = scan_SSE(x); out = _mm_add_ps(out, offset); - _mm_store_ps(&a[i], out); + _mm_store_ps(&arr[i], out); offset = _mm_shuffle_ps(out, out, _MM_SHUFFLE(3, 3, 3, 3)); } - } - - inline void prefixSum(float* arr, size_t K) - { - size_t Kf = (K >> 2) << 2; - if (Kf) prefix_sum_SSE(arr, Kf); - else Kf = 1; - for (size_t i = Kf; i < K; ++i) + if (!n4) n4 = 1; + for (size_t i = n4; i < n; ++i) { arr[i] += arr[i - 1]; } } #else - inline void prefixSum(float* arr, size_t K) + inline void prefixSum(float* arr, int n) { - for (size_t i = 1; i < K; ++i) + int n4 = n & ~3; + float acc = 0; + for (int i = 0; i < n4; i += 4) { - arr[i] += arr[i - 1]; + // first accumulation + arr[i + 3] += arr[i + 2]; + arr[i + 2] += arr[i + 1]; + arr[i + 1] += arr[i]; + + // second accumulation + arr[i + 3] += arr[i + 1]; + arr[i + 2] += arr[i]; + + // accumulate offset + arr[i] += acc; + arr[i + 1] += acc; + arr[i + 2] += acc; + arr[i + 3] += acc; + + acc = arr[i + 3]; } - } -#endif - struct FastRealGenerator - { - template - float operator()(Random& rg) + + for (int i = n4; i < n; ++i) { - union - { - float f; - uint32_t u; - }; - - u = rg(); - u = (127 << 23) | (u & 0x7FFFFF); - return f - 1; + arr[i] += acc; } - }; + } +#endif template inline size_t sampleFromDiscrete(RealIt begin, RealIt end, Random& rg) { - FastRealGenerator dist; - auto r = dist(rg) * std::accumulate(begin, end, 0.f); + auto r = rg.uniform_real() * std::accumulate(begin, end, 0.f); size_t K = std::distance(begin, end); size_t z = 0; for (; r > *begin && z < K - 1; ++z, ++begin) @@ -173,9 +130,7 @@ namespace tomoto template inline size_t sampleFromDiscreteAcc(RealIt begin, RealIt end, Random& rg) { - //auto r = std::generate_canonical(rg) * *(end - 1); - FastRealGenerator dist; - auto r = dist(rg) * *(end - 1); + auto r = rg.uniform_real() * *(end - 1); size_t K = std::distance(begin, end); size_t z = 0; #ifdef __AVX__ diff --git a/src/Utils/serializer.hpp b/src/Utils/serializer.hpp index 813ba72..4959690 100644 --- a/src/Utils/serializer.hpp +++ b/src/Utils/serializer.hpp @@ -305,8 +305,8 @@ namespace tomoto template inline void writeToBinStreamImpl(std::ostream& ostr, const Eigen::Matrix<_Ty, -1, -1>& v) { - writeToStream(ostr, v.rows()); - writeToStream(ostr, v.cols()); + writeToStream(ostr, (uint32_t)v.rows()); + writeToStream(ostr, (uint32_t)v.cols()); if (!ostr.write((const char*)v.data(), sizeof(_Ty) * v.size())) throw std::ios_base::failure( std::string("writing type '") + typeid(_Ty).name() + std::string("' is failed") ); } @@ -324,8 +324,8 @@ namespace tomoto template inline void writeToBinStreamImpl(std::ostream& ostr, const Eigen::Matrix<_Ty, -1, 1>& v) { - writeToStream(ostr, v.rows()); - writeToStream(ostr, v.cols()); + writeToStream(ostr, (uint32_t)v.rows()); + writeToStream(ostr, (uint32_t)v.cols()); if (!ostr.write((const char*)v.data(), sizeof(_Ty) * v.size())) throw std::ios_base::failure( std::string("writing type '") + typeid(_Ty).name() + std::string("' is failed") ); } @@ -344,7 +344,7 @@ namespace tomoto template inline void writeToBinStreamImpl(std::ostream& ostr, const std::vector<_Ty>& v) { - writeToStream(ostr, v.size()); + writeToStream(ostr, (uint32_t)v.size()); for (auto& e : v) writeToStream(ostr, e); } @@ -373,7 +373,7 @@ namespace tomoto template inline void writeToBinStreamImpl(std::ostream& ostr, const std::unordered_map<_KeyTy, _ValTy>& v) { - writeToStream(ostr, v.size()); + writeToStream(ostr, (uint32_t)v.size()); for (auto& e : v) writeToStream(ostr, e); } @@ -391,7 +391,7 @@ namespace tomoto template inline void writeToBinStreamImpl(std::ostream& ostr, const std::array<_Ty, _N>& v) { - writeToStream(ostr, v.size()); + writeToStream(ostr, (uint32_t)v.size()); for (auto& e : v) writeToStream(ostr, e); } diff --git a/src/Utils/sse_mathfun.h b/src/Utils/sse_mathfun.h index 824a79f..e47df7b 100644 --- a/src/Utils/sse_mathfun.h +++ b/src/Utils/sse_mathfun.h @@ -75,18 +75,18 @@ _PI32_CONST(2, 2); _PI32_CONST(4, 4); _PI32_CONST(0x7f, 0x7f); -_PS_CONST(cephes_SQRTHF, 0.707106781186547524); -_PS_CONST(cephes_log_p0, 7.0376836292E-2); -_PS_CONST(cephes_log_p1, -1.1514610310E-1); -_PS_CONST(cephes_log_p2, 1.1676998740E-1); -_PS_CONST(cephes_log_p3, -1.2420140846E-1); -_PS_CONST(cephes_log_p4, +1.4249322787E-1); -_PS_CONST(cephes_log_p5, -1.6668057665E-1); -_PS_CONST(cephes_log_p6, +2.0000714765E-1); -_PS_CONST(cephes_log_p7, -2.4999993993E-1); -_PS_CONST(cephes_log_p8, +3.3333331174E-1); -_PS_CONST(cephes_log_q1, -2.12194440e-4); -_PS_CONST(cephes_log_q2, 0.693359375); +_PS_CONST(cephes_SQRTHF, 0.707106781186547524f); +_PS_CONST(cephes_log_p0, 7.0376836292E-2f); +_PS_CONST(cephes_log_p1, -1.1514610310E-1f); +_PS_CONST(cephes_log_p2, 1.1676998740E-1f); +_PS_CONST(cephes_log_p3, -1.2420140846E-1f); +_PS_CONST(cephes_log_p4, +1.4249322787E-1f); +_PS_CONST(cephes_log_p5, -1.6668057665E-1f); +_PS_CONST(cephes_log_p6, +2.0000714765E-1f); +_PS_CONST(cephes_log_p7, -2.4999993993E-1f); +_PS_CONST(cephes_log_p8, +3.3333331174E-1f); +_PS_CONST(cephes_log_q1, -2.12194440e-4f); +_PS_CONST(cephes_log_q2, 0.693359375f); #ifndef __SSE2__ typedef union xmm_mm_union { @@ -200,16 +200,16 @@ inline v4sf log_ps(v4sf x) { _PS_CONST(exp_hi, 88.3762626647949f); _PS_CONST(exp_lo, -88.3762626647949f); -_PS_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS_CONST(cephes_exp_C1, 0.693359375); -_PS_CONST(cephes_exp_C2, -2.12194440e-4); +_PS_CONST(cephes_LOG2EF, 1.44269504088896341f); +_PS_CONST(cephes_exp_C1, 0.693359375f); +_PS_CONST(cephes_exp_C2, -2.12194440e-4f); -_PS_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS_CONST(cephes_exp_p5, 5.0000001201E-1); +_PS_CONST(cephes_exp_p0, 1.9875691500E-4f); +_PS_CONST(cephes_exp_p1, 1.3981999507E-3f); +_PS_CONST(cephes_exp_p2, 8.3334519073E-3f); +_PS_CONST(cephes_exp_p3, 4.1665795894E-2f); +_PS_CONST(cephes_exp_p4, 1.6666665459E-1f); +_PS_CONST(cephes_exp_p5, 5.0000001201E-1f); inline v4sf exp_ps(v4sf x) { v4sf tmp = _mm_setzero_ps(), fx; @@ -289,16 +289,16 @@ inline v4sf exp_ps(v4sf x) { return y; } -_PS_CONST(minus_cephes_DP1, -0.78515625); -_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS_CONST(sincof_p0, -1.9515295891E-4); -_PS_CONST(sincof_p1, 8.3321608736E-3); -_PS_CONST(sincof_p2, -1.6666654611E-1); -_PS_CONST(coscof_p0, 2.443315711809948E-005); -_PS_CONST(coscof_p1, -1.388731625493765E-003); -_PS_CONST(coscof_p2, 4.166664568298827E-002); -_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI +_PS_CONST(minus_cephes_DP1, -0.78515625f); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); +_PS_CONST(sincof_p0, -1.9515295891E-4f); +_PS_CONST(sincof_p1, 8.3321608736E-3f); +_PS_CONST(sincof_p2, -1.6666654611E-1f); +_PS_CONST(coscof_p0, 2.443315711809948E-005f); +_PS_CONST(coscof_p1, -1.388731625493765E-003f); +_PS_CONST(coscof_p2, 4.166664568298827E-002f); +_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so diff --git a/src/python/PyUtils.h b/src/python/PyUtils.h index 0ce75fc..fe5d17f 100644 --- a/src/python/PyUtils.h +++ b/src/python/PyUtils.h @@ -9,7 +9,14 @@ #include #include +#ifdef _DEBUG +#undef _DEBUG #include +#define _DEBUG +#else +#include +#endif + #include #ifdef MAIN_MODULE #else @@ -33,7 +40,7 @@ namespace py UniqueObj(const UniqueObj&) = delete; UniqueObj& operator=(const UniqueObj&) = delete; - UniqueObj(UniqueObj&& o) + UniqueObj(UniqueObj&& o) noexcept { std::swap(obj, o.obj); } @@ -84,7 +91,7 @@ namespace py template<> inline float makeObjectToCType(PyObject *obj) { - float d = PyFloat_AsDouble(obj); + float d = (float)PyFloat_AsDouble(obj); if (d == -1 && PyErr_Occurred()) throw std::bad_exception{}; return d; } diff --git a/src/python/docs.h b/src/python/docs.h index 8f4dac1..5941afd 100644 --- a/src/python/docs.h +++ b/src/python/docs.h @@ -1986,13 +1986,15 @@ y : Iterable[float] )""); DOC_SIGNATURE_EN_KO(SLDA_get_regression_coef__doc__, - "get_regression_coef(self, var_id)", + "get_regression_coef(self, var_id=None)", u8R""(Return the regression coefficient of the response variable `var_id`. Parameters ---------- var_id : int indicating the reponse variable, in range [0, `f`) + + If omitted, the whole regression coefficients with shape `[f, k]` are returned. )"", u8R""(응답 변수 `var_id`의 회귀 계수를 반환합니다. @@ -2000,6 +2002,8 @@ Parameters ---------- var_id : int 응답 변수를 지정하는 [0, `f`) 범위의 정수 + + 생략시, `[f, k]` 모양의 전체 회귀 계수가 반환됩니다. )""); DOC_SIGNATURE_EN_KO(SLDA_get_var_type__doc__, @@ -2727,3 +2731,11 @@ DOC_VARIABLE_EN_KO(DT_lr_b__doc__, DOC_VARIABLE_EN_KO(DT_lr_c__doc__, u8R""(parameter `c` with range (0.5, 1] for SGLD step size (e_i = a * (b + i) ^ -c))"", u8R""(SGLD의 스텝 크기를 결정하는 (0.5, 1] 범위의 파라미터 `c` (e_i = a * (b + i) ^ -c))""); + +DOC_VARIABLE_EN_KO(DT_num_timepoints__doc__, + u8R""(the number of timepoints of the model (read-only))"", + u8R""(모델의 시점 개수 (읽기전용))""); + +DOC_VARIABLE_EN_KO(DT_num_docs_by_timepoint__doc__, + u8R""(the number of documents in the model by timepoint (read-only))"", + u8R""(각 시점별 모델 내 문헌 개수 (읽기전용))""); \ No newline at end of file diff --git a/src/python/py_CT.cpp b/src/python/py_CT.cpp index e266c97..cff53c7 100644 --- a/src/python/py_CT.cpp +++ b/src/python/py_CT.cpp @@ -9,11 +9,13 @@ static int CT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; float alpha = 0.1, eta = 0.01; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "smoothing_alpha", "eta", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "smoothing_alpha", "eta", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,7 +23,22 @@ static int CT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::ICTModel::create((tomoto::TermWeight)tw, K, alpha, eta, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::ICTModel::create((tomoto::TermWeight)tw, K, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_DMR.cpp b/src/python/py_DMR.cpp index d352d7f..aaed399 100644 --- a/src/python/py_DMR.cpp +++ b/src/python/py_DMR.cpp @@ -9,11 +9,13 @@ static int DMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; float alpha = 0.1, eta = 0.01, sigma = 1, alphaEpsilon = 1e-10; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", "sigma", "alpha_epsilon", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &alpha, &eta, &sigma, &alphaEpsilon, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", "sigma", "alpha_epsilon", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &alpha, &eta, &sigma, &alphaEpsilon, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,7 +23,22 @@ static int DMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::IDMRModel::create((tomoto::TermWeight)tw, K, alpha, sigma, eta, alphaEpsilon, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::IDMRModel::create((tomoto::TermWeight)tw, K, alpha, sigma, eta, alphaEpsilon, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_DT.cpp b/src/python/py_DT.cpp index 8b487da..15ad60d 100644 --- a/src/python/py_DT.cpp +++ b/src/python/py_DT.cpp @@ -8,17 +8,18 @@ static int DT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1, T = 1; - float alphaVar = 0.1, etaVar = 0.1, phiVar = 0.1; - float lrA = 0.01, lrB = 0.1, lrC = 0.55; + float alphaVar = 0.1f, etaVar = 0.1f, phiVar = 0.1f; + float lrA = 0.01f, lrB = 0.1f, lrC = 0.55f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "t", "alpha_var", "eta_var", "phi_var", "lr_a", "lr_b", "lr_c", - "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffffffnOO", (char**)kwlist, + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffffffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &K, &T, &alphaVar, &etaVar, &phiVar, &lrA, &lrB, &lrC, - &seed, &objCorpus, &objTransform)) return -1; + &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -26,9 +27,24 @@ static int DT_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + tomoto::ITopicModel* inst = tomoto::IDTModel::create((tomoto::TermWeight)tw, K, T, alphaVar, etaVar, phiVar, lrA, lrB, lrC, - 0, tomoto::RandGen{ seed }); + 0, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; @@ -299,6 +315,8 @@ static PyMethodDef DT_methods[] = DEFINE_GETTER(tomoto::IDTModel, DT, getShapeA); DEFINE_GETTER(tomoto::IDTModel, DT, getShapeB); DEFINE_GETTER(tomoto::IDTModel, DT, getShapeC); +DEFINE_GETTER(tomoto::IDTModel, DT, getT); +DEFINE_GETTER(tomoto::IDTModel, DT, getNumDocsByT); DEFINE_SETTER_CHECKED_FLOAT(tomoto::IDTModel, DT, setShapeA, value > 0); DEFINE_SETTER_CHECKED_FLOAT(tomoto::IDTModel, DT, setShapeB, value >= 0); @@ -308,6 +326,8 @@ static PyGetSetDef DT_getseters[] = { { (char*)"lr_a", (getter)DT_getShapeA, (setter)DT_setShapeA, DT_lr_a__doc__, nullptr }, { (char*)"lr_b", (getter)DT_getShapeB, (setter)DT_setShapeB, DT_lr_b__doc__, nullptr }, { (char*)"lr_c", (getter)DT_getShapeC, (setter)DT_setShapeC, DT_lr_c__doc__, nullptr }, + { (char*)"num_timepoints", (getter)DT_getT, nullptr, DT_num_timepoints__doc__, nullptr }, + { (char*)"num_docs_by_timepoint", (getter)DT_getNumDocsByT, nullptr, DT_num_docs_by_timepoint__doc__, nullptr }, { nullptr }, }; diff --git a/src/python/py_GDMR.cpp b/src/python/py_GDMR.cpp index 068fba5..4cece89 100644 --- a/src/python/py_GDMR.cpp +++ b/src/python/py_GDMR.cpp @@ -8,17 +8,18 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; - float alpha = 0.1, eta = 0.01, sigma = 1, sigma0 = 3, alphaEpsilon = 1e-10; + float alpha = 0.1f, eta = 0.01f, sigma = 1, sigma0 = 3, alphaEpsilon = 1e-10f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr, *objDegrees = nullptr, *objRange = nullptr; static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "degrees", "alpha", "eta", "sigma", "sigma0", "alpha_epsilon", - "metadata_range", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfffffOnOO", (char**)kwlist, + "metadata_range", "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOfffffOnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop,&K, &objDegrees, &alpha, &eta, &sigma, &sigma0, &alphaEpsilon, - &objRange, &seed, &objCorpus, &objTransform)) return -1; + &objRange, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -26,6 +27,21 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + vector degrees; if (objDegrees) { @@ -39,7 +55,7 @@ static int GDMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) } tomoto::IGDMRModel* inst = tomoto::IGDMRModel::create((tomoto::TermWeight)tw, K, - degrees, alpha, sigma, sigma0, eta, alphaEpsilon, tomoto::RandGen{ seed }); + degrees, alpha, sigma, sigma0, eta, alphaEpsilon, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_HDP.cpp b/src/python/py_HDP.cpp index e08a706..7515b82 100644 --- a/src/python/py_HDP.cpp +++ b/src/python/py_HDP.cpp @@ -8,12 +8,14 @@ static int HDP_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 2; - float alpha = 0.1, eta = 0.01, gamma = 0.1; + float alpha = 0.1f, eta = 0.01f, gamma = 0.1f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "initial_k", "alpha", "eta", "gamma", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnfffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &alpha, &eta, &gamma, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "initial_k", "alpha", "eta", "gamma", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnfffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &alpha, &eta, &gamma, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,7 +23,22 @@ static int HDP_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::IHDPModel::create((tomoto::TermWeight)tw, K, alpha, eta, gamma, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::IHDPModel::create((tomoto::TermWeight)tw, K, alpha, eta, gamma, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_HLDA.cpp b/src/python/py_HLDA.cpp index f01cdbb..4307508 100644 --- a/src/python/py_HLDA.cpp +++ b/src/python/py_HLDA.cpp @@ -8,12 +8,14 @@ static int HLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t depth = 2; - float alpha = 0.1, eta = 0.01, gamma = 0.1; + float alpha = 0.1f, eta = 0.01f, gamma = 0.1f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "depth", "alpha", "eta", "gamma", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnfffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &depth, &alpha, &eta, &gamma, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "depth", "alpha", "eta", "gamma", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnfffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &depth, &alpha, &eta, &gamma, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,7 +23,22 @@ static int HLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::IHLDAModel::create((tomoto::TermWeight)tw, depth, alpha, eta, gamma, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::IHLDAModel::create((tomoto::TermWeight)tw, depth, alpha, eta, gamma, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_HPA.cpp b/src/python/py_HPA.cpp index 64caa8f..0f1344a 100644 --- a/src/python/py_HPA.cpp +++ b/src/python/py_HPA.cpp @@ -8,12 +8,14 @@ static int HPA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1, K2 = 1; - float alpha = 0.1, eta = 0.01; + float alpha = 0.1f, eta = 0.01f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k1", "k2", "alpha", "eta", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &K2, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k1", "k2", "alpha", "eta", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &K2, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,8 +23,23 @@ static int HPA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + tomoto::ITopicModel* inst = tomoto::IHPAModel::create((tomoto::TermWeight)tw, - false, K, K2, alpha, eta, tomoto::RandGen{ seed }); + false, K, K2, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 7d5ea3c..bebbc87 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -11,13 +11,14 @@ static int LDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; - float alpha = 0.1, eta = 0.01; + float alpha = 0.1f, eta = 0.01f; PyObject* objCorpus = nullptr, *objTransform = nullptr; + const char* rng = "scalar"; size_t seed = random_device{}(); - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", "seed", + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", "seed", "rng", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffnOO", (char**)kwlist, - &tw, &minCnt, &minDf, &rmTop, &K, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnffnsOO", (char**)kwlist, + &tw, &minCnt, &minDf, &rmTop, &K, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -25,7 +26,22 @@ static int LDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::ILDAModel::create((tomoto::TermWeight)tw, K, alpha, eta, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::ILDAModel::create((tomoto::TermWeight)tw, K, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index f0bb98e..b3ce959 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -8,12 +8,14 @@ static int LLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; - float alpha = 0.1, eta = 0.01, sigma = 1, alphaEpsilon = 1e-10; + float alpha = 0.1f, eta = 0.01f, sigma = 1; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnfffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "alpha", "eta", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnfffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,7 +23,22 @@ static int LLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } - tomoto::ITopicModel* inst = tomoto::ILLDAModel::create((tomoto::TermWeight)tw, K, alpha, eta, tomoto::RandGen{ seed }); + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + + tomoto::ITopicModel* inst = tomoto::ILLDAModel::create((tomoto::TermWeight)tw, K, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_MGLDA.cpp b/src/python/py_MGLDA.cpp index 80ea3f4..fbb3cc8 100644 --- a/src/python/py_MGLDA.cpp +++ b/src/python/py_MGLDA.cpp @@ -8,14 +8,15 @@ static int MGLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1, KL = 1, T = 3; - float alpha = 0.1, alphaL = 0.1, eta = 0.01, etaL = 0.01, alphaM = 0.1, alphaML = 0.1, gamma = 0.1; + float alpha = 0.1f, alphaL = 0.1f, eta = 0.01f, etaL = 0.01f, alphaM = 0.1f, alphaML = 0.1f, gamma = 0.1f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k_g", "k_l", "t", "alpha_g", "alpha_l", "alpha_mg", "alpha_ml", - "eta_g", "eta_l", "gamma", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnnfffffffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &KL, &T, - &alpha, &alphaL, &alphaM, &alphaML, &eta, &etaL, &gamma, &seed, &objCorpus, &objTransform)) return -1; + "eta_g", "eta_l", "gamma", "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnnfffffffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &KL, &T, &alpha, &alphaL, &alphaM, &alphaML, &eta, &etaL, &gamma, + &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -23,8 +24,23 @@ static int MGLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + tomoto::ITopicModel* inst = tomoto::IMGLDAModel::create((tomoto::TermWeight)tw, - K, KL, T, alpha, alphaL, alphaM, alphaML, eta, etaL, gamma, tomoto::RandGen{ seed }); + K, KL, T, alpha, alphaL, alphaM, alphaML, eta, etaL, gamma, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_PA.cpp b/src/python/py_PA.cpp index 3818216..be9f9ea 100644 --- a/src/python/py_PA.cpp +++ b/src/python/py_PA.cpp @@ -8,12 +8,14 @@ static int PA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1, K2 = 1; - float alpha = 0.1, eta = 0.01; + float alpha = 0.1f, eta = 0.01f; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k1", "k2", "alpha", "eta", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &K, &K2, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k1", "k2", "alpha", "eta", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &K, &K2, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,8 +23,23 @@ static int PA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + tomoto::ITopicModel* inst = tomoto::IPAModel::create((tomoto::TermWeight)tw, - K, K2, alpha, eta, tomoto::RandGen{ seed }); + K, K2, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; @@ -267,7 +284,7 @@ static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwar { std::vector docs; docs.emplace_back((tomoto::DocumentBase*)doc->doc); - float ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0]; + double ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0]; return Py_BuildValue("((NN)f)", py::buildPyValue(inst->getTopicsByDoc(doc->doc)), py::buildPyValue(inst->getSubTopicsByDoc(doc->doc)), ll); } diff --git a/src/python/py_PLDA.cpp b/src/python/py_PLDA.cpp index 34a6cff..7e2b19e 100644 --- a/src/python/py_PLDA.cpp +++ b/src/python/py_PLDA.cpp @@ -8,12 +8,14 @@ static int PLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t numLatentTopics = 0, numTopicsPerLabel = 1; - float alpha = 0.1, eta = 0.01, sigma = 1, alphaEpsilon = 1e-10; + float alpha = 0.1f, eta = 0.01f, sigma = 1; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; - static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "latent_topics", "topics_per_label", "alpha", "eta", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, - &numLatentTopics, &numTopicsPerLabel, &alpha, &eta, &seed, &objCorpus, &objTransform)) return -1; + static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "latent_topics", "topics_per_label", "alpha", "eta", + "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnnffnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, + &numLatentTopics, &numTopicsPerLabel, &alpha, &eta, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) @@ -21,8 +23,23 @@ static int PLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; + } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + tomoto::ITopicModel* inst = tomoto::IPLDAModel::create((tomoto::TermWeight)tw, - numLatentTopics, numTopicsPerLabel, alpha, eta, tomoto::RandGen{ seed }); + numLatentTopics, numTopicsPerLabel, alpha, eta, seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; diff --git a/src/python/py_SLDA.cpp b/src/python/py_SLDA.cpp index c5363f9..8ef7247 100644 --- a/src/python/py_SLDA.cpp +++ b/src/python/py_SLDA.cpp @@ -8,24 +8,40 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) { size_t tw = 0, minCnt = 0, minDf = 0, rmTop = 0; size_t K = 1; - float alpha = 0.1, eta = 0.01; + float alpha = 0.1f, eta = 0.01f; PyObject *vars = nullptr, *mu = nullptr, *nuSq = nullptr, *glmCoef = nullptr; + const char* rng = "scalar"; size_t seed = random_device{}(); PyObject* objCorpus = nullptr, *objTransform = nullptr; static const char* kwlist[] = { "tw", "min_cf", "min_df", "rm_top", "k", "vars", "alpha", "eta", - "mu", "nu_sq", "glm_param", "seed", "corpus", "transform", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOffOOOnOO", (char**)kwlist, + "mu", "nu_sq", "glm_param", "seed", "rng", "corpus", "transform", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnnnnOffOOOnsOO", (char**)kwlist, &tw, &minCnt, &minDf, &rmTop, &K, &vars, &alpha, &eta, - &mu, &nuSq, &glmCoef, &seed, &objCorpus, &objTransform)) return -1; + &mu, &nuSq, &glmCoef, &seed, &rng, &objCorpus, &objTransform)) return -1; try { if (objCorpus && !PyObject_HasAttrString(objCorpus, corpus_feeder_name)) { throw runtime_error{ "`corpus` must be `tomotopy.utils.Corpus` type." }; + } + string srng = rng; + bool scalarRng = false; + if (srng == "vector8") + { + scalarRng = false; + } + else if (srng == "scalar") + { + scalarRng = true; } + else + { + throw runtime_error{ "Unknown `rng` type '" + srng + "'." }; + } + vector varTypes; if (vars) { @@ -46,7 +62,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) float fTemp; if (mu) { - if ((fTemp = PyFloat_AsDouble(mu)) == -1 && PyErr_Occurred()) + if ((fTemp = (float)PyFloat_AsDouble(mu)) == -1 && PyErr_Occurred()) { PyErr_Clear(); py::UniqueObj iter; @@ -62,7 +78,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if (nuSq) { - if ((fTemp = PyFloat_AsDouble(nuSq)) == -1 && PyErr_Occurred()) + if ((fTemp = (float)PyFloat_AsDouble(nuSq)) == -1 && PyErr_Occurred()) { PyErr_Clear(); py::UniqueObj iter; @@ -78,7 +94,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if (glmCoef) { - if ((fTemp = PyFloat_AsDouble(glmCoef)) == -1 && PyErr_Occurred()) + if ((fTemp = (float)PyFloat_AsDouble(glmCoef)) == -1 && PyErr_Occurred()) { PyErr_Clear(); py::UniqueObj iter; @@ -94,7 +110,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) tomoto::ITopicModel* inst = tomoto::ISLDAModel::create((tomoto::TermWeight)tw, K, varTypes, alpha, eta, vmu, vnuSq, vglmCoef, - tomoto::RandGen{ seed }); + seed, scalarRng); if (!inst) throw runtime_error{ "unknown tw value" }; self->inst = inst; self->isPrepared = false; @@ -251,13 +267,26 @@ static PyObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * static PyObject* SLDA_getRegressionCoef(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - size_t varId; + PyObject* argVarId = nullptr; static const char* kwlist[] = { "var_id", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n", (char**)kwlist, &varId)) return nullptr; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", (char**)kwlist, &argVarId)) return nullptr; try { if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); + if (!argVarId || argVarId == Py_None) + { + npy_intp shapes[2] = { (npy_intp)inst->getF(), (npy_intp)inst->getK() }; + PyObject* ret = PyArray_EMPTY(2, shapes, NPY_FLOAT, 0); + for (size_t i = 0; i < inst->getF(); ++i) + { + auto l = inst->getRegressionCoef(i); + memcpy(PyArray_GETPTR2((PyArrayObject*)ret, i, 0), l.data(), sizeof(float) * l.size()); + } + return ret; + } + + size_t varId = PyLong_AsLong(argVarId); if (varId >= inst->getF()) throw runtime_error{ "'var_id' must be < 'f'" }; return py::buildPyValue(inst->getRegressionCoef(varId)); } diff --git a/tomotopy/documentation.kr.rst b/tomotopy/documentation.kr.rst index 9862fbf..9c84ac6 100644 --- a/tomotopy/documentation.kr.rst +++ b/tomotopy/documentation.kr.rst @@ -18,7 +18,7 @@ tomotopy 란? * Correlated Topic Model (`tomotopy.CTModel`) * Dynamic Topic Model (`tomotopy.DTModel`) -tomotopy의 가장 최신버전은 0.8.1 입니다. +tomotopy의 가장 최신버전은 0.8.2 입니다. .. image:: https://badge.fury.io/py/tomotopy.svg @@ -279,6 +279,10 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.8.2 (2020-07-14) + * `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다. + * `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다. + * 0.8.1 (2020-06-08) * `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다. * 이제 `tomotopy.CTModel.prior_cov`가 `[k, k]` 모양의 공분산 행렬을 반환합니다. diff --git a/tomotopy/documentation.rst b/tomotopy/documentation.rst index e998761..f953008 100644 --- a/tomotopy/documentation.rst +++ b/tomotopy/documentation.rst @@ -18,7 +18,7 @@ The current version of `tomoto` supports several major topic models including * Correlated Topic Model (`tomotopy.CTModel`) * Dynamic Topic Model (`tomotopy.DTModel`). -The most recent version of tomotopy is 0.8.1. +The most recent version of tomotopy is 0.8.2. .. image:: https://badge.fury.io/py/tomotopy.svg @@ -282,6 +282,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.8.2 (2020-07-14) + * New properties `tomotopy.DTModel.num_timepoints` and `tomotopy.DTModel.num_docs_by_timepoint` have been added. + * A bug which causes different results with the different platform even if `seeds` were the same was partially fixed. + As a result of this fix, now `tomotopy` in 32 bit yields different training results from earlier version. + * 0.8.1 (2020-06-08) * A bug where `tomotopy.LDAModel.used_vocabs` returned an incorrect value was fixed. * Now `tomotopy.CTModel.prior_cov` returns a covariance matrix with shape `[k, k]`. diff --git a/tomotopy/version.py b/tomotopy/version.py index 85f0bcd..78bbd49 100644 --- a/tomotopy/version.py +++ b/tomotopy/version.py @@ -1 +1 @@ -__version__ = '0.8.1' \ No newline at end of file +__version__ = '0.8.2' \ No newline at end of file From fbc8014ab73e85168016020050a8bb626625b349 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 14 Jul 2020 21:28:38 +0900 Subject: [PATCH 2/5] add dependency EigenRand --- .github/workflows/deploy.yml | 11 +++++++++++ .github/workflows/deploy_test.yml | 11 +++++++++++ .github/workflows/generate_documentation.yml | 5 +++++ .github/workflows/pull_request_test.yml | 11 +++++++++++ 4 files changed, 38 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index b4474e9..c84ca6c 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -24,6 +24,11 @@ jobs: mkdir include mv eigen-git-mirror/Eigen/ include/ rm -rf eigen-git-mirror/ + git clone https://github.com/bab2min/EigenRand + cd EigenRand + git checkout tags/v0.2.1 + cd .. + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -61,6 +66,9 @@ jobs: wget https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -94,6 +102,9 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml index a1de2a0..fb486a1 100644 --- a/.github/workflows/deploy_test.yml +++ b/.github/workflows/deploy_test.yml @@ -23,6 +23,11 @@ jobs: mkdir include mv eigen-git-mirror/Eigen/ include/ rm -rf eigen-git-mirror/ + git clone https://github.com/bab2min/EigenRand + cd EigenRand + git checkout tags/v0.2.1 + cd .. + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -60,6 +65,9 @@ jobs: wget https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -93,6 +101,9 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Deploy continue-on-error: True env: diff --git a/.github/workflows/generate_documentation.yml b/.github/workflows/generate_documentation.yml index b5b96f8..f4efb81 100644 --- a/.github/workflows/generate_documentation.yml +++ b/.github/workflows/generate_documentation.yml @@ -27,6 +27,11 @@ jobs: wget https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + git clone https://github.com/bab2min/EigenRand + cd EigenRand + git checkout tags/v0.2.1 + cd .. + mv EigenRand/EigenRand include/ - name: build run: | python3 -m pip install pdoc3 numpy diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 886636a..38a9b0b 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -28,6 +28,11 @@ jobs: git checkout tags/3.3.7 cd .. mv eigen-git-mirror include + git clone https://github.com/bab2min/EigenRand + cd EigenRand + git checkout tags/v0.2.1 + cd .. + mv EigenRand/EigenRand include/ - name: Build run: | /opt/python/${{ matrix.cp }}/bin/python -m pip install numpy @@ -58,6 +63,9 @@ jobs: wget https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Build run: | python -m pip install numpy @@ -91,6 +99,9 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include + Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + tar -zxvf v0.2.1.tar.gz + mv EigenRand/EigenRand include/ - name: Build run: | python -m pip install numpy From 6ef49a60665800851ca78ade9c160b418ea4f5f7 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 14 Jul 2020 21:32:59 +0900 Subject: [PATCH 3/5] update workflows --- .github/workflows/deploy.yml | 4 ++-- .github/workflows/deploy_test.yml | 4 ++-- .github/workflows/pull_request_test.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index c84ca6c..ab10b8d 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -68,7 +68,7 @@ jobs: mv eigen-git-mirror-3.3.7 include wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -104,7 +104,7 @@ jobs: mv eigen-git-mirror-3.3.7 include Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Deploy continue-on-error: True env: diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml index fb486a1..922ef1d 100644 --- a/.github/workflows/deploy_test.yml +++ b/.github/workflows/deploy_test.yml @@ -67,7 +67,7 @@ jobs: mv eigen-git-mirror-3.3.7 include wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Deploy continue-on-error: True env: @@ -103,7 +103,7 @@ jobs: mv eigen-git-mirror-3.3.7 include Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Deploy continue-on-error: True env: diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 38a9b0b..310d39d 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -65,7 +65,7 @@ jobs: mv eigen-git-mirror-3.3.7 include wget https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Build run: | python -m pip install numpy @@ -101,7 +101,7 @@ jobs: mv eigen-git-mirror-3.3.7 include Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz - mv EigenRand/EigenRand include/ + mv EigenRand-0.2.1/EigenRand include/ - name: Build run: | python -m pip install numpy From e9247d6004ec625690f73f2dcc1dddc0e42b147a Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 14 Jul 2020 21:42:39 +0900 Subject: [PATCH 4/5] fix workflows & clang err --- .github/workflows/deploy.yml | 2 +- .github/workflows/deploy_test.yml | 2 +- .github/workflows/pull_request_test.yml | 2 +- src/TopicModel/DT.h | 2 +- src/TopicModel/DTModel.hpp | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index ab10b8d..9b92f0e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -102,7 +102,7 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include - Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + Invoke-WebRequest -OutFile v0.2.1.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz mv EigenRand-0.2.1/EigenRand include/ - name: Deploy diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml index 922ef1d..0b89a6d 100644 --- a/.github/workflows/deploy_test.yml +++ b/.github/workflows/deploy_test.yml @@ -101,7 +101,7 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include - Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + Invoke-WebRequest -OutFile v0.2.1.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz mv EigenRand-0.2.1/EigenRand include/ - name: Deploy diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 310d39d..c13e5ae 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -99,7 +99,7 @@ jobs: Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/eigenteam/eigen-git-mirror/archive/3.3.7.tar.gz tar -zxvf 3.3.7.tar.gz mv eigen-git-mirror-3.3.7 include - Invoke-WebRequest -OutFile 3.3.7.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz + Invoke-WebRequest -OutFile v0.2.1.tar.gz https://github.com/bab2min/EigenRand/archive/v0.2.1.tar.gz tar -zxvf v0.2.1.tar.gz mv EigenRand-0.2.1/EigenRand include/ - name: Build diff --git a/src/TopicModel/DT.h b/src/TopicModel/DT.h index 8996178..162a003 100644 --- a/src/TopicModel/DT.h +++ b/src/TopicModel/DT.h @@ -45,7 +45,7 @@ namespace tomoto size_t timepoint) const = 0; virtual size_t getT() const = 0; - virtual std::vector getNumDocsByT() const = 0; + virtual std::vector getNumDocsByT() const = 0; virtual Float getAlphaVar() const = 0; virtual Float getEtaVar() const = 0; diff --git a/src/TopicModel/DTModel.hpp b/src/TopicModel/DTModel.hpp index 7eeacd2..a7a6df6 100644 --- a/src/TopicModel/DTModel.hpp +++ b/src/TopicModel/DTModel.hpp @@ -48,7 +48,7 @@ namespace tomoto Eigen::Matrix alphas; // Dim: (Topic, Time) Eigen::Matrix etaByDoc; // Dim: (Topic, Docs) : Topic distribution by docs(and time) - std::vector numDocsByTime; // Dim: (Time) + std::vector numDocsByTime; // Dim: (Time) Eigen::Matrix phi; // Dim: (Word, Topic * Time) std::vector> wordAliasTables; // Dim: (Word * Time) @@ -495,7 +495,7 @@ namespace tomoto T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi); GETTER(T, size_t, T); - GETTER(NumDocsByT, std::vector, numDocsByTime); + GETTER(NumDocsByT, std::vector, numDocsByTime); GETTER(AlphaVar, Float, alphaVar); GETTER(EtaVar, Float, etaVar); GETTER(PhiVar, Float, phiVar); From df416990924a938f3ee7381f0891736cd460553d Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 14 Jul 2020 22:16:48 +0900 Subject: [PATCH 5/5] turn off packet random engine --- src/TopicModel/LDAModel.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index a7df634..5ea0186 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -30,7 +30,7 @@ Term Weighting Scheme is based on following paper: return new MDL(__VA_ARGS__);\ }\ }\ - else{\ + /*else{\ switch (TW){\ case TermWeight::one:\ return new MDL(__VA_ARGS__);\ @@ -39,7 +39,7 @@ Term Weighting Scheme is based on following paper: case TermWeight::pmi:\ return new MDL(__VA_ARGS__);\ }\ - }\ + }*/\ return nullptr; } while(0) #define GETTER(name, type, field) type get##name() const override { return field; }