Merge pull request #31 from bab2min/develop

bug fixing including #30
bab2min · Mar 1, 2020 · bec1011 · bec1011
2 parents 7c79eec + f30807f
commit bec1011
Show file tree

Hide file tree

Showing 29 changed files with 493 additions and 146 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@
 build_windows.bat
 *.bin
 enwiki-stemmed-1000.txt
-/venv/
+/venv/
+.vscode/
diff --git a/README.kr.rst b/README.kr.rst
@@ -210,6 +210,11 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.5.2 (2020-03-01)
+    * `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다.
+    * `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다.
+    * `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다.
+
 * 0.5.1 (2020-01-11)
     * `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
     * `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.

diff --git a/README.rst b/README.rst
@@ -215,6 +215,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.5.2 (2020-03-01)
+    * A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`.
+    * A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program.
+    * A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True.
+
 * 0.5.1 (2020-01-11)
     * A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
     * Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.

diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
 setup(
     name='tomotopy',
 
-    version='0.5.1',
+    version='0.5.2',
 
     description='Tomoto, The Topic Modeling Tool for Python',
     long_description=long_description,

diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp
@@ -38,14 +38,16 @@ namespace tomoto
 		size_t numDocBetaSample = -1;
 		math::MultiNormalDistribution<FLOAT> topicPrior;
 
+		template<bool _asymEta>
 		FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
 		{
 			const size_t V = this->realV;
 			assert(vid < V);
+			auto etaHelper = this->template getEtaHelper<_asymEta>();
 			auto& zLikelihood = ld.zLikelihood;
 			zLikelihood = doc.smBeta.array()
-				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
-				/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
+				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
+				/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());
 			sample::prefixSum(zLikelihood.data(), this->K);
 			return &zLikelihood[0];
 		}
@@ -106,10 +108,10 @@ namespace tomoto
 			doc.smBeta /= doc.smBeta.array().sum();
 		}
 
-		template<ParallelScheme _ps>
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
+		template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
+		void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
-			BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId);
+			BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId);
 			/*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
 			{
 				updateBeta(doc, rgs);

diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp
@@ -153,14 +153,16 @@ namespace tomoto
 			return 0;
 		}
 
+		template<bool _asymEta>
 		FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
 		{
 			const size_t V = this->realV;
 			assert(vid < V);
+			auto etaHelper = this->template getEtaHelper<_asymEta>();
 			auto& zLikelihood = ld.zLikelihood;
 			zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + this->expLambda.col(doc.metadata).array())
-				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
-				/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
+				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
+				/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());
 
 			sample::prefixSum(zLikelihood.data(), this->K);
 			return &zLikelihood[0];

diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp
@@ -177,15 +177,17 @@ namespace tomoto
 			}
 		}
 
+		template<bool _asymEta>
 		FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
 		{
 			const size_t V = this->realV;
 			assert(vid < V);
+			auto etaHelper = this->template getEtaHelper<_asymEta>();
 			auto& zLikelihood = ld.zLikelihood;
 			getTermsFromMd(ld, &doc.metadataC[0], ld.terms);
 			zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + (this->lambda * ld.terms).array().exp() + this->alphaEps)
-				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
-				/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
+				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
+				/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());
 
 			sample::prefixSum(zLikelihood.data(), this->K);
 			return &zLikelihood[0];

diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp
@@ -190,8 +190,8 @@ namespace tomoto
 			}
 		}
 
-		template<ParallelScheme _ps>
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
+		template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
+		void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
 			for (size_t w = 0; w < doc.words.size(); ++w)
 			{
@@ -200,7 +200,7 @@ namespace tomoto
 				calcWordTopicProb(ld, doc.words[w]);
 				auto topicDist = getTopicLikelihoods(ld);
 				auto dist = getTableLikelihoods(ld, doc, doc.words[w]);
-				doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + 1, rgs);
+				doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + (_infer ? 0 : 1), rgs);
 				if (doc.Zs[w] == doc.numTopicByTable.size()) // create new table
 				{
 					size_t K = ld.numByTopic.size();
@@ -281,8 +281,8 @@ namespace tomoto
 			for (auto& r : res) r.get();
 		}
 
-		template<ParallelScheme _ps>
-		void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
+		template<ParallelScheme _ps, typename _ExtraDocData>
+		void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
 		{
 			std::vector<std::future<void>> res;
 			const size_t V = this->realV;
@@ -457,6 +457,13 @@ namespace tomoto
 		{
 			return this->globalState.numTableByTopic[tid];
 		}
+
+		std::vector<FLOAT> getTopicsByDoc(const _DocType& doc) const
+		{
+			std::vector<FLOAT> ret(this->K);
+			Eigen::Map<Eigen::Matrix<FLOAT, -1, 1>> { ret.data(), this->K }.array() = doc.numByTopic.array().template cast<FLOAT>() / doc.getSumWordWeight();
+			return ret;
+		}
 	};
 
 	template<TermWeight _TW>

diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp
@@ -422,6 +422,7 @@ namespace tomoto
 			addWordToOnlyLocal<INC>(ld, doc, pid, vid, level);
 		}
 
+		template<bool _asymEta>
 		FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
 		{
 			const size_t V = this->realV;
@@ -443,14 +444,23 @@ namespace tomoto
 			{
 				if (doc.words[w] >= this->realV) continue;
 				addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w]);
-				auto dist = static_cast<const DerivedClass*>(this)->getZLikelihoods(ld, doc, docId, doc.words[w]);
+				FLOAT* dist;
+				if (this->etaByTopicWord.size())
+				{
+					THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
+				}
+				else
+				{
+					dist = static_cast<const DerivedClass*>(this)->template
+						getZLikelihoods<false>(ld, doc, docId, doc.words[w]);
+				}
 				doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + this->K, rgs);
 				addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w]);
 			}
 		}
 
-		template<ParallelScheme _ps>
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
+		template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
+		void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
 			sampleTopics(doc, docId, ld, rgs);
 		}

diff --git a/src/TopicModel/HPAModel.cpp b/src/TopicModel/HPAModel.cpp
@@ -18,4 +18,4 @@ namespace tomoto
 		}
 		return nullptr;
 	}
-}
+}
diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp
@@ -88,6 +88,7 @@ namespace tomoto
 			return std::make_pair<size_t, size_t>(ceil(k * (float)K2 / this->K), ceil((k + 1) * (float)K2 / this->K));
 		}
 
+		template<bool _asymEta>
 		FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
 		{
 			const size_t V = this->realV;
@@ -173,24 +174,32 @@ namespace tomoto
 			}
 		}
 
-		template<ParallelScheme _ps>
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
+		template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
+		void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
 			size_t b = 0, e = doc.words.size();
 			if (_ps == ParallelScheme::partition)
 			{
-				b = this->chunkOffsetByDoc(partitionId, docId);
-				e = this->chunkOffsetByDoc(partitionId + 1, docId);
+				b = edd.chunkOffsetByDoc(partitionId, docId);
+				e = edd.chunkOffsetByDoc(partitionId + 1, docId);
 			}
 
-			size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0;
+			size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0;
 
 			const auto K = this->K;
 			for (size_t w = b; w < e; ++w)
 			{
 				if (doc.words[w] >= this->realV) continue;
 				addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]);
-				auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset);
+				FLOAT* dist;
+				if (this->etaByTopicWord.size())
+				{
+					THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
+				}
+				else
+				{
+					dist = getZLikelihoods<false>(ld, doc, docId, doc.words[w] - vOffset);
+				}
 				if (_Exclusive)
 				{
 					auto z = sample::sampleFromDiscreteAcc(dist, dist + K2 + K + 1, rgs);
@@ -233,12 +242,13 @@ namespace tomoto
 			}
 		}
 
-		void distributePartition(ThreadPool& pool, _ModelState* localData)
+		template<typename _ExtraDocData>
+		void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const
 		{
 		}
 
-		template<ParallelScheme _ps>
-		void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
+		template<ParallelScheme _ps, typename _ExtraDocData>
+		void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
 		{
 			std::vector<std::future<void>> res;
 

diff --git a/src/TopicModel/LDA.h b/src/TopicModel/LDA.h
@@ -114,7 +114,10 @@ namespace tomoto
 		virtual std::vector<size_t> getCountByTopic() const = 0;
 		virtual size_t getK() const = 0;
 		virtual FLOAT getAlpha() const = 0;
-		virtual FLOAT getAlpha(TID k1) const = 0;
+		virtual FLOAT getAlpha(TID k) const = 0;
 		virtual FLOAT getEta() const = 0;
+
+		virtual std::vector<FLOAT> getWordPrior(const std::string& word) const = 0;
+		virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) = 0;
 	};
 }
diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp
@@ -63,6 +63,9 @@ namespace tomoto
 		virtual size_t getK() const = 0;
 		virtual FLOAT getAlpha() const = 0;
 		virtual FLOAT getEta() const = 0;
+
+		virtual std::vector<FLOAT> getWordPrior(const std::string& word) const { return {}; }
+		virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) {}
 	};
 
 	template<typename _Interface = ILDACVB0Model,
@@ -90,13 +93,9 @@ namespace tomoto
 		template<typename _List>
 		static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha)
 		{
-			FLOAT ret = 0;
+			auto listExpr = Eigen::Matrix<FLOAT, -1, 1>::NullaryExpr(len, list);
 			auto dAlpha = math::digammaT(alpha);
-			for (size_t i = 0; i < len; ++i)
-			{
-				ret += math::digammaT(list(i) + alpha) - dAlpha;
-			}
-			return ret;
+			return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum();
 		}
 
 		void optimizeParameters(ThreadPool& pool, _ModelState* localData)
@@ -138,8 +137,8 @@ namespace tomoto
 			if (DEC) ld.numByTopicWord.col(vid) = ld.numByTopicWord.col(vid).cwiseMax(0);
 		}
 
-		template<ParallelScheme _ps>
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
+		template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
+		void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
 			for (size_t w = 0; w < doc.words.size(); ++w)
 			{
@@ -150,7 +149,8 @@ namespace tomoto
 			}
 		}
 
-		void updatePartition(ThreadPool& pool, _ModelState* localData)
+		template<typename _DocIter, typename _ExtraDocData>
+		void updatePartition(ThreadPool& pool, _ModelState* localData, _DocIter first, _DocIter last, _ExtraDocData& edd)
 		{
 		}
 
@@ -166,7 +166,7 @@ namespace tomoto
 					forRandom((this->docs.size() - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id)
 					{
 						static_cast<DerivedClass*>(this)->template sampleDocument<ParallelScheme::copy_merge>(
-							this->docs[id * chStride + ch], id * chStride + ch,
+							this->docs[id * chStride + ch], 0, id * chStride + ch,
 							localData[threadId], rgs[threadId], this->iterated);
 					});
 				}));
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,4 +18,4 @@ namespace tomoto @@
     		}
     		return nullptr;
     	}
-    }
+    }