Merge pull request #26 from bab2min/develop

preparing 0.5.0
bab2min · Dec 29, 2019 · 000b459 · 000b459
2 parents 89c03df + 03d8677
commit 000b459
Show file tree

Hide file tree

Showing 49 changed files with 1,381 additions and 402 deletions.
diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml
@@ -25,11 +25,13 @@ jobs:
         git checkout tags/3.3.7
         cd ..
         mv eigen-git-mirror include
-    - name: Build & Test
+    - name: Build
       run: |
-        /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
         /opt/python/${{ matrix.cp }}/bin/python setup.py build install
-        /opt/python/${{ matrix.cp }}/bin/python -m pytest test/unit_test.py
+    - name: Test
+      run: |
+        /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest
+        /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/unit_test.py
 
   build_macos:
     name: Build for macOS
@@ -57,7 +59,7 @@ jobs:
     - name: Test
       run: |
         python -m pip install pytest
-        python -m pytest test/unit_test.py
+        python -m pytest --verbose test/unit_test.py
 
   build_windows:
     name: Build for Windows
@@ -88,4 +90,4 @@ jobs:
     - name: Test
       run: |
         python -m pip install pytest
-        python -m pytest test/unit_test.py
+        python -m pytest --verbose test/unit_test.py
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@
 /tomotopy.egg-info
 build_windows.bat
 *.bin
+/venv/
diff --git a/README.kr.rst b/README.kr.rst
@@ -30,7 +30,7 @@ tomotopy 란?
 
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
 
-tomotopy의 가장 최신버전은 0.4.2 입니다.
+tomotopy의 가장 최신버전은 0.5.0 입니다.
 
 시작하기
 ---------------
@@ -197,6 +197,13 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.5.0 (2019-12-30)
+    * `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다.
+    * `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용)
+    * `tomotopy.LDAModel.train` 및 `tomotopy.LDAModel.infer` 메소드에 parallel 옵션이 추가되었습니다. 이를 통해 학습 및 추론시 사용할 병렬화 알고리즘을 선택할 수 있습니다.
+    * `tomotopy.ParallelScheme.PARTITION` 알고리즘이 추가되었습니다. 이 알고리즘은 작업자 수가 많거나 토픽의 개수나 어휘 크기가 클 때도 효율적으로 작동합니다.
+    * 모델 생성시 min_cf < 2일때 rm_top 옵션이 적용되지 않는 문제를 수정하였습니다.
+
 * 0.4.2 (2019-11-30)
     * `tomotopy.LLDAModel`와 `tomotopy.PLDAModel` 모델에서 토픽 할당이 잘못 일어나던 문제를 해결했습니다.
     * `tomotopy.Document` 및 `tomotopy.Dictionary` 클래스에 가독성이 좋은 __repr__가 추가되었습니다.

diff --git a/README.rst b/README.rst
@@ -31,7 +31,7 @@ The current version of `tomoto` supports several major topic models including
 
 Please visit https://bab2min.github.io/tomotopy to see more information.
 
-The most recent version of tomotopy is 0.4.2.
+The most recent version of tomotopy is 0.5.0.
 
 Getting Started
 ---------------
@@ -202,6 +202,13 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.5.0 (2019-12-30)
+    * Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution.
+    * New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel)
+    * New parameter `parallel` was added for `tomotopy.LDAModel.train` and `tomotopy.LDAModel.infer` method. You can select parallelism algorithm by changing this parameter.
+    * `tomotopy.ParallelScheme.PARTITION`, a new algorithm, was added. It works efficiently when the number of workers is large, the number of topics or the size of vocabulary is big.
+    * A bug where `rm_top` didn't work at `min_cf` < 2 was fixed.
+
 * 0.4.2 (2019-11-30)
     * Wrong topic assignments of `tomotopy.LLDAModel` and `tomotopy.PLDAModel` were fixed.
     * Readable __repr__ of `tomotopy.Document` and `tomotopy.Dictionary` was implemented.

diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
 setup(
     name='tomotopy',
 
-    version='0.4.2',
+    version='0.5.0',
 
     description='Tomoto, The Topic Modeling Tool for Python',
     long_description=long_description,

diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp
@@ -16,7 +16,7 @@ namespace tomoto
 	{
 	};
 
-	template<TermWeight _TW, size_t _Flags = 0,
+	template<TermWeight _TW, size_t _Flags = flags::partitioned_multisampling,
 		typename _Interface = ICTModel,
 		typename _Derived = void,
 		typename _DocType = DocumentCTM<_TW>,
@@ -55,6 +55,8 @@ namespace tomoto
 			Eigen::Matrix<FLOAT, -1, 1> pbeta, lowerBound, upperBound;
 			constexpr FLOAT epsilon = 1e-8;
 			constexpr size_t burnIn = 3;
+			sample::FastRealGenerator frg;
+
 			pbeta = lowerBound = upperBound = Eigen::Matrix<FLOAT, -1, 1>::Zero(this->K);
 			for (size_t i = 0; i < numBetaSample + burnIn; ++i)
 			{
@@ -66,7 +68,7 @@ namespace tomoto
 				{
 					FLOAT N_k = doc.numByTopic[k] + this->alpha;
 					FLOAT N_nk = doc.getSumWordWeight() + this->alpha * (this->K + 1) - N_k;
-					FLOAT u1 = std::generate_canonical<FLOAT, 32>(rg), u2 = std::generate_canonical<FLOAT, 32>(rg);
+					FLOAT u1 = frg(rg), u2 = frg(rg);
 					FLOAT max_uk = epsilon + pow(u1, (FLOAT)1 / N_k)  * (pbeta[k] - epsilon);
 					FLOAT min_unk = (1 - pow(u2, (FLOAT)1 / N_nk))
 						* (1 - pbeta[k]) + pbeta[k];
@@ -104,12 +106,47 @@ namespace tomoto
 			doc.smBeta /= doc.smBeta.array().sum();
 		}
 
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const
+		template<ParallelScheme _ps>
+		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
-			BaseClass::sampleDocument(doc, docId, ld, rgs, iterationCnt);
-			if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
+			BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId);
+			/*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
 			{
 				updateBeta(doc, rgs);
+			}*/
+		}
+
+		template<typename _DocIter>
+		void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const
+		{
+			if (this->iterated < this->burnIn || !this->optimInterval || (this->iterated + 1) % this->optimInterval != 0) return;
+
+			if (pool)
+			{
+				std::vector<std::future<void>> res;
+				const size_t chStride = pool->getNumWorkers() * 8;
+				size_t dist = std::distance(first, last);
+				for (size_t ch = 0; ch < chStride; ++ch)
+				{
+					auto b = first, e = first;
+					std::advance(b, dist * ch / chStride);
+					std::advance(e, dist * (ch + 1) / chStride);
+					res.emplace_back(pool->enqueue([&, ch, chStride](size_t threadId, _DocIter b, _DocIter e)
+					{
+						for (auto doc = b; doc != e; ++doc)
+						{
+							updateBeta(*doc, rgs[threadId]);
+						}
+					}, b, e));
+				}
+				for (auto& r : res) r.get();
+			}
+			else
+			{
+				for (auto doc = first; doc != last; ++doc)
+				{
+					updateBeta(*doc, rgs[0]);
+				}
 			}
 		}
 
@@ -130,7 +167,7 @@ namespace tomoto
 					}
 				}, ch));
 			}
-			for (auto&& r : res) r.get();
+			for (auto& r : res) r.get();
 			return 0;
 		}
 
@@ -211,39 +248,39 @@ namespace tomoto
 			return ret;
 		}
 
-		std::vector<FLOAT> getPriorMean() const
+		std::vector<FLOAT> getPriorMean() const override
 		{
 			return { topicPrior.mean.data(), topicPrior.mean.data() + topicPrior.mean.size() };
 		}
 
-		std::vector<FLOAT> getPriorCov() const
+		std::vector<FLOAT> getPriorCov() const override
 		{
 			return { topicPrior.cov.data(), topicPrior.cov.data() + topicPrior.cov.size() };
 		}
 
-		std::vector<FLOAT> getCorrelationTopic(TID k) const
+		std::vector<FLOAT> getCorrelationTopic(TID k) const override
 		{
 			Eigen::Matrix<FLOAT, -1, 1> ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt();
 			return { ret.data(), ret.data() + ret.size() };
 		}
 
 		GETTER(NumBetaSample, size_t, numBetaSample);
 
-		void setNumBetaSample(size_t _numSample)
+		void setNumBetaSample(size_t _numSample) override
 		{
 			numBetaSample = _numSample;
 		}
 
 		GETTER(NumDocBetaSample, size_t, numDocBetaSample);
 
-		void setNumDocBetaSample(size_t _numSample)
+		void setNumDocBetaSample(size_t _numSample) override
 		{
 			numDocBetaSample = _numSample;
 		}
 
 		GETTER(NumTMNSample, size_t, numTMNSample);
 
-		void setNumTMNSample(size_t _numSample)
+		void setNumTMNSample(size_t _numSample) override
 		{
 			numTMNSample = _numSample;
 		}

diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp
@@ -16,7 +16,7 @@ namespace tomoto
 		Eigen::Matrix<FLOAT, -1, 1> tmpK;
 	};
 
-	template<TermWeight _TW, size_t _Flags = 0,
+	template<TermWeight _TW, size_t _Flags = flags::partitioned_multisampling,
 		typename _Interface = IDMRModel,
 		typename _Derived = void,
 		typename _DocType = DocumentDMR<_TW>,
@@ -69,6 +69,7 @@ namespace tomoto
 				res.emplace_back(pool.enqueue([&](size_t threadId)
 				{
 					auto& tmpK = localData[threadId].tmpK;
+					if (!tmpK.size()) tmpK.resize(this->K);
 					Eigen::Matrix<FLOAT, -1, 1> val = Eigen::Matrix<FLOAT, -1, 1>::Zero(K * F + 1);
 					for (size_t docId = ch; docId < this->docs.size(); docId += chStride)
 					{
@@ -95,7 +96,7 @@ namespace tomoto
 					return val;
 				}));
 			}
-			for (auto&& r : res)
+			for (auto& r : res)
 			{
 				auto ret = r.get();
 				fx += ret[K * F];
@@ -279,12 +280,12 @@ namespace tomoto
 		GETTER(AlphaEps, FLOAT, alphaEps);
 		GETTER(OptimRepeat, size_t, optimRepeat);
 
-		void setAlphaEps(FLOAT _alphaEps)
+		void setAlphaEps(FLOAT _alphaEps) override
 		{
 			alphaEps = _alphaEps;
 		}
 
-		void setOptimRepeat(size_t _optimRepeat)
+		void setOptimRepeat(size_t _optimRepeat) override
 		{
 			optimRepeat = _optimRepeat;
 		}
@@ -312,7 +313,7 @@ namespace tomoto
 			return { l.data(), l.data() + F };
 		}
 
-		const Dictionary& getMetadataDict() const { return metadataDict; }
+		const Dictionary& getMetadataDict() const override { return metadataDict; }
 	};
 
 	/* This is for preventing 'undefined symbol' problem in compiling by clang. */

diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp
@@ -14,7 +14,7 @@ namespace tomoto
 		std::vector<size_t> ndimCnt;
 	};
 
-	template<TermWeight _TW, size_t _Flags = 0,
+	template<TermWeight _TW, size_t _Flags = flags::partitioned_multisampling,
 		typename _Interface = IGDMRModel,
 		typename _Derived = void,
 		typename _DocType = DocumentGDMR<_TW, _Flags>,
@@ -136,7 +136,7 @@ namespace tomoto
 					return ret;
 				}));
 			}
-			for (auto&& r : res)
+			for (auto& r : res)
 			{
 				auto ret = r.get();
 				fx += ret[K * F];
@@ -310,7 +310,7 @@ namespace tomoto
 		GETTER(Fs, const std::vector<size_t>&, degreeByF);
 		GETTER(Sigma0, FLOAT, sigma0);
 
-		void setSigma0(FLOAT _sigma0)
+		void setSigma0(FLOAT _sigma0) override
 		{
 			this->sigma0 = _sigma0;
 		}

diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp
@@ -140,7 +140,7 @@ namespace tomoto
 			const size_t V = this->realV;
 			const auto K = ld.numByTopic.size();
 			ld.topicLikelihood.resize(K + 1);
-			ld.topicLikelihood.head(K) = ld.zLikelihood.array().template cast<FLOAT>() * ld.numTableByTopic.array().template cast<FLOAT>();
+			ld.topicLikelihood.head(K) = ld.zLikelihood.head(K).array().template cast<FLOAT>() * ld.numTableByTopic.array().template cast<FLOAT>();
 			ld.topicLikelihood[K] = ld.zLikelihood[K] * gamma;
 			sample::prefixSum(ld.topicLikelihood.data(), ld.topicLikelihood.size());
 			return &ld.topicLikelihood[0];
@@ -190,7 +190,8 @@ namespace tomoto
 			}
 		}
 
-		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const
+		template<ParallelScheme _ps>
+		void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
 		{
 			for (size_t w = 0; w < doc.words.size(); ++w)
 			{
@@ -254,7 +255,7 @@ namespace tomoto
 
 		void updateGlobalInfo(ThreadPool& pool, _ModelState* localData)
 		{
-			std::vector<std::future<void>> res(pool.getNumWorkers());
+			std::vector<std::future<void>> res;
 			auto& K = this->K;
 			K = 0;
 			for (size_t i = 0; i < pool.getNumWorkers(); ++i)
@@ -265,7 +266,7 @@ namespace tomoto
 			// synchronize topic size of all documents
 			for (size_t i = 0; i < pool.getNumWorkers(); ++i)
 			{
-				res[i] = pool.enqueue([&, this](size_t threadId, size_t b, size_t e)
+				res.emplace_back(pool.enqueue([&, this](size_t threadId, size_t b, size_t e)
 				{
 					for (size_t j = b; j < e; ++j)
 					{
@@ -275,14 +276,15 @@ namespace tomoto
 						doc.numByTopic.conservativeResize(K);
 						doc.numByTopic.tail(K - oldSize).setZero();
 					}
-				}, this->docs.size() * i / pool.getNumWorkers(), this->docs.size() * (i + 1) / pool.getNumWorkers());
+				}, this->docs.size() * i / pool.getNumWorkers(), this->docs.size() * (i + 1) / pool.getNumWorkers()));
 			}
-			for (auto&& r : res) r.get();
+			for (auto& r : res) r.get();
 		}
 
+		template<ParallelScheme _ps>
 		void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
 		{
-			std::vector<std::future<void>> res(pool.getNumWorkers());
+			std::vector<std::future<void>> res;
 			const size_t V = this->realV;
 			auto K = this->K;
 
@@ -323,12 +325,12 @@ namespace tomoto
 
 			for (size_t i = 0; i < pool.getNumWorkers(); ++i)
 			{
-				res[i] = pool.enqueue([&, this, i](size_t threadId)
+				res.emplace_back(pool.enqueue([&, this, i](size_t threadId)
 				{
 					localData[i] = globalState;
-				});
+				}));
 			}
-			for (auto&& r : res) r.get();
+			for (auto& r : res) r.get();
 		}
 
 		/* this LL calculation is based on https://github.com/blei-lab/hdp/blob/master/hdp/state.cpp */
@@ -342,7 +344,7 @@ namespace tomoto
 			{
 				auto& doc = *_first;
 				ll += doc.getNumTable() * log(alpha) - math::lgammaT(doc.getSumWordWeight() + alpha) + math::lgammaT(alpha);
-				for (auto&& nt : doc.numTopicByTable)
+				for (auto& nt : doc.numTopicByTable)
 				{
 					if (nt) ll += math::lgammaT(nt.num);
 				}

diff --git a/src/TopicModel/HLDA.h b/src/TopicModel/HLDA.h
@@ -13,6 +13,8 @@ namespace tomoto
 		// Zs indicates level in HLDAModel.
 		std::vector<int32_t> path;
 
+		template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
+
 		DEFINE_SERIALIZER_AFTER_BASE(DocumentLDA<_TW>, path);
 	};
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ @@
     /tomotopy.egg-info
     build_windows.bat
     *.bin
+    /venv/