Merge pull request #29 from bab2min/develop

fixing issue #28 and supporting missing values for S-LDA
bab2min · Jan 11, 2020 · 7c79eec · 7c79eec
2 parents 85307ed + 40133b9
commit 7c79eec
Show file tree

Hide file tree

Showing 10 changed files with 133 additions and 33 deletions.
diff --git a/README.kr.rst b/README.kr.rst
@@ -30,7 +30,7 @@ tomotopy 란?
 
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
 
-tomotopy의 가장 최신버전은 0.5.0 입니다.
+tomotopy의 가장 최신버전은 0.5.1 입니다.
 
 시작하기
 ---------------
@@ -210,6 +210,10 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.5.1 (2020-01-11)
+    * `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
+    * `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.
+
 * 0.5.0 (2019-12-30)
     * `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다.
     * `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용)

diff --git a/README.rst b/README.rst
@@ -31,7 +31,7 @@ The current version of `tomoto` supports several major topic models including
 
 Please visit https://bab2min.github.io/tomotopy to see more information.
 
-The most recent version of tomotopy is 0.5.0.
+The most recent version of tomotopy is 0.5.1.
 
 Getting Started
 ---------------
@@ -215,6 +215,10 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.5.1 (2020-01-11)
+    * A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
+    * Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.
+
 * 0.5.0 (2019-12-30)
     * Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution.
     * New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel)

diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
 setup(
     name='tomotopy',
 
-    version='0.5.0',
+    version='0.5.1',
 
     description='Tomoto, The Topic Modeling Tool for Python',
     long_description=long_description,

diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp
@@ -315,6 +315,20 @@ namespace tomoto
 			return { cnt.data(), cnt.data() + K };
 		}
 
+		template<ParallelScheme _ps>
+		size_t estimateMaxThreads() const
+		{
+			if (_ps == ParallelScheme::partition)
+			{
+				return this->realV / 4;
+			}
+			if (_ps == ParallelScheme::copy_merge)
+			{
+				return this->docs.size() / 2;
+			}
+			return (size_t)-1;
+		}
+
 		DEFINE_SERIALIZER(alpha, eta, K);
 
 	public:

diff --git a/src/TopicModel/SLDAModel.hpp b/src/TopicModel/SLDAModel.hpp
@@ -30,7 +30,6 @@ namespace tomoto
 
 			virtual void optimizeCoef(
 				const Eigen::Matrix<FLOAT, -1, -1>& normZ,
-				const Eigen::Matrix<FLOAT, -1, -1>& normZZT,
 				FLOAT mu, FLOAT nuSq,
 				Eigen::Block<Eigen::Matrix<FLOAT, -1, -1>, -1, 1, true> ys
 			) = 0;
@@ -83,13 +82,14 @@ namespace tomoto
 
 			void optimizeCoef(
 				const Eigen::Matrix<FLOAT, -1, -1>& normZ,
-				const Eigen::Matrix<FLOAT, -1, -1>& normZZT,
 				FLOAT mu, FLOAT nuSq,
 				Eigen::Block<Eigen::Matrix<FLOAT, -1, -1>, -1, 1, true> ys
 			) override
 			{
-				this->regressionCoef = (normZZT + Eigen::Matrix<FLOAT, -1, -1>::Identity(normZZT.cols(), normZZT.cols()) / nuSq)
-					.colPivHouseholderQr().solve(normZ * ys);
+				Eigen::Matrix<FLOAT, -1, -1> selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast<FLOAT>();
+				Eigen::Matrix<FLOAT, -1, -1> normZZT = selectedNormZ * selectedNormZ.transpose();
+				normZZT += Eigen::Matrix<FLOAT, -1, -1>::Identity(normZZT.cols(), normZZT.cols()) / nuSq;
+				this->regressionCoef = normZZT.colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, ys).matrix());
 			}
 
 			double getLL(FLOAT y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic,
@@ -135,19 +135,22 @@ namespace tomoto
 
 			void optimizeCoef(
 				const Eigen::Matrix<FLOAT, -1, -1>& normZ,
-				const Eigen::Matrix<FLOAT, -1, -1>& normZZT,
 				FLOAT mu, FLOAT nuSq,
 				Eigen::Block<Eigen::Matrix<FLOAT, -1, -1>, -1, 1, true> ys
 			) override
 			{
-				this->regressionCoef = ((normZ * Eigen::DiagonalMatrix<FLOAT, -1>{ omega }) * normZ.transpose()
-					+ Eigen::Matrix<FLOAT, -1, -1>::Identity(normZZT.cols(), normZZT.cols()) / nuSq)
-					.colPivHouseholderQr().solve(normZ * (b * (ys - decltype(ys)::Constant(ys.size(), 0.5f)))
-						+ Eigen::Matrix<FLOAT, -1, 1>::Constant(normZ.rows(), mu / nuSq));
+				Eigen::Matrix<FLOAT, -1, -1> selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast<FLOAT>();
+				Eigen::Matrix<FLOAT, -1, -1> normZZT = selectedNormZ * Eigen::DiagonalMatrix<FLOAT, -1>{ omega } * selectedNormZ.transpose();
+				normZZT += Eigen::Matrix<FLOAT, -1, -1>::Identity(normZZT.cols(), normZZT.cols()) / nuSq;
+
+				this->regressionCoef = normZZT
+					.colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, b * (ys.array() - 0.5f)).matrix()
+						+ Eigen::Matrix<FLOAT, -1, 1>::Constant(selectedNormZ.rows(), mu / nuSq));
 
 				RandGen rng;
 				for (size_t i = 0; i < omega.size(); ++i)
 				{
+					if (std::isnan(ys[i])) continue;
 					omega[i] = math::drawPolyaGamma(b, (this->regressionCoef.array() * normZ.col(i).array()).sum(), rng);
 				}
 			}
@@ -208,13 +211,12 @@ namespace tomoto
 			zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + this->alphas.array())
 				* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
 				/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
-			if (docId != (size_t)-1)
+
+			for (size_t f = 0; f < F; ++f)
 			{
-				for (size_t f = 0; f < F; ++f)
-				{
-					responseVars[f]->updateZLL(zLikelihood, doc.y[f], doc.numByTopic,
-						docId, doc.getSumWordWeight());
-				}
+				if (std::isnan(doc.y[f])) continue;
+				responseVars[f]->updateZLL(zLikelihood, doc.y[f], doc.numByTopic,
+					docId, doc.getSumWordWeight());
 			}
 			sample::prefixSum(zLikelihood.data(), this->K);
 			return &zLikelihood[0];
@@ -227,10 +229,10 @@ namespace tomoto
 				normZ.col(i) = this->docs[i].numByTopic.array().template cast<FLOAT>() / 
 					std::max((FLOAT)this->docs[i].getSumWordWeight(), 0.01f);
 			}
-			Eigen::Matrix<FLOAT, -1, -1> normZZT = normZ * normZ.transpose();
+
 			for (size_t f = 0; f < F; ++f)
 			{
-				responseVars[f]->optimizeCoef(normZ, normZZT, mu[f], nuSq[f], Ys.col(f));
+				responseVars[f]->optimizeCoef(normZ, mu[f], nuSq[f], Ys.col(f));
 			}
 		}
 
@@ -256,6 +258,7 @@ namespace tomoto
 				ll -= math::lgammaT(doc.getSumWordWeight() + this->alphas.sum()) - math::lgammaT(this->alphas.sum());
 				for (size_t f = 0; f < F; ++f)
 				{
+					if (std::isnan(doc.y[f])) continue;
 					ll += responseVars[f]->getLL(doc.y[f], doc.numByTopic, doc.getSumWordWeight());
 				}
 				for (TID k = 0; k < K; ++k)
@@ -357,7 +360,14 @@ namespace tomoto
 		std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<FLOAT>& y) const override
 		{
 			auto doc = this->_makeDocWithinVocab(words);
+			if (y.size() > F) throw std::runtime_error{ text::format(
+				"size of 'y' is greater than the number of vars.\n"
+				"size of 'y' : %zd, number of vars: %zd", y.size(), F) };
 			doc.y = y;
+			while (doc.y.size() < F)
+			{
+				doc.y.emplace_back(NAN);
+			}
 			return make_unique<_DocType>(doc);
 		}
 

diff --git a/src/python/docs.h b/src/python/docs.h
@@ -32,14 +32,22 @@ DOC_SIGNATURE_EN_KO(Document_get_topic_dist__doc__,
 	u8R""(현재 문헌의 토픽 확률 분포를 `list` 형태로 반환합니다.)"");
 
 DOC_SIGNATURE_EN_KO(Document_get_sub_topics__doc__,
-	"get_topics(self, top_n=10)",
-	u8R""(Return the `top_n` sub topics with its probability of the document. (for only `tomotopy.PAModel`))"",
-	u8R""(현재 문헌의 상위 `top_n`개의 하위 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))"");
+	"get_sub_topics(self, top_n=10)",
+	u8R""(.. versionadded:: 0.5.0
+
+Return the `top_n` sub topics with its probability of the document. (for only `tomotopy.PAModel`))"",
+	u8R""(.. versionadded:: 0.5.0
+
+현재 문헌의 상위 `top_n`개의 하위 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))"");
 
 DOC_SIGNATURE_EN_KO(Document_get_sub_topic_dist__doc__,
-	"get_topic_dist(self)",
-	u8R""(Return a distribution of the sub topics in the document. (for only `tomotopy.PAModel`))"",
-	u8R""(현재 문헌의 하위 토픽 확률 분포를 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))"");
+	"get_sub_topic_dist(self)",
+	u8R""(.. versionadded:: 0.5.0
+
+Return a distribution of the sub topics in the document. (for only `tomotopy.PAModel`))"",
+	u8R""(.. versionadded:: 0.5.0
+
+현재 문헌의 하위 토픽 확률 분포를 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))"");
 
 DOC_SIGNATURE_EN_KO(Document_get_words__doc__,
 	"get_words(self, top_n=10)",
@@ -1395,6 +1403,10 @@ words : iterable of str
 y : list of float
     response variables of this document. 
     The length of `y` must be equal to the number of response variables of the model (`tomotopy.SLDAModel.f`).
+    
+    .. versionadded:: 0.5.1
+    
+    If you have a missing value, you can set the item as `NaN`. Documents with `NaN` variables are included in modeling topics, but excluded from regression.
 )"",
 u8R""(현재 모델에 응답 변수 `y`를 포함하는 새로운 문헌을 추가하고 추가된 문헌의 인덱스 번호를 반환합니다.
 
@@ -1404,6 +1416,10 @@ words : iterable of str
     문헌의 각 단어를 나열하는 `str` 타입의 iterable
 y : list of float
     문헌의 응답 변수로 쓰일 `float`의 `list`. `y`의 길이는 모델의 응답 변수의 개수인 `tomotopy.SLDAModel.f`와 일치해야 합니다.
+    
+    .. versionadded:: 0.5.1
+    
+    만약 결측값이 있을 경우, 해당 항목을 `NaN`으로 설정할 수 있습니다. 이 경우 `NaN`값을 가진 문헌은 토픽을 모델링하는데에는 포함되지만, 응답 변수 회귀에서는 제외됩니다.
 )"");
 
 DOC_SIGNATURE_EN_KO(SLDA_make_doc__doc__,
@@ -1417,6 +1433,7 @@ words : iterable of str
 y : list of float
     response variables of this document. 
     The length of `y` doesn't have to be equal to the number of response variables of the model (`tomotopy.SLDAModel.f`).
+    If the length of `y` is shorter than `tomotopy.SLDAModel.f`, missing values are automatically filled with `NaN`.
 )"",
 u8R""(`words` 단어를 바탕으로 새로운 문헌인 `tomotopy.Document` 인스턴스를 반환합니다. 이 인스턴스는 `tomotopy.LDAModel.infer` 메소드에 사용될 수 있습니다.
 
@@ -1427,6 +1444,7 @@ words : iterable of str
 y : list of float
     문헌의 응답 변수로 쓰일 `float`의 `list`. 
     `y`의 길이는 모델의 응답 변수의 개수인 `tomotopy.SLDAModel.f`와 꼭 일치할 필요는 없습니다.
+    `y`의 길이가 `tomotopy.SLDAModel.f`보다 짧을 경우, 모자란 값들은 자동으로 `NaN`으로 채워집니다.
 )"");
 
 DOC_SIGNATURE_EN_KO(SLDA_get_regression_coef__doc__,
@@ -1453,21 +1471,21 @@ DOC_SIGNATURE_EN_KO(SLDA_get_var_type__doc__,
 
 DOC_SIGNATURE_EN_KO(SLDA_estimate__doc__,
 	"estimate(self, doc)",
-	u8R""(Return the estimated response variable for `doc`. 
+	u8R""(Return the estimated response variable for `doc`.
 If `doc` is an unseen document instance which is generated by `tomotopy.SLDAModel.make_doc` method, it should be inferred by `tomotopy.LDAModel.infer` method first.
 
 Parameters
 ----------
 doc : tomotopy.Document
-    an instance of document to be used for estimating response variables
+    an instance of document or a list of them to be used for estimating response variables
 )"",
 	u8R""(`doc`의 추정된 응답 변수를 반환합니다.
 만약 `doc`이 `tomotopy.SLDAModel.make_doc`에 의해 생성된 인스턴스라면, 먼저 `tomotopy.LDAModel.infer`를 통해 토픽 추론을 실시한 다음 이 메소드를 사용해야 합니다.
 
 Parameters
 ----------
 doc : tomotopy.Document
-    응답 변수를 추정하려하는 문헌의 인스턴스
+    응답 변수를 추정하려하는 문헌의 인스턴스 혹은 인스턴스들의 list
 )"");
 
 DOC_VARIABLE_EN_KO(SLDA_f__doc__,

diff --git a/src/python/py_SLDA.cpp b/src/python/py_SLDA.cpp
@@ -235,7 +235,27 @@ static PyObject* SLDA_estimateVars(TopicModelObject* self, PyObject* args, PyObj
 	{
 		if (!self->inst) throw runtime_error{ "inst is null" };
 		auto* inst = static_cast<tomoto::ISLDAModel*>(self->inst);
-		if (Py_TYPE(argDoc) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document type" };
+		if (py::UniqueObj iter = PyObject_GetIter(argDoc))
+		{
+			py::UniqueObj nextDoc;
+			std::vector<const tomoto::DocumentBase*> docs;
+			while ((nextDoc = PyIter_Next(iter)))
+			{
+				if (Py_TYPE(nextDoc) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document or list of tomotopy.Document" };
+				docs.emplace_back(((DocumentObject*)nextDoc.get())->doc);
+			}
+			if (PyErr_Occurred()) return nullptr;
+			return py::buildPyValueTransform(docs.begin(), docs.end(), [&](const tomoto::DocumentBase* d)
+			{
+				return inst->estimateVars(d);
+			});
+		}
+		else
+		{
+			PyErr_Clear();
+		}
+
+		if (Py_TYPE(argDoc) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document or list of tomotopy.Document" };
 		auto* doc = (DocumentObject*)argDoc;
 		if (doc->parentModel != self) throw runtime_error{ "'doc' was from another model, not fit to this model" };
 

diff --git a/test/unit_test.py b/test/unit_test.py
@@ -4,7 +4,7 @@
     (tp.LDAModel, 'test/sample.txt', 0, None, {'k':10}, None),
     (tp.LLDAModel, 'test/sample_with_md.txt', 0, None, {'k':5}, None),
     (tp.PLDAModel, 'test/sample_with_md.txt', 0, None, {'latent_topics':2, 'topics_per_label':2}, None),
-	(tp.PLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'latent_topics':2, 'topics_per_label':2}, None),
+    (tp.PLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'latent_topics':2, 'topics_per_label':2}, None),
     (tp.HLDAModel, 'test/sample.txt', 0, None, {'depth':3}, [tp.ParallelScheme.NONE]),
     (tp.CTModel, 'test/sample.txt', 0, None, {'k':10}, None),
     (tp.HDPModel, 'test/sample.txt', 0, None, {'initial_k':10}, [tp.ParallelScheme.COPY_MERGE]),
@@ -95,6 +95,28 @@ def infer(cls, inputFile, mdFields, f, kargs, ps):
 
     mdl.infer(unseen_docs, parallel=ps)
 
+def test_estimate_SLDA_PARTITION(cls=tp.SLDAModel, inputFile='test/sample_with_md.txt', mdFields=1, f=lambda x:list(map(float, x)), kargs={'k':10, 'vars':'b'}, ps=tp.ParallelScheme.PARTITION):
+    print('Test estimate')
+    tw = 0
+    print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw]))
+    mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs)
+    print('Adding docs...')
+    unseen_docs = []
+    for n, line in enumerate(open(inputFile, encoding='utf-8')):
+        ch = line.strip().split()
+        if len(ch) < mdFields + 1: continue
+        if n < 20: unseen_docs.append(line)
+        else:
+            if mdFields:
+                mdl.add_doc(ch[mdFields:], f(ch[:mdFields]))
+            else:
+                mdl.add_doc(ch)
+    mdl.train(20, parallel=ps)
+    for n, line in enumerate(unseen_docs):
+        unseen_docs[n] = mdl.make_doc(ch)
+
+    mdl.infer(unseen_docs, parallel=ps)
+    mdl.estimate(unseen_docs)
 
 for model_case in model_cases:
     pss = model_case[5]

diff --git a/tomotopy/documentation.kr.rst b/tomotopy/documentation.kr.rst
@@ -16,7 +16,7 @@ tomotopy 란?
 * Hierarchical PA (`tomotopy.HPAModel`)
 * Correlated Topic Model (`tomotopy.CTModel`)
 
-tomotopy의 가장 최신버전은 0.5.0 입니다.
+tomotopy의 가장 최신버전은 0.5.1 입니다.
 
 .. image:: https://badge.fury.io/py/tomotopy.svg
 
@@ -251,6 +251,10 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.5.1 (2020-01-11)
+    * `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
+    * `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.
+
 * 0.5.0 (2019-12-30)
     * `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다.
     * `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용)

diff --git a/tomotopy/documentation.rst b/tomotopy/documentation.rst
@@ -16,7 +16,7 @@ The current version of `tomoto` supports several major topic models including
 * Hierarchical PA (`tomotopy.HPAModel`)
 * Correlated Topic Model (`tomotopy.CTModel`).
 
-The most recent version of tomotopy is 0.5.0.
+The most recent version of tomotopy is 0.5.1.
 
 .. image:: https://badge.fury.io/py/tomotopy.svg
 
@@ -254,6 +254,10 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.5.1 (2020-01-11)
+    * A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
+    * Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.
+
 * 0.5.0 (2019-12-30)
     * Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution.
     * New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel)