Merge pull request #116 from bab2min/dev

Dev
bab2min · Apr 26, 2021 · ff09183 · ff09183
2 parents 7217579 + 6b932d2
commit ff09183
Show file tree

Hide file tree

Showing 56 changed files with 2,692 additions and 2,790 deletions.
diff --git a/examples/dmr_multi_label.py b/examples/dmr_multi_label.py
@@ -0,0 +1,54 @@
+'''
+This example show how to perform a DMR topic model with multi-metadata using tomotopy
+'''
+import itertools
+
+import tomotopy as tp
+import numpy as np
+
+# You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
+corpus = tp.utils.Corpus()
+for line in open('text_mining_year_journal.txt', encoding='utf-8'):
+    fd = line.strip().split('\t', maxsplit=2)
+    corpus.add_doc(fd[2].split(), multi_metadata=['y_' + fd[0], 'j_' + fd[1]])
+# We add prefix 'y' for year-label and 'j' for journal-label
+
+# We set a range of the first metadata as [2000, 2017] 
+# and one of the second metadata as [0, 1].
+mdl = tp.DMRModel(tw=tp.TermWeight.ONE, 
+    k=20,
+    corpus=corpus
+)
+mdl.optim_interval = 20
+mdl.burn_in = 200
+
+mdl.train(0)
+
+print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
+    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
+))
+
+# Let's train the model
+for i in range(0, 2000, 20):
+    print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
+    mdl.train(20)
+print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word))
+
+mdl.summary()
+
+year_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('y_'))
+journal_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('j_'))
+
+# calculate topic distribution with each metadata using get_topic_prior()
+print('Topic distributions by year')
+for l in year_labels:
+    print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')
+
+print('Topic distributions by journal')
+for l in journal_labels:
+    print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')
+
+# Also we can estimate topic distributions with multiple metadata
+print('Topic distributions by year-journal')
+for y, j in itertools.product(year_labels, journal_labels):
+    print(y, ',', j, '\n', mdl.get_topic_prior(multi_metadata=[y, j]), '\n')
diff --git a/examples/dmr_plot.py b/examples/dmr_plot.py
@@ -38,7 +38,7 @@
 for i in range(0, 2000, 20):
     print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
     mdl.train(20)
-print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
+print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word))
 
 mdl.summary()
 

diff --git a/setup.py b/setup.py
@@ -26,13 +26,13 @@
     cargs = ['/O2', '/MT', '/Gy']
     arch_levels = {'':'', 'sse2':'/arch:SSE2', 'avx':'/arch:AVX', 'avx2':'/arch:AVX2'}
 elif platform.system() == 'Darwin': 
-    cargs = ['-std=c++0x', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch']
+    cargs = ['-std=c++1y', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch']
     largs += ['-stdlib=libc++']
     if 'many' not in os.environ.get('AUDITWHEEL_PLAT', ''): arch_levels = {'':'-march=native'}
 elif 'many' in os.environ.get('AUDITWHEEL_PLAT', ''):
-    cargs = ['-std=c++0x', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch']
+    cargs = ['-std=c++1y', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch']
 else:
-    cargs = ['-std=c++0x', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch']
+    cargs = ['-std=c++1y', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch']
     arch_levels = {'':'-march=native'}
 
 if struct.calcsize('P') < 8: arch_levels = {k:v for k, v in arch_levels.items() if k in ('', 'sse2')}

diff --git a/src/Coherence/CoherenceModel.hpp b/src/Coherence/CoherenceModel.hpp
@@ -25,7 +25,7 @@ namespace tomoto
 			void init(size_t windowSize)
 			{
 				pe_type = _pe;
-				pe = make_unique<ProbEstimator<_pe>>(windowSize);
+				pe = std::make_unique<ProbEstimator<_pe>>(windowSize);
 			}
 
 			template<ProbEstimation _pe, typename _TargetIter>

diff --git a/src/Labeling/FoRelevance.cpp b/src/Labeling/FoRelevance.cpp
@@ -6,6 +6,55 @@
 
 using namespace tomoto::label;
 
+template<bool reverse = false>
+class DocWordIterator
+{
+	const tomoto::DocumentBase* doc = nullptr;
+	size_t n = 0;
+public:
+	DocWordIterator(const tomoto::DocumentBase* _doc = nullptr, size_t _n = 0)
+		: doc{ _doc }, n{ _n }
+	{
+	}
+
+	tomoto::Vid operator[](size_t i) const
+	{
+		return doc->words[doc->wOrder.empty() ? (n + i) : doc->wOrder[n + i]];
+	}
+
+	tomoto::Vid operator*() const
+	{
+		return doc->words[doc->wOrder.empty() ? n : doc->wOrder[n]];
+	}
+
+	bool operator==(const DocWordIterator& o) const
+	{
+		return doc == o.doc && n == o.n;
+	}
+
+	bool operator!=(const DocWordIterator& o) const
+	{
+		return !operator==(o);
+	}
+
+	DocWordIterator& operator++()
+	{
+		if (reverse) --n;
+		else ++n;
+		return *this;
+	}
+
+	DocWordIterator operator+(ptrdiff_t o) const
+	{
+		return { doc, (size_t)((ptrdiff_t)n + o) };
+	}
+
+	DocWordIterator operator-(ptrdiff_t o) const
+	{
+		return { doc, (size_t)((ptrdiff_t)n - o) };
+	}
+};
+
 class DocWrapper
 {
 	const tomoto::DocumentBase* doc;
@@ -25,24 +74,24 @@ class DocWrapper
 		return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
 	}
 
-	auto begin() const -> decltype(doc->words.begin())
+	DocWordIterator<> begin() const
 	{
-		return doc->words.begin();
+		return { doc, 0 };
 	}
 
-	auto end() const -> decltype(doc->words.end())
+	DocWordIterator<> end() const
 	{
-		return doc->words.end();
+		return { doc, doc->words.size() };
 	}
 
-	auto rbegin() const -> decltype(doc->words.rbegin())
+	DocWordIterator<true> rbegin() const
 	{
-		return doc->words.rbegin();
+		return { doc, doc->words.size() };
 	}
 
-	auto rend() const -> decltype(doc->words.rend())
+	DocWordIterator<true> rend() const
 	{
-		return doc->words.rend();
+		return { doc, 0 };
 	}
 };
 
@@ -99,7 +148,6 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
 	return candidates;
 }
 
-
 std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
 {
 	auto& vocabFreqs = tm->getVocabCf();
@@ -217,11 +265,11 @@ void FoRelevance::estimateContexts()
 		}
 	}
 
-	Eigen::Matrix<Float, -1, -1> wordTopicDist{ tm->getV(), tm->getK() };
+	Matrix wordTopicDist{ tm->getV(), tm->getK() };
 	for (size_t i = 0; i < tm->getK(); ++i)
 	{
 		auto dist = tm->getWidsByTopic(i);
-		wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
+		wordTopicDist.col(i) = Eigen::Map<Vector>{ dist.data(), (Eigen::Index)dist.size() };
 	}
 
 	size_t totDocCnt = 0;
@@ -256,7 +304,7 @@ void FoRelevance::estimateContexts()
 		}
 
 		size_t docCnt = 0;
-		Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
+		Vector wcPMI = Vector::Zero(this->tm->getV());
 		for (auto& docId : c.docIds)
 		{
 			thread_local Eigen::VectorXi bdf(this->tm->getV());

diff --git a/src/Labeling/FoRelevance.h b/src/Labeling/FoRelevance.h
@@ -93,8 +93,8 @@ namespace tomoto
 				if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
 				if (numWorkers > 1)
 				{
-					pool = make_unique<ThreadPool>(numWorkers);
-					mtx = make_unique<std::mutex[]>(numWorkers);
+					pool = std::make_unique<ThreadPool>(numWorkers);
+					mtx = std::make_unique<std::mutex[]>(numWorkers);
 				}
 
 				for (; candFirst != candEnd; ++candFirst)

diff --git a/src/TopicModel/CT.h b/src/TopicModel/CT.h
@@ -8,8 +8,8 @@ namespace tomoto
 	{
 		using BaseDocument = DocumentLDA<_tw>;
 		using DocumentLDA<_tw>::DocumentLDA;
-		Eigen::Matrix<Float, -1, -1> beta; // Dim: (K, betaSample)
-		Eigen::Matrix<Float, -1, 1> smBeta; // Dim: K
+		Matrix beta; // Dim: (K, betaSample)
+		Vector smBeta; // Dim: K
 
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta);
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);

diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp
@@ -56,14 +56,14 @@ namespace tomoto
 
 		void updateBeta(_DocType& doc, _RandGen& rg) const
 		{
-			Eigen::Matrix<Float, -1, 1> pbeta, lowerBound, upperBound;
+			Vector pbeta, lowerBound, upperBound;
 			constexpr Float epsilon = 1e-8;
 			constexpr size_t burnIn = 3;
 
-			pbeta = lowerBound = upperBound = Eigen::Matrix<Float, -1, 1>::Zero(this->K);
+			pbeta = lowerBound = upperBound = Vector::Zero(this->K);
 			for (size_t i = 0; i < numBetaSample + burnIn; ++i)
 			{
-				if (i == 0) pbeta = Eigen::Matrix<Float, -1, 1>::Ones(this->K);
+				if (i == 0) pbeta = Vector::Ones(this->K);
 				else pbeta = doc.beta.col(i % numBetaSample).array().exp();
 
 				Float betaESum = pbeta.sum() + 1;
@@ -199,7 +199,7 @@ namespace tomoto
 			for (; _first != _last; ++_first)
 			{
 				auto& doc = *_first;
-				Eigen::Matrix<Float, -1, 1> pbeta = doc.smBeta.array().log();
+				Vector pbeta = doc.smBeta.array().log();
 				Float last = pbeta[K - 1];
 				for (Tid k = 0; k < K; ++k)
 				{
@@ -215,16 +215,16 @@ namespace tomoto
 		void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const
 		{
 			BaseClass::prepareDoc(doc, docId, wordSize);
-			doc.beta = Eigen::Matrix<Float, -1, -1>::Zero(this->K, numBetaSample);
-			doc.smBeta = Eigen::Matrix<Float, -1, 1>::Constant(this->K, (Float)1 / this->K);
+			doc.beta = Matrix::Zero(this->K, numBetaSample);
+			doc.smBeta = Vector::Constant(this->K, (Float)1 / this->K);
 		}
 
 		void updateDocs()
 		{
 			BaseClass::updateDocs();
 			for (auto& doc : this->docs)
 			{
-				doc.beta = Eigen::Matrix<Float, -1, -1>::Zero(this->K, numBetaSample);
+				doc.beta = Matrix::Zero(this->K, numBetaSample);
 			}
 		}
 
@@ -274,7 +274,7 @@ namespace tomoto
 
 		std::vector<Float> getCorrelationTopic(Tid k) const override
 		{
-			Eigen::Matrix<Float, -1, 1> ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt();
+			Vector ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt();
 			return { ret.data(), ret.data() + ret.size() };
 		}
 

diff --git a/src/TopicModel/DMR.h b/src/TopicModel/DMR.h
@@ -11,11 +11,15 @@ namespace tomoto
 		using BaseDocument = DocumentLDA<_tw>;
 		using DocumentLDA<_tw>::DocumentLDA;
 		uint64_t metadata = 0;
+		std::vector<uint64_t> multiMetadata;
+		Vector mdVec;
+		size_t mdHash = (size_t)-1;
+		mutable Matrix cachedAlpha;
 
 		RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
 
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
-		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
+		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata);
 	};
 
 	struct DMRArgs : public LDAArgs
@@ -36,10 +40,18 @@ namespace tomoto
 		virtual void setOptimRepeat(size_t repeat) = 0;
 		virtual size_t getOptimRepeat() const = 0;
 		virtual size_t getF() const = 0;
+		virtual size_t getMdVecSize() const = 0;
 		virtual Float getSigma() const = 0;
 		virtual const Dictionary& getMetadataDict() const = 0;
+		virtual const Dictionary& getMultiMetadataDict() const = 0;
 		virtual std::vector<Float> getLambdaByMetadata(size_t metadataId) const = 0;
 		virtual std::vector<Float> getLambdaByTopic(Tid tid) const = 0;
+
+		virtual std::vector<Float> getTopicPrior(
+			const std::string& metadata, 
+			const std::vector<std::string>& multiMetadata, 
+			bool raw = false
+		) const = 0;
 	};
 
 	template<TermWeight _tw>