Skip to content

Commit

Permalink
Merge pull request #116 from bab2min/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
bab2min authored Apr 26, 2021
2 parents 7217579 + 6b932d2 commit ff09183
Show file tree
Hide file tree
Showing 56 changed files with 2,692 additions and 2,790 deletions.
54 changes: 54 additions & 0 deletions examples/dmr_multi_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
'''
This example show how to perform a DMR topic model with multi-metadata using tomotopy
'''
import itertools

import tomotopy as tp
import numpy as np

# You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
corpus = tp.utils.Corpus()
for line in open('text_mining_year_journal.txt', encoding='utf-8'):
fd = line.strip().split('\t', maxsplit=2)
corpus.add_doc(fd[2].split(), multi_metadata=['y_' + fd[0], 'j_' + fd[1]])
# We add prefix 'y' for year-label and 'j' for journal-label

# We set a range of the first metadata as [2000, 2017]
# and one of the second metadata as [0, 1].
mdl = tp.DMRModel(tw=tp.TermWeight.ONE,
k=20,
corpus=corpus
)
mdl.optim_interval = 20
mdl.burn_in = 200

mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))

# Let's train the model
for i in range(0, 2000, 20):
print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word))

mdl.summary()

year_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('y_'))
journal_labels = sorted(l for l in mdl.multi_metadata_dict if l.startswith('j_'))

# calculate topic distribution with each metadata using get_topic_prior()
print('Topic distributions by year')
for l in year_labels:
print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')

print('Topic distributions by journal')
for l in journal_labels:
print(l, '\n', mdl.get_topic_prior(multi_metadata=[l]), '\n')

# Also we can estimate topic distributions with multiple metadata
print('Topic distributions by year-journal')
for y, j in itertools.product(year_labels, journal_labels):
print(y, ',', j, '\n', mdl.get_topic_prior(multi_metadata=[y, j]), '\n')
2 changes: 1 addition & 1 deletion examples/dmr_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
for i in range(0, 2000, 20):
print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
print('Iteration: {:04} LL per word: {:.4}'.format(2000, mdl.ll_per_word))

mdl.summary()

Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
cargs = ['/O2', '/MT', '/Gy']
arch_levels = {'':'', 'sse2':'/arch:SSE2', 'avx':'/arch:AVX', 'avx2':'/arch:AVX2'}
elif platform.system() == 'Darwin':
cargs = ['-std=c++0x', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch']
cargs = ['-std=c++1y', '-O3', '-fpermissive', '-stdlib=libc++', '-Wno-unused-variable', '-Wno-switch']
largs += ['-stdlib=libc++']
if 'many' not in os.environ.get('AUDITWHEEL_PLAT', ''): arch_levels = {'':'-march=native'}
elif 'many' in os.environ.get('AUDITWHEEL_PLAT', ''):
cargs = ['-std=c++0x', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch']
cargs = ['-std=c++1y', '-O3', '-fpermissive', '-g0', '-Wno-unused-variable', '-Wno-switch']
else:
cargs = ['-std=c++0x', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch']
cargs = ['-std=c++1y', '-O3', '-fpermissive', '-Wno-unused-variable', '-Wno-switch']
arch_levels = {'':'-march=native'}

if struct.calcsize('P') < 8: arch_levels = {k:v for k, v in arch_levels.items() if k in ('', 'sse2')}
Expand Down
2 changes: 1 addition & 1 deletion src/Coherence/CoherenceModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace tomoto
void init(size_t windowSize)
{
pe_type = _pe;
pe = make_unique<ProbEstimator<_pe>>(windowSize);
pe = std::make_unique<ProbEstimator<_pe>>(windowSize);
}

template<ProbEstimation _pe, typename _TargetIter>
Expand Down
72 changes: 60 additions & 12 deletions src/Labeling/FoRelevance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,55 @@

using namespace tomoto::label;

template<bool reverse = false>
class DocWordIterator
{
const tomoto::DocumentBase* doc = nullptr;
size_t n = 0;
public:
DocWordIterator(const tomoto::DocumentBase* _doc = nullptr, size_t _n = 0)
: doc{ _doc }, n{ _n }
{
}

tomoto::Vid operator[](size_t i) const
{
return doc->words[doc->wOrder.empty() ? (n + i) : doc->wOrder[n + i]];
}

tomoto::Vid operator*() const
{
return doc->words[doc->wOrder.empty() ? n : doc->wOrder[n]];
}

bool operator==(const DocWordIterator& o) const
{
return doc == o.doc && n == o.n;
}

bool operator!=(const DocWordIterator& o) const
{
return !operator==(o);
}

DocWordIterator& operator++()
{
if (reverse) --n;
else ++n;
return *this;
}

DocWordIterator operator+(ptrdiff_t o) const
{
return { doc, (size_t)((ptrdiff_t)n + o) };
}

DocWordIterator operator-(ptrdiff_t o) const
{
return { doc, (size_t)((ptrdiff_t)n - o) };
}
};

class DocWrapper
{
const tomoto::DocumentBase* doc;
Expand All @@ -25,24 +74,24 @@ class DocWrapper
return doc->words[doc->wOrder.empty() ? idx : doc->wOrder[idx]];
}

auto begin() const -> decltype(doc->words.begin())
DocWordIterator<> begin() const
{
return doc->words.begin();
return { doc, 0 };
}

auto end() const -> decltype(doc->words.end())
DocWordIterator<> end() const
{
return doc->words.end();
return { doc, doc->words.size() };
}

auto rbegin() const -> decltype(doc->words.rbegin())
DocWordIterator<true> rbegin() const
{
return doc->words.rbegin();
return { doc, doc->words.size() };
}

auto rend() const -> decltype(doc->words.rend())
DocWordIterator<true> rend() const
{
return doc->words.rend();
return { doc, 0 };
}
};

Expand Down Expand Up @@ -99,7 +148,6 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel* tm) cons
return candidates;
}


std::vector<Candidate> tomoto::label::PMIBEExtractor::extract(const ITopicModel* tm) const
{
auto& vocabFreqs = tm->getVocabCf();
Expand Down Expand Up @@ -217,11 +265,11 @@ void FoRelevance::estimateContexts()
}
}

Eigen::Matrix<Float, -1, -1> wordTopicDist{ tm->getV(), tm->getK() };
Matrix wordTopicDist{ tm->getV(), tm->getK() };
for (size_t i = 0; i < tm->getK(); ++i)
{
auto dist = tm->getWidsByTopic(i);
wordTopicDist.col(i) = Eigen::Map<Eigen::Matrix<Float, -1, 1>>{ dist.data(), (Eigen::Index)dist.size() };
wordTopicDist.col(i) = Eigen::Map<Vector>{ dist.data(), (Eigen::Index)dist.size() };
}

size_t totDocCnt = 0;
Expand Down Expand Up @@ -256,7 +304,7 @@ void FoRelevance::estimateContexts()
}

size_t docCnt = 0;
Eigen::Matrix<Float, -1, 1> wcPMI = Eigen::Matrix<Float, -1, 1>::Zero(this->tm->getV());
Vector wcPMI = Vector::Zero(this->tm->getV());
for (auto& docId : c.docIds)
{
thread_local Eigen::VectorXi bdf(this->tm->getV());
Expand Down
4 changes: 2 additions & 2 deletions src/Labeling/FoRelevance.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ namespace tomoto
if (!numWorkers) numWorkers = std::thread::hardware_concurrency();
if (numWorkers > 1)
{
pool = make_unique<ThreadPool>(numWorkers);
mtx = make_unique<std::mutex[]>(numWorkers);
pool = std::make_unique<ThreadPool>(numWorkers);
mtx = std::make_unique<std::mutex[]>(numWorkers);
}

for (; candFirst != candEnd; ++candFirst)
Expand Down
4 changes: 2 additions & 2 deletions src/TopicModel/CT.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ namespace tomoto
{
using BaseDocument = DocumentLDA<_tw>;
using DocumentLDA<_tw>::DocumentLDA;
Eigen::Matrix<Float, -1, -1> beta; // Dim: (K, betaSample)
Eigen::Matrix<Float, -1, 1> smBeta; // Dim: K
Matrix beta; // Dim: (K, betaSample)
Vector smBeta; // Dim: K

DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);
Expand Down
16 changes: 8 additions & 8 deletions src/TopicModel/CTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@ namespace tomoto

void updateBeta(_DocType& doc, _RandGen& rg) const
{
Eigen::Matrix<Float, -1, 1> pbeta, lowerBound, upperBound;
Vector pbeta, lowerBound, upperBound;
constexpr Float epsilon = 1e-8;
constexpr size_t burnIn = 3;

pbeta = lowerBound = upperBound = Eigen::Matrix<Float, -1, 1>::Zero(this->K);
pbeta = lowerBound = upperBound = Vector::Zero(this->K);
for (size_t i = 0; i < numBetaSample + burnIn; ++i)
{
if (i == 0) pbeta = Eigen::Matrix<Float, -1, 1>::Ones(this->K);
if (i == 0) pbeta = Vector::Ones(this->K);
else pbeta = doc.beta.col(i % numBetaSample).array().exp();

Float betaESum = pbeta.sum() + 1;
Expand Down Expand Up @@ -199,7 +199,7 @@ namespace tomoto
for (; _first != _last; ++_first)
{
auto& doc = *_first;
Eigen::Matrix<Float, -1, 1> pbeta = doc.smBeta.array().log();
Vector pbeta = doc.smBeta.array().log();
Float last = pbeta[K - 1];
for (Tid k = 0; k < K; ++k)
{
Expand All @@ -215,16 +215,16 @@ namespace tomoto
void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const
{
BaseClass::prepareDoc(doc, docId, wordSize);
doc.beta = Eigen::Matrix<Float, -1, -1>::Zero(this->K, numBetaSample);
doc.smBeta = Eigen::Matrix<Float, -1, 1>::Constant(this->K, (Float)1 / this->K);
doc.beta = Matrix::Zero(this->K, numBetaSample);
doc.smBeta = Vector::Constant(this->K, (Float)1 / this->K);
}

void updateDocs()
{
BaseClass::updateDocs();
for (auto& doc : this->docs)
{
doc.beta = Eigen::Matrix<Float, -1, -1>::Zero(this->K, numBetaSample);
doc.beta = Matrix::Zero(this->K, numBetaSample);
}
}

Expand Down Expand Up @@ -274,7 +274,7 @@ namespace tomoto

std::vector<Float> getCorrelationTopic(Tid k) const override
{
Eigen::Matrix<Float, -1, 1> ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt();
Vector ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt();
return { ret.data(), ret.data() + ret.size() };
}

Expand Down
14 changes: 13 additions & 1 deletion src/TopicModel/DMR.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@ namespace tomoto
using BaseDocument = DocumentLDA<_tw>;
using DocumentLDA<_tw>::DocumentLDA;
uint64_t metadata = 0;
std::vector<uint64_t> multiMetadata;
Vector mdVec;
size_t mdHash = (size_t)-1;
mutable Matrix cachedAlpha;

RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;

DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata);
};

struct DMRArgs : public LDAArgs
Expand All @@ -36,10 +40,18 @@ namespace tomoto
virtual void setOptimRepeat(size_t repeat) = 0;
virtual size_t getOptimRepeat() const = 0;
virtual size_t getF() const = 0;
virtual size_t getMdVecSize() const = 0;
virtual Float getSigma() const = 0;
virtual const Dictionary& getMetadataDict() const = 0;
virtual const Dictionary& getMultiMetadataDict() const = 0;
virtual std::vector<Float> getLambdaByMetadata(size_t metadataId) const = 0;
virtual std::vector<Float> getLambdaByTopic(Tid tid) const = 0;

virtual std::vector<Float> getTopicPrior(
const std::string& metadata,
const std::vector<std::string>& multiMetadata,
bool raw = false
) const = 0;
};

template<TermWeight _tw>
Expand Down
Loading

0 comments on commit ff09183

Please sign in to comment.