Skip to content

Commit

Permalink
Merge pull request #31 from bab2min/develop
Browse files Browse the repository at this point in the history
bug fixing including #30
  • Loading branch information
bab2min authored Mar 1, 2020
2 parents 7c79eec + f30807f commit bec1011
Show file tree
Hide file tree
Showing 29 changed files with 493 additions and 146 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
build_windows.bat
*.bin
enwiki-stemmed-1000.txt
/venv/
/venv/
.vscode/
5 changes: 5 additions & 0 deletions README.kr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma

역사
-------
* 0.5.2 (2020-03-01)
* `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다.
* `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다.
* `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다.

* 0.5.1 (2020-01-11)
* `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
* `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.
Expand Down
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh

History
-------
* 0.5.2 (2020-03-01)
* A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`.
* A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program.
* A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True.

* 0.5.1 (2020-01-11)
* A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
* Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
setup(
name='tomotopy',

version='0.5.1',
version='0.5.2',

description='Tomoto, The Topic Modeling Tool for Python',
long_description=long_description,
Expand Down
12 changes: 7 additions & 5 deletions src/TopicModel/CTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,16 @@ namespace tomoto
size_t numDocBetaSample = -1;
math::MultiNormalDistribution<FLOAT> topicPrior;

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
zLikelihood = doc.smBeta.array()
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());
sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
}
Expand Down Expand Up @@ -106,10 +108,10 @@ namespace tomoto
doc.smBeta /= doc.smBeta.array().sum();
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId);
BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId);
/*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
{
updateBeta(doc, rgs);
Expand Down
6 changes: 4 additions & 2 deletions src/TopicModel/DMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,16 @@ namespace tomoto
return 0;
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + this->expLambda.col(doc.metadata).array())
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());

sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
Expand Down
6 changes: 4 additions & 2 deletions src/TopicModel/GDMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,17 @@ namespace tomoto
}
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
getTermsFromMd(ld, &doc.metadataC[0], ld.terms);
zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + (this->lambda * ld.terms).array().exp() + this->alphaEps)
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());

sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
Expand Down
17 changes: 12 additions & 5 deletions src/TopicModel/HDPModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ namespace tomoto
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
for (size_t w = 0; w < doc.words.size(); ++w)
{
Expand All @@ -200,7 +200,7 @@ namespace tomoto
calcWordTopicProb(ld, doc.words[w]);
auto topicDist = getTopicLikelihoods(ld);
auto dist = getTableLikelihoods(ld, doc, doc.words[w]);
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + 1, rgs);
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + (_infer ? 0 : 1), rgs);
if (doc.Zs[w] == doc.numTopicByTable.size()) // create new table
{
size_t K = ld.numByTopic.size();
Expand Down Expand Up @@ -281,8 +281,8 @@ namespace tomoto
for (auto& r : res) r.get();
}

template<ParallelScheme _ps>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
template<ParallelScheme _ps, typename _ExtraDocData>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
{
std::vector<std::future<void>> res;
const size_t V = this->realV;
Expand Down Expand Up @@ -457,6 +457,13 @@ namespace tomoto
{
return this->globalState.numTableByTopic[tid];
}

std::vector<FLOAT> getTopicsByDoc(const _DocType& doc) const
{
std::vector<FLOAT> ret(this->K);
Eigen::Map<Eigen::Matrix<FLOAT, -1, 1>> { ret.data(), this->K }.array() = doc.numByTopic.array().template cast<FLOAT>() / doc.getSumWordWeight();
return ret;
}
};

template<TermWeight _TW>
Expand Down
16 changes: 13 additions & 3 deletions src/TopicModel/HLDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ namespace tomoto
addWordToOnlyLocal<INC>(ld, doc, pid, vid, level);
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
Expand All @@ -443,14 +444,23 @@ namespace tomoto
{
if (doc.words[w] >= this->realV) continue;
addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w]);
auto dist = static_cast<const DerivedClass*>(this)->getZLikelihoods(ld, doc, docId, doc.words[w]);
FLOAT* dist;
if (this->etaByTopicWord.size())
{
THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
}
else
{
dist = static_cast<const DerivedClass*>(this)->template
getZLikelihoods<false>(ld, doc, docId, doc.words[w]);
}
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + this->K, rgs);
addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w]);
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
sampleTopics(doc, docId, ld, rgs);
}
Expand Down
2 changes: 1 addition & 1 deletion src/TopicModel/HPAModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ namespace tomoto
}
return nullptr;
}
}
}
28 changes: 19 additions & 9 deletions src/TopicModel/HPAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ namespace tomoto
return std::make_pair<size_t, size_t>(ceil(k * (float)K2 / this->K), ceil((k + 1) * (float)K2 / this->K));
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
Expand Down Expand Up @@ -173,24 +174,32 @@ namespace tomoto
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
size_t b = 0, e = doc.words.size();
if (_ps == ParallelScheme::partition)
{
b = this->chunkOffsetByDoc(partitionId, docId);
e = this->chunkOffsetByDoc(partitionId + 1, docId);
b = edd.chunkOffsetByDoc(partitionId, docId);
e = edd.chunkOffsetByDoc(partitionId + 1, docId);
}

size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0;
size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0;

const auto K = this->K;
for (size_t w = b; w < e; ++w)
{
if (doc.words[w] >= this->realV) continue;
addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]);
auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset);
FLOAT* dist;
if (this->etaByTopicWord.size())
{
THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
}
else
{
dist = getZLikelihoods<false>(ld, doc, docId, doc.words[w] - vOffset);
}
if (_Exclusive)
{
auto z = sample::sampleFromDiscreteAcc(dist, dist + K2 + K + 1, rgs);
Expand Down Expand Up @@ -233,12 +242,13 @@ namespace tomoto
}
}

void distributePartition(ThreadPool& pool, _ModelState* localData)
template<typename _ExtraDocData>
void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const
{
}

template<ParallelScheme _ps>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
template<ParallelScheme _ps, typename _ExtraDocData>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
{
std::vector<std::future<void>> res;

Expand Down
5 changes: 4 additions & 1 deletion src/TopicModel/LDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@ namespace tomoto
virtual std::vector<size_t> getCountByTopic() const = 0;
virtual size_t getK() const = 0;
virtual FLOAT getAlpha() const = 0;
virtual FLOAT getAlpha(TID k1) const = 0;
virtual FLOAT getAlpha(TID k) const = 0;
virtual FLOAT getEta() const = 0;

virtual std::vector<FLOAT> getWordPrior(const std::string& word) const = 0;
virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) = 0;
};
}
20 changes: 10 additions & 10 deletions src/TopicModel/LDACVB0Model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ namespace tomoto
virtual size_t getK() const = 0;
virtual FLOAT getAlpha() const = 0;
virtual FLOAT getEta() const = 0;

virtual std::vector<FLOAT> getWordPrior(const std::string& word) const { return {}; }
virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) {}
};

template<typename _Interface = ILDACVB0Model,
Expand Down Expand Up @@ -90,13 +93,9 @@ namespace tomoto
template<typename _List>
static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha)
{
FLOAT ret = 0;
auto listExpr = Eigen::Matrix<FLOAT, -1, 1>::NullaryExpr(len, list);
auto dAlpha = math::digammaT(alpha);
for (size_t i = 0; i < len; ++i)
{
ret += math::digammaT(list(i) + alpha) - dAlpha;
}
return ret;
return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum();
}

void optimizeParameters(ThreadPool& pool, _ModelState* localData)
Expand Down Expand Up @@ -138,8 +137,8 @@ namespace tomoto
if (DEC) ld.numByTopicWord.col(vid) = ld.numByTopicWord.col(vid).cwiseMax(0);
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
for (size_t w = 0; w < doc.words.size(); ++w)
{
Expand All @@ -150,7 +149,8 @@ namespace tomoto
}
}

void updatePartition(ThreadPool& pool, _ModelState* localData)
template<typename _DocIter, typename _ExtraDocData>
void updatePartition(ThreadPool& pool, _ModelState* localData, _DocIter first, _DocIter last, _ExtraDocData& edd)
{
}

Expand All @@ -166,7 +166,7 @@ namespace tomoto
forRandom((this->docs.size() - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id)
{
static_cast<DerivedClass*>(this)->template sampleDocument<ParallelScheme::copy_merge>(
this->docs[id * chStride + ch], id * chStride + ch,
this->docs[id * chStride + ch], 0, id * chStride + ch,
localData[threadId], rgs[threadId], this->iterated);
});
}));
Expand Down
Loading

0 comments on commit bec1011

Please sign in to comment.