Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug fixing including #30 #31

Merged
merged 2 commits into from
Mar 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
build_windows.bat
*.bin
enwiki-stemmed-1000.txt
/venv/
/venv/
.vscode/
5 changes: 5 additions & 0 deletions README.kr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma

역사
-------
* 0.5.2 (2020-03-01)
* `tomotopy.LLDAModel.add_doc` 실행시 segmentation fault가 발생하는 문제를 해결했습니다.
* `tomotopy.HDPModel`에서 `infer` 실행시 종종 프로그램이 종료되는 문제를 해결했습니다.
* `tomotopy.LDAModel.infer`에서 ps=tomotopy.ParallelScheme.PARTITION, together=True로 실행시 발생하는 오류를 해결했습니다.

* 0.5.1 (2020-01-11)
* `tomotopy.SLDAModel.make_doc`에서 결측값을 지원하지 않던 문제를 해결했습니다.
* `tomotopy.SLDAModel`이 이제 결측값을 지원합니다. 결측값을 가진 문헌은 토픽 모델링에는 참여하지만, 응답 변수 회귀에서는 제외됩니다.
Expand Down
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh

History
-------
* 0.5.2 (2020-03-01)
* A segmentation fault problem was fixed in `tomotopy.LLDAModel.add_doc`.
* A bug was fixed that `infer` of `tomotopy.HDPModel` sometimes crashes the program.
* A crash issue was fixed of `tomotopy.LDAModel.infer` with ps=tomotopy.ParallelScheme.PARTITION, together=True.

* 0.5.1 (2020-01-11)
* A bug was fixed that `tomotopy.SLDAModel.make_doc` doesn't support missing values for `y`.
* Now `tomotopy.SLDAModel` fully supports missing values for response variables `y`. Documents with missing values (NaN) are included in modeling topic, but excluded from regression of response variables.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
setup(
name='tomotopy',

version='0.5.1',
version='0.5.2',

description='Tomoto, The Topic Modeling Tool for Python',
long_description=long_description,
Expand Down
12 changes: 7 additions & 5 deletions src/TopicModel/CTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,16 @@ namespace tomoto
size_t numDocBetaSample = -1;
math::MultiNormalDistribution<FLOAT> topicPrior;

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
zLikelihood = doc.smBeta.array()
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());
sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
}
Expand Down Expand Up @@ -106,10 +108,10 @@ namespace tomoto
doc.smBeta /= doc.smBeta.array().sum();
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId);
BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId);
/*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
{
updateBeta(doc, rgs);
Expand Down
6 changes: 4 additions & 2 deletions src/TopicModel/DMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,16 @@ namespace tomoto
return 0;
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + this->expLambda.col(doc.metadata).array())
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());

sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
Expand Down
6 changes: 4 additions & 2 deletions src/TopicModel/GDMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,17 @@ namespace tomoto
}
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
assert(vid < V);
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
getTermsFromMd(ld, &doc.metadataC[0], ld.terms);
zLikelihood = (doc.numByTopic.array().template cast<FLOAT>() + (this->lambda * ld.terms).array().exp() + this->alphaEps)
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + this->eta)
/ (ld.numByTopic.array().template cast<FLOAT>() + V * this->eta);
* (ld.numByTopicWord.col(vid).array().template cast<FLOAT>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<FLOAT>() + etaHelper.getEtaSum());

sample::prefixSum(zLikelihood.data(), this->K);
return &zLikelihood[0];
Expand Down
17 changes: 12 additions & 5 deletions src/TopicModel/HDPModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ namespace tomoto
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
for (size_t w = 0; w < doc.words.size(); ++w)
{
Expand All @@ -200,7 +200,7 @@ namespace tomoto
calcWordTopicProb(ld, doc.words[w]);
auto topicDist = getTopicLikelihoods(ld);
auto dist = getTableLikelihoods(ld, doc, doc.words[w]);
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + 1, rgs);
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + doc.numTopicByTable.size() + (_infer ? 0 : 1), rgs);
if (doc.Zs[w] == doc.numTopicByTable.size()) // create new table
{
size_t K = ld.numByTopic.size();
Expand Down Expand Up @@ -281,8 +281,8 @@ namespace tomoto
for (auto& r : res) r.get();
}

template<ParallelScheme _ps>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
template<ParallelScheme _ps, typename _ExtraDocData>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
{
std::vector<std::future<void>> res;
const size_t V = this->realV;
Expand Down Expand Up @@ -457,6 +457,13 @@ namespace tomoto
{
return this->globalState.numTableByTopic[tid];
}

std::vector<FLOAT> getTopicsByDoc(const _DocType& doc) const
{
std::vector<FLOAT> ret(this->K);
Eigen::Map<Eigen::Matrix<FLOAT, -1, 1>> { ret.data(), this->K }.array() = doc.numByTopic.array().template cast<FLOAT>() / doc.getSumWordWeight();
return ret;
}
};

template<TermWeight _TW>
Expand Down
16 changes: 13 additions & 3 deletions src/TopicModel/HLDAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ namespace tomoto
addWordToOnlyLocal<INC>(ld, doc, pid, vid, level);
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
Expand All @@ -443,14 +444,23 @@ namespace tomoto
{
if (doc.words[w] >= this->realV) continue;
addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w]);
auto dist = static_cast<const DerivedClass*>(this)->getZLikelihoods(ld, doc, docId, doc.words[w]);
FLOAT* dist;
if (this->etaByTopicWord.size())
{
THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
}
else
{
dist = static_cast<const DerivedClass*>(this)->template
getZLikelihoods<false>(ld, doc, docId, doc.words[w]);
}
doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + this->K, rgs);
addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w]);
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
sampleTopics(doc, docId, ld, rgs);
}
Expand Down
2 changes: 1 addition & 1 deletion src/TopicModel/HPAModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ namespace tomoto
}
return nullptr;
}
}
}
28 changes: 19 additions & 9 deletions src/TopicModel/HPAModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ namespace tomoto
return std::make_pair<size_t, size_t>(ceil(k * (float)K2 / this->K), ceil((k + 1) * (float)K2 / this->K));
}

template<bool _asymEta>
FLOAT* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
{
const size_t V = this->realV;
Expand Down Expand Up @@ -173,24 +174,32 @@ namespace tomoto
}
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
size_t b = 0, e = doc.words.size();
if (_ps == ParallelScheme::partition)
{
b = this->chunkOffsetByDoc(partitionId, docId);
e = this->chunkOffsetByDoc(partitionId + 1, docId);
b = edd.chunkOffsetByDoc(partitionId, docId);
e = edd.chunkOffsetByDoc(partitionId + 1, docId);
}

size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0;
size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? edd.vChunkOffset[partitionId - 1] : 0;

const auto K = this->K;
for (size_t w = b; w < e; ++w)
{
if (doc.words[w] >= this->realV) continue;
addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]);
auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset);
FLOAT* dist;
if (this->etaByTopicWord.size())
{
THROW_ERROR_WITH_INFO(exception::Unimplemented, "Unimplemented features");
}
else
{
dist = getZLikelihoods<false>(ld, doc, docId, doc.words[w] - vOffset);
}
if (_Exclusive)
{
auto z = sample::sampleFromDiscreteAcc(dist, dist + K2 + K + 1, rgs);
Expand Down Expand Up @@ -233,12 +242,13 @@ namespace tomoto
}
}

void distributePartition(ThreadPool& pool, _ModelState* localData)
template<typename _ExtraDocData>
void distributePartition(ThreadPool& pool, const _ModelState& globalState, _ModelState* localData, const _ExtraDocData& edd) const
{
}

template<ParallelScheme _ps>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const
template<ParallelScheme _ps, typename _ExtraDocData>
void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*, const _ExtraDocData& edd) const
{
std::vector<std::future<void>> res;

Expand Down
5 changes: 4 additions & 1 deletion src/TopicModel/LDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@ namespace tomoto
virtual std::vector<size_t> getCountByTopic() const = 0;
virtual size_t getK() const = 0;
virtual FLOAT getAlpha() const = 0;
virtual FLOAT getAlpha(TID k1) const = 0;
virtual FLOAT getAlpha(TID k) const = 0;
virtual FLOAT getEta() const = 0;

virtual std::vector<FLOAT> getWordPrior(const std::string& word) const = 0;
virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) = 0;
};
}
20 changes: 10 additions & 10 deletions src/TopicModel/LDACVB0Model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ namespace tomoto
virtual size_t getK() const = 0;
virtual FLOAT getAlpha() const = 0;
virtual FLOAT getEta() const = 0;

virtual std::vector<FLOAT> getWordPrior(const std::string& word) const { return {}; }
virtual void setWordPrior(const std::string& word, const std::vector<FLOAT>& priors) {}
};

template<typename _Interface = ILDACVB0Model,
Expand Down Expand Up @@ -90,13 +93,9 @@ namespace tomoto
template<typename _List>
static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha)
{
FLOAT ret = 0;
auto listExpr = Eigen::Matrix<FLOAT, -1, 1>::NullaryExpr(len, list);
auto dAlpha = math::digammaT(alpha);
for (size_t i = 0; i < len; ++i)
{
ret += math::digammaT(list(i) + alpha) - dAlpha;
}
return ret;
return (math::digammaApprox(listExpr.array() + alpha) - dAlpha).sum();
}

void optimizeParameters(ThreadPool& pool, _ModelState* localData)
Expand Down Expand Up @@ -138,8 +137,8 @@ namespace tomoto
if (DEC) ld.numByTopicWord.col(vid) = ld.numByTopicWord.col(vid).cwiseMax(0);
}

template<ParallelScheme _ps>
void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
for (size_t w = 0; w < doc.words.size(); ++w)
{
Expand All @@ -150,7 +149,8 @@ namespace tomoto
}
}

void updatePartition(ThreadPool& pool, _ModelState* localData)
template<typename _DocIter, typename _ExtraDocData>
void updatePartition(ThreadPool& pool, _ModelState* localData, _DocIter first, _DocIter last, _ExtraDocData& edd)
{
}

Expand All @@ -166,7 +166,7 @@ namespace tomoto
forRandom((this->docs.size() - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id)
{
static_cast<DerivedClass*>(this)->template sampleDocument<ParallelScheme::copy_merge>(
this->docs[id * chStride + ch], id * chStride + ch,
this->docs[id * chStride + ch], 0, id * chStride + ch,
localData[threadId], rgs[threadId], this->iterated);
});
}));
Expand Down
Loading