Skip to content

Commit

Permalink
preparing 0.8.2
Browse files Browse the repository at this point in the history
fixed #59, partially fixed #63
  • Loading branch information
bab2min committed Jul 14, 2020
1 parent 400f060 commit 706bfa0
Show file tree
Hide file tree
Showing 73 changed files with 822 additions and 728 deletions.
6 changes: 5 additions & 1 deletion README.kr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ tomotopy 란?

더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.

tomotopy의 가장 최신버전은 0.8.1 입니다.
tomotopy의 가장 최신버전은 0.8.2 입니다.

시작하기
---------------
Expand Down Expand Up @@ -240,6 +240,10 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma

역사
-------
* 0.8.2 (2020-07-14)
* `tomotopy.DTModel.num_timepoints`와 `tomotopy.DTModel.num_docs_by_timepoint` 프로퍼티가 추가되었습니다.
* `seed`가 동일해서 플랫폼이 다르면 다른 결과를 내던 문제가 일부 해결되었습니다. 이로 인해 32bit 버전의 모델 학습 결과가 이전 버전과는 달라졌습니다.

* 0.8.1 (2020-06-08)
* `tomotopy.LDAModel.used_vocabs`가 잘못된 값을 반환하는 버그가 수정되었습니다.
* 이제 `tomotopy.CTModel.prior_cov`가 `[k, k]` 모양의 공분산 행렬을 반환합니다.
Expand Down
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh

History
-------
* 0.8.2 (2020-07-14)
* New properties `tomotopy.DTModel.num_timepoints` and `tomotopy.DTModel.num_docs_by_timepoint` have been added.
* A bug which causes different results with the different platform even if `seeds` were the same was partially fixed.
As a result of this fix, now `tomotopy` in 32 bit yields different training results from earlier version.

* 0.8.1 (2020-06-08)
* A bug where `tomotopy.LDAModel.used_vocabs` returned an incorrect value was fixed.
* Now `tomotopy.CTModel.prior_cov` returns a covariance matrix with shape `[k, k]`.
Expand Down
44 changes: 44 additions & 0 deletions examples/dtm_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import tomotopy as tp
import numpy as np
import nltk

def data_feeder(input_file):
for line in open(input_file, encoding='utf-8'):
fd = line.strip().split(maxsplit=1)
timepoint = int(fd[0])
yield fd[1], None, {'timepoint':timepoint}

porter_stemmer = nltk.PorterStemmer().stem
corpus = tp.utils.Corpus(
tokenizer=tp.utils.SimpleTokenizer(porter_stemmer)
)
corpus.process(data_feeder('../test/sample_tp.txt'))

num_timepoints = 13

mdl = tp.DTModel(min_cf=3, k=10, t=num_timepoints, phi_var=1e-2, corpus=corpus)
mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

# Let's train the model
for i in range(0, 1000, 20):
print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

topic_dist_by_time = np.zeros(shape=[num_timepoints, mdl.k], dtype=np.float)
doc_counts_by_time = np.zeros(shape=[num_timepoints], dtype=np.int32)
for doc in mdl.docs:
doc_counts_by_time[doc.timepoint] += 1
topic_dist_by_time[doc.timepoint] += doc.get_topic_dist()

topic_dist_by_time /= doc_counts_by_time[:, np.newaxis]

for k in range(mdl.k):
print('Topic #{}'.format(k), *(w for w, _ in mdl.get_topic_words(k, 0, top_n=5)))
print(topic_dist_by_time[:, k])

2 changes: 1 addition & 1 deletion examples/gdmr_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __call__(self, value, clip=None):
'''

corpus = tp.utils.Corpus()
for line in open('examples/dataset2.txt', encoding='utf-8'):
for line in open('dataset2.txt', encoding='utf-8'):
fd = line.strip().split()
corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))

Expand Down
2 changes: 1 addition & 1 deletion src/Labeling/FoRelevance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ std::vector<Candidate> PMIExtractor::extract(const tomoto::ITopicModel * tm) con
trieNodes[0].traverse_with_keys([&](const TrieEx<Vid, size_t>* node, const std::vector<Vid>& rkeys)
{
if (rkeys.size() <= 2 || node->val < candMinCnt) return;
float n = tm->getN();
float n = (float)tm->getN();
auto pmi = node->val / n;
for (auto k : rkeys)
{
Expand Down
11 changes: 6 additions & 5 deletions src/TopicModel/CT.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

namespace tomoto
{
template<TermWeight _tw, size_t _Flags = 0>
struct DocumentCTM : public DocumentLDA<_tw, _Flags>
template<TermWeight _tw>
struct DocumentCTM : public DocumentLDA<_tw>
{
using BaseDocument = DocumentLDA<_tw, _Flags>;
using DocumentLDA<_tw, _Flags>::DocumentLDA;
using BaseDocument = DocumentLDA<_tw>;
using DocumentLDA<_tw>::DocumentLDA;
Eigen::Matrix<Float, -1, -1> beta; // Dim: (K, betaSample)
Eigen::Matrix<Float, -1, 1> smBeta; // Dim: K

Expand All @@ -21,7 +21,8 @@ namespace tomoto
using DefaultDocType = DocumentCTM<TermWeight::one>;
static ICTModel* create(TermWeight _weight, size_t _K = 1,
Float smoothingAlpha = 0.1, Float _eta = 0.01,
const RandGen& _rg = RandGen{ std::random_device{}() });
size_t seed = std::random_device{}(),
bool scalarRng = false);

virtual void setNumBetaSample(size_t numSample) = 0;
virtual size_t getNumBetaSample() const = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/TopicModel/CTModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

namespace tomoto
{
template class CTModel<TermWeight::one>;
/*template class CTModel<TermWeight::one>;
template class CTModel<TermWeight::idf>;
template class CTModel<TermWeight::pmi>;
template class CTModel<TermWeight::pmi>;*/

ICTModel* ICTModel::create(TermWeight _weight, size_t _K, Float smoothingAlpha, Float _eta, const RandGen& _rg)
ICTModel* ICTModel::create(TermWeight _weight, size_t _K, Float smoothingAlpha, Float _eta, size_t seed, bool scalarRng)
{
SWITCH_TW(_weight, CTModel, _K, smoothingAlpha, _eta, _rg);
TMT_SWITCH_TW(_weight, scalarRng, CTModel, _K, smoothingAlpha, _eta, seed);
}
}
26 changes: 13 additions & 13 deletions src/TopicModel/CTModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,19 @@ namespace tomoto
{
};

template<TermWeight _tw, size_t _Flags = flags::partitioned_multisampling,
template<TermWeight _tw, typename _RandGen,
size_t _Flags = flags::partitioned_multisampling,
typename _Interface = ICTModel,
typename _Derived = void,
typename _DocType = DocumentCTM<_tw>,
typename _ModelState = ModelStateCTM<_tw>>
class CTModel : public LDAModel<_tw, _Flags, _Interface,
typename std::conditional<std::is_same<_Derived, void>::value, CTModel<_tw, _Flags>, _Derived>::type,
class CTModel : public LDAModel<_tw, _RandGen, _Flags, _Interface,
typename std::conditional<std::is_same<_Derived, void>::value, CTModel<_tw, _RandGen, _Flags>, _Derived>::type,
_DocType, _ModelState>
{
protected:
using DerivedClass = typename std::conditional<std::is_same<_Derived, void>::value, CTModel<_tw>, _Derived>::type;
using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>;
using DerivedClass = typename std::conditional<std::is_same<_Derived, void>::value, CTModel<_tw, _RandGen>, _Derived>::type;
using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>;
friend BaseClass;
friend typename BaseClass::BaseClass;
using WeightType = typename BaseClass::WeightType;
Expand All @@ -53,12 +54,11 @@ namespace tomoto
return &zLikelihood[0];
}

void updateBeta(_DocType& doc, RandGen& rg) const
void updateBeta(_DocType& doc, _RandGen& rg) const
{
Eigen::Matrix<Float, -1, 1> pbeta, lowerBound, upperBound;
constexpr Float epsilon = 1e-8;
constexpr size_t burnIn = 3;
sample::FastRealGenerator frg;

pbeta = lowerBound = upperBound = Eigen::Matrix<Float, -1, 1>::Zero(this->K);
for (size_t i = 0; i < numBetaSample + burnIn; ++i)
Expand All @@ -71,7 +71,7 @@ namespace tomoto
{
Float N_k = doc.numByTopic[k] + this->alpha;
Float N_nk = doc.getSumWordWeight() + this->alpha * (this->K + 1) - N_k;
Float u1 = frg(rg), u2 = frg(rg);
Float u1 = rg.uniform_real(), u2 = rg.uniform_real();
Float max_uk = epsilon + pow(u1, (Float)1 / N_k) * (pbeta[k] - epsilon);
Float min_unk = (1 - pow(u2, (Float)1 / N_nk))
* (1 - pbeta[k]) + pbeta[k];
Expand Down Expand Up @@ -111,7 +111,7 @@ namespace tomoto
}

template<ParallelScheme _ps, bool _infer, typename _ExtraDocData>
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
void sampleDocument(_DocType& doc, const _ExtraDocData& edd, size_t docId, _ModelState& ld, _RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const
{
BaseClass::template sampleDocument<_ps, _infer>(doc, edd, docId, ld, rgs, iterationCnt, partitionId);
/*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0)
Expand All @@ -121,7 +121,7 @@ namespace tomoto
}

template<typename _DocIter>
void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const
void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, _RandGen* rgs, _DocIter first, _DocIter last) const
{
if (this->iterated < this->burnIn || !this->optimInterval || (this->iterated + 1) % this->optimInterval != 0) return;

Expand Down Expand Up @@ -154,7 +154,7 @@ namespace tomoto
}
}

int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, RandGen* rgs)
int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
{
std::cerr << "Failed to sample! Reset prior and retry!" << std::endl;
const size_t chStride = std::min(pool.getNumWorkers() * 8, this->docs.size());
Expand All @@ -175,7 +175,7 @@ namespace tomoto
return 0;
}

void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs)
void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
{
std::vector<std::future<void>> res;
topicPrior = math::MultiNormalDistribution<Float>::estimate([this](size_t i)
Expand Down Expand Up @@ -239,7 +239,7 @@ namespace tomoto
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);

CTModel(size_t _K = 1, Float smoothingAlpha = 0.1, Float _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() })
CTModel(size_t _K = 1, Float smoothingAlpha = 0.1, Float _eta = 0.01, const _RandGen& _rg = _RandGen{ std::random_device{}() })
: BaseClass(_K, smoothingAlpha, _eta, _rg)
{
this->optimInterval = 2;
Expand Down
11 changes: 6 additions & 5 deletions src/TopicModel/DMR.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

namespace tomoto
{
template<TermWeight _tw, size_t _Flags = 0>
struct DocumentDMR : public DocumentLDA<_tw, _Flags>
template<TermWeight _tw>
struct DocumentDMR : public DocumentLDA<_tw>
{
using BaseDocument = DocumentLDA<_tw, _Flags>;
using DocumentLDA<_tw, _Flags>::DocumentLDA;
using BaseDocument = DocumentLDA<_tw>;
using DocumentLDA<_tw>::DocumentLDA;
size_t metadata = 0;

DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
Expand All @@ -20,7 +20,8 @@ namespace tomoto
using DefaultDocType = DocumentDMR<TermWeight::one>;
static IDMRModel* create(TermWeight _weight, size_t _K = 1,
Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01, Float _alphaEps = 1e-10,
const RandGen& _rg = RandGen{ std::random_device{}() });
size_t seed = std::random_device{}(),
bool scalarRng = false);

virtual size_t addDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) = 0;
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, const std::vector<std::string>& metadata) const = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/TopicModel/DMRModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

namespace tomoto
{
template class DMRModel<TermWeight::one>;
/*template class DMRModel<TermWeight::one>;
template class DMRModel<TermWeight::idf>;
template class DMRModel<TermWeight::pmi>;
template class DMRModel<TermWeight::pmi>;*/

IDMRModel* IDMRModel::create(TermWeight _weight, size_t _K, Float _defaultAlpha, Float _sigma, Float _eta, Float _alphaEps, const RandGen& _rg)
IDMRModel* IDMRModel::create(TermWeight _weight, size_t _K, Float _defaultAlpha, Float _sigma, Float _eta, Float _alphaEps, size_t seed, bool scalarRng)
{
SWITCH_TW(_weight, DMRModel, _K, _defaultAlpha, _sigma, _eta, _alphaEps, _rg);
TMT_SWITCH_TW(_weight, scalarRng, DMRModel, _K, _defaultAlpha, _sigma, _eta, _alphaEps, seed);
}
}
26 changes: 14 additions & 12 deletions src/TopicModel/DMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,19 @@ namespace tomoto
Eigen::Matrix<Float, -1, 1> tmpK;
};

template<TermWeight _tw, size_t _Flags = flags::partitioned_multisampling,
template<TermWeight _tw, typename _RandGen,
size_t _Flags = flags::partitioned_multisampling,
typename _Interface = IDMRModel,
typename _Derived = void,
typename _DocType = DocumentDMR<_tw>,
typename _ModelState = ModelStateDMR<_tw>>
class DMRModel : public LDAModel<_tw, _Flags, _Interface,
typename std::conditional<std::is_same<_Derived, void>::value, DMRModel<_tw, _Flags>, _Derived>::type,
class DMRModel : public LDAModel<_tw, _RandGen, _Flags, _Interface,
typename std::conditional<std::is_same<_Derived, void>::value, DMRModel<_tw, _RandGen, _Flags>, _Derived>::type,
_DocType, _ModelState>
{
protected:
using DerivedClass = typename std::conditional<std::is_same<_Derived, void>::value, DMRModel<_tw>, _Derived>::type;
using BaseClass = LDAModel<_tw, _Flags, _Interface, DerivedClass, _DocType, _ModelState>;
using DerivedClass = typename std::conditional<std::is_same<_Derived, void>::value, DMRModel<_tw, _RandGen>, _Derived>::type;
using BaseClass = LDAModel<_tw, _RandGen, _Flags, _Interface, DerivedClass, _DocType, _ModelState>;
friend BaseClass;
friend typename BaseClass::BaseClass;
using WeightType = typename BaseClass::WeightType;
Expand Down Expand Up @@ -118,7 +119,7 @@ namespace tomoto
}
}

void optimizeParameters(ThreadPool& pool, _ModelState* localData, RandGen* rgs)
void optimizeParameters(ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
{
Eigen::Matrix<Float, -1, -1> bLambda;
Float fx = 0, bestFx = INFINITY;
Expand Down Expand Up @@ -146,7 +147,7 @@ namespace tomoto
expLambda = lambda.array().exp() + alphaEps;
}

int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, RandGen* rgs)
int restoreFromTrainingError(const exception::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
{
std::cerr << "Failed to optimize! Reset prior and retry!" << std::endl;
lambda.setZero();
Expand Down Expand Up @@ -254,7 +255,7 @@ namespace tomoto
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda);

DMRModel(size_t _K = 1, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01,
Float _alphaEps = 0, const RandGen& _rg = RandGen{ std::random_device{}() })
Float _alphaEps = 0, const _RandGen& _rg = _RandGen{ std::random_device{}() })
: BaseClass(_K, defaultAlpha, _eta, _rg), sigma(_sigma), alphaEps(_alphaEps)
{
if (_sigma <= 0) THROW_ERROR_WITH_INFO(std::runtime_error, text::format("wrong sigma value (sigma = %f)", _sigma));
Expand Down Expand Up @@ -362,11 +363,12 @@ namespace tomoto
};

/* This is for preventing 'undefined symbol' problem in compiling by clang. */
template<TermWeight _tw, size_t _Flags,
template<TermWeight _tw, typename _RandGen, size_t _Flags,
typename _Interface, typename _Derived, typename _DocType, typename _ModelState>
constexpr Float DMRModel<_tw, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxLambda;
constexpr Float DMRModel<_tw, _RandGen, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxLambda;

template<TermWeight _tw, size_t _Flags,
template<TermWeight _tw, typename _RandGen, size_t _Flags,
typename _Interface, typename _Derived, typename _DocType, typename _ModelState>
constexpr size_t DMRModel<_tw, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxBFGSIteration;
constexpr size_t DMRModel<_tw, _RandGen, _Flags, _Interface, _Derived, _DocType, _ModelState>::maxBFGSIteration;

}
14 changes: 8 additions & 6 deletions src/TopicModel/DT.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@

namespace tomoto
{
template<TermWeight _tw, size_t _Flags = 0>
struct DocumentDTM : public DocumentLDA<_tw, _Flags>
template<TermWeight _tw>
struct DocumentDTM : public DocumentLDA<_tw>
{
using BaseDocument = DocumentLDA<_tw, _Flags>;
using DocumentLDA<_tw, _Flags>::DocumentLDA;
using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type;
using BaseDocument = DocumentLDA<_tw>;
using DocumentLDA<_tw>::DocumentLDA;

size_t timepoint = 0;
ShareableVector<Float> eta;
Expand All @@ -27,7 +26,8 @@ namespace tomoto
Float _alphaVar = 1.0, Float _etaVar = 1.0, Float _phiVar = 1.0,
Float _shapeA = 0.03, Float _shapeB = 0.1, Float _shapeC = 0.55,
Float _etaRegL2 = 0,
const RandGen& _rg = RandGen{ std::random_device{}() });
size_t seed = std::random_device{}(),
bool scalarRng = false);

virtual size_t addDoc(const std::vector<std::string>& words, size_t timepoint) = 0;
virtual std::unique_ptr<DocumentBase> makeDoc(const std::vector<std::string>& words, size_t timepoint) const = 0;
Expand All @@ -45,6 +45,8 @@ namespace tomoto
size_t timepoint) const = 0;

virtual size_t getT() const = 0;
virtual std::vector<size_t> getNumDocsByT() const = 0;

virtual Float getAlphaVar() const = 0;
virtual Float getEtaVar() const = 0;
virtual Float getPhiVar() const = 0;
Expand Down
Loading

0 comments on commit 706bfa0

Please sign in to comment.