Skip to content

Commit

Permalink
preparing 0.8.0
Browse files Browse the repository at this point in the history
fixed HDP inference bug (#49)
implemented converting HDP to LDA (#50)
added used_vocabs (#54)
added g-DMR model
  • Loading branch information
bab2min committed Jun 4, 2020
1 parent dc370b6 commit f72d8f6
Show file tree
Hide file tree
Showing 19 changed files with 1,339 additions and 107 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
py-cpuinfo
py-cpuinfo
numpy>=1.10.0
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from codecs import open
import os, os.path, struct, re, platform
from setuptools.command.install import install
import numpy

here = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -43,7 +44,7 @@
module_name = '_tomotopy' + ('_' + arch if arch else '')
modules.append(Extension(module_name,
libraries=[],
include_dirs=['include'],
include_dirs=['include', numpy.get_include()],
sources=sources,
define_macros=[('MODULE_NAME', 'PyInit_' + module_name)] + lang_macro,
extra_compile_args=cargs + (aopt.split(' ') if aopt else []), extra_link_args=largs))
Expand All @@ -52,7 +53,7 @@
setup(
name='tomotopy',

version='0.7.1',
version='0.8.0',

description='Tomoto, The Topic Modeling Tool for Python',
long_description=long_description,
Expand Down Expand Up @@ -85,7 +86,7 @@
"Operating System :: POSIX",
"Operating System :: MacOS"
],
install_requires=['py-cpuinfo'],
install_requires=['py-cpuinfo', 'numpy>=1.10.0'],
keywords='NLP,Topic Model',

packages = ['tomotopy'],
Expand Down
15 changes: 10 additions & 5 deletions src/TopicModel/GDMR.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,29 @@ namespace tomoto
{
using BaseDocument = DocumentDMR<_tw, _Flags>;
using DocumentDMR<_tw, _Flags>::DocumentDMR;
std::vector<Float> metadataC;
std::vector<Float> metadataOrg, metadataNormalized;

DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataC);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadataC);
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataOrg);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadataOrg);
};

class IGDMRModel : public IDMRModel
{
public:
using DefaultDocType = DocumentDMR<TermWeight::one>;
static IGDMRModel* create(TermWeight _weight, size_t _K = 1, const std::vector<size_t>& _degreeByF = {},
Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01, Float _alphaEps = 1e-10,
Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _sigma0 = 1.0, Float _eta = 0.01, Float _alphaEps = 1e-10,
const RandGen& _rg = RandGen{ std::random_device{}() });

virtual Float getSigma0() const = 0;
virtual void setSigma0(Float) = 0;
virtual const std::vector<size_t>& getFs() const = 0;
virtual std::vector<Float> getLambdaByTopic(Tid tid) const = 0;

virtual std::vector<Float> getTDF(const Float* metadata, bool normalize) const = 0;
virtual std::vector<Float> getTDFBatch(const Float* metadata, size_t stride, size_t cnt, bool normalize) const = 0;

virtual void setMdRange(const std::vector<Float>& vMin, const std::vector<Float>& vMax) = 0;
virtual void getMdRange(std::vector<Float>& vMin, std::vector<Float>& vMax) const = 0;
};
}
}
5 changes: 3 additions & 2 deletions src/TopicModel/GDMRModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ namespace tomoto
template class GDMRModel<TermWeight::idf>;
template class GDMRModel<TermWeight::pmi>;

IGDMRModel* IGDMRModel::create(TermWeight _weight, size_t _K, const std::vector<size_t>& degreeByF, Float _defaultAlpha, Float _sigma, Float _eta, Float _alphaEps, const RandGen& _rg)
IGDMRModel* IGDMRModel::create(TermWeight _weight, size_t _K, const std::vector<size_t>& degreeByF,
Float _defaultAlpha, Float _sigma, Float _sigma0, Float _eta, Float _alphaEps, const RandGen& _rg)
{
SWITCH_TW(_weight, GDMRModel, _K, degreeByF, _defaultAlpha, _sigma, _eta, _alphaEps, _rg);
SWITCH_TW(_weight, GDMRModel, _K, degreeByF, _defaultAlpha, _sigma, _sigma0, _eta, _alphaEps, _rg);
}
}
115 changes: 94 additions & 21 deletions src/TopicModel/GDMRModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace tomoto
using WeightType = typename BaseClass::WeightType;

Float sigma0 = 3;
std::vector<Float> mdCoefs, mdIntercepts;
std::vector<Float> mdCoefs, mdIntercepts, mdMax;
std::vector<size_t> degreeByF;

Float getIntegratedLambdaSq(const Eigen::Ref<const Eigen::Matrix<Float, -1, 1>, 0, Eigen::InnerStride<>>& lambdas) const
Expand Down Expand Up @@ -109,7 +109,7 @@ namespace tomoto
for (size_t docId = ch; docId < this->docs.size(); docId += chStride)
{
const auto& doc = this->docs[docId];
const auto& vx = doc.metadataC;
const auto& vx = doc.metadataNormalized;
getTermsFromMd(&vx[0], terms.data());
for (Tid k = 0; k < K; ++k)
{
Expand Down Expand Up @@ -146,7 +146,7 @@ namespace tomoto
return -fx;
}

void getTermsFromMd(const Float* vx, Float* out) const
void getTermsFromMd(const Float* vx, Float* out, bool normalize = false) const
{
thread_local std::vector<size_t> digit(degreeByF.size());
std::fill(digit.begin(), digit.end(), 0);
Expand All @@ -165,7 +165,7 @@ namespace tomoto
{
for (size_t i = 0; i < degreeByF[n]; ++i)
{
slpCache[n][i] = slp::slpGet(i + 1, vx[n]);
slpCache[n][i] = slp::slpGet(i + 1, normalize ? ((vx[n] - mdIntercepts[n]) / mdCoefs[n]) : vx[n]);
}
}

Expand Down Expand Up @@ -194,7 +194,7 @@ namespace tomoto
auto etaHelper = this->template getEtaHelper<_asymEta>();
auto& zLikelihood = ld.zLikelihood;
thread_local Eigen::Matrix<Float, -1, 1> terms{ this->F };
getTermsFromMd(&doc.metadataC[0], terms.data());
getTermsFromMd(&doc.metadataNormalized[0], terms.data());
zLikelihood = (doc.numByTopic.array().template cast<Float>() + (this->lambda * terms).array().exp() + this->alphaEps)
* (ld.numByTopicWord.col(vid).array().template cast<Float>() + etaHelper.getEta(vid))
/ (ld.numByTopic.array().template cast<Float>() + etaHelper.getEtaSum());
Expand All @@ -214,7 +214,7 @@ namespace tomoto
{
auto& doc = *_first;
thread_local Eigen::Matrix<Float, -1, 1> terms{ this->F };
getTermsFromMd(&doc.metadataC[0], terms.data());
getTermsFromMd(&doc.metadataNormalized[0], terms.data());
for (Tid k = 0; k < K; ++k)
{
alphas[k] = exp(this->lambda.row(k) * terms) + this->alphaEps;
Expand Down Expand Up @@ -256,40 +256,68 @@ namespace tomoto
return ll;
}

void normalizeMetadata()
void collectMinMaxMetadata()
{
size_t s = degreeByF.size();
if (mdIntercepts.size() < s || mdCoefs.size() < s)
if (mdIntercepts.size() < s)
{
mdIntercepts.resize(s, FLT_MAX);
mdCoefs.resize(s, FLT_MIN);
mdMax.resize(s, FLT_MIN);
}
mdCoefs.resize(s, 0);

for (auto& doc : this->docs)
{
for (size_t i = 0; i < s; ++i)
{
mdIntercepts[i] = std::min(mdIntercepts[i], doc.metadataC[i]);
mdCoefs[i] = std::max(mdCoefs[i], doc.metadataC[i]);
mdIntercepts[i] = std::min(mdIntercepts[i], doc.metadataOrg[i]);
mdMax[i] = std::max(mdMax[i], doc.metadataOrg[i]);
}
}
for (size_t i = 0; i < s; ++i)
{
mdCoefs[i] -= mdIntercepts[i];
mdCoefs[i] = mdMax[i] - mdIntercepts[i];
if (mdCoefs[i] == 0) mdCoefs[i] = 1;
}
}

std::vector<Float> normalizeMetadata(const std::vector<Float>& metadata) const
{
std::vector<Float> ret(degreeByF.size());
for (size_t i = 0; i < degreeByF.size(); ++i)
{
ret[i] = mdCoefs[i] ? (metadata[i] - mdIntercepts[i]) / mdCoefs[i] : 0;
}
return ret;
}

void prepareDoc(_DocType& doc, size_t docId, size_t wordSize) const
{
BaseClass::prepareDoc(doc, docId, wordSize);
for (size_t i = 0; i < degreeByF.size(); ++i) doc.metadataC[i] = mdCoefs[i] ? (doc.metadataC[i] - mdIntercepts[i]) / mdCoefs[i] : 0;
doc.metadataNormalized = normalizeMetadata(doc.metadataOrg);
}

void initGlobalState(bool initDocs)
{
BaseClass::BaseClass::initGlobalState(initDocs);
normalizeMetadata();
this->F = accumulate(degreeByF.begin(), degreeByF.end(), 1, [](size_t a, size_t b) {return a * (b + 1); });
if (initDocs) collectMinMaxMetadata();
else
{
// Old binary file has metadataNormalized values into `metadataOrg`
if (this->docs[0].metadataNormalized.empty()
&& !this->docs[0].metadataOrg.empty())
{
for (auto& doc : this->docs)
{
doc.metadataNormalized = doc.metadataOrg;
for (size_t i = 0; i < degreeByF.size(); ++i)
{
doc.metadataOrg[i] = mdIntercepts[i] + doc.metadataOrg[i] * mdCoefs[i];
}
}
}
}

if (initDocs)
{
Expand All @@ -303,11 +331,12 @@ namespace tomoto

public:
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, degreeByF, mdCoefs, mdIntercepts);
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, degreeByF, mdCoefs, mdIntercepts, mdMax);

GDMRModel(size_t _K = 1, const std::vector<size_t>& _degreeByF = {}, Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _eta = 0.01,
GDMRModel(size_t _K = 1, const std::vector<size_t>& _degreeByF = {},
Float defaultAlpha = 1.0, Float _sigma = 1.0, Float _sigma0 = 1.0, Float _eta = 0.01,
Float _alphaEps = 1e-10, const RandGen& _rg = RandGen{ std::random_device{}() })
: BaseClass(_K, defaultAlpha, _sigma, _eta, _alphaEps, _rg), degreeByF(_degreeByF)
: BaseClass(_K, defaultAlpha, _sigma, _eta, _alphaEps, _rg), sigma0(_sigma0), degreeByF(_degreeByF)
{
this->F = accumulate(degreeByF.begin(), degreeByF.end(), 1, [](size_t a, size_t b) {return a * (b + 1); });
}
Expand All @@ -322,7 +351,7 @@ namespace tomoto

_DocType& _updateDoc(_DocType& doc, const std::vector<std::string>& metadata) const
{
std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataC), [](const std::string& w)
std::transform(metadata.begin(), metadata.end(), back_inserter(doc.metadataOrg), [](const std::string& w)
{
return std::stof(w);
});
Expand All @@ -345,7 +374,7 @@ namespace tomoto
{
Eigen::Matrix<Float, -1, 1> alphas(this->K);
thread_local Eigen::Matrix<Float, -1, 1> terms{ this->F };
getTermsFromMd(&doc.metadataC[0], terms.data());
getTermsFromMd(&doc.metadataNormalized[0], terms.data());
for (Tid k = 0; k < this->K; ++k)
{
alphas[k] = exp(this->lambda.row(k) * terms) + this->alphaEps;
Expand All @@ -366,10 +395,54 @@ namespace tomoto
return ret;
}

std::vector<Float> getTDF(const Float* metadata, bool normalize) const override
{
Eigen::Matrix<Float, -1, 1> terms{ this->F };
getTermsFromMd(metadata, terms.data(), true);
std::vector<Float> ret(this->K);
Eigen::Map<Eigen::Array<Float, -1, 1>> retMap{ ret.data(), (Eigen::Index)ret.size() };
retMap = (this->lambda * terms).array();
if (normalize)
{
retMap = (retMap - retMap.maxCoeff()).exp();
retMap /= retMap.sum();
}
return ret;
}

std::vector<Float> getTDFBatch(const Float* metadata, size_t stride, size_t cnt, bool normalize) const override
{
Eigen::Matrix<Float, -1, -1> terms{ this->F, (Eigen::Index)cnt };
for (size_t i = 0; i < cnt; ++i)
{
getTermsFromMd(metadata + stride * i, terms.col(i).data(), true);
}
std::vector<Float> ret(this->K * cnt);
Eigen::Map<Eigen::Array<Float, -1, -1>> retMap{ ret.data(), (Eigen::Index)this->K, (Eigen::Index)cnt };
retMap = (this->lambda * terms).array();
if (normalize)
{
retMap.rowwise() -= retMap.colwise().maxCoeff();
retMap = retMap.exp();
retMap.rowwise() /= retMap.colwise().sum();
}
return ret;
}
void setMdRange(const std::vector<Float>& vMin, const std::vector<Float>& vMax) override
{
mdIntercepts = vMin;
mdCoefs = vMax;
mdMax = vMax;
}

void getMdRange(std::vector<Float>& vMin, std::vector<Float>& vMax) const override
{
vMin = mdIntercepts;
if (mdMax.empty())
{
vMax = mdIntercepts;
for (size_t i = 0; i < vMax.size(); ++i) vMax[i] += mdCoefs[i];
}
else vMax = mdMax;
}
};
}
}
2 changes: 2 additions & 0 deletions src/TopicModel/HDP.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,7 @@ namespace tomoto
virtual size_t getTotalTables() const = 0;
virtual size_t getLiveK() const = 0;
virtual bool isLiveTopic(Tid tid) const = 0;

virtual std::unique_ptr<ILDAModel> convertToLDA(float topicThreshold, std::vector<size_t>& newK) const = 0;
};
}
Loading

0 comments on commit f72d8f6

Please sign in to comment.