Merge pull request #56 from bab2min/develop

preparing 0.8.0
bab2min · Jun 6, 2020 · be0107c · be0107c
2 parents 533bcd7 + c063c74
commit be0107c
Show file tree

Hide file tree

Showing 50 changed files with 1,774 additions and 246 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -1,10 +1,10 @@
 name: Publish new package
 
 on:
-  release:
-    types: published
-    branches:
-      - master
+  push:
+    tags:
+      - 'v*.*.*'
+      - '!v*.*.*d'
 
 jobs:
   build_manylinux:
@@ -30,12 +30,12 @@ jobs:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
+        /opt/python/cp35-cp35m/bin/python -m pip install twine wheel numpy
         /opt/python/cp35-cp35m/bin/python setup.py sdist
-        /opt/python/cp35-cp35m/bin/python -m pip install twine wheel
         /opt/python/cp35-cp35m/bin/python -m twine upload dist/*.tar.gz
         for cp in cp35-cp35m cp36-cp36m cp37-cp37m cp38-cp38
         do
-          /opt/python/${cp}/bin/python -m pip install twine wheel
+          /opt/python/${cp}/bin/python -m pip install twine wheel numpy
           /opt/python/${cp}/bin/python setup.py build bdist_wheel
           auditwheel repair dist/*-${cp}-linux_x86_64.whl
         done
@@ -68,7 +68,7 @@ jobs:
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
         export AUDITWHEEL_PLAT=many
-        python -m pip install twine wheel
+        python -m pip install twine wheel numpy
         python setup.py bdist_wheel
         twine upload dist/*
 
@@ -100,6 +100,6 @@ jobs:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
-        python -m pip install twine wheel
+        python -m pip install twine wheel numpy
         python setup.py bdist_wheel
         twine upload dist/*
diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml
@@ -1,10 +1,9 @@
 name: Publish new package into test pypi
 
 on:
-  release:
-    types: published
-    branches:
-      - develop
+  push:
+    tags:
+      - 'v*.*.*d'
 
 jobs:
   build_manylinux:
@@ -30,12 +29,12 @@ jobs:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
+        /opt/python/cp35-cp35m/bin/python -m pip install twine wheel numpy
         /opt/python/cp35-cp35m/bin/python setup.py sdist
-        /opt/python/cp35-cp35m/bin/python -m pip install twine wheel
-        /opt/python/cp35-cp35m/bin/python -m twine upload dist/*.tar.gz
+        /opt/python/cp35-cp35m/bin/python -m twine upload --repository testpypi dist/*.tar.gz
         for cp in cp35-cp35m cp36-cp36m cp37-cp37m cp38-cp38
         do
-          /opt/python/${cp}/bin/python -m pip install twine wheel
+          /opt/python/${cp}/bin/python -m pip install twine wheel numpy
           /opt/python/${cp}/bin/python setup.py build bdist_wheel
           auditwheel repair dist/*-${cp}-linux_x86_64.whl
         done
@@ -68,7 +67,7 @@ jobs:
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
         export AUDITWHEEL_PLAT=many
-        python -m pip install twine wheel
+        python -m pip install twine wheel numpy
         python setup.py bdist_wheel
         twine upload --repository testpypi dist/*
 
@@ -100,6 +99,6 @@ jobs:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       run: |
-        python -m pip install twine wheel
+        python -m pip install twine wheel numpy
         python setup.py bdist_wheel
         twine upload --repository testpypi dist/*
diff --git a/.github/workflows/generate_documentation.yml b/.github/workflows/generate_documentation.yml
@@ -1,8 +1,10 @@
 name: Generate the documentation
 
 on:
-  release:
-    types: published
+  push:
+    tags: 
+      - 'v*.*.*'
+      - '!v*.*.*d'
 
 jobs:
   build_manylinux:
@@ -27,7 +29,7 @@ jobs:
         mv eigen-git-mirror-3.3.7 include
     - name: build
       run: |
-        python3 -m pip install pdoc3
+        python3 -m pip install pdoc3 numpy
         export TOMOTOPY_LANG=${{ matrix.language }}
         python3 setup.py install
     - name: gen doc

diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml
@@ -30,6 +30,7 @@ jobs:
         mv eigen-git-mirror include
     - name: Build
       run: |
+        /opt/python/${{ matrix.cp }}/bin/python -m pip install numpy
         /opt/python/${{ matrix.cp }}/bin/python setup.py build install
     - name: Test
       run: |
@@ -59,6 +60,7 @@ jobs:
         mv eigen-git-mirror-3.3.7 include
     - name: Build
       run: |
+        python -m pip install numpy
         python setup.py build install
     - name: Test
       run: |
@@ -91,6 +93,7 @@ jobs:
         mv eigen-git-mirror-3.3.7 include
     - name: Build
       run: |
+        python -m pip install numpy
         python setup.py build install
     - name: Test
       run: |

diff --git a/README.kr.rst b/README.kr.rst
@@ -24,6 +24,7 @@ tomotopy 란?
 * Partially Labeled LDA (`tomotopy.PLDAModel`)
 * Supervised LDA (`tomotopy.SLDAModel`)
 * Dirichlet Multinomial Regression (`tomotopy.DMRModel`)
+* Generalized Dirichlet Multinomial Regression (`tomotopy.GDMRModel`)
 * Hierarchical Dirichlet Process (`tomotopy.HDPModel`)
 * Hierarchical LDA (`tomotopy.HLDAModel`)
 * Multi Grain LDA (`tomotopy.MGLDAModel`) 
@@ -34,7 +35,7 @@ tomotopy 란?
 
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
 
-tomotopy의 가장 최신버전은 0.7.1 입니다.
+tomotopy의 가장 최신버전은 0.8.0 입니다.
 
 시작하기
 ---------------
@@ -175,7 +176,7 @@ add_doc은 `tomotopy.LDAModel.train`을 시작하기 전까지만 사용할 수
     mdl = tp.LDAModel(k=20)
     # add_doc ...
     mdl.train(100)
-    doc_inst = mdl.make_doc(unseen_words) # doc_inst is an instance of the unseen document
+    doc_inst = mdl.make_doc(unseen_doc) # doc_inst is an instance of the unseen document
 
 새로운 문헌에 대해 추론하기
 ------------------------------
@@ -187,7 +188,7 @@ add_doc은 `tomotopy.LDAModel.train`을 시작하기 전까지만 사용할 수
     mdl = tp.LDAModel(k=20)
     # add_doc ...
     mdl.train(100)
-    doc_inst = mdl.make_doc(unseen_words)
+    doc_inst = mdl.make_doc(unseen_doc)
     topic_dist, ll = mdl.infer(doc_inst)
     print("Topic Distribution for Unseen Docs: ", topic_dist)
     print("Log-likelihood of inference: ", ll)
@@ -239,6 +240,16 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.8.0 (2020-06-06)
+    * NumPy가 tomotopy에 도입됨에 따라 많은 메소드와 프로퍼티들이 `list`가 아니라 `numpy.ndarray`를 반환하도록 변경되었습니다.
+    * Tomotopy에 새 의존관계 `NumPy >= 1.10.0`가 추가되었습니다..
+    * `tomotopy.HDPModel.infer`가 잘못된 추론을 하던 문제가 수정되었습니다.
+    * HDP 모델을 LDA 모델로 변환하는 메소드가 추가되었습니다.
+    * `tomotopy.LDAModel.used_vocabs`, `tomotopy.LDAModel.used_vocab_freq`, `tomotopy.LDAModel.used_vocab_df` 등의 새로운 프로퍼티가 모델에 추가되었습니다.
+    * 새로운 토픽 모델인 g-DMR(`tomotopy.GDMRModel`)가 추가되었습니다.
+    * macOS에서 `tomotopy.label.FoRelevance`를 생성할 때 발생하던 문제가 해결되었습니다.
+    * `tomotopy.utils.Corpus.add_doc`로 `raw`가 없는 문헌을 생성한 뒤 토픽 모델에 입력할 시 발생하는 오류를 수정했습니다.
+
 * 0.7.1 (2020-05-08)
     * `tomotopy.HLDAModel`용으로 `tomotopy.Document.path`가 새로 추가되었습니다.
     * `tomotopy.label.PMIExtractor` 사용시에 발생하던 메모리 문제가 해결되었습니다.

diff --git a/README.rst b/README.rst
@@ -25,6 +25,7 @@ The current version of `tomoto` supports several major topic models including
 * Partially Labeled LDA (`tomotopy.PLDAModel`)
 * Supervised LDA (`tomotopy.SLDAModel`)
 * Dirichlet Multinomial Regression (`tomotopy.DMRModel`)
+* Generalized Dirichlet Multinomial Regression (`tomotopy.GDMRModel`)
 * Hierarchical Dirichlet Process (`tomotopy.HDPModel`)
 * Hierarchical LDA (`tomotopy.HLDAModel`)
 * Multi Grain LDA (`tomotopy.MGLDAModel`) 
@@ -35,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
 
 Please visit https://bab2min.github.io/tomotopy to see more information.
 
-The most recent version of tomotopy is 0.7.1.
+The most recent version of tomotopy is 0.8.0.
 
 Getting Started
 ---------------
@@ -179,7 +180,7 @@ Since `make_doc` returns the instance directly, you can use its return value for
     mdl = tp.LDAModel(k=20)
     # add_doc ...
     mdl.train(100)
-    doc_inst = mdl.make_doc(unseen_words) # doc_inst is an instance of the unseen document
+    doc_inst = mdl.make_doc(unseen_doc) # doc_inst is an instance of the unseen document
 
 Inference for Unseen Documents
 ------------------------------
@@ -191,7 +192,7 @@ Inference for unseen document should be performed using `tomotopy.LDAModel.infer
     mdl = tp.LDAModel(k=20)
     # add_doc ...
     mdl.train(100)
-    doc_inst = mdl.make_doc(unseen_words)
+    doc_inst = mdl.make_doc(unseen_doc)
     topic_dist, ll = mdl.infer(doc_inst)
     print("Topic Distribution for Unseen Docs: ", topic_dist)
     print("Log-likelihood of inference: ", ll)
@@ -245,6 +246,16 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.8.0 (2020-06-06)
+    * Since NumPy was introduced in tomotopy, many methods and properties of tomotopy return not just `list`, but `numpy.ndarray` now.
+    * Tomotopy has a new dependency `NumPy >= 1.10.0`.
+    * A wrong estimation of `tomotopy.HDPModel.infer` was fixed.
+    * A new method about converting HDPModel to LDAModel was added.
+    * New properties including `tomotopy.LDAModel.used_vocabs`, `tomotopy.LDAModel.used_vocab_freq` and `tomotopy.LDAModel.used_vocab_df` were added into topic models.
+    * A new g-DMR topic model(`tomotopy.GDMRModel`) was added.
+    * An error at initializing `tomotopy.label.FoRelevance` in macOS was fixed.
+    * An error that occured when using `tomotopy.utils.Corpus` created without `raw` parameters was fixed.
+
 * 0.7.1 (2020-05-08)
     * `tomotopy.Document.path` was added for `tomotopy.HLDAModel`.
     * A memory corruption bug in `tomotopy.label.PMIExtractor` was fixed.

diff --git a/example.py b/example.py
@@ -8,7 +8,7 @@ def lda_example(input_file, save_path):
         mdl.add_doc(ch)
     mdl.burn_in = 100
     mdl.train(0)
-    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
     print('Removed top words:', mdl.removed_top_words)
     print('Training...', file=sys.stderr, flush=True)
     for i in range(0, 1000, 10):
@@ -31,7 +31,7 @@ def hdp_example(input_file, save_path):
         mdl.add_doc(ch)
     mdl.burn_in = 100
     mdl.train(0)
-    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
     print('Removed top words:', mdl.removed_top_words)
     print('Training...', file=sys.stderr, flush=True)
     for i in range(0, 1000, 10):
@@ -63,7 +63,7 @@ def word_prior_example(input_file):
     # Topic 2 for a topic related to 'citi'
     mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)])
     mdl.train(0)
-    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
     print('Removed top words:', mdl.removed_top_words)
     for i in range(0, 1000, 10):
         mdl.train(10)
@@ -83,7 +83,7 @@ def corpus_and_labeling_example(input_file):
     # make LDA model and train
     mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
     mdl.train(0)
-    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
     print('Removed top words:', mdl.removed_top_words)
     for i in range(0, 1000, 10):
         mdl.train(10)
@@ -114,7 +114,7 @@ def raw_corpus_and_labeling_example(input_file):
     # make LDA model and train
     mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
     mdl.train(0)
-    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
     print('Removed top words:', mdl.removed_top_words)
     for i in range(0, 1000, 10):
         mdl.train(10)

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-py-cpuinfo
+py-cpuinfo
+numpy>=1.10.0
diff --git a/setup.py b/setup.py
@@ -2,6 +2,9 @@
 from codecs import open
 import os, os.path, struct, re, platform
 from setuptools.command.install import install
+import numpy
+
+exec(open('tomotopy/version.py').read())
 
 here = os.path.abspath(os.path.dirname(__file__))
 
@@ -43,7 +46,7 @@
     module_name = '_tomotopy' + ('_' + arch if arch else '')
     modules.append(Extension(module_name,
                     libraries=[],
-                    include_dirs=['include'],
+                    include_dirs=['include', numpy.get_include()],
                     sources=sources,
                     define_macros=[('MODULE_NAME', 'PyInit_' + module_name)] + lang_macro,
                     extra_compile_args=cargs + (aopt.split(' ') if aopt else []), extra_link_args=largs))
@@ -52,7 +55,7 @@
 setup(
     name='tomotopy',
 
-    version='0.7.1',
+    version=__version__,
 
     description='Tomoto, The Topic Modeling Tool for Python',
     long_description=long_description,
@@ -85,7 +88,7 @@
         "Operating System :: POSIX",
         "Operating System :: MacOS"
     ],
-    install_requires=['py-cpuinfo'],
+    install_requires=['py-cpuinfo', 'numpy>=1.10.0'],
     keywords='NLP,Topic Model',
 
     packages = ['tomotopy'],

diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp
@@ -34,9 +34,9 @@ namespace tomoto
 
 		static constexpr char TMID[] = "CTM\0";
 
-		size_t numBetaSample = 10;
-		size_t numTMNSample = 5;
-		size_t numDocBetaSample = -1;
+		uint64_t numBetaSample = 10;
+		uint64_t numTMNSample = 5;
+		uint64_t numDocBetaSample = -1;
 		math::MultiNormalDistribution<Float> topicPrior;
 
 		template<bool _asymEta>

diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp
@@ -37,8 +37,8 @@ namespace tomoto
 		Eigen::Matrix<Float, -1, -1> lambda;
 		Eigen::Matrix<Float, -1, -1> expLambda;
 		Float sigma;
-		size_t F = 0;
-		size_t optimRepeat = 5;
+		uint32_t F = 0;
+		uint32_t optimRepeat = 5;
 		Float alphaEps = 1e-10;
 		Float temperatureScale = 0;
 		static constexpr Float maxLambda = 10;

diff --git a/src/TopicModel/DTModel.hpp b/src/TopicModel/DTModel.hpp
@@ -41,7 +41,7 @@ namespace tomoto
 
 		static constexpr char TMID[] = "DTM\0";
 
-		size_t T;
+		uint64_t T;
 		Float shapeA = 0.03f, shapeB = 0.1f, shapeC = 0.55f;
 		const Float alphaVar = 1.f, etaVar = 1.f, phiVar = 1.f, etaRegL2 = 0.0f;
 
@@ -475,9 +475,9 @@ namespace tomoto
 			return doc;
 		}
 
-		std::vector<size_t> _getTopicsCount() const
+		std::vector<uint64_t> _getTopicsCount() const
 		{
-			std::vector<size_t> cnt(this->K * T);
+			std::vector<uint64_t> cnt(this->K * T);
 			for (auto& doc : this->docs)
 			{
 				for (size_t i = 0; i < doc.Zs.size(); ++i)