diff --git a/README.kr.rst b/README.kr.rst index 2719538..3d7d75f 100644 --- a/README.kr.rst +++ b/README.kr.rst @@ -204,9 +204,28 @@ infer 메소드는 `tomotopy.Document` 인스턴스 하나를 추론하거나 `t .. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png +어휘 사전분포를 이용하여 주제 고정하기 +-------------------------------------- +0.6.0 버전부터 `tomotopy.LDAModel.set_word_prior`라는 메소드가 추가되었습니다. 이 메소드로 특정 단어의 사전분포를 조절할 수 있습니다. +예를 들어 다음 코드처럼 단어 'church'의 가중치를 Topic 0에 대해서는 1.0, 나머지 Topic에 대해서는 0.1로 설정할 수 있습니다. +이는 단어 'church'가 Topic 0에 할당될 확률이 다른 Topic에 할당될 확률보다 10배 높다는 것을 의미하며, 따라서 대부분의 'church'는 Topic 0에 할당되게 됩니다. +그리고 학습을 거치며 'church'와 관련된 단어들 역시 Topic 0에 모이게 되므로, 최종적으로 Topic 0은 'church'와 관련된 주제가 될 것입니다. +이를 통해 특정 내용의 주제를 원하는 Topic 번호에 고정시킬 수 있습니다. + +:: + + import tomotopy as tp + mdl = tp.LDAModel(k=20) + + # add documents into `mdl` + + # setting word prior + mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) + +자세한 내용은 `example.py`의 `word_prior_example` 함수를 참조하십시오. 예제 코드 --------- +--------- tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/example.py 를 확인하시길 바랍니다. 예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다. diff --git a/README.rst b/README.rst index f594839..5958397 100644 --- a/README.rst +++ b/README.rst @@ -208,6 +208,26 @@ The following chart shows the speed difference between the two algorithms based .. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png +Pining Topics using Word Priors +------------------------------- +Since version 0.6.0, a new method `tomotopy.LDAModel.set_word_prior` has been added. It allows you to control word prior for each topic. +For example, we can set the weight of the word 'church' to 1.0 in topic 0, and the weight to 0.1 in the rest of the topics by following codes. +This means that the probability that the word 'church' is assigned to topic 0 is 10 times higher than the probability of being assigned to another topic. +Therefore, most of 'church' is assigned to topic 0, so topic 0 contains many words related to 'church'. +This allows to manipulate some topics to be placed at a specific topic number. + +:: + + import tomotopy as tp + mdl = tp.LDAModel(k=20) + + # add documents into `mdl` + + # setting word prior + mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) + +See `word_prior_example` in `example.py` for more details. + Examples -------- diff --git a/example.py b/example.py index 33968e9..c44f568 100644 --- a/example.py +++ b/example.py @@ -48,6 +48,33 @@ def hdp_example(input_file, save_path): for word, prob in mdl.get_topic_words(k): print('\t', word, prob, sep='\t') +def word_prior_example(input_file): + corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) + # data_feeder yields a tuple of (raw string, user data) or a str (raw string) + corpus.process(open(input_file, encoding='utf-8')) + + # make LDA model and train + mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) + # The word 'church' is assigned to Topic 0 with a weight of 1.0 and to the remaining topics with a weight of 0.1. + # Therefore, a topic related to 'church' can be fixed at Topic 0 . + mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) + # Topic 1 for a topic related to 'softwar' + mdl.set_word_prior('softwar', [1.0 if k == 1 else 0.1 for k in range(20)]) + # Topic 2 for a topic related to 'citi' + mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)]) + mdl.train(0) + print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) + print('Removed top words:', mdl.removed_top_words) + for i in range(0, 1000, 10): + mdl.train(10) + print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) + + for k in range(mdl.k): + print("== Topic #{} ==".format(k)) + for word, prob in mdl.get_topic_words(k, top_n=10): + print(word, prob, sep='\t') + print() + def corpus_and_labeling_example(input_file): corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) @@ -115,6 +142,9 @@ def raw_corpus_and_labeling_example(input_file): print('Running HDP') hdp_example('enwiki-stemmed-1000.txt', 'test.hdp.bin') +print('Set Word Prior') +word_prior_example('enwiki-stemmed-1000.txt') + print('Running LDA and Labeling') corpus_and_labeling_example('enwiki-stemmed-1000.txt') diff --git a/tomotopy/documentation.kr.rst b/tomotopy/documentation.kr.rst index 6b8e3f8..8e006b4 100644 --- a/tomotopy/documentation.kr.rst +++ b/tomotopy/documentation.kr.rst @@ -17,7 +17,7 @@ tomotopy 란? * Correlated Topic Model (`tomotopy.CTModel`) * Dynamic Topic Model (`tomotopy.DTModel`) -tomotopy의 가장 최신버전은 0.6.2 입니다. +tomotopy의 가장 최신버전은 0.7.0 입니다. .. image:: https://badge.fury.io/py/tomotopy.svg @@ -246,8 +246,28 @@ infer 메소드는 `tomotopy.Document` 인스턴스 하나를 추론하거나 `t .. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png +어휘 사전분포를 이용하여 주제 고정하기 +-------------------------------------- +0.6.0 버전부터 `tomotopy.LDAModel.set_word_prior`라는 메소드가 추가되었습니다. 이 메소드로 특정 단어의 사전분포를 조절할 수 있습니다. +예를 들어 다음 코드처럼 단어 'church'의 가중치를 Topic 0에 대해서는 1.0, 나머지 Topic에 대해서는 0.1로 설정할 수 있습니다. +이는 단어 'church'가 Topic 0에 할당될 확률이 다른 Topic에 할당될 확률보다 10배 높다는 것을 의미하며, 따라서 대부분의 'church'는 Topic 0에 할당되게 됩니다. +그리고 학습을 거치며 'church'와 관련된 단어들 역시 Topic 0에 모이게 되므로, 최종적으로 Topic 0은 'church'와 관련된 주제가 될 것입니다. +이를 통해 특정 내용의 주제를 원하는 Topic 번호에 고정시킬 수 있습니다. + +:: + + import tomotopy as tp + mdl = tp.LDAModel(k=20) + + # add documents into `mdl` + + # setting word prior + mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) + +자세한 내용은 `example.py`의 `word_prior_example` 함수를 참조하십시오. + 예제 코드 --------- +--------- tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/example.py 를 확인하시길 바랍니다. 예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다. diff --git a/tomotopy/documentation.rst b/tomotopy/documentation.rst index 8484049..212ee3c 100644 --- a/tomotopy/documentation.rst +++ b/tomotopy/documentation.rst @@ -248,6 +248,26 @@ The following chart shows the speed difference between the two algorithms based .. image:: https://bab2min.github.io/tomotopy/images/algo_comp2.png +Pining Topics using Word Priors +------------------------------- +Since version 0.6.0, a new method `tomotopy.LDAModel.set_word_prior` has been added. It allows you to control word prior for each topic. +For example, we can set the weight of the word 'church' to 1.0 in topic 0, and the weight to 0.1 in the rest of the topics by following codes. +This means that the probability that the word 'church' is assigned to topic 0 is 10 times higher than the probability of being assigned to another topic. +Therefore, most of 'church' is assigned to topic 0, so topic 0 contains many words related to 'church'. +This allows to manipulate some topics to be placed at a specific topic number. + +:: + + import tomotopy as tp + mdl = tp.LDAModel(k=20) + + # add documents into `mdl` + + # setting word prior + mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) + +See `word_prior_example` in `example.py` for more details. + Examples -------- You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/example.py .