update docs & example

bab2min · Dec 19, 2020 · 66ac059 · 66ac059
1 parent d4da45b
commit 66ac059
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 12 deletions.
diff --git a/examples/extract_ngram.py b/examples/extract_ngram.py
@@ -0,0 +1,44 @@
+import sys
+import tomotopy as tp
+
+def extract_ngrams_example(input_file):
+    from nltk.corpus import stopwords
+    stops = set(stopwords.words('english'))
+    stops.update(['many', 'also', 'would', 'often', 'could'])
+    corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), 
+        stopwords=lambda x: len(x) <= 2 or x in stops)
+    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
+    corpus.process(open(input_file, encoding='utf-8'))
+
+    # extract the n-gram candidates first
+    cands = corpus.extract_ngrams(min_cf=20, min_df=10, max_len=5, max_cand=1000)
+    print('==== extracted n-gram collocations ====')
+    for cand in cands:
+        print(cand)
+
+    # it prints like:
+    # tomotopy.label.Candidate(words=["academic","nobel","prize","laureate"], name="", score=23.376673)
+    # tomotopy.label.Candidate(words=["canadian","ice","hockey","player"], name="", score=21.658447)
+    # tomotopy.label.Candidate(words=["english","race","car","driver"], name="", score=20.356688)
+    # tomotopy.label.Candidate(words=["australian","rugby","league","player"], name="", score=20.124966)
+    # tomotopy.label.Candidate(words=["american","race","car","driver"], name="", score=19.717760)
+    # tomotopy.label.Candidate(words=["new","zealand","rugby","player"], name="", score=18.866398)
+    # tomotopy.label.Candidate(words=["american","ice","hockey","player"], name="", score=17.599983)
+    # tomotopy.label.Candidate(words=["american","actor","director","producer"], name="", score=16.722300)
+    # tomotopy.label.Candidate(words=["nobel","prize","laureate"], name="", score=16.635370)
+    # tomotopy.label.Candidate(words=["eastern","orthodox","liturgics"], name="", score=16.540277)
+    # ...
+
+    # before concat
+    print(corpus[3])
+
+    # concat n-grams in the corpus
+    corpus.concat_ngrams(cands, delimiter='_')
+
+    # after concat
+    print(corpus[3])
+
+# You can get the sample data file 'enwiki-1000.txt'
+# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
+
+extract_ngrams_example('enwiki-1000.txt')
diff --git a/src/python/label_docs.h b/src/python/label_docs.h
@@ -58,8 +58,8 @@ DOC_SIGNATURE_EN_KO(PMIExtractor___init____doc__,
 `PMIExtractor` exploits multivariate pointwise mutual information to extract collocations. 
 It finds a string of words that often co-occur statistically.
 
-Parameter
----------
+Parameters
+----------
 min_cf : int
     minimum collection frequency of collocations. Collocations with a smaller collection frequency than `min_cf` are excluded from the candidates.
     Set this value large if the corpus is big
@@ -80,8 +80,8 @@ max_cand : int
 
 `PMIExtractor`는 다변수 점별 상호정보량을 활용해 연어를 추출합니다. 이는 통계적으로 자주 함께 등장하는 단어열을 찾아줍니다.
 
-Parameter
----------
+Parameters
+----------
 min_cf : int
     추출하려는 후보의 최소 장서 빈도. 문헌 내 등장하는 빈도수가 `min_cf`보다 작은 연어는 후보에서 제외됩니다.
     분석하려는 코퍼스가 클 경우 이 값을 키우십시오.
@@ -106,17 +106,17 @@ DOC_SIGNATURE_EN_KO(Labeler_get_topic_labels__doc__,
 	"get_topic_labels(self, k, top_n=10)",
 	u8R""(Return the top-n label candidates for the topic `k`
 
-Parameter
----------
+Parameters
+----------
 k : int
     an integer indicating a topic
 top_n : int
     the number of labels
 )"",
 	u8R""(토픽 `k`에 해당하는 레이블 후보 상위 n개를 반환합니다.
 
-Parameter
----------
+Parameters
+----------
 k : int
     토픽을 지정하는 정수
 top_n : int

diff --git a/tomotopy/coherence.py b/tomotopy/coherence.py
@@ -156,3 +156,76 @@ def get_score(self, words=None, topic_id=None):
         if words is None:
             words = (w for w, _ in self._topic_model.get_topic_words(topic_id, top_n=self._top_n))
         return super().get_score(words)
+
+import os
+if os.environ.get('TOMOTOPY_LANG') == 'kr':
+    __doc__ = """..versionadded:: 0.10.0
+
+이 모듈은 다음 논문에 의거한 토픽 coherence 계산법을 제공합니다:
+
+> * Röder, M., Both, A., & Hinneburg, A. (2015, February). Exploring the space of topic coherence measures. In Proceedings of the eighth ACM international conference on Web search and data mining (pp. 399-408).
+> http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
+> https://github.com/dice-group/Palmetto
+"""
+    __pdoc__ = {}
+    __pdoc__['ProbEstimation'] = '''
+논문에 제시된 probability estimator를 위한 열거형
+'''
+    __pdoc__['Segmentation'] = '''
+논문에 제시된 segmentation을 위한 열거형
+'''
+    __pdoc__['ConfirmMeasure'] = '''
+논문에 제시된 direct confirm measure를 위한 열거형
+'''
+    __pdoc__['IndirectMeasure'] = '''
+논문에 제시된 indirect confirm measure를 위한 열거형
+'''
+    __pdoc__['Coherence'] = '''
+`Coherence` 클래스는 coherence를 계산하는 방법을 제공합니다.
+
+주어진 코퍼스를 바탕으로 coherence를 계산하는 인스턴스를 초기화합니다.
+
+Parameters
+----------
+corpus : Union[tomotopy.utils.Corpus, tomotopy.LDAModel]
+    단어 분포 확률을 추정하기 위한 레퍼런스 코퍼스.
+    `tomotopy.utils.Corpus` 타입뿐만 아니라 `tomotopy.LDAModel`를 비롯한 다양한 토픽 모델링 타입의 인스턴스까지 지원합니다.
+    만약 `corpus`가 `tomotpy.utils.Corpus`의 인스턴스라면 `targets`이 반드시 주어져야 합니다.
+coherence : Union[str, Tuple[int, int, int], Tuple[int, int, int, int]]
+    coherence를 계산하는 데 사용될 척도. 척도는 (`tomotopy.coherence.ProbEstimation`, `tomotopy.coherence.Segmentation`, `tomotopy.coherence.ConfirmMeasure`)의 조합이거나
+    (`tomotopy.coherence.ProbEstimation`, `tomotopy.coherence.Segmentation`, `tomotopy.coherence.ConfirmMeasure`, `tomotopy.coherence.IndirectMeasure`)의 조합이어야 합니다.
+    
+    또한 다음과 같이 `str` 타입의 단축표현도 제공됩니다.
+    > * 'u_mass' : (`tomotopy.coherence.ProbEstimation.DOCUMENT`, `tomotopy.coherence.Segmentation.ONE_PRE`, `tomotopy.coherence.ConfirmMeasure.LOGCOND`)
+    > * 'c_uci' : (`tomotopy.coherence.ProbEstimation.SLIDING_WINDOWS`, `tomotopy.coherence.Segmentation.ONE_ONE`, `tomotopy.coherence.ConfirmMeasure.PMI`)
+    > * 'c_npmi' : (`tomotopy.coherence.ProbEstimation.SLIDING_WINDOWS`, `tomotopy.coherence.Segmentation.ONE_ONE`, `tomotopy.coherence.ConfirmMeasure.NPMI`)
+    > * 'c_v' : (`tomotopy.coherence.ProbEstimation.SLIDING_WINDOWS`, `tomotopy.coherence.Segmentation.ONE_SET`, `tomotopy.coherence.ConfirmMeasure.NPMI`, `tomotopy.coherence.IndirectMeasure.COSINE`)
+window_size : int
+    `tomotopy.coherence.ProbEstimation.SLIDING_WINDOWS`가 사용될 경우 쓰일 window 크기.
+    기본값은 'c_uci'와 'c_npmi'의 경우 10, 'c_v'의 경우 110입니다.
+targets : Iterable[str]
+    만약 `corpus`가 `tomotpy.utils.Corpus`의 인스턴스인 경우, 목표 단어가 주어져야 합니다. 
+    `targets`에 주어진 단어 목록에 대해서만 확률 분포가 추정됩니다.
+top_n : int
+    각 토픽에서 추출할 상위 단어의 개수.
+    만약 `corpus`이 `tomotopy.LDAModel`나 기타 토픽 모델의 인스턴스인 경우, 목표 단어는 각 토픽의 상위 단어에서 추출됩니다.
+    만약 `targets`이 주어진 경우 `corpus`가 토픽 모델인 경우에도 `targets`에서 목표 단어를 가져옵니다.
+eps : float
+    계산 과정에서 0으로 나누는 것을 방지하기 위한 epsilon 값
+gamma : float
+    indirect confirm measure 계산에 사용되는 gamma 값
+'''
+    __pdoc__['Coherence.get_score'] = '''주어진 `words` 또는 `topic_id`를 이용해 coherence를 계산합니다.
+
+Parameters
+----------
+words : Iterable[str]
+    coherence가 계산될 단어들.
+    만약 `tomotopy.coherence.Coherence`가 `tomotopy.LDAModel`나 기타 토픽 모델의 인스턴스로 `corpus`를 받아 초기화된 경우 `words`는 생략될 수 있습니다.
+    이 경우 단어들은 토픽 모델의 `topic_id` 토픽에서 추출됩니다.
+topic_id : int
+    단어가 추출될 토픽의 id.
+    이 파라미터는 오직 `tomotopy.coherence.Coherence`가 `tomotopy.LDAModel`나 기타 토픽 모델의 인스턴스로 `corpus`를 받아 초기화된 경우에만 사용 가능합니다.
+    생략시 모든 토픽의 coherence 점수를 평균낸 값이 반환됩니다.
+'''
+del os
diff --git a/tomotopy/utils.py b/tomotopy/utils.py
@@ -173,13 +173,15 @@ def concat_ngrams(self, cands, delimiter='_'):
 
 class SimpleTokenizer:
     '''`SimpleTokenizer` provided a simple word-tokenizing utility with an arbitrary stemmer.'''
-    def __init__(self, stemmer=None, pattern:str=None):
+    def __init__(self, stemmer=None, pattern:str=None, lowercase=True):
         '''Parameters
 ----------
 stemmer : Callable[str, str]
     a callable object for stemming words. If this value is set to `None`, words are not stemmed.
 pattern : str
     a regex pattern for extracting tokens
+lowercase : bool
+    converts the token into lowercase if this is True
 
 Here is an example of using SimpleTokenizer with NLTK for stemming.
 
@@ -190,14 +192,15 @@ def __init__(self, stemmer=None, pattern:str=None):
         if stemmer and not callable(stemmer):
             raise ValueError("`stemmer` must be callable.")
         self._stemmer = stemmer or None
+        self._lowercase = lowercase
 
     def __call__(self, raw:str, user_data=None):
         if self._stemmer:
-            for g in self._pat.finditer(raw.lower()):
+            for g in self._pat.finditer(raw.lower() if self._lowercase else raw):
                 start, end = g.span(0)
                 yield self._stemmer(g.group(0)), start, end - start
         else:
-            for g in self._pat.finditer(raw.lower()):
+            for g in self._pat.finditer(raw.lower() if self._lowercase else raw):
                 start, end = g.span(0)
                 yield g.group(0), start, end - start
 
@@ -258,7 +261,40 @@ def __call__(self, raw:str, user_data=None):
 ----------
 filename : str
     읽어들일 파일의 경로"""
-
+    __pdoc__['Corpus.extract_ngrams'] = '''..versionadded:: 0.10.0
+
+PMI 점수를 이용해 자주 등장하는 n-gram들을 추출합니다.
+
+Parameters
+----------
+min_cf : int
+    추출할 n-gram의 최소 장서빈도
+min_df : int
+    추출할 n-gram의 최소 문헌빈도
+max_len : int
+    추출할 n-gram의 최대 길이
+max_cand : int
+    추출할 n-gram의 갯수
+min_score : float
+    추출할 n-gram의 최소 PMI 점수
+
+Returns
+-------
+candidates : List[tomotopy.label.Candidate]
+    추출된 n-gram 후보의 리스트. `tomotopy.label.Candidate` 타입
+'''
+    __pdoc__['Corpus.concat_ngrams'] = '''..versionadded:: 0.10.0
+
+코퍼스 내에서 주어진 n-gram 목록과 일치하는 단어열을 하나의 단어로 합칩니다.
+
+Parameters
+----------
+cands : Iterable[tomotopy.label.Candidate]
+    합칠 n-gram의 List. `tomotopy.utils.Corpus.extract_ngrams`로 생성할 수 있습니다.
+delimiter : str
+    여러 단어들을 연결할 때 사용할 구분자. 기본값은 `'_'`입니다.
+'''
+
     __pdoc__['SimpleTokenizer'] = """`SimpleTokenizer`는 임의의 스테머를 사용할 수 있는 단순한 단어 분리 유틸리티입니다.
 
 Parameters
@@ -267,6 +303,8 @@ def __call__(self, raw:str, user_data=None):
     단어를 스테밍하는데 사용되는 호출가능한 객체. 만약 이 값이 `None`이라면 스테밍은 사용되지 않습니다.
 pattern : str
     토큰을 추출하는데 사용할 정규식 패턴
+lowercase : bool
+    참일 경우 분리된 단어들을 소문자화합니다.
 
 SimpleTokenizer와 NLTK를 사용하여 스테밍을 하는 예제는 다음과 같습니다.