This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
create_pos_patterns.py
99 lines (79 loc) · 2.94 KB
/
create_pos_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Make pos patterns which represent knowledge node
from the downloaded wikipedia topics.
@author TaoPR (github.com/starcolon)
"""
import sys
import argparse
from termcolor import colored
from pylib.text import structure as TextStructure
from pylib.text.pos_tree import PatternCapture
from pylib.knowledge.datasource import MineDB
from nltk.tokenize.punkt import PunktSentenceTokenizer
arguments = argparse.ArgumentParser()
arguments.add_argument('--verbose', dest='verbose', action='store_true', help='Turn verbose output on.')
arguments.add_argument('--start', type=int, default=0, help='Starting index of the crawling record to annotate.')
args = vars(arguments.parse_args(sys.argv[1:]))
"""
Initialise a lazy connection to the crawling record collection
"""
def init_crawl_collection():
crawl_collection = MineDB('localhost','vor','crawl')
return crawl_collection
"""
Iterate through the unannotated recordset in the crawled collection,
and generate each of the sentence from the topic.
"""
def raw_records(crawl_collection,start):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
for rec in crawl_collection.query({'downloaded': True},field=None,skip=start):
_id = rec['_id']
if rec['content'] is None:
continue
content = rec['content']['contents']
# A wiki page may probably comprise of multiple content
for c in content:
# Explode a long topic into list of sentences
sentences = pst.sentences_from_text(c)
for s in sentences:
yield (_id,s)
"""
Prompt the user to annotate the given text sentence
"""
def cli_annotate(crawl_collection):
# Load existing pos patterns
patterns = PatternCapture()
patterns.load('./pos-patterns')
print(colored('Existing patterns :','green'))
print(patterns.join(' , '))
def annotate(_id,text):
# Analyse the POS structure of the sentence
tokens = text.split(' ')
pos = TextStructure.pos_tag(tokens)
TextStructure.tag_with_color(tokens)
# Test POS pattern parsing and show the result
print(patterns.capture(pos))
# Extract the pure list of POS
pos_ = [tag for t,tag in pos]
# POS token sample form: NN-JJ,NN,NN-NNS
nodes = input(colored("POS token patterns: ","cyan"))
if len(nodes)>0:
# Add patterns to the registry if not yet
nodes = [n.strip() for n in nodes.split(',')]
for n in nodes:
if n not in patterns:
print(colored('New pattern added: ','green'), n)
patterns.append(n)
# Save right away
patterns.save('./pos-patterns')
print("Patterns saved!")
return annotate
if __name__ == '__main__':
# Prepare a connection to the crawled dataset
# and the annotation collection respectively
crawl_collection = init_crawl_collection()
# Make an annotator function
annotate = cli_annotate(crawl_collection)
# Iterate through each unannotated sentence
[annotate(_id,t) for (_id,t) in raw_records(crawl_collection,args['start'])]