-
Notifications
You must be signed in to change notification settings - Fork 0
/
less01.py
179 lines (147 loc) · 5.18 KB
/
less01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
spacy.prefer_gpu()
def print_doc_analysis(doc):
for token in doc:
print ("Index: {} | is_alpha {} | is_punct {} | like_num {} | is_title {} | POS {} | Text: {}".format(
token.i, token.is_alpha, token.is_punct, token.like_num, token.is_title, token.pos_, token.text))
def print_doc_syn_dep(doc):
for token in doc:
print("{} {} {} {}".format(token.text, token.pos_, token.dep_, token.head.text))
def print_doc_named_entities(doc):
for ent in doc.ents:
print(ent.text, ent.label_)
def print_matcher_results(doc, matches):
for match_id, start, end in matches:
matched_span = doc[start:end]
print ("{} {} : {}".format(start, end, matched_span.text))
nlp = English()
doc = nlp("This is a sentence.")
print (doc.text)
# 3 Documents, spans, tokens
doc = nlp("I like three (3) tree kangaroos and narwales.")
first_token = doc[0]
print ("first token:", first_token)
#4 Lexical Attributes
doc = nlp(
"In 1990, more than 60% of people in East Asia were in extreme poverty. "
"Now less than 4% are."
)
print_doc_analysis(doc)
for token in doc:
# check like number
if token.like_num:
# the the next token in the doc
next_token = doc[token.i + 1]
# check if next is %
if next_token.text == '%':
print ("Percentage:", token.text)
# 5 Statistical Models
# $ python -m spacy download en_core_web_sm
# syntactical dependencies
nlp = spacy.load("en_core_web_sm")
doc = nlp("She ate the hot delicious pepperoni pizza ravenously.")
print_doc_syn_dep(doc)
# named entities
print ("--- named entities ---")
# doesn't pick up: from Facebook
# doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion from Great Hill Partners")
# this one misses iPhone X
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")
print_doc_named_entities(doc)
print(doc[1:3])
# 10 matcher
# initialize vocabulary
matcher = None
pattern = None
matcher = Matcher(nlp.vocab)
# pattern
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)
# similar to a regex
# call the matcher
matches = matcher(doc)
print ("Matcher output: {} {}".format(type(matches), matches))
print_matcher_results(doc, matcher(doc))
# another example with more complex pattern
pattern = [
{'IS_DIGIT': True},
{'LOWER': 'fifa'},
{'LOWER': 'world'},
{'LOWER': 'cup'},
{'IS_PUNCT': True}
]
doc = nlp("Jay Duff loves 2018 FIFA World Cup: France won!")
# new matcher
matcher = Matcher(nlp.vocab)
matcher.add('FIFA_PATTERN', None, pattern)
print_matcher_results(doc, matcher(doc))
# other matcher using lemmatizer
matcher = None
pattern = None
print ("--- 10.c ---")
# - good example of AND, OR conditions
pattern = [
{'LEMMA': 'love', 'POS': 'VERB'},
{'POS' : 'NOUN'}
]
# doc = nlp("I love you.") # no match, YOU not a noun
# doc = nlp("I loved dogs but now I love cats more.")
# doc = nlp("Jay is in love. I first loved Betty but now I love June.")
doc = nlp("Jay is in love. I first loved hamburgers but now I love chicken sandwiches.")
# new matcher
matcher = Matcher(nlp.vocab)
matcher.add('LOVE_PATTERN', None, pattern)
print_doc_syn_dep(doc)
print_matcher_results(doc, matcher(doc))
# 10 - part 1
print ('--- 10 - part 1 ---')
doc = nlp(
"After making the iOS update you won't notice a radical system-wide "
"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
"iOS 11's furniture remains the same as in iOS 10. But you will discover "
"some tweaks once you delve a little deeper."
)
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [
{"TEXT": 'iOS'},
{"IS_DIGIT": True}
]
# Add the pattern to the matcher and apply the matcher to the doc
matcher = Matcher(nlp.vocab)
matcher.add("IOS_VERSION_PATTERN", None, pattern)
print_doc_syn_dep(doc)
print_matcher_results(doc, matcher(doc))
# 10 - part 2
print ('--- 10 - part 2 ---')
doc = nlp(
"i downloaded Fortnite on my laptop and can't open the game at all. Help? "
"so when I was downloading Minecraft, I got the Windows version where it "
"is the '.zip' folder and I used the default program to unpack it... do "
"I also need to download Winzip?"
)
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [
{"LEMMA": 'download'},
{"POS": 'PROPN'}
]
# Add the pattern to the matcher and apply the matcher to the doc
matcher = Matcher(nlp.vocab)
matcher.add("DOWNLOAD_PATTERN", None, pattern)
print_doc_syn_dep(doc)
print_matcher_results(doc, matcher(doc))
# 10 - part 3
print ('--- 10 - part 3 ---')
doc = nlp(
"Features of the app include a beautiful design, smart search, automatic "
"labels and optional voice responses."
)
# Write a pattern for adjective plus one or two nouns
# OP = ? make this an optional condition
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher = Matcher(nlp.vocab)
matcher.add("DOWNLOAD_PATTERN", None, pattern)
print_doc_syn_dep(doc)
print_matcher_results(doc, matcher(doc))