-
Notifications
You must be signed in to change notification settings - Fork 3
/
NER_people_location.py
86 lines (62 loc) · 2.29 KB
/
NER_people_location.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import cPickle, numpy as np, pandas as pd , re
from collections import Counter
from polyglot.text import Text
'''
The purpose of this py file is to identify people and locations mentioned in the
SF open data portal
This has two functions, one for people and one for locations.
Afterwards the data is saved into a csv file.
'''
tagged_search = pd.read_csv('processed_search_term_data/tagged_search_terms.csv')
tagged_search_list = list(tagged_search['processed_search_term'])
def removePunctuation(text):
for c in '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~\\':
text = text.replace(c,"").strip().lower()
return text
string_word = removePunctuation(str(tagged_search_list))
def people(string):
'''
Input: A string of relevant search terms
Output: pulls out names of people identified from the Named Entity Recognition
software polyglot
'''
NER = Text(string)
NER = NER.entities
ent = [removePunctuation(re.sub('I-PER','',str(entity))) for entity in NER if entity.tag == "I-PER"]
ent =[' '.join(set([w[1:] for w in word.split(' ')])) for word in ent]
return len(list((ent[0].split(' '))))
string = people(string_word)
S = Counter(string).most_common()
name = []
count = []
for name_count in (S):
name.append(name_count[0])
count.append(name_count[1])
def LOC(string):
'''
Input: A string of relevant search terms
Output: pulls out names of people identified from the Named Entity Recognition
software polyglot
'''
NER = Text(string)
NER = NER.entities
ent = [removePunctuation(re.sub('I-LOC','',str(entity))) for entity in NER if entity.tag == "I-LOC"]
ent =[' '.join(set([w[1:] for w in word.split(' ')])) for word in ent]
return len(list((ent[0].split(' '))))
Location = LOC(string_word)
Location_counter = Counter(Location).most_common()
loc = []
cnt = []
for loc_count in (Location_counter):
loc.append(loc_count[0])
cnt.append(loc_count[1])
table1 = pd.DataFrame(columns=['Name','Name_Count'])
table2 = pd.DataFrame(columns=['Location','Location_Count'])
table1['Name'] = name
table1['Count'] = count
table2['Location'] = loc
table2['Location_Count'] = cnt
table1.to_csv('people_count.csv')
table2.to_csv('location_count.csv')
#link:
# http://polyglot.readthedocs.io/en/latest/NamedEntityRecognition.html