-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearch_terms.py
102 lines (74 loc) · 2.95 KB
/
search_terms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
This function is help identify and label keywords and phrases that are used in the
SF open data portal. This is aimed to segment the bad search terms such as addresses, the web links,
and the actual relevant keyword search used.
"""
import pandas as pd
import re
# Load San Francisco Street Data
street_names = pd.read_csv('street_names_clean.csv')
street_name_list = list(street_names['Names'].values)
def removePunctuation(text):
'''
Removes Punctuation from Input Text
:param text: Text Input
:return: Text free of punctuation
'''
for c in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
text = text.replace(c,"").strip().lower()
return text
def searchAddress(text):
'''
Identifies if text is an address in San Francisco
:param text: Text Input
:return:
'''
result = [re.findall('\d+\s' + street + '|' + street + '\s\w+', text) for street in street_name_list
if re.findall('\d+\s' + street + '|' + street + '\s\w+', text) ]
return result
def search_term_type(text):
"""
input: keyword search term
output: labels the keyword as good, address, year, link.
"""
#street_type = '\s(aly|ave*|blvd|boulevard|cir*|court|ct|dr*|exp*|hl|hwy|highway|ln|lane|park|path|pl*|ramp|rd|road|row|st*|step*|stps|walk|way)'
processed_text = removePunctuation(text)
# address = [re.findall('\d+\s'+street+'|'+street+'\s\w+',text) for street in street_name_list
# if re.findall('\d+\s'+street+'|'+street+'\s\w+', text)!= []]
# Regex patterns
dates_regex = re.compile("[0-9]*[0-9]/[0-9]*[0-9]/[0-9]*[0-9]")
links_regex = re.compile("http[s]*|www.*")
numbers_regex = re.compile("^[0-9][0-9]*[0-9]$")
valid_numbers = ['311', '911']
address_regex = re.compile('^[0-9]*[0-9] [a-z]*[a-z]')
# Series of checks for categorizing search terms
if bool(re.search(links_regex,text)):
term_category = "Link"
elif bool(re.search(dates_regex,text)):
term_category = "Date"
elif bool(re.search(numbers_regex, text)):
if text in valid_numbers:
term_category = "Search Term"
else:
term_category = "Number"
# Checks for SPELLING ERRORS
# if not one of the previous check if address
else:
if bool(re.search(address_regex, text)):
address = searchAddress(processed_text)
if bool(address):
term_category = "Address"
else:
term_category = "Search Term"
else:
term_category = "Search Term"
#find_links = re.findall(links_regex, text)
return term_category
# http://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url
# http://regexlib.com/REDetails.aspx?regexp_id=430
# For testing
print search_term_type("1190 mission street")
print search_term_type("09/11/2105")
print search_term_type("www.google.com")
print search_term_type("911")
print search_term_type("555 doctor")