-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInitDatabaseAll.py
174 lines (149 loc) · 7.88 KB
/
InitDatabaseAll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Lan Luo, CPSC 437, Database Project
# This file constructs a database from downloaded raw HTML files from the Yale Facebook for each college
# that compiles basic information on each Yale students (first name, last name, class year, college, address,
# country, major, and birthday) into a single relation.
# import HTML parser/processor
from bs4 import BeautifulSoup
# import to use for regex
import re
# import to use pandas as data frame tables
import pandas as pd
# import to use when shuffling/permuting each column in the pandas data frame
import numpy as np
# import to use when converting address to latitude, longitude coordinates
from geopy.geocoders import Nominatim
# import to wait when chunking to avoid time out
import time
# initialize final scraped data list variable
allfin = []
# loop through every college raw html file to get student information
for collegelist in ["Benjamin Franklin", "Berkeley", "Branford", "Davenport", "Ezra Stiles", "Grace Hopper", "Jonathan Edwards", "Morse", "Pauli Murray", "Pierson", "Saybrook", "Silliman", "Timothy Dwight", "Trumbull"]:
# initialize variables
collegeraw = None
college = None
# collegeraw for reading html file and then college as parsed cleanly
collegeraw = open(collegelist + ".html").read()
college = BeautifulSoup(collegeraw, 'html.parser')
# split up each student into their own class
classes = None
classes = [student for student in college.find_all("div", class_="student_container")]
# make each student into one string variable
students = []
for x in classes:
students.append(str(x))
# clean data by making it easier to split/extract information
for i in range(len(students)):
# remove extra html tags that won't be used to split the data
students[i] = students[i].replace("</div>","")
# replace html ampersand to &
students[i] = students[i].replace("&","&")
# remove phone numbers (extraneous info - location information can be drawn from the home address)
students[i] = re.sub('<br/>(\d{3} ?-)?\d{3}-\d{4} / ', "", students[i])
# remove strange sequences of numbers (e.g. ending in YS), not address information so extraneous
students[i] = re.sub('<br/>#? ?\d* YS', "", students[i])
students[i] = re.sub('<br/>\d-\d{4} /', "", students[i])
students[i] = re.sub('<br/>\d+ /', "", students[i])
# if there's an american zip code, then add United States as a country to be consistent with students who have information for their foreign home country
USzip = re.search('([A-Z]{2} \d{5}-?(\d{4})?)', students[i])
if USzip:
students[i] = students[i].replace(USzip.group(1), USzip.group(1) + "<br/>United States")
# split by line break to make creating a relation easier
out = []
for i in range(len(students)):
out.append(students[i].split("<br/>"))
# only include students with complete information
fin = []
fin = [i for i in out if len(i) == 7]
# duplicate variable to avoid python memory issue
copy_fin = []
copy_fin = fin[:]
# loop through each student
for i in range(len(copy_fin)):
# intialize column information
yearsearch, emailsearch, lastsearch, firstsearch = None, None, None, None
# extract year by matching with regex
yearsearch = re.search('(\d{2})<div class="year_border">', copy_fin[i][0])
if yearsearch:
Year = "20" + yearsearch.group(1)
# extract email by matching with regex
emailsearch = re.search('href="mailto:(.*)">', copy_fin[i][0])
if emailsearch:
Email = emailsearch.group(1)
# extract last name by matching with regex
lastsearch = re.search('<h5 class="yalehead">(.*),', copy_fin[i][0])
if lastsearch:
LastName = lastsearch.group(1)
# extract first name by matching with regex
firstsearch = re.search('<h5 class="yalehead">.*, (.*) </h5>', copy_fin[i][0])
if firstsearch:
FirstName = firstsearch.group(1)
# delete first value in list (long string with all the information extracted above)
del copy_fin[i][0]
# insert values for the information extracted above
copy_fin[i].insert(0, collegelist)
copy_fin[i].insert(0, Year)
copy_fin[i].insert(0, Email)
copy_fin[i].insert(0, LastName)
copy_fin[i].insert(0, FirstName)
# remove college tag at beginning of room sequence
tagindex = (copy_fin[i][5]).find('-')
copy_fin[i][5] = copy_fin[i][5][tagindex + 1:]
# create variable to match room number appropriately (throws out rows with non-sensical rows)
matchroomfin = []
for i in range(len(copy_fin)):
# if statement removes rows with an address for the room number value
if len(copy_fin[i][5]) <= 6:
# use regexes to match for appropriate room number types
if re.search('[A-Z][0-9]{2}[A-Z]?', copy_fin[i][5]):
matchroomfin.append(copy_fin[i])
if re.search('\d{3}[A-Z]?', copy_fin[i][5]):
matchroomfin.append(copy_fin[i])
# create final scraped data list
allfin.extend(matchroomfin)
# generate pandas data frame in line with ordering of scraped data list
DFfin = pd.DataFrame(allfin)
DFfin.columns = ['First Name', 'Last Name', 'Email', 'Class Year', 'College', 'Room', 'Street', 'City_State_ZIP', 'Country', 'Major', 'Birthday']
# drop last name to better anonymizedata
DFfin.drop(columns=['Last Name'])
# create new last name that's simply a duplicate of the first name
# this is great for anonymity since first names are hardly useful for uniquely identifying a person
DFfin['Last Name'] = DFfin['First Name']
# combine all address information into a single field
DFfin['Address'] = DFfin['Street'] + ", " + DFfin['City_State_ZIP'] + ", " + DFfin['Country']
# drop unnecessary columns
DFfin.drop(columns=['Street', 'City_State_ZIP', 'Email'])
# reorder data frame appropriately
DFfin = DFfin[['First Name', 'Last Name', 'Class Year', 'College', 'Address', 'Country', 'Major', 'Birthday', 'Room']]
DFfinv1 = DFfin.copy()
DFfinv2 = DFfin.copy()
# table with non-address info
DFfinv1 = DFfinv1.drop(columns=['Address', 'Country'])
# table with address info
DFfinv2 = DFfinv2.drop(columns=['First Name', 'Last Name', 'Class Year', 'College', 'Major', 'Birthday', 'Room'])
# permute/shuffle/randomize each column independently from the others
# perfectly anonymizes the data so that no single row gives any useful identifying information
DFfinv1 = DFfinv1.apply(np.random.permutation)
# this permutation does address information separately (since these need to be done together to preserve the location information)
DFfinv2 = DFfinv2.reindex(np.random.permutation(DFfinv2.index))
DFlast = DFfinv1.join(DFfinv2, how = "left")
DFlast = DFlast[['First Name', 'Last Name', 'Class Year', 'College', 'Major', 'Birthday', 'Room', 'Address', 'Country']]
# test for anonymity, indeed none of the information below matches my personal information
DFlast.loc[DFlast['First Name'] == 'Lan']
# # add field for address coordinates using geopy package
# geolocator = Nominatim(user_agent="school project")
# # manually create chunks to create coordinates, geocoder times out if the request is too long
# # create chunks without address
# DFchunks = []
# for i in range(176):
# DFchunks.append( DFlast[i*25: (i+1)*25].copy() )
# # addresses added into chunks
# for i in range(len(DFchunks)):
# print i
# DFchunks[i]['LatLong'] = DFchunks[i]['Address'].apply(geolocator.geocode)
# DFchunks[i]['LatLong'] = DFchunks[i]['LatLong'].apply(lambda x: (x.latitude, x.longitude) if x != None else None)
# time.sleep(60)
# # append all the chunks together into one large relation
# DFfinish = DFchunks[0]
# for i in range(1, len(DFchunks)):
# DFfinish.append(DFchunks[i])
DFlast.to_csv("studentdata.csv")