-
Notifications
You must be signed in to change notification settings - Fork 15
/
precinct_parser.py
145 lines (131 loc) · 3.68 KB
/
precinct_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from urllib2 import urlopen
precincts = """001 - Noble 1
002 - Noble 2
003 - Noble 3
004 - Noble 4
005 - Noble 5
006 - Noble 6
007 - Noble 7
008 - Noble 8
009 - Noble 9
010 - Noble 10
011 - Noble 11
012 - Chester 1
013 - Chester 2
014 - Chester 3
015 - Chester 4
016 - Chester 5
017 - Chester 6
018 - Pleasant 1
019 - Pleasant 2
020 - Lagro 1
021 - Lagro 2
022 - Lagro 3
023 - Liberty 1
024 - Liberty 2
025 - Paw Paw
026 - Waltz""".split("\n")
pages = {}
num_to_name = {}
for s in precincts:
i = int(s[:2])
num_to_name[i] = s
s = s.replace(" ", "%20")
url = "http://clerk.wabashcounty85.us/custom/00000185/elections/election2018g/Wabash_PrecSumm_"+s[:3]+".htm"
print url
f = urlopen(url)
page = f.read()
pages[i] = page
f = open("pages", "w")
f.write(repr(pages))
from re import search
def get_class_name(line):
"""
go from laporte HTML line to the class of that line
"""
pattern = "class=(.|_|\\t)+?( |>)"
res = search(pattern, line)
if res == None:
return "None"
return res.group()[6:-1]
def get_data_from_line(line):
"""
go from laporte HTML line to the data of that line
"""
pattern = ">.*<"
if line == "</div>":
return "close_div"
return search(pattern, line).group()[1:-1]
def get_precinct_name(num):
"""
Given precinct name, get precinct number
"""
return num_to_name[num]
from csv import writer
name = "20181106__in__general__wabash__precinct.csv"
out_f = open(name, "w")
output = writer(out_f)
# KEY: County, Precinct, Machine Ballots, Absentee Ballots, Provisional Ballots, Total Ballots, Race, Candidate
key = ["county", "precinct", "election_day", "absentee", "provisional", "votes", "office", "candidate"]
output.writerow(key)
# s5_ is the first one we care about
# s5_.f1_ has the race name
# For each s0_ (candidate row):
# f3_ has Machine votes
# f4_ has Absentee votes
# f5_ has Provisional votes
# f23_ has Total votes
# f6_ has candidate (WRITE HERE)
# s6_ has a break
# s2_ means we don't care anymore
county = "Wabash"
precinct_name = "ERROR"
candidate = "ERROR"
race = "ERROR"
total = "ERROR"
provisional = "ERROR"
absentee = "ERROR"
machine = "ERROR"
for precinct in pages.keys():
print precinct
page = pages[precinct]
precinct_name = get_precinct_name(precinct)
too_early = True
for line in page.split("\n"):
# Ignore the ones that're too early - they mess up our system
if too_early:
if "s5_" in line:
print "no longer too early"
too_early = False
else:
continue
clazz = get_class_name(line)
print clazz
if clazz == "f0_":
race = get_data_from_line(line)
elif clazz == "f1_":
machine = get_data_from_line(line)
elif clazz == "f2_":
absentee = get_data_from_line(line)
elif clazz == "f3_":
provisional = get_data_from_line(line)
elif clazz == "f16_":
total = get_data_from_line(line)
elif clazz == "f4_":
print "getting candidate"
candidate = get_data_from_line(line)
row = [county, precinct_name, machine, absentee, provisional, total, race, candidate]
output.writerow(row)
elif clazz == "s6_":
candidate = "ERROR"
race = "ERROR"
total = "ERROR"
provisional = "ERROR"
absentee = "ERROR"
machine = "ERROR"
elif clazz == "s2_":
continue
else:
continue
# print "not useful", clazz
# KEY: County, Precinct, Machine Ballots, Absentee Ballots, Provisional Ballots, Total Ballots, Race, Candidate