forked from gwilhibbert/Moonboard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_problems.py
124 lines (108 loc) · 3.98 KB
/
scrape_problems.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import urllib.request
import os
def get_line(problem_file,data_points):
line=problem_file.readline()
if line=="":
return(line)
line_items=line.split(".,.")
if len(line_items)!=data_points or line_items[0]=="\n" or line[0]=="#":
line=get_line(problem_file,data_points)
return(line)
def get_existing(problem_file,problem_list=[]):
line=get_line(problem_file,11)
while line:
problem_list.append(line.split(".,.")[0])
line=get_line(problem_file,11)
return(problem_list)
def get_addresses():
address_file=open("problem_list.txt","r")
line=get_line(address_file,1)
address_list=[]
while line:
if len(line[:-1])!=47:
address_list.append(line[:-1])
line=get_line(address_file,1)
return(address_list)
def scrape(url):#downloads webpage passed to it and saves to temp location
scrape_file=open("temp.txt","w")
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
f = urllib.request.urlopen(req)
scrape_file.write(f.read().decode('utf-8'))
scrape_file.close()
print("Just scraped "+url)
def get_scrape_line():#find the line with all the useful data in
scrape_file=open("temp.txt","r")
scrape_line=scrape_file.readline()
check=-1
while scrape_line and check==-1:
scrape_line=scrape_file.readline()
check=scrape_line.find('"Method')
return(scrape_line)
def extract_data(name,line):#takes the item to be found and extracts it
length=len(name)
position=line.find(name)+length+3
search_area=line[position:(position+30)]
search_list=search_area.split('"')
item=search_list[0]
return(item)
def get_hold(line):#gets hold and sorts them in start midlle and end categories
hold_list=line.split("},{")
middle_hold=""
end_hold=""
start_hold=""
for item in hold_list:
data_list=item.split(",")
is_start=data_list[2].split(":")
if is_start[1]=="true":
start_hold+=(data_list[1].split(":"))[1][1:-1]+","
elif (data_list[3].split(":"))[1]=="true":
end_hold+=(data_list[1].split(":"))[1][1:-1]+","
else:
middle_hold+=(data_list[1].split(":"))[1][1:-1]+","
return(start_hold+".,."+middle_hold+".,."+end_hold)
def extract_holds(line):#removes everything from the string except holds info and calls get_hold
position=line.find("Moves")
line=line[(position+9):]
position=line.find("]")
line=line[0:(position-1)]
holds=get_hold(line)
return(holds)
def get_items(line,address):#repeatedly calls extract_data for each item
factor_list=["Method","Name",'"Grade','"Id":1,"Description','Nickname','"Holdsetup":{"Id":15,"Description',"IsBenchmar"]
save_string=""
for item in factor_list:
extracted=extract_data(item,line)
save_string+=extracted+".,."
holds=extract_holds(line)
#print(holds)
save_string+=holds+"\r\n"
save_string=address+".,."+save_string
save_file=open("problems.txt","a")
save_file.write(save_string)
save_file.close()
def main():
problem_file=open("problems.txt","r")
existing=get_existing(problem_file)
problem_file.close()
#print(existing)
addresses=get_addresses()
#print(addresses)
existing_number=0
new_number=0
for address in addresses:
if address not in existing:
new_number+=1
data=scrape(address)
line=get_scrape_line()
get_items(line,address)
os.remove("temp.txt")
else:
existing_number+=1
print(str(existing_number)+" existing problems, "+str(new_number)+" new problems and "+str(existing_number+new_number)+" total problems")
main()