-
Notifications
You must be signed in to change notification settings - Fork 0
/
HDU_fetch_mysql.py
195 lines (184 loc) · 6.92 KB
/
HDU_fetch_mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-
import MySQLdb
import requests
import re
import datetime
import time
import sys
class HDU_fetcher(object):
"""docstring for POJ_fetcher"""
def __init__(self, arg=None, MySQL_info=None, quiet=False):
super
(HDU_fetcher, self).__init__()
self.arg = arg
# Sqlite
self.con = MySQLdb.connect(
host=MySQL_info["host"],
user=MySQL_info["user"],
passwd=MySQL_info["passwd"],
db=MySQL_info["db"],
charset=MySQL_info["charset"]
)
try:
self.create_table()
except Exception, e:
if re.match(r'Table .* already exists', e[1]):
print "table already exists"
else:
raise e
else:
print "create table hdu_data successfully"
# requests
self.s = requests.Session()
self.fileds = ['RunID','User','Problem','Result','Memory','Time','Language','Code_Length','Submit_Time']
self.quiet = quiet
def create_table(self):
cu = self.con.cursor()
cu.execute('''CREATE TABLE `hdu_data` (
`RunID` int(11) NOT NULL,
`User` varchar(45) NOT NULL,
`Problem` int(11) NOT NULL,
`Result` varchar(45) NOT NULL,
`Memory` varchar(45) DEFAULT NULL,
`Time` varchar(45) DEFAULT NULL,
`Language` varchar(45) NOT NULL,
`Code_Length` varchar(45) NOT NULL,
`Submit_Time` datetime NOT NULL,
PRIMARY KEY (`RunID`),
UNIQUE KEY `RunID_UNIQUE` (`RunID`),
KEY `index3` (`User`),
KEY `index4` (`Problem`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
''')
cu.close()
def fetch_html(self, start_at=16668947):
# http://acm.hdu.edu.cn/status.php?first=16668947&user=&pid=&lang=&status=#status
time.sleep(0.1)
url = "http://acm.hdu.edu.cn/status.php?first=" + \
str(start_at) + "&user=&pid=&lang=&status=#status"
while True:
success = True
print "Fetch RunID %9d" % start_at
try:
resp = self.s.get(url, timeout=5)
except Exception, e:
print e
success = False
else:
print "status code=%s" % resp.status_code
if resp.status_code != 200:
success = False
if re.search(r'Please retry after (?P<time>\d+)ms\.Thank you\.', resp.text):
success = False
retry_time = re.search(
r'Please retry after (?P<time>\d+)ms\.Thank you\.', resp.text).group('time')
retry_time = int(retry_time) / 1000.0
print "too_often retry after %.3f s" % retry_time
time.sleep(retry_time)
if success:
break
return resp
def fetch(self, start_at=16668947):
resp = self.fetch_html(start_at)
resp.encoding = "GBK"
patternstr = r'''
<tr\s(bgcolor=\#D7EBFF\s)?align=center\s>
<td.*?>(?P<RunID>\d+)</td>
<td.*?>(?P<Submit_Time>.*?)</td>
<td>(<a.*?>)?<font\scolor=.*?>(?P<Result>.*?)</font>(</a>)?</td>
<td><a.*?>(?P<Problem>\d+)</a></td>
<td>(?P<Time>\d+)MS</td>
<td>(?P<Memory>\d+)K</td>
<td>(?P<Code_Length>\d+)B</td>
<td>(?P<Language>.*?)</td>
<td.*?><a\shref=\"/userstatus.php\?user=(?P<User>.*?)\">.*?</a></td>
</tr>
'''
# patternstr = patternstr.replace('\n','')
# patternstr = re.escape(patternstr)
# print patternstr
pattern = re.compile(patternstr, re.S | re.VERBOSE)
results = []
# with open(r"Z:\resp%s.html" % start_at, 'w') as f:
# f.write(resp.content)
for m in pattern.finditer(resp.text):
line = {
'RunID': m.group('RunID'),
'User': m.group('User'),
'Problem': m.group('Problem'),
'Result': m.group('Result'),
'Memory': m.group('Memory'),
'Time': m.group('Time'),
'Language': m.group('Language'),
'Code_Length': m.group('Code_Length'),
'Submit_Time': m.group('Submit_Time'),
}
results.append(line)
# print line
# print results
print "got %d status" % len(results)
if len(results) == 0:
print resp.text
self.insert(results)
return results
def insert(self, status):
cu = self.con.cursor()
status_array = []
for s in status:
sarr = []
for key in self.fileds:
sarr.append(s[key])
status_array.append(sarr)
if not self.quiet:
print sarr
# print status_array
sql = "REPLACE INTO hdu_data VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cu.executemany(sql, status_array)
self.con.commit()
def make_up(self, detla, only_print=False, verify=False):
cu = self.con.cursor()
cu.execute("SELECT S1.RunID as A, S2.RunID as B from hdu_data as S1,hdu_data as S2 where B - A > ? and B = (select min(RunID) from poj_data where RunID > A) ORDER BY S1.RunID",[detla])
bk = cu.fetchall()
for A, B in bk:
if verify:
print "%8d --> %8d --> %8d : %s" % (A, B-A, B, int(self.fetch(B)[0]['RunID']) == A)
else:
print "%8d --> %8d --> %8d" % (A, B-A, B)
if not only_print:
for i in xrange(A+20, B+20, 20):
self.fetch(i)
def main(self, begin, end, now_time):
if begin is None:
cu = self.con.cursor()
cu.execute("select RunID from hdu_data order by RunID DESC LIMIT 1")
begin = cu.fetchone()
if begin:
begin = int(begin[0])
else:
begin = 1020
if end is None:
end = begin + 100 * 10000
pre_time = datetime.datetime.strptime("2009-06-23 07:06:55", "%Y-%m-%d %H:%M:%S")
for i in xrange(begin, end, 20):
res = self.fetch(i)
rcd_time = datetime.datetime.strptime(res[0]["Submit_Time"], "%Y-%m-%d %H:%M:%S")
print rcd_time
if rcd_time > now_time or rcd_time == pre_time:
print now_time, rcd_time, pre_time
break
pre_time = rcd_time
time.sleep(0.1)
if __name__ == '__main__':
fetcher = HDU_fetcher(
MySQL_info={
"host": "localhost",
"user": "root",
"passwd": "199528",
"db": "OJ_data",
"charset": "utf8"
},
quiet=True
)
fetcher.main(None, None, datetime.datetime.today())
# 合并表的方法
# INSERT OR REPLACE INTO POJ_Status SELECT * FROM POJ_status.POJ_Status