-
Notifications
You must be signed in to change notification settings - Fork 340
/
util.py
56 lines (44 loc) · 1.34 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#coding=utf-8
import urllib2
import gzip
import StringIO
import ConfigParser
def get_content(toUrl,count):
""" Return the content of given url
Args:
toUrl: aim url
count: index of this connect
Return:
content if success
'Fail' if fail
"""
cf = ConfigParser.ConfigParser()
cf.read("config.ini")
cookie = cf.get("cookie", "cookie")
headers = {
'Cookie': cookie,
'Host':'www.zhihu.com',
'Referer':'http://www.zhihu.com/',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept-Encoding':'gzip'
}
req = urllib2.Request(
url = toUrl,
headers = headers
)
try:
opener = urllib2.build_opener(urllib2.ProxyHandler())
urllib2.install_opener(opener)
page = urllib2.urlopen(req,timeout = 15)
headers = page.info()
content = page.read()
except Exception,e:
if count % 1 == 0:
print str(count) + ", Error: " + str(e) + " URL: " + toUrl
return "FAIL"
if page.info().get('Content-Encoding') == 'gzip':
data = StringIO.StringIO(content)
gz = gzip.GzipFile(fileobj=data)
content = gz.read()
gz.close()
return content