-
Notifications
You must be signed in to change notification settings - Fork 0
/
BigPlan.py
81 lines (75 loc) · 3.81 KB
/
BigPlan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import random
import lxml.html
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import os
import requests
import re
# 10秒内每隔500毫秒扫描1次页面变化,当出现指定的元素后结束。
# wait = WebDriverWait(browser, 10)
USER_AGENTS = ['Mozilla/5.0', 'Gecko/20100101', 'Firefox/53.0', 'Windows NT 6.1']
def parser(url):
# 因为这个网站的弹出广告超多所以建议使用PHANTOMJS方式爬取
dcap = dict(DesiredCapabilities.PHANTOMJS)
# 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器
dcap["phantomjs.page.settings.userAgent"] = (random.choice(USER_AGENTS))
# 不载入图片,爬页面速度会快很多
dcap["phantomjs.page.settings.loadImages"] = False
browser = webdriver.PhantomJS(desired_capabilities=dcap)
try:
browser.get(url)
# 隐式等待5秒,可以自己调节
browser.implicitly_wait(5)
# 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项
# 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
browser.set_page_load_timeout(10)
# 之所以放弃这个方式,的确这个方式有bug或者自己遗漏某一点,这样很容易再爬取中中断
# wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
html = browser.page_source
doc = lxml.html.fromstring(html)
browser.quit()
except Exception as e:
browser.quit()
print(e)
return doc
def getWeLike():
print('开始进入网站...')
for index in range(200):
if index == 0:
doc = parser('http://jp.xieav.com/one/LLP/')
else:
doc = parser('http://jp.xieav.com/one/LLP/index{}.shtml'.format(index + 2))
filePics = doc.xpath('/html/body/div[3]/div[1]/div/ul/li/div[1]/a/img/@src')
fileUrls = doc.xpath('/html/body/div[3]/div[1]/div/ul/li/div[1]/a/@href')
fileRealNames = doc.xpath('/html/body/div[3]/div[1]/div/ul/li/div[1]/a/img/@alt')
for i, eachImgName in enumerate(fileRealNames):
print(eachImgName)
# ------------------------------创建文件夹-------------------------------
if not os.path.exists('image\\' + eachImgName):
print('创建文件夹...')
os.makedirs('image\\' + eachImgName)
# ------------------------------下载icon图片-----------------------------
picurl = 'http://jp.xieav.com' + filePics[i]
try:
r = requests.get(picurl)
filename = 'image\\{}\\'.format(eachImgName) + '1.jpg'
with open(filename, 'wb') as fo:
fo.write(r.content)
# ------------------------------进入对应网页-----------------------------
intourl = 'http://jp.xieav.com' + fileUrls[i] + 'player.shtml'
print(intourl)
# ------------------------------进入js网页-----------------------------
videodoc = parser(intourl)
videojs = videodoc.xpath('//*[@id="ccplay"]/script[1]/@src')
print('http://jp.xieav.com' + videojs[0])
jscontent = parser('http://jp.xieav.com' + videojs[0]).xpath('/html/body/pre/text()')
urlPattern = '.*(xfplay\:\/\/\S+\.(mp4|wmv|rmvb)).*'
pattern = re.match(urlPattern, jscontent[0])
if pattern:
xfplay = pattern.group(1)
print(xfplay)
f = open('image\\{}\\'.format(eachImgName) + '1.txt', 'w')
f.write(pattern.group(1))
except Exception as e:
print(e)
getWeLike()