-
Notifications
You must be signed in to change notification settings - Fork 60
/
run.py
65 lines (53 loc) · 1.91 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 19-7-19 下午4:10
# @Author : Hubery
# @File : run.py
# @Software: PyCharm
import json
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from concurrent.futures import ThreadPoolExecutor
from crawler import *
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def run_crawler():
"""
多线程爬取
:return:
"""
crawler_list = [crawler_zhi_hu, crawler_v2ex, crawler_github, crawler_wei_bo, crawler_tie_ba, crawler_dou_ban,
crawler_tian_ya, crawler_wang_yi]
result = []
with ThreadPoolExecutor(max_workers=4) as pool:
def get_result(future):
"""
这个是 add_done_callback()方法来添加回调函数,
future.result()为函数运行的结果
:param future:
:return:
"""
crawler_result = future.result()
result.append(crawler_result)
# 这里是分别保存为txt文件
# hot_name = crawler_result.get('hot_name', '')
# file_path = r'%s/%s.txt' % (os.path.join(BASE_DIR, 'result'), hot_name)
# with open(file_path, 'a+', encoding='utf8') as f:
# f.write(str(crawler_result) + '\n')
for future in crawler_list:
pool.submit(future).add_done_callback(get_result)
file_path = r'{}/result.json'.format(os.path.join(BASE_DIR, 'result'))
with open(file_path, 'w', encoding='utf8') as f:
f.write(json.dumps(result, ensure_ascii=False, indent=4))
print('done')
def run():
"""
定时爬取
:return:
"""
print('开启定时任务')
scheduler = BlockingScheduler()
scheduler.add_job(func=run_crawler, trigger='interval', minutes=20) # 每20分钟抓取一次
scheduler.start()
if __name__ == '__main__':
run_crawler() # 单次爬虫运行
run() # 定时爬虫运行