Skip to content

Commit

Permalink
Merge pull request #1 from zhangxu3486432/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
zhangxu3486432 authored Feb 21, 2020
2 parents 056c0f4 + a7100ca commit 39b580a
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 37 deletions.
3 changes: 3 additions & 0 deletions crawler/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os

if not os.path.exists('error'):
os.makedirs('error')

if not os.path.exists('log'):
os.makedirs('log')
5 changes: 4 additions & 1 deletion crawler/crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def get_category(url):


class CpdItem(Item):
id = Field(
request_id = Field(
output_processor=TakeFirst(),
)
url = Field(
Expand Down Expand Up @@ -120,3 +120,6 @@ class CpdItem(Item):
page = Field(
output_processor=TakeFirst(),
)
total_page = Field(
output_processor=TakeFirst(),
)
28 changes: 28 additions & 0 deletions crawler/crawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import base64
import logging
import os
import random

from scrapy import signals
Expand Down Expand Up @@ -122,6 +123,33 @@ def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agent_list)


class SaveHttpErrorMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
return cls(crawler)

def __init__(self, crawler):
spider_name = crawler.spider.name

path = f'error/{spider_name}'

self.error_file = os.path.join(path, 'error.tsv')
self.retry_file = os.path.join(path, 'retry.tsv')

if os.path.exists(self.error_file):
os.rename(self.error_file, self.retry_file)

def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
return
if response.status != 404:
with open(self.error_file, 'a+') as f:
line = f'{response.url}\t{response.status}\n'
f.write(line)
return


# 代理服务器
proxyServer = "http://http-dyn.abuyun.com:9020"

Expand Down
48 changes: 36 additions & 12 deletions crawler/crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings

settings = get_project_settings()
Expand All @@ -23,24 +22,49 @@ def __init__(self):
mysql_user = settings.get('MYSQL_USER', 'root')
mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler')
mysql_port = settings.get('MYSQL_PORT', 3306)
database = 'news'
self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port)
self.db = pymysql.connect(host=host, user=mysql_user, password=mysql_pwd, port=mysql_port)
self.cursor = self.db.cursor()

def process_item(self, item, spider):
sql = """INSERT INTO
`cpd_news`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
sql_news = """INSERT INTO
news.cpd_news (news_id, title, category, source, date, page_total)
VALUES (%s,%s,%s,%s,%s,%s)
"""
sql_content = """INSERT INTO
news.cpd_news_content (news_id, request_id, url, content, page)
VALUES (%s,%s,%s,%s,%s)
"""
request_id = item.get('request_id', '')
url = item.get('url', '')
title = item.get('title', '')
content = item.get('content', '')
category = item.get('category', '')
source = item.get('source', '')
date = item.get('date', '')
news_id = item.get('news_id', '')
page = item.get('page', 0)
total_page = item.get('total_page', 0)

try:
self.cursor.execute(sql, (
item.get('id', ''), item.get('url', ''), item.get('title', ''), item.get('content', ''),
item.get('category', ''),
item.get('source', ''), item.get('date', ''), item.get('news_id', ''), item.get('page', 0)))
self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
self.db.commit()
except Exception as e:
spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
self.db.rollback()
if e.args[0] == 1452:
try:
self.cursor.execute(sql_news, (news_id, title, category, source, date, total_page))
self.db.commit()
except Exception as e:
self.db.rollback()
spider.logger.error(e)
try:
self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
self.db.commit()
except Exception as e:
self.db.rollback()
spider.logger.error(e)
else:
spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
return

def close_spider(self, spider):
Expand Down
21 changes: 15 additions & 6 deletions crawler/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'crawler'

Expand Down Expand Up @@ -53,6 +54,7 @@
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'crawler.middlewares.SaveHttpErrorMiddleware': 49,
'crawler.middlewares.CrawlerSpiderMiddleware': 543,
}

Expand Down Expand Up @@ -95,20 +97,27 @@
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

DEPTH_LIMIT = 10

LOG_LEVEL = 'DEBUG'
LOG_STDOUT = True

DUPEFILTER_CLASS = 'crawler.dupefilters.RFPDupeFilter'

RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405]

REDIRECT_ENABLED = False

# 数据库设置
MYSQL_HOST = '39.99.157.187'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'news_crawler'
MYSQL_PORT = 3306
dev = os.getenv('ScrapyDev', False)
if not dev:
MYSQL_HOST = '39.99.157.187'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'news_crawler'
MYSQL_PORT = 3306
else:
MYSQL_HOST = 'localhost'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'news_crawler'
MYSQL_PORT = 3306

USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
Expand Down
3 changes: 3 additions & 0 deletions crawler/crawler/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,8 @@

import os

if not os.path.exists('error/cpd'):
os.makedirs('error/cpd')

if not os.path.exists('log/cpd'):
os.makedirs('log/cpd')
28 changes: 22 additions & 6 deletions crawler/crawler/spiders/cpd_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


import logging
import os
import re
from datetime import datetime

Expand Down Expand Up @@ -35,8 +36,8 @@ def __init__(self, *args, **kwargs):
}

database_name = 'news'
table_name = 'cpd_news'
filter_name = 'id'
table_name = 'cpd_news_content'
filter_name = 'request_id'

allowed_domains = ["cpd.com.cn"]

Expand Down Expand Up @@ -101,6 +102,17 @@ def __init__(self, *args, **kwargs):
p_path2 = re.compile('(.*?)content.html')

def start_requests(self):
try:
path = f'error/{self.name}'
retry_file = os.path.join(path, 'retry.tsv')
with open(retry_file, 'r') as f:
lines = f.readlines()
for line in lines:
news_url = line.split('\t')[0]
yield scrapy.Request(url=news_url, callback=self.parse_news)
except IOError:
logger.info('retry.tsv not accessible')

for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse_index, dont_filter=True)

Expand All @@ -121,11 +133,13 @@ def parse_news(self, response):
if title is not None:
next_page_html = response.xpath('//*[@id="autopage"]//script').get()

page = response.meta.get('page', 1)
current_page = response.meta.get('page', 1)
total_page = 1

next_page1 = self.p_news1.findall(next_page_html) # [(总页数 0,当前页数 0)] 从 0 计数
next_page2 = self.p_news2.findall(next_page_html) # [(总页数 1,当前页数 1)] 从 1 计数
if len(next_page1) == 1 and next_page1[0][0] != '0' and next_page1[0][1] == '0':
total_page = int(next_page1[0][0])
url_arr = self.p_path1.findall(url)
if len(url_arr) == 1:
for page in range(1, int(next_page1[0][0])):
Expand All @@ -134,9 +148,10 @@ def parse_news(self, response):
else:
self.logger.error(f'未知格式的 NEWS URL: {url}')
elif len(next_page2) == 1 and next_page2[0][0] != '1' and next_page2[0][1] == '1':
total_page = int(next_page2[0][0])
url_arr = self.p_path2.findall(url)
if len(url_arr) == 1:
for page in range(2, int(next_page2[0][0] + 1)):
for page in range(2, int(next_page2[0][0]) + 1):
yield scrapy.Request(url=f'{url_arr[0]}content_{page}.html', callback=self.parse_news,
meta={'page': page})
else:
Expand All @@ -145,15 +160,16 @@ def parse_news(self, response):
fp = request_fingerprint(response.request)

cpd_item = ItemLoader(item=CpdItem(), response=response)
cpd_item.add_value('id', fp)
cpd_item.add_value('request_id', fp)
cpd_item.add_value('url', url)
cpd_item.add_xpath('title', '//*[@id="newslist"]/h1/gettitle/text()')
cpd_item.add_xpath('content', '//*[@id="fz_test"]/div[1]/table')
cpd_item.add_value('category', url)
cpd_item.add_xpath('source', '//*[@id="source_report"]/text()')
cpd_item.add_xpath('date', '//*[@id="pub_time_report"]/text()')
cpd_item.add_value('news_id', url)
cpd_item.add_value('page', page)
cpd_item.add_value('page', current_page)
cpd_item.add_value('total_page', total_page)
yield cpd_item.load_item()

links = self.link.extract_links(response)
Expand Down
1 change: 1 addition & 0 deletions deploy/docker-compose-db.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ services:
restart: always
environment:
MYSQL_ROOT_PASSWORD: news_crawler
TZ: "Asia/Shanghai"
ports:
- '3306:3306'
volumes:
Expand Down
36 changes: 24 additions & 12 deletions deploy/init/cpd.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,30 @@ use news;

create table if not exists cpd_news
(
id varchar(40) not null,
url varchar(255) not null,
title varchar(255) not null,
content text not null,
category varchar(5) not null,
source varchar(50) not null,
date varchar(30) not null,
news_id varchar(50) not null,
page int not null,
news_id varchar(40) not null primary key comment '新闻 id',
title varchar(255) not null comment '新闻标题',
category varchar(10) not null comment '新闻分类',
source varchar(50) not null comment '新闻来源',
date varchar(30) not null comment '新闻日期',
page_total int not null comment '新闻总页数',
duplication varchar(40) not null default '',
entry_time datetime not null default CURRENT_TIMESTAMP comment '入库时间',
constraint data_id_uindex
unique (id)
unique (news_id)
);

alter table data
add primary key (id);
create table if not exists cpd_news_content
(
news_id varchar(40) not null comment '新闻 id',
request_id varchar(40) not null primary key comment '请求 id',
url varchar(255) not null comment '新闻链接',
content mediumtext not null comment '新闻内容',
page int not null comment '当前页数',
entry_time datetime not null default CURRENT_TIMESTAMP comment '入库时间',
constraint data_id_uindex
unique (request_id),
FOREIGN KEY fk_news(news_id)
REFERENCES cpd_news(news_id)
ON UPDATE CASCADE
ON DELETE RESTRICT
);

0 comments on commit 39b580a

Please sign in to comment.