From 396aa7cf98baadfb9253c42498e9efe2000e2b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com> Date: Fri, 21 Feb 2020 21:55:19 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=E8=AE=BE=E8=AE=A1=EF=BC=8C=E5=8C=BA=E5=88=86=E5=BC=80?= =?UTF-8?q?=E5=8F=91=E5=92=8C=E5=8F=91=E5=B8=83=E6=A8=A1=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/crawler/items.py | 5 ++- crawler/crawler/pipelines.py | 49 ++++++++++++++++++++------- crawler/crawler/settings.py | 16 ++++++--- crawler/crawler/spiders/cpd_spider.py | 16 +++++---- deploy/docker-compose-db.yml | 1 + deploy/init/cpd.sql | 26 +++++++++----- 6 files changed, 81 insertions(+), 32 deletions(-) diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py index 32ff55f..a8f7779 100644 --- a/crawler/crawler/items.py +++ b/crawler/crawler/items.py @@ -87,7 +87,7 @@ def get_category(url): class CpdItem(Item): - id = Field( + request_id = Field( output_processor=TakeFirst(), ) url = Field( @@ -120,3 +120,6 @@ class CpdItem(Item): page = Field( output_processor=TakeFirst(), ) + total_page = Field( + output_processor=TakeFirst(), + ) diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py index b4852e0..9d2c5de 100644 --- a/crawler/crawler/pipelines.py +++ b/crawler/crawler/pipelines.py @@ -5,7 +5,6 @@ # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql -from scrapy.exceptions import DropItem from scrapy.utils.project import get_project_settings settings = get_project_settings() @@ -23,24 +22,48 @@ def __init__(self): mysql_user = settings.get('MYSQL_USER', 'root') mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler') mysql_port = settings.get('MYSQL_PORT', 3306) - database = 'news' - self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port) + self.db = pymysql.connect(host=host, user=mysql_user, password=mysql_pwd, port=mysql_port) self.cursor = self.db.cursor() def process_item(self, item, spider): - sql = """INSERT INTO - `cpd_news`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s) - """ + sql_news = """INSERT INTO + news.cpd_news (news_id, title, category, source, date, page_total) + VALUES (%s,%s,%s,%s,%s,%s) + """ + sql_content = """INSERT INTO + news.cpd_news_content (news_id, request_id, url, content, page) + VALUES (%s,%s,%s,%s,%s) + """ + request_id = item.get('request_id', '') + url = item.get('url', '') + title = item.get('title', '') + content = item.get('content', '') + category = item.get('category', '') + source = item.get('source', '') + date = item.get('date', '') + news_id = item.get('news_id', '') + page = item.get('page', 0) + total_page = item.get('total_page', 0) + try: - self.cursor.execute(sql, ( - item.get('id', ''), item.get('url', ''), item.get('title', ''), item.get('content', ''), - item.get('category', ''), - item.get('source', ''), item.get('date', ''), item.get('news_id', ''), item.get('page', 0))) - self.db.commit() + self.cursor.execute(sql_content, (news_id, request_id, url, content, page)) except Exception as e: - spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}') self.db.rollback() + if e.args[0] == 1452: + try: + self.cursor.execute(sql_news, (news_id, title, category, source, date, total_page)) + except Exception as e: + self.db.rollback() + spider.logger.error(e) + self.db.commit() + try: + self.cursor.execute(sql_content, (news_id, request_id, url, content, page)) + except Exception as e: + self.db.rollback() + spider.logger.error(e) + self.db.commit() + else: + spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}') return def close_spider(self, spider): diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index 2dffa8b..9064c90 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -8,6 +8,7 @@ # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import os BOT_NAME = 'crawler' @@ -105,10 +106,17 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405] # 数据库设置 -MYSQL_HOST = '39.99.157.187' -MYSQL_USER = 'root' -MYSQL_PASSWORD = 'news_crawler' -MYSQL_PORT = 3306 +dev = os.getenv('ScrapyDev', False) +if not dev: + MYSQL_HOST = '39.99.157.187' + MYSQL_USER = 'root' + MYSQL_PASSWORD = 'news_crawler' + MYSQL_PORT = 3306 +else: + MYSQL_HOST = 'localhost' + MYSQL_USER = 'root' + MYSQL_PASSWORD = 'news_crawler' + MYSQL_PORT = 3306 USER_AGENT_LIST = [ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py index 5f17ee5..729c951 100644 --- a/crawler/crawler/spiders/cpd_spider.py +++ b/crawler/crawler/spiders/cpd_spider.py @@ -35,8 +35,8 @@ def __init__(self, *args, **kwargs): } database_name = 'news' - table_name = 'cpd_news' - filter_name = 'id' + table_name = 'cpd_news_content' + filter_name = 'request_id' allowed_domains = ["cpd.com.cn"] @@ -121,11 +121,13 @@ def parse_news(self, response): if title is not None: next_page_html = response.xpath('//*[@id="autopage"]//script').get() - page = response.meta.get('page', 1) + current_page = response.meta.get('page', 1) + total_page = 1 next_page1 = self.p_news1.findall(next_page_html) # [(总页数 0,当前页数 0)] 从 0 计数 next_page2 = self.p_news2.findall(next_page_html) # [(总页数 1,当前页数 1)] 从 1 计数 if len(next_page1) == 1 and next_page1[0][0] != '0' and next_page1[0][1] == '0': + total_page = int(next_page1[0][0]) url_arr = self.p_path1.findall(url) if len(url_arr) == 1: for page in range(1, int(next_page1[0][0])): @@ -134,9 +136,10 @@ def parse_news(self, response): else: self.logger.error(f'未知格式的 NEWS URL: {url}') elif len(next_page2) == 1 and next_page2[0][0] != '1' and next_page2[0][1] == '1': + total_page = int(next_page2[0][0]) url_arr = self.p_path2.findall(url) if len(url_arr) == 1: - for page in range(2, int(next_page2[0][0] + 1)): + for page in range(2, int(next_page2[0][0]) + 1): yield scrapy.Request(url=f'{url_arr[0]}content_{page}.html', callback=self.parse_news, meta={'page': page}) else: @@ -145,7 +148,7 @@ def parse_news(self, response): fp = request_fingerprint(response.request) cpd_item = ItemLoader(item=CpdItem(), response=response) - cpd_item.add_value('id', fp) + cpd_item.add_value('request_id', fp) cpd_item.add_value('url', url) cpd_item.add_xpath('title', '//*[@id="newslist"]/h1/gettitle/text()') cpd_item.add_xpath('content', '//*[@id="fz_test"]/div[1]/table') @@ -153,7 +156,8 @@ def parse_news(self, response): cpd_item.add_xpath('source', '//*[@id="source_report"]/text()') cpd_item.add_xpath('date', '//*[@id="pub_time_report"]/text()') cpd_item.add_value('news_id', url) - cpd_item.add_value('page', page) + cpd_item.add_value('page', current_page) + cpd_item.add_value('total_page', total_page) yield cpd_item.load_item() links = self.link.extract_links(response) diff --git a/deploy/docker-compose-db.yml b/deploy/docker-compose-db.yml index ad8a1ff..fec180c 100644 --- a/deploy/docker-compose-db.yml +++ b/deploy/docker-compose-db.yml @@ -9,6 +9,7 @@ services: restart: always environment: MYSQL_ROOT_PASSWORD: news_crawler + TZ: "Asia/Shanghai" ports: - '3306:3306' volumes: diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql index dbc0b36..5a443ee 100644 --- a/deploy/init/cpd.sql +++ b/deploy/init/cpd.sql @@ -4,18 +4,28 @@ use news; create table if not exists cpd_news ( - id varchar(40) not null, - url varchar(255) not null, + news_id varchar(50) not null primary key, title varchar(255) not null, - content text not null, category varchar(5) not null, source varchar(50) not null, date varchar(30) not null, - news_id varchar(50) not null, - page int not null, + page_total int not null, + entry_time datetime not null default CURRENT_TIMESTAMP comment '入库时间', constraint data_id_uindex - unique (id) + unique (news_id) ); -alter table data - add primary key (id); +create table if not exists cpd_news_content +( + news_id varchar(40) not null primary key, + request_id varchar(40) not null, + url varchar(255) not null, + content text not null, + page int not null, + constraint data_id_uindex + unique (request_id), + FOREIGN KEY fk_news(news_id) + REFERENCES cpd_news(news_id) + ON UPDATE CASCADE + ON DELETE RESTRICT +); From 454f63623c329241b1128893e718c81b22f11292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com> Date: Fri, 21 Feb 2020 22:43:36 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E5=8F=96=E6=B6=88=20depth=20limit=20?= =?UTF-8?q?=E9=99=90=E5=88=B6=EF=BC=8C=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=E5=AD=97=E6=AE=B5=E6=B3=A8=E9=87=8A=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=20pipline=20=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/crawler/pipelines.py | 5 +++-- crawler/crawler/settings.py | 2 -- deploy/init/cpd.sql | 24 +++++++++++++----------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py index 9d2c5de..7c079c4 100644 --- a/crawler/crawler/pipelines.py +++ b/crawler/crawler/pipelines.py @@ -47,21 +47,22 @@ def process_item(self, item, spider): try: self.cursor.execute(sql_content, (news_id, request_id, url, content, page)) + self.db.commit() except Exception as e: self.db.rollback() if e.args[0] == 1452: try: self.cursor.execute(sql_news, (news_id, title, category, source, date, total_page)) + self.db.commit() except Exception as e: self.db.rollback() spider.logger.error(e) - self.db.commit() try: self.cursor.execute(sql_content, (news_id, request_id, url, content, page)) + self.db.commit() except Exception as e: self.db.rollback() spider.logger.error(e) - self.db.commit() else: spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}') return diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index 9064c90..ebdfb3a 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -96,8 +96,6 @@ # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -DEPTH_LIMIT = 10 - LOG_LEVEL = 'DEBUG' LOG_STDOUT = True diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql index 5a443ee..ce78795 100644 --- a/deploy/init/cpd.sql +++ b/deploy/init/cpd.sql @@ -4,12 +4,13 @@ use news; create table if not exists cpd_news ( - news_id varchar(50) not null primary key, - title varchar(255) not null, - category varchar(5) not null, - source varchar(50) not null, - date varchar(30) not null, - page_total int not null, + news_id varchar(40) not null primary key comment '新闻 id', + title varchar(255) not null comment '新闻标题', + category varchar(10) not null comment '新闻分类', + source varchar(50) not null comment '新闻来源', + date varchar(30) not null comment '新闻日期', + page_total int not null comment '新闻总页数', + duplication varchar(40) not null default '', entry_time datetime not null default CURRENT_TIMESTAMP comment '入库时间', constraint data_id_uindex unique (news_id) @@ -17,11 +18,12 @@ create table if not exists cpd_news create table if not exists cpd_news_content ( - news_id varchar(40) not null primary key, - request_id varchar(40) not null, - url varchar(255) not null, - content text not null, - page int not null, + news_id varchar(40) not null comment '新闻 id', + request_id varchar(40) not null primary key comment '请求 id', + url varchar(255) not null comment '新闻链接', + content mediumtext not null comment '新闻内容', + page int not null comment '当前页数', + entry_time datetime not null default CURRENT_TIMESTAMP comment '入库时间', constraint data_id_uindex unique (request_id), FOREIGN KEY fk_news(news_id) From a7100ca03203e8dca50cb840cb22677324868c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com> Date: Fri, 21 Feb 2020 23:22:50 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E5=A4=84=E7=90=86=EF=BC=8C=E7=A6=81=E6=AD=A2=E9=87=8D=E5=AE=9A?= =?UTF-8?q?=E5=90=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/crawler/__init__.py | 3 +++ crawler/crawler/middlewares.py | 28 +++++++++++++++++++++++++++ crawler/crawler/settings.py | 3 +++ crawler/crawler/spiders/__init__.py | 3 +++ crawler/crawler/spiders/cpd_spider.py | 12 ++++++++++++ 5 files changed, 49 insertions(+) diff --git a/crawler/crawler/__init__.py b/crawler/crawler/__init__.py index 05a8854..2763120 100644 --- a/crawler/crawler/__init__.py +++ b/crawler/crawler/__init__.py @@ -1,4 +1,7 @@ import os +if not os.path.exists('error'): + os.makedirs('error') + if not os.path.exists('log'): os.makedirs('log') diff --git a/crawler/crawler/middlewares.py b/crawler/crawler/middlewares.py index 30cff0a..03c3f4e 100644 --- a/crawler/crawler/middlewares.py +++ b/crawler/crawler/middlewares.py @@ -7,6 +7,7 @@ import base64 import logging +import os import random from scrapy import signals @@ -122,6 +123,33 @@ def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list) +class SaveHttpErrorMiddleware(object): + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + return cls(crawler) + + def __init__(self, crawler): + spider_name = crawler.spider.name + + path = f'error/{spider_name}' + + self.error_file = os.path.join(path, 'error.tsv') + self.retry_file = os.path.join(path, 'retry.tsv') + + if os.path.exists(self.error_file): + os.rename(self.error_file, self.retry_file) + + def process_spider_input(self, response, spider): + if 200 <= response.status < 300: # common case + return + if response.status != 404: + with open(self.error_file, 'a+') as f: + line = f'{response.url}\t{response.status}\n' + f.write(line) + return + + # 代理服务器 proxyServer = "http://http-dyn.abuyun.com:9020" diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index ebdfb3a..eb2c480 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -54,6 +54,7 @@ # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { + 'crawler.middlewares.SaveHttpErrorMiddleware': 49, 'crawler.middlewares.CrawlerSpiderMiddleware': 543, } @@ -103,6 +104,8 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405] +REDIRECT_ENABLED = False + # 数据库设置 dev = os.getenv('ScrapyDev', False) if not dev: diff --git a/crawler/crawler/spiders/__init__.py b/crawler/crawler/spiders/__init__.py index 1ee9c1a..e31986d 100644 --- a/crawler/crawler/spiders/__init__.py +++ b/crawler/crawler/spiders/__init__.py @@ -5,5 +5,8 @@ import os +if not os.path.exists('error/cpd'): + os.makedirs('error/cpd') + if not os.path.exists('log/cpd'): os.makedirs('log/cpd') diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py index 729c951..afdf459 100644 --- a/crawler/crawler/spiders/cpd_spider.py +++ b/crawler/crawler/spiders/cpd_spider.py @@ -8,6 +8,7 @@ import logging +import os import re from datetime import datetime @@ -101,6 +102,17 @@ def __init__(self, *args, **kwargs): p_path2 = re.compile('(.*?)content.html') def start_requests(self): + try: + path = f'error/{self.name}' + retry_file = os.path.join(path, 'retry.tsv') + with open(retry_file, 'r') as f: + lines = f.readlines() + for line in lines: + news_url = line.split('\t')[0] + yield scrapy.Request(url=news_url, callback=self.parse_news) + except IOError: + logger.info('retry.tsv not accessible') + for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse_index, dont_filter=True)