From 396aa7cf98baadfb9253c42498e9efe2000e2b48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com>
Date: Fri, 21 Feb 2020 21:55:19 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=BA=93=E8=AE=BE=E8=AE=A1=EF=BC=8C=E5=8C=BA=E5=88=86=E5=BC=80?=
 =?UTF-8?q?=E5=8F=91=E5=92=8C=E5=8F=91=E5=B8=83=E6=A8=A1=E5=BC=8F=EF=BC=8C?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crawler/crawler/items.py              |  5 ++-
 crawler/crawler/pipelines.py          | 49 ++++++++++++++++++++-------
 crawler/crawler/settings.py           | 16 ++++++---
 crawler/crawler/spiders/cpd_spider.py | 16 +++++----
 deploy/docker-compose-db.yml          |  1 +
 deploy/init/cpd.sql                   | 26 +++++++++-----
 6 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py
index 32ff55f..a8f7779 100644
--- a/crawler/crawler/items.py
+++ b/crawler/crawler/items.py
@@ -87,7 +87,7 @@ def get_category(url):
 
 
 class CpdItem(Item):
-    id = Field(
+    request_id = Field(
         output_processor=TakeFirst(),
     )
     url = Field(
@@ -120,3 +120,6 @@ class CpdItem(Item):
     page = Field(
         output_processor=TakeFirst(),
     )
+    total_page = Field(
+        output_processor=TakeFirst(),
+    )
diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py
index b4852e0..9d2c5de 100644
--- a/crawler/crawler/pipelines.py
+++ b/crawler/crawler/pipelines.py
@@ -5,7 +5,6 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 import pymysql
-from scrapy.exceptions import DropItem
 from scrapy.utils.project import get_project_settings
 
 settings = get_project_settings()
@@ -23,24 +22,48 @@ def __init__(self):
         mysql_user = settings.get('MYSQL_USER', 'root')
         mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler')
         mysql_port = settings.get('MYSQL_PORT', 3306)
-        database = 'news'
-        self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port)
+        self.db = pymysql.connect(host=host, user=mysql_user, password=mysql_pwd, port=mysql_port)
         self.cursor = self.db.cursor()
 
     def process_item(self, item, spider):
-        sql = """INSERT INTO
-                        `cpd_news`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`)
-                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
-                        """
+        sql_news = """INSERT INTO
+        news.cpd_news (news_id, title, category, source, date, page_total)
+        VALUES (%s,%s,%s,%s,%s,%s)
+        """
+        sql_content = """INSERT INTO
+        news.cpd_news_content (news_id, request_id, url, content, page)
+        VALUES (%s,%s,%s,%s,%s)
+        """
+        request_id = item.get('request_id', '')
+        url = item.get('url', '')
+        title = item.get('title', '')
+        content = item.get('content', '')
+        category = item.get('category', '')
+        source = item.get('source', '')
+        date = item.get('date', '')
+        news_id = item.get('news_id', '')
+        page = item.get('page', 0)
+        total_page = item.get('total_page', 0)
+
         try:
-            self.cursor.execute(sql, (
-                item.get('id', ''), item.get('url', ''), item.get('title', ''), item.get('content', ''),
-                item.get('category', ''),
-                item.get('source', ''), item.get('date', ''), item.get('news_id', ''), item.get('page', 0)))
-            self.db.commit()
+            self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
         except Exception as e:
-            spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
             self.db.rollback()
+            if e.args[0] == 1452:
+                try:
+                    self.cursor.execute(sql_news, (news_id, title, category, source, date, total_page))
+                except Exception as e:
+                    self.db.rollback()
+                    spider.logger.error(e)
+                self.db.commit()
+                try:
+                    self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
+                except Exception as e:
+                    self.db.rollback()
+                    spider.logger.error(e)
+                self.db.commit()
+            else:
+                spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
         return
 
     def close_spider(self, spider):
diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py
index 2dffa8b..9064c90 100644
--- a/crawler/crawler/settings.py
+++ b/crawler/crawler/settings.py
@@ -8,6 +8,7 @@
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+import os
 
 BOT_NAME = 'crawler'
 
@@ -105,10 +106,17 @@
 RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405]
 
 # 数据库设置
-MYSQL_HOST = '39.99.157.187'
-MYSQL_USER = 'root'
-MYSQL_PASSWORD = 'news_crawler'
-MYSQL_PORT = 3306
+dev = os.getenv('ScrapyDev', False)
+if not dev:
+    MYSQL_HOST = '39.99.157.187'
+    MYSQL_USER = 'root'
+    MYSQL_PASSWORD = 'news_crawler'
+    MYSQL_PORT = 3306
+else:
+    MYSQL_HOST = 'localhost'
+    MYSQL_USER = 'root'
+    MYSQL_PASSWORD = 'news_crawler'
+    MYSQL_PORT = 3306
 
 USER_AGENT_LIST = [
     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py
index 5f17ee5..729c951 100644
--- a/crawler/crawler/spiders/cpd_spider.py
+++ b/crawler/crawler/spiders/cpd_spider.py
@@ -35,8 +35,8 @@ def __init__(self, *args, **kwargs):
     }
 
     database_name = 'news'
-    table_name = 'cpd_news'
-    filter_name = 'id'
+    table_name = 'cpd_news_content'
+    filter_name = 'request_id'
 
     allowed_domains = ["cpd.com.cn"]
 
@@ -121,11 +121,13 @@ def parse_news(self, response):
         if title is not None:
             next_page_html = response.xpath('//*[@id="autopage"]//script').get()
 
-            page = response.meta.get('page', 1)
+            current_page = response.meta.get('page', 1)
+            total_page = 1
 
             next_page1 = self.p_news1.findall(next_page_html)  # [(总页数 0,当前页数 0)] 从 0 计数
             next_page2 = self.p_news2.findall(next_page_html)  # [(总页数 1,当前页数 1)] 从 1 计数
             if len(next_page1) == 1 and next_page1[0][0] != '0' and next_page1[0][1] == '0':
+                total_page = int(next_page1[0][0])
                 url_arr = self.p_path1.findall(url)
                 if len(url_arr) == 1:
                     for page in range(1, int(next_page1[0][0])):
@@ -134,9 +136,10 @@ def parse_news(self, response):
                 else:
                     self.logger.error(f'未知格式的 NEWS URL: {url}')
             elif len(next_page2) == 1 and next_page2[0][0] != '1' and next_page2[0][1] == '1':
+                total_page = int(next_page2[0][0])
                 url_arr = self.p_path2.findall(url)
                 if len(url_arr) == 1:
-                    for page in range(2, int(next_page2[0][0] + 1)):
+                    for page in range(2, int(next_page2[0][0]) + 1):
                         yield scrapy.Request(url=f'{url_arr[0]}content_{page}.html', callback=self.parse_news,
                                              meta={'page': page})
                 else:
@@ -145,7 +148,7 @@ def parse_news(self, response):
             fp = request_fingerprint(response.request)
 
             cpd_item = ItemLoader(item=CpdItem(), response=response)
-            cpd_item.add_value('id', fp)
+            cpd_item.add_value('request_id', fp)
             cpd_item.add_value('url', url)
             cpd_item.add_xpath('title', '//*[@id="newslist"]/h1/gettitle/text()')
             cpd_item.add_xpath('content', '//*[@id="fz_test"]/div[1]/table')
@@ -153,7 +156,8 @@ def parse_news(self, response):
             cpd_item.add_xpath('source', '//*[@id="source_report"]/text()')
             cpd_item.add_xpath('date', '//*[@id="pub_time_report"]/text()')
             cpd_item.add_value('news_id', url)
-            cpd_item.add_value('page', page)
+            cpd_item.add_value('page', current_page)
+            cpd_item.add_value('total_page', total_page)
             yield cpd_item.load_item()
 
         links = self.link.extract_links(response)
diff --git a/deploy/docker-compose-db.yml b/deploy/docker-compose-db.yml
index ad8a1ff..fec180c 100644
--- a/deploy/docker-compose-db.yml
+++ b/deploy/docker-compose-db.yml
@@ -9,6 +9,7 @@ services:
     restart: always
     environment:
       MYSQL_ROOT_PASSWORD: news_crawler
+      TZ: "Asia/Shanghai"
     ports:
       - '3306:3306'
     volumes:
diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql
index dbc0b36..5a443ee 100644
--- a/deploy/init/cpd.sql
+++ b/deploy/init/cpd.sql
@@ -4,18 +4,28 @@ use news;
 
 create table if not exists cpd_news
 (
-	id varchar(40) not null,
-	url varchar(255) not null,
+	news_id varchar(50) not null primary key,
 	title varchar(255) not null,
-	content text not null,
 	category varchar(5) not null,
 	source varchar(50) not null,
 	date varchar(30) not null,
-	news_id varchar(50) not null,
-	page int not null,
+	page_total int not null,
+	entry_time datetime not null default CURRENT_TIMESTAMP  comment '入库时间',
 	constraint data_id_uindex
-		unique (id)
+        unique (news_id)
 );
 
-alter table data
-	add primary key (id);
+create table if not exists cpd_news_content
+(
+	news_id varchar(40) not null primary key,
+	request_id varchar(40) not null,
+	url varchar(255) not null,
+	content text not null,
+	page int not null,
+	constraint data_id_uindex
+        unique (request_id),
+	FOREIGN KEY fk_news(news_id)
+    REFERENCES cpd_news(news_id)
+    ON UPDATE CASCADE
+    ON DELETE RESTRICT
+);

From 454f63623c329241b1128893e718c81b22f11292 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com>
Date: Fri, 21 Feb 2020 22:43:36 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E5=8F=96=E6=B6=88=20depth=20limit=20?=
 =?UTF-8?q?=E9=99=90=E5=88=B6=EF=BC=8C=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=BA=93=E5=AD=97=E6=AE=B5=E6=B3=A8=E9=87=8A=EF=BC=8C=E4=BF=AE?=
 =?UTF-8?q?=E5=A4=8D=20pipline=20=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crawler/crawler/pipelines.py |  5 +++--
 crawler/crawler/settings.py  |  2 --
 deploy/init/cpd.sql          | 24 +++++++++++++-----------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py
index 9d2c5de..7c079c4 100644
--- a/crawler/crawler/pipelines.py
+++ b/crawler/crawler/pipelines.py
@@ -47,21 +47,22 @@ def process_item(self, item, spider):
 
         try:
             self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
+            self.db.commit()
         except Exception as e:
             self.db.rollback()
             if e.args[0] == 1452:
                 try:
                     self.cursor.execute(sql_news, (news_id, title, category, source, date, total_page))
+                    self.db.commit()
                 except Exception as e:
                     self.db.rollback()
                     spider.logger.error(e)
-                self.db.commit()
                 try:
                     self.cursor.execute(sql_content, (news_id, request_id, url, content, page))
+                    self.db.commit()
                 except Exception as e:
                     self.db.rollback()
                     spider.logger.error(e)
-                self.db.commit()
             else:
                 spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
         return
diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py
index 9064c90..ebdfb3a 100644
--- a/crawler/crawler/settings.py
+++ b/crawler/crawler/settings.py
@@ -96,8 +96,6 @@
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
-DEPTH_LIMIT = 10
-
 LOG_LEVEL = 'DEBUG'
 LOG_STDOUT = True
 
diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql
index 5a443ee..ce78795 100644
--- a/deploy/init/cpd.sql
+++ b/deploy/init/cpd.sql
@@ -4,12 +4,13 @@ use news;
 
 create table if not exists cpd_news
 (
-	news_id varchar(50) not null primary key,
-	title varchar(255) not null,
-	category varchar(5) not null,
-	source varchar(50) not null,
-	date varchar(30) not null,
-	page_total int not null,
+	news_id varchar(40) not null primary key comment '新闻 id',
+	title varchar(255) not null comment '新闻标题',
+	category varchar(10) not null comment '新闻分类',
+	source varchar(50) not null comment '新闻来源',
+	date varchar(30) not null comment '新闻日期',
+	page_total int not null comment '新闻总页数',
+	duplication varchar(40) not null default '',
 	entry_time datetime not null default CURRENT_TIMESTAMP  comment '入库时间',
 	constraint data_id_uindex
         unique (news_id)
@@ -17,11 +18,12 @@ create table if not exists cpd_news
 
 create table if not exists cpd_news_content
 (
-	news_id varchar(40) not null primary key,
-	request_id varchar(40) not null,
-	url varchar(255) not null,
-	content text not null,
-	page int not null,
+	news_id varchar(40) not null comment '新闻 id',
+	request_id varchar(40) not null primary key comment '请求 id',
+	url varchar(255) not null comment '新闻链接',
+	content mediumtext not null comment '新闻内容',
+	page int not null comment '当前页数',
+	entry_time datetime not null default CURRENT_TIMESTAMP  comment '入库时间',
 	constraint data_id_uindex
         unique (request_id),
 	FOREIGN KEY fk_news(news_id)

From a7100ca03203e8dca50cb840cb22677324868c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=97=AD?= <zhangxu3486432@gmail.com>
Date: Fri, 21 Feb 2020 23:22:50 +0800
Subject: [PATCH 3/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=EF=BC=8C=E7=A6=81=E6=AD=A2=E9=87=8D=E5=AE=9A?=
 =?UTF-8?q?=E5=90=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crawler/crawler/__init__.py           |  3 +++
 crawler/crawler/middlewares.py        | 28 +++++++++++++++++++++++++++
 crawler/crawler/settings.py           |  3 +++
 crawler/crawler/spiders/__init__.py   |  3 +++
 crawler/crawler/spiders/cpd_spider.py | 12 ++++++++++++
 5 files changed, 49 insertions(+)

diff --git a/crawler/crawler/__init__.py b/crawler/crawler/__init__.py
index 05a8854..2763120 100644
--- a/crawler/crawler/__init__.py
+++ b/crawler/crawler/__init__.py
@@ -1,4 +1,7 @@
 import os
 
+if not os.path.exists('error'):
+    os.makedirs('error')
+
 if not os.path.exists('log'):
     os.makedirs('log')
diff --git a/crawler/crawler/middlewares.py b/crawler/crawler/middlewares.py
index 30cff0a..03c3f4e 100644
--- a/crawler/crawler/middlewares.py
+++ b/crawler/crawler/middlewares.py
@@ -7,6 +7,7 @@
 
 import base64
 import logging
+import os
 import random
 
 from scrapy import signals
@@ -122,6 +123,33 @@ def process_request(self, request, spider):
         request.headers['User-Agent'] = random.choice(self.user_agent_list)
 
 
+class SaveHttpErrorMiddleware(object):
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        return cls(crawler)
+
+    def __init__(self, crawler):
+        spider_name = crawler.spider.name
+
+        path = f'error/{spider_name}'
+
+        self.error_file = os.path.join(path, 'error.tsv')
+        self.retry_file = os.path.join(path, 'retry.tsv')
+
+        if os.path.exists(self.error_file):
+            os.rename(self.error_file, self.retry_file)
+
+    def process_spider_input(self, response, spider):
+        if 200 <= response.status < 300:  # common case
+            return
+        if response.status != 404:
+            with open(self.error_file, 'a+') as f:
+                line = f'{response.url}\t{response.status}\n'
+                f.write(line)
+        return
+
+
 # 代理服务器
 proxyServer = "http://http-dyn.abuyun.com:9020"
 
diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py
index ebdfb3a..eb2c480 100644
--- a/crawler/crawler/settings.py
+++ b/crawler/crawler/settings.py
@@ -54,6 +54,7 @@
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 SPIDER_MIDDLEWARES = {
+    'crawler.middlewares.SaveHttpErrorMiddleware': 49,
     'crawler.middlewares.CrawlerSpiderMiddleware': 543,
 }
 
@@ -103,6 +104,8 @@
 
 RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405]
 
+REDIRECT_ENABLED = False
+
 # 数据库设置
 dev = os.getenv('ScrapyDev', False)
 if not dev:
diff --git a/crawler/crawler/spiders/__init__.py b/crawler/crawler/spiders/__init__.py
index 1ee9c1a..e31986d 100644
--- a/crawler/crawler/spiders/__init__.py
+++ b/crawler/crawler/spiders/__init__.py
@@ -5,5 +5,8 @@
 
 import os
 
+if not os.path.exists('error/cpd'):
+    os.makedirs('error/cpd')
+
 if not os.path.exists('log/cpd'):
     os.makedirs('log/cpd')
diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py
index 729c951..afdf459 100644
--- a/crawler/crawler/spiders/cpd_spider.py
+++ b/crawler/crawler/spiders/cpd_spider.py
@@ -8,6 +8,7 @@
 
 
 import logging
+import os
 import re
 from datetime import datetime
 
@@ -101,6 +102,17 @@ def __init__(self, *args, **kwargs):
     p_path2 = re.compile('(.*?)content.html')
 
     def start_requests(self):
+        try:
+            path = f'error/{self.name}'
+            retry_file = os.path.join(path, 'retry.tsv')
+            with open(retry_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    news_url = line.split('\t')[0]
+                    yield scrapy.Request(url=news_url, callback=self.parse_news)
+        except IOError:
+            logger.info('retry.tsv not accessible')
+
         for url in self.start_urls:
             yield scrapy.Request(url=url, callback=self.parse_index, dont_filter=True)