diff --git a/.gitignore b/.gitignore index aa44ee2..e1c19e4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Other +nohup.out +data/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/Pipfile b/Pipfile index 101f62c..dcb1207 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,8 @@ verify_ssl = true scrapy = "*" ipython = "*" w3lib = "*" +requests = "*" +pymysql = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 8664d55..4cf8a87 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d2eccc5f19c1401e4020195bca631f12ff1a38ed2115b5326dbdcfe9bdf23926" + "sha256": "bf76d091e87932c328db83d455b06c6beefc6291b3f624079218890ae6aaf6fe" }, "pipfile-spec": 6, "requires": { @@ -45,6 +45,13 @@ ], "version": "==0.1.0" }, + "certifi": { + "hashes": [ + "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", + "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" + ], + "version": "==2019.11.28" + }, "cffi": { "hashes": [ "sha256:0b49274afc941c626b605fb59b59c3485c17dc776dc3cc7cc14aca74cc19cc42", @@ -83,6 +90,13 @@ ], "version": "==1.13.2" }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, "constantly": { "hashes": [ "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", @@ -153,11 +167,11 @@ }, "ipython": { "hashes": [ - "sha256:0f4bcf18293fb666df8511feec0403bdb7e061a5842ea6e88a3177b0ceb34ead", - "sha256:387686dd7fc9caf29d2fddcf3116c4b07a11d9025701d220c589a430b0171d8a" + "sha256:d9459e7237e2e5858738ff9c3e26504b79899b58a6d49e574d352493d80684c6", + "sha256:f6689108b1734501d3b59c84427259fd5ac5141afe2e846cfa8598eb811886c9" ], "index": "pypi", - "version": "==7.11.1" + "version": "==7.12.0" }, "ipython-genutils": { "hashes": [ @@ -168,42 +182,43 @@ }, "jedi": { "hashes": [ - "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064", - "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807" + "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2", + "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5" ], - "version": "==0.15.2" + "version": "==0.16.0" }, "lxml": { "hashes": [ - "sha256:00ac0d64949fef6b3693813fe636a2d56d97a5a49b5bbb86e4cc4cc50ebc9ea2", - "sha256:0571e607558665ed42e450d7bf0e2941d542c18e117b1ebbf0ba72f287ad841c", - "sha256:0e3f04a7615fdac0be5e18b2406529521d6dbdb0167d2a690ee328bef7807487", - "sha256:13cf89be53348d1c17b453867da68704802966c433b2bb4fa1f970daadd2ef70", - "sha256:217262fcf6a4c2e1c7cb1efa08bd9ebc432502abc6c255c4abab611e8be0d14d", - "sha256:223e544828f1955daaf4cefbb4853bc416b2ec3fd56d4f4204a8b17007c21250", - "sha256:277cb61fede2f95b9c61912fefb3d43fbd5f18bf18a14fae4911b67984486f5d", - "sha256:3213f753e8ae86c396e0e066866e64c6b04618e85c723b32ecb0909885211f74", - "sha256:4690984a4dee1033da0af6df0b7a6bde83f74e1c0c870623797cec77964de34d", - "sha256:4fcc472ef87f45c429d3b923b925704aa581f875d65bac80f8ab0c3296a63f78", - "sha256:61409bd745a265a742f2693e4600e4dbd45cc1daebe1d5fad6fcb22912d44145", - "sha256:678f1963f755c5d9f5f6968dded7b245dd1ece8cf53c1aa9d80e6734a8c7f41d", - "sha256:6c6d03549d4e2734133badb9ab1c05d9f0ef4bcd31d83e5d2b4747c85cfa21da", - "sha256:6e74d5f4d6ecd6942375c52ffcd35f4318a61a02328f6f1bd79fcb4ffedf969e", - "sha256:7b4fc7b1ecc987ca7aaf3f4f0e71bbfbd81aaabf87002558f5bc95da3a865bcd", - "sha256:7ed386a40e172ddf44c061ad74881d8622f791d9af0b6f5be20023029129bc85", - "sha256:8f54f0924d12c47a382c600c880770b5ebfc96c9fd94cf6f6bdc21caf6163ea7", - "sha256:ad9b81351fdc236bda538efa6879315448411a81186c836d4b80d6ca8217cdb9", - "sha256:bbd00e21ea17f7bcc58dccd13869d68441b32899e89cf6cfa90d624a9198ce85", - "sha256:c3c289762cc09735e2a8f8a49571d0e8b4f57ea831ea11558247b5bdea0ac4db", - "sha256:cf4650942de5e5685ad308e22bcafbccfe37c54aa7c0e30cd620c2ee5c93d336", - "sha256:cfcbc33c9c59c93776aa41ab02e55c288a042211708b72fdb518221cc803abc8", - "sha256:e301055deadfedbd80cf94f2f65ff23126b232b0d1fea28f332ce58137bcdb18", - "sha256:ebbfe24df7f7b5c6c7620702496b6419f6a9aa2fd7f005eb731cc80d7b4692b9", - "sha256:eff69ddbf3ad86375c344339371168640951c302450c5d3e9936e98d6459db06", - "sha256:f6ed60a62c5f1c44e789d2cf14009423cb1646b44a43e40a9cf6a21f077678a1" + "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd", + "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c", + "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081", + "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f", + "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261", + "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a", + "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9", + "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a", + "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb", + "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60", + "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128", + "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a", + "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717", + "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89", + "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72", + "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8", + "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3", + "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7", + "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8", + "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77", + "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1", + "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15", + "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679", + "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012", + "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6", + "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc", + "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca" ], "markers": "python_version != '3.4'", - "version": "==4.4.2" + "version": "==4.5.0" }, "parsel": { "hashes": [ @@ -214,10 +229,10 @@ }, "parso": { "hashes": [ - "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1", - "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3" + "sha256:1376bdc8cb81377ca481976933773295218a2df47d3e1182ba76d372b1acb128", + "sha256:597f36de5102a8db05ffdf7ecdc761838b86565a4a111604c6e78beaedf1b045" ], - "version": "==0.5.2" + "version": "==0.6.0" }, "pexpect": { "hashes": [ @@ -236,10 +251,10 @@ }, "prompt-toolkit": { "hashes": [ - "sha256:0278d2f51b5ceba6ea8da39f76d15684e84c996b325475f6e5720edc584326a7", - "sha256:63daee79aa8366c8f1c637f1a4876b890da5fc92a19ebd2f7080ebacb901e990" + "sha256:a402e9bf468b63314e37460b68ba68243d55b2f8c4d0192f85a019af3945050e", + "sha256:c93e53af97f630f12f5f62a3274e79527936ed466f038953dfa379d4941f651a" ], - "version": "==3.0.2" + "version": "==3.0.3" }, "protego": { "hashes": [ @@ -295,6 +310,14 @@ ], "version": "==2.0.0" }, + "pymysql": { + "hashes": [ + "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a", + "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7" + ], + "index": "pypi", + "version": "==0.9.3" + }, "pyopenssl": { "hashes": [ "sha256:621880965a720b8ece2f1b2f54ea2071966ab00e2970ad2ce11d596102063504", @@ -309,6 +332,14 @@ ], "version": "==1.5.0" }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, "scrapy": { "hashes": [ "sha256:4352c64c7ffc70148a7988db837bb25bccafb3350ab9c978c1f9a8930521959b", @@ -363,6 +394,13 @@ "markers": "python_version >= '3.5'", "version": "==19.10.0" }, + "urllib3": { + "hashes": [ + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" + ], + "version": "==1.25.8" + }, "w3lib": { "hashes": [ "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823", diff --git a/README.md b/README.md new file mode 100644 index 0000000..e124ef1 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# Traffic_news + +## 反爬 + +经过测试,大部分站点的反爬措施较弱。 + +**站点 `zhian.cpd.com.cn` 的反爬措施较强。** + +### 措施 + +* 降低访问频率 +* 伪造 `User-Agent` + +以上方法均无法有效突破反爬的封锁。 diff --git a/crawler/crawler/dupefilters.py b/crawler/crawler/dupefilters.py new file mode 100644 index 0000000..dedcb1f --- /dev/null +++ b/crawler/crawler/dupefilters.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Author : 张旭 +# @Email : zhangxu3486432@gmail.com +# @Blog : https://zhangxu3486432.github.io +# @FileName: dupefilters.py +# @Time : 2020/2/3 + +from __future__ import print_function + +import logging + +import pymysql +from scrapy.dupefilters import BaseDupeFilter +from scrapy.utils.project import get_project_settings + +settings = get_project_settings() +from scrapy.utils.request import referer_str, request_fingerprint + + +class RFPDupeFilter(BaseDupeFilter): + """Request Fingerprint duplicates filter""" + + def __init__(self, database_name=None, table_name=None, filter_name=None, debug=False): + self.fingerprints = set() + self.logdupes = True + self.debug = debug + self.logger = logging.getLogger(__name__) + self.fingerprints.update() + if database_name and table_name: + host = settings.get('MYSQL_HOST', 'localhost') + mysql_user = settings.get('MYSQL_USER', 'root') + mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler') + mysql_port = settings.get('MYSQL_PORT', 3306) + self.db = pymysql.connect(host, mysql_user, mysql_pwd, database_name, mysql_port) + self.cursor = self.db.cursor() + sql = "SELECT {0} FROM {1} WHERE 1".format(filter_name, table_name) + self.cursor.execute(sql) + ids = self.cursor.fetchall() + self.fingerprints.update(ids) + + @classmethod + def from_crawler(cls, crawler): + debug = settings.getbool('DUPEFILTER_DEBUG') + return cls(crawler.spider.database_name, crawler.spider.table_name, crawler.spider.filter_name, debug) + + def request_seen(self, request): + fp = self.request_fingerprint(request) + if fp in self.fingerprints: + return True + self.fingerprints.add(fp) + + def request_fingerprint(self, request): + return request_fingerprint(request) + + def close(self, reason): + if self.db and self.cursor: + self.db.close() + self.cursor.close() + + def log(self, request, spider): + if self.debug: + msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" + args = {'request': request, 'referer': referer_str(request)} + self.logger.debug(msg, args, extra={'spider': spider}) + elif self.logdupes: + msg = ("Filtered duplicate request: %(request)s" + " - no more duplicates will be shown" + " (see DUPEFILTER_DEBUG to show all duplicates)") + self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + self.logdupes = False + + spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py index 6303525..32ff55f 100644 --- a/crawler/crawler/items.py +++ b/crawler/crawler/items.py @@ -5,12 +5,15 @@ # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html +import logging import re from scrapy import Item, Field from scrapy.loader.processors import Join, MapCompose, TakeFirst from w3lib.html import remove_tags, remove_tags_with_content +logger = logging.getLogger(__name__) + class CrawlerItem(Item): pass @@ -56,14 +59,12 @@ def clean(value): def remove_style(value): value = remove_tags_with_content(value, which_ones=('style',)) value = remove_tags(value) - value = value.replace('\r\n', '') - value = value.replace('\n', '') - value = value.replace(' \u3000\u3000', '', 1) value = value.replace('\t', ' ') + value = value.strip() return value -def get_id(url): +def get_news_id(url): res = p1.match(url) if res: res = res.groups() @@ -73,17 +74,22 @@ def get_id(url): res = set(res) res.remove(None) return res.pop() - return None + logger.error('Cannot extract news_id from url.') + return '' def get_category(url): path = p2.search(url) if path: return category[path[1]] - return None + logger.error('Unknown category.') + return '其他' class CpdItem(Item): + id = Field( + output_processor=TakeFirst(), + ) url = Field( output_processor=TakeFirst(), ) @@ -108,7 +114,7 @@ class CpdItem(Item): output_processor=TakeFirst(), ) news_id = Field( - input_processor=MapCompose(get_id), + input_processor=MapCompose(get_news_id), output_processor=TakeFirst(), ) page = Field( diff --git a/crawler/crawler/middlewares.py b/crawler/crawler/middlewares.py index 0939a19..a73c788 100644 --- a/crawler/crawler/middlewares.py +++ b/crawler/crawler/middlewares.py @@ -5,11 +5,12 @@ # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import base64 import logging +import os import random from scrapy import signals -from scrapy.exceptions import IgnoreRequest from scrapy.utils.project import get_project_settings settings = get_project_settings() @@ -116,25 +117,47 @@ class RandomUserAgentMiddleware(object): def __init__(self): self.user_agent_list = settings.get('USER_AGENT_LIST') + self.count = 0 def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list) -class HttpError(IgnoreRequest): - """A non-200 response was filtered""" - - def __init__(self, response, *args, **kwargs): - self.response = response - super(HttpError, self).__init__(*args, **kwargs) +class SaveHttpErrorMiddleware(object): + def __init__(self): + if os.path.exists('../data/error/405.tsv'): + os.rename('../data/error/405.tsv', '../data/error/retry.tsv') -class SaveHttpErrorMiddleware(object): + if os.path.exists('../data/error/error.tsv'): + os.remove('../data/error/error.tsv') def process_spider_input(self, response, spider): if 200 <= response.status < 300: # common case return - with open('../data/error/cpd_error.tsv', 'a+') as f: - line = f'{response.url}\t{response.status}\n' - f.write(line) + if response.status == 405: + with open('../data/error/405.tsv', 'a+') as f: + line = f'{response.url}\t{response.status}\n' + f.write(line) + else: + with open('../data/error/error.tsv', 'a+') as f: + line = f'{response.url}\t{response.status}\n' + f.write(line) return + + +# 代理服务器 +proxyServer = "http://http-dyn.abuyun.com:9020" + +# 代理隧道验证信息 +proxyUser = "HQOD0I34O25GCC2D" +proxyPass = "78380759AA00F51F" + +proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8") + + +class ProxyMiddleware(object): + + def process_request(self, request, spider): + request.meta["proxy"] = proxyServer + request.headers["Proxy-Authorization"] = proxyAuth diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py index 1fadf7e..7b1e14a 100644 --- a/crawler/crawler/pipelines.py +++ b/crawler/crawler/pipelines.py @@ -4,7 +4,11 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +import pymysql from scrapy.exceptions import DropItem +from scrapy.utils.project import get_project_settings + +settings = get_project_settings() class CrawlerPipeline(object): @@ -15,15 +19,30 @@ def process_item(self, item, spider): class SavePipeline(object): def __init__(self): - self.f = open('../data/tsv/cpd.tsv', 'a+') + host = settings.get('MYSQL_HOST', 'localhost') + mysql_user = settings.get('MYSQL_USER', 'root') + mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler') + mysql_port = settings.get('MYSQL_PORT', 3306) + database = 'cpd' + self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port) + self.cursor = self.db.cursor() def process_item(self, item, spider): - if item.__len__() != 8: + if item.__len__() != 9: raise DropItem(item) - # return - self.f.write( - f"{item['news_id']}\t{item['title']}\t{item['category']}\t{item['source']}\t{item['date']}\t{item['page']}\t{item['url']}\t{item['content']}\n") - return item + sql = """INSERT INTO + `data`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s) + """ + try: + self.cursor.execute(sql, (item['id'], item['url'], item['title'], item['content'], item['category'], + item['source'], item['date'], item['news_id'], item['page'])) + self.db.commit() + except Exception as e: + spider.logger.error(f'occur error when db commit date: {e.args[1]}') + self.db.rollback() + return def close_spider(self, spider): - self.f.close() + self.cursor.close() + self.db.close() diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index 9038cba..5a841b4 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from datetime import datetime # Scrapy settings for crawler project # @@ -15,6 +16,7 @@ NEWSPIDER_MODULE = 'crawler.spiders' DOWNLOADER_MIDDLEWARES = { + 'crawler.middlewares.ProxyMiddleware': 90, 'crawler.middlewares.RandomUserAgentMiddleware': 543, } @@ -26,12 +28,13 @@ # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 -CONCURRENT_REQUESTS = 1 +CONCURRENT_REQUESTS = 5 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -# DOWNLOAD_DELAY = 1 +DOWNLOAD_DELAY = 0.2 +RANDOMIZE_DOWNLOAD_DELAY = False # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 @@ -94,13 +97,21 @@ # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -DEPTH_LIMIT = 5 +DEPTH_LIMIT = 10 -LOG_FILE = '../log/crawler.log' +LOG_FILE = f"../log/crawler_{datetime.now().strftime('%Y.%m.%d_%H:%M:%S')}.log" LOG_LEVEL = 'DEBUG' LOG_STDOUT = True -JOBDIR = 'crawler/job_info/001' +DUPEFILTER_CLASS = 'crawler.dupefilters.RFPDupeFilter' + +RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405] + +# 数据库设置 +MYSQL_HOST = 'localhost' +MYSQL_USER = 'root' +MYSQL_PASSWORD = 'news_crawler' +MYSQL_PORT = 3306 USER_AGENT_LIST = [ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py index 7287290..70930be 100644 --- a/crawler/crawler/spiders/cpd_spider.py +++ b/crawler/crawler/spiders/cpd_spider.py @@ -13,24 +13,38 @@ import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader +from scrapy.utils.project import get_project_settings +from scrapy.utils.request import request_fingerprint from crawler.items import CpdItem +settings = get_project_settings() + logger = logging.getLogger(__name__) class CpdSpider(scrapy.Spider): + def __init__(self, *args, **kwargs): + super(CpdSpider, self).__init__(*args, **kwargs) + name = "cpd" + + database_name = 'news' + table_name = 'cpd_news' + filter_name = 'id' + allowed_domains = ["cpd.com.cn"] start_urls = [ # 'http://www.cpd.com.cn/', 'http://www.cpd.com.cn/n10216060/n10216158/', + # 'http://news.cpd.com.cn/', 'http://news.cpd.com.cn/n18151/', 'http://news.cpd.com.cn/n3559/', 'http://news.cpd.com.cn/n3569/', 'http://news.cpd.com.cn/n3573/', + # 'http://jt.cpd.com.cn/', 'http://jt.cpd.com.cn/n462015/', 'http://jt.cpd.com.cn/n462009/', @@ -44,10 +58,12 @@ class CpdSpider(scrapy.Spider): 'http://jt.cpd.com.cn/n462061/', 'http://jt.cpd.com.cn/n462053/', 'http://jt.cpd.com.cn/n462027/', + # 'http://zhian.cpd.com.cn/', 'http://zhian.cpd.com.cn/n26237006/', 'http://zhian.cpd.com.cn/n26237008/', 'http://zhian.cpd.com.cn/n26237014/', + # 'http://minsheng.cpd.com.cn/', 'http://minsheng.cpd.com.cn/n1448484/', 'http://minsheng.cpd.com.cn/n1448482/', @@ -56,7 +72,7 @@ class CpdSpider(scrapy.Spider): 'http://minsheng.cpd.com.cn/n1448492/' ] - link = LinkExtractor(allow=start_urls) + link = LinkExtractor(allow=start_urls, deny='%3Cscript%3Edocument.write(location.href);%3C/script%3E') # # http://zhian.cpd.com.cn/n26237006/ # p_index = re.compile('createPageHTML\((\d+), (\d+),') @@ -80,8 +96,18 @@ class CpdSpider(scrapy.Spider): p_path2 = re.compile('(.*?)content.html') def start_requests(self): + try: + with open('../data/error/retry.tsv') as f: + lines = f.readlines() + for line in lines: + news_url = line.split('\t')[0] + yield scrapy.Request(url=news_url, callback=self.parse_news, dont_filter=True) + + except IOError: + logger.info('retry.tsv not accessible') + for url in self.start_urls: - yield scrapy.Request(url=url, callback=self.parse_index) + yield scrapy.Request(url=url, callback=self.parse_index, dont_filter=True) def parse_index(self, response): url = response.url @@ -119,7 +145,10 @@ def parse_news(self, response): else: self.logger.error(f'未知格式的 NEWS URL: {url}') + fp = request_fingerprint(response.request) + cpd_item = ItemLoader(item=CpdItem(), response=response) + cpd_item.add_value('id', fp) cpd_item.add_value('url', url) cpd_item.add_xpath('title', '//*[@id="newslist"]/h1/gettitle/text()') cpd_item.add_xpath('content', '//*[@id="fz_test"]/div[1]/table') @@ -130,6 +159,6 @@ def parse_news(self, response): cpd_item.add_value('page', page) yield cpd_item.load_item() - links = self.link.extract_links(response) + links = self.link.extract_links(response,) for link in links: yield scrapy.Request(url=link.url, callback=self.parse_news) diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml new file mode 100644 index 0000000..1d2b5f9 --- /dev/null +++ b/deploy/docker-compose.yml @@ -0,0 +1,34 @@ +version: '3' + +services: + + db: + image: mysql + container_name: mysql-crawler + command: mysqld --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci #设置utf8字符集 + # command: --default-authentication-plugin=mysql_native_password + restart: always + environment: + MYSQL_ROOT_PASSWORD: news_crawler +# MYSQL_DATABASE: cpd +# MYSQL_USER: user1 +# MYSQL_PASSWORD: password1 + ports: + - '3306:3306' + volumes: + - ../data/mysql/db:/var/lib/mysql + - ../data/mysql/conf:/etc/mysql/conf.d + - ./init/cpd.sql:/docker-entrypoint-initdb.d/init.sql:ro + + gui: + depends_on: + - db + image: phpmyadmin/phpmyadmin + container_name: phpmyadmin-crawler + restart: always + environment: + MYSQL_ROOT_PASSWORD: news_crawler + PMA_HOST: db + ports: + - '8080:80' + diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql new file mode 100644 index 0000000..22ace1f --- /dev/null +++ b/deploy/init/cpd.sql @@ -0,0 +1,21 @@ +create schema if not exists news collate utf8mb4_unicode_ci; + +use news; + +create table if not exists cpd_news +( + id varchar(40) not null, + url varchar(255) not null, + title varchar(50) not null, + content text not null, + category varchar(5) not null, + source varchar(50) not null, + date varchar(30) not null, + news_id varchar(50) not null, + page int not null, + constraint data_id_uindex + unique (id) +); + +alter table data + add primary key (id);