-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5640f7c
commit 2b81d2e
Showing
12 changed files
with
347 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
# Other | ||
nohup.out | ||
data/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Traffic_news | ||
|
||
## 反爬 | ||
|
||
经过测试,大部分站点的反爬措施较弱。 | ||
|
||
**站点 `zhian.cpd.com.cn` 的反爬措施较强。** | ||
|
||
### 措施 | ||
|
||
* 降低访问频率 | ||
* 伪造 `User-Agent` | ||
|
||
以上方法均无法有效突破反爬的封锁。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#! /usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
# @Author : 张旭 | ||
# @Email : [email protected] | ||
# @Blog : https://zhangxu3486432.github.io | ||
# @FileName: dupefilters.py | ||
# @Time : 2020/2/3 | ||
|
||
from __future__ import print_function | ||
|
||
import logging | ||
|
||
import pymysql | ||
from scrapy.dupefilters import BaseDupeFilter | ||
from scrapy.utils.project import get_project_settings | ||
|
||
settings = get_project_settings() | ||
from scrapy.utils.request import referer_str, request_fingerprint | ||
|
||
|
||
class RFPDupeFilter(BaseDupeFilter): | ||
"""Request Fingerprint duplicates filter""" | ||
|
||
def __init__(self, database_name=None, table_name=None, filter_name=None, debug=False): | ||
self.fingerprints = set() | ||
self.logdupes = True | ||
self.debug = debug | ||
self.logger = logging.getLogger(__name__) | ||
self.fingerprints.update() | ||
if database_name and table_name: | ||
host = settings.get('MYSQL_HOST', 'localhost') | ||
mysql_user = settings.get('MYSQL_USER', 'root') | ||
mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler') | ||
mysql_port = settings.get('MYSQL_PORT', 3306) | ||
self.db = pymysql.connect(host, mysql_user, mysql_pwd, database_name, mysql_port) | ||
self.cursor = self.db.cursor() | ||
sql = "SELECT {0} FROM {1} WHERE 1".format(filter_name, table_name) | ||
self.cursor.execute(sql) | ||
ids = self.cursor.fetchall() | ||
self.fingerprints.update(ids) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler): | ||
debug = settings.getbool('DUPEFILTER_DEBUG') | ||
return cls(crawler.spider.database_name, crawler.spider.table_name, crawler.spider.filter_name, debug) | ||
|
||
def request_seen(self, request): | ||
fp = self.request_fingerprint(request) | ||
if fp in self.fingerprints: | ||
return True | ||
self.fingerprints.add(fp) | ||
|
||
def request_fingerprint(self, request): | ||
return request_fingerprint(request) | ||
|
||
def close(self, reason): | ||
if self.db and self.cursor: | ||
self.db.close() | ||
self.cursor.close() | ||
|
||
def log(self, request, spider): | ||
if self.debug: | ||
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" | ||
args = {'request': request, 'referer': referer_str(request)} | ||
self.logger.debug(msg, args, extra={'spider': spider}) | ||
elif self.logdupes: | ||
msg = ("Filtered duplicate request: %(request)s" | ||
" - no more duplicates will be shown" | ||
" (see DUPEFILTER_DEBUG to show all duplicates)") | ||
self.logger.debug(msg, {'request': request}, extra={'spider': spider}) | ||
self.logdupes = False | ||
|
||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.