Skip to content

Commit

Permalink
重写 dedupfilter,数据保存在 mysql
Browse files Browse the repository at this point in the history
  • Loading branch information
kosuke-zhang committed Feb 3, 2020
1 parent 5640f7c commit 2b81d2e
Show file tree
Hide file tree
Showing 12 changed files with 347 additions and 73 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Other
nohup.out
data/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ verify_ssl = true
scrapy = "*"
ipython = "*"
w3lib = "*"
requests = "*"
pymysql = "*"

[requires]
python_version = "3.7"
118 changes: 78 additions & 40 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Traffic_news

## 反爬

经过测试,大部分站点的反爬措施较弱。

**站点 `zhian.cpd.com.cn` 的反爬措施较强。**

### 措施

* 降低访问频率
* 伪造 `User-Agent`

以上方法均无法有效突破反爬的封锁。
73 changes: 73 additions & 0 deletions crawler/crawler/dupefilters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author : 张旭
# @Email : [email protected]
# @Blog : https://zhangxu3486432.github.io
# @FileName: dupefilters.py
# @Time : 2020/2/3

from __future__ import print_function

import logging

import pymysql
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.project import get_project_settings

settings = get_project_settings()
from scrapy.utils.request import referer_str, request_fingerprint


class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""

def __init__(self, database_name=None, table_name=None, filter_name=None, debug=False):
self.fingerprints = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
self.fingerprints.update()
if database_name and table_name:
host = settings.get('MYSQL_HOST', 'localhost')
mysql_user = settings.get('MYSQL_USER', 'root')
mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler')
mysql_port = settings.get('MYSQL_PORT', 3306)
self.db = pymysql.connect(host, mysql_user, mysql_pwd, database_name, mysql_port)
self.cursor = self.db.cursor()
sql = "SELECT {0} FROM {1} WHERE 1".format(filter_name, table_name)
self.cursor.execute(sql)
ids = self.cursor.fetchall()
self.fingerprints.update(ids)

@classmethod
def from_crawler(cls, crawler):
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(crawler.spider.database_name, crawler.spider.table_name, crawler.spider.filter_name, debug)

def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)

def request_fingerprint(self, request):
return request_fingerprint(request)

def close(self, reason):
if self.db and self.cursor:
self.db.close()
self.cursor.close()

def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {'request': request, 'referer': referer_str(request)}
self.logger.debug(msg, args, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False

spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
20 changes: 13 additions & 7 deletions crawler/crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import logging
import re

from scrapy import Item, Field
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from w3lib.html import remove_tags, remove_tags_with_content

logger = logging.getLogger(__name__)


class CrawlerItem(Item):
pass
Expand Down Expand Up @@ -56,14 +59,12 @@ def clean(value):
def remove_style(value):
value = remove_tags_with_content(value, which_ones=('style',))
value = remove_tags(value)
value = value.replace('\r\n', '')
value = value.replace('\n', '')
value = value.replace(' \u3000\u3000', '', 1)
value = value.replace('\t', ' ')
value = value.strip()
return value


def get_id(url):
def get_news_id(url):
res = p1.match(url)
if res:
res = res.groups()
Expand All @@ -73,17 +74,22 @@ def get_id(url):
res = set(res)
res.remove(None)
return res.pop()
return None
logger.error('Cannot extract news_id from url.')
return ''


def get_category(url):
path = p2.search(url)
if path:
return category[path[1]]
return None
logger.error('Unknown category.')
return '其他'


class CpdItem(Item):
id = Field(
output_processor=TakeFirst(),
)
url = Field(
output_processor=TakeFirst(),
)
Expand All @@ -108,7 +114,7 @@ class CpdItem(Item):
output_processor=TakeFirst(),
)
news_id = Field(
input_processor=MapCompose(get_id),
input_processor=MapCompose(get_news_id),
output_processor=TakeFirst(),
)
page = Field(
Expand Down
Loading

0 comments on commit 2b81d2e

Please sign in to comment.