Skip to content

Commit

Permalink
remove save-http-error-middleware; add custom_settings for cpd
Browse files Browse the repository at this point in the history
  • Loading branch information
kosuke-zhang committed Feb 6, 2020
1 parent f874c9f commit 36668bf
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 88 deletions.
2 changes: 0 additions & 2 deletions crawler/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os

if not os.path.exists('error'):
os.makedirs('error')
if not os.path.exists('log'):
os.makedirs('log')
28 changes: 0 additions & 28 deletions crawler/crawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import base64
import logging
import os
import random

from scrapy import signals
Expand Down Expand Up @@ -123,33 +122,6 @@ def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agent_list)


class SaveHttpErrorMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
return cls(crawler)

def __init__(self, crawler):
spider_name = crawler.spider.name

path = f'error/{spider_name}'

self.error_file = os.path.join(path, 'error.tsv')
self.retry_file = os.path.join(path, 'retry.tsv')

if os.path.exists(self.error_file):
os.rename(self.error_file, self.retry_file)

def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
return
if response.status != 404:
with open(self.error_file, 'a+') as f:
line = f'{response.url}\t{response.status}\n'
f.write(line)
return


# 代理服务器
proxyServer = "http://http-dyn.abuyun.com:9020"

Expand Down
35 changes: 0 additions & 35 deletions crawler/crawler/settings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from datetime import datetime

# Scrapy settings for crawler project
#
Expand Down Expand Up @@ -54,7 +53,6 @@
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'crawler.middlewares.SaveHttpErrorMiddleware': 49,
'crawler.middlewares.CrawlerSpiderMiddleware': 543,
}

Expand Down Expand Up @@ -99,7 +97,6 @@

DEPTH_LIMIT = 10

# LOG_FILE = f"log/crawler_{datetime.now().strftime('%Y.%m.%d_%H:%M:%S')}.log"
LOG_LEVEL = 'DEBUG'
LOG_STDOUT = True

Expand Down Expand Up @@ -131,36 +128,4 @@
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
# 'zspider/0.9-dev http://feedback.redkolibri.com/',
# 'Xaldon_WebSpider/2.0.b1',
# 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Speedy Spider (http://www.entireweb.com/about/search_tech/speedy_spider/)',
# 'Mozilla/5.0 (compatible; Speedy Spider; http://www.entireweb.com/about/search_tech/speedy_spider/)',
# 'Speedy Spider (Entireweb; Beta/1.3; http://www.entireweb.com/about/search_tech/speedyspider/)',
# 'Speedy Spider (Entireweb; Beta/1.2; http://www.entireweb.com/about/search_tech/speedyspider/)',
# 'Speedy Spider (Entireweb; Beta/1.1; http://www.entireweb.com/about/search_tech/speedyspider/)',
# 'Speedy Spider (Entireweb; Beta/1.0; http://www.entireweb.com/about/search_tech/speedyspider/)',
# 'Speedy Spider (Beta/1.0; www.entireweb.com)',
# 'Speedy Spider (http://www.entireweb.com/about/search_tech/speedy_spider/)',
# 'Speedy Spider (http://www.entireweb.com/about/search_tech/speedyspider/)',
# 'Speedy Spider (http://www.entireweb.com)',
# 'Sosospider+(+http://help.soso.com/webspider.htm)',
# 'sogou spider',
# 'Nusearch Spider (www.nusearch.com)',
# 'nuSearch Spider (compatible; MSIE 4.01; Windows NT)',
# 'lmspider ([email protected])',
# 'lmspider [email protected]',
# 'ldspider (http://code.google.com/p/ldspider/wiki/Robots)',
# 'iaskspider/2.0(+http://iask.com/help/help_index.html)',
# 'iaskspider',
# 'hl_ftien_spider_v1.1',
# 'hl_ftien_spider',
# 'FyberSpider (+http://www.fybersearch.com/fyberspider.php)',
# 'FyberSpider',
# 'everyfeed-spider/2.0 (http://www.everyfeed.com)',
# 'envolk[ITS]spider/1.6 (+http://www.envolk.com/envolkspider.html)',
# 'envolk[ITS]spider/1.6 ( http://www.envolk.com/envolkspider.html)',
# 'Baiduspider+(+http://www.baidu.com/search/spider_jp.html)',
# 'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
# 'BaiDuSpider',
# 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) AddSugarSpiderBot www.idealobserver.com',
]
2 changes: 0 additions & 2 deletions crawler/crawler/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,5 @@

import os

if not os.path.exists('error/cpd'):
os.makedirs('error/cpd')
if not os.path.exists('log/cpd'):
os.makedirs('log/cpd')
21 changes: 6 additions & 15 deletions crawler/crawler/spiders/cpd_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@


import logging
import os
import re
from datetime import datetime

import scrapy
from scrapy.linkextractors import LinkExtractor
Expand All @@ -30,6 +30,10 @@ def __init__(self, *args, **kwargs):

name = "cpd"

custom_settings = {
'LOG_FILE': f"log/{name}/crawler_{datetime.now().strftime('%Y.%m.%d_%H:%M:%S')}.log"
}

database_name = 'news'
table_name = 'cpd_news'
filter_name = 'id'
Expand Down Expand Up @@ -97,19 +101,6 @@ def __init__(self, *args, **kwargs):
p_path2 = re.compile('(.*?)content.html')

def start_requests(self):
try:
path = f'error/{self.name}'
retry_file = os.path.join(path, 'retry.tsv')

with open(retry_file, 'r') as f:
lines = f.readlines()
for line in lines:
news_url = line.split('\t')[0]
yield scrapy.Request(url=news_url, callback=self.parse_news)

except IOError:
logger.info('retry.tsv not accessible')

for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse_index, dont_filter=True)

Expand Down Expand Up @@ -165,6 +156,6 @@ def parse_news(self, response):
cpd_item.add_value('page', page)
yield cpd_item.load_item()

links = self.link.extract_links(response, )
links = self.link.extract_links(response)
for link in links:
yield scrapy.Request(url=link.url, callback=self.parse_news)
15 changes: 15 additions & 0 deletions deploy/docker-compose-crawler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3'

services:

crawler:
image: registry.cn-hangzhou.aliyuncs.com/traffic_news/cpd_crawler:latest
container_name: crawler
command: scrapy crawl cpd
volumes:
- ./log/:/project/log/
networks:
- crawler_net

networks:
crawler_net:
9 changes: 3 additions & 6 deletions deploy/docker-compose.yml → deploy/docker-compose-db.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@ services:
image: mysql
container_name: mysql-crawler
command: mysqld --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci #设置utf8字符集
# command: --default-authentication-plugin=mysql_native_password
restart: always
environment:
MYSQL_ROOT_PASSWORD: news_crawler
ports:
- '3306:3306'
volumes:
- data/mysql/db:/var/lib/mysql
# - data/mysql/conf:/etc/mysql/conf.d
- init/cpd.sql:/docker-entrypoint-initdb.d/init.sql:ro
- ../data/mysql/db:/var/lib/mysql
- ./init/cpd.sql:/docker-entrypoint-initdb.d/init.sql:ro

gui:
depends_on:
Expand All @@ -27,5 +25,4 @@ services:
MYSQL_ROOT_PASSWORD: news_crawler
PMA_HOST: db
ports:
- '8080:80'

- '8000:80'

0 comments on commit 36668bf

Please sign in to comment.