Skip to content

Commit

Permalink
数据库变更,修复 bug
Browse files Browse the repository at this point in the history
  • Loading branch information
kosuke-zhang committed Feb 4, 2020
1 parent 2b81d2e commit 0756620
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 23 deletions.
1 change: 1 addition & 0 deletions crawler/crawler/dupefilters.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, database_name=None, table_name=None, filter_name=None, debug=
sql = "SELECT {0} FROM {1} WHERE 1".format(filter_name, table_name)
self.cursor.execute(sql)
ids = self.cursor.fetchall()
ids = map(lambda x: x[0], ids)
self.fingerprints.update(ids)

@classmethod
Expand Down
11 changes: 2 additions & 9 deletions crawler/crawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,20 +126,13 @@ def process_request(self, request, spider):
class SaveHttpErrorMiddleware(object):

def __init__(self):
if os.path.exists('../data/error/405.tsv'):
os.rename('../data/error/405.tsv', '../data/error/retry.tsv')

if os.path.exists('../data/error/error.tsv'):
os.remove('../data/error/error.tsv')
os.rename('../data/error/error.tsv', '../data/error/retry.tsv')

def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
return
if response.status == 405:
with open('../data/error/405.tsv', 'a+') as f:
line = f'{response.url}\t{response.status}\n'
f.write(line)
else:
if response.status != 404:
with open('../data/error/error.tsv', 'a+') as f:
line = f'{response.url}\t{response.status}\n'
f.write(line)
Expand Down
14 changes: 7 additions & 7 deletions crawler/crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,23 @@ def __init__(self):
mysql_user = settings.get('MYSQL_USER', 'root')
mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler')
mysql_port = settings.get('MYSQL_PORT', 3306)
database = 'cpd'
database = 'news'
self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port)
self.cursor = self.db.cursor()

def process_item(self, item, spider):
if item.__len__() != 9:
raise DropItem(item)
sql = """INSERT INTO
`data`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`)
`cpd_news`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
try:
self.cursor.execute(sql, (item['id'], item['url'], item['title'], item['content'], item['category'],
item['source'], item['date'], item['news_id'], item['page']))
self.cursor.execute(sql, (
item.get('id', ''), item.get('url', ''), item.get('title', ''), item.get('content', ''),
item.get('category', ''),
item.get('source', ''), item.get('date', ''), item.get('news_id', ''), item.get('page', '')))
self.db.commit()
except Exception as e:
spider.logger.error(f'occur error when db commit date: {e.args[1]}')
spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}')
self.db.rollback()
return

Expand Down
2 changes: 1 addition & 1 deletion crawler/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405]

# 数据库设置
MYSQL_HOST = 'localhost'
MYSQL_HOST = '39.99.157.187'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'news_crawler'
MYSQL_PORT = 3306
Expand Down
12 changes: 7 additions & 5 deletions crawler/crawler/spiders/cpd_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(self, *args, **kwargs):
'http://minsheng.cpd.com.cn/n1448492/'
]

link = LinkExtractor(allow=start_urls, deny='%3Cscript%3Edocument.write(location.href);%3C/script%3E')
link = LinkExtractor(allow=start_urls, deny='.*?<script>document\.write\(location\.href\);</script>')

# # http://zhian.cpd.com.cn/n26237006/
# p_index = re.compile('createPageHTML\((\d+), (\d+),')
Expand Down Expand Up @@ -101,7 +101,7 @@ def start_requests(self):
lines = f.readlines()
for line in lines:
news_url = line.split('\t')[0]
yield scrapy.Request(url=news_url, callback=self.parse_news, dont_filter=True)
yield scrapy.Request(url=news_url, callback=self.parse_news)

except IOError:
logger.info('retry.tsv not accessible')
Expand All @@ -120,9 +120,11 @@ def parse_index(self, response):

def parse_news(self, response):
url = response.url
next_page_html = response.xpath('//*[@id="autopage"]//script').get()
title = response.xpath('//*[@id="newslist"]/h1/gettitle/text()').get()

# 判断是 index 页面,还是 news 页面
if next_page_html is not None:
if title is not None:
next_page_html = response.xpath('//*[@id="autopage"]//script').get()

page = response.meta.get('page', 1)

Expand Down Expand Up @@ -159,6 +161,6 @@ def parse_news(self, response):
cpd_item.add_value('page', page)
yield cpd_item.load_item()

links = self.link.extract_links(response,)
links = self.link.extract_links(response, )
for link in links:
yield scrapy.Request(url=link.url, callback=self.parse_news)
2 changes: 1 addition & 1 deletion deploy/init/cpd.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ create table if not exists cpd_news
(
id varchar(40) not null,
url varchar(255) not null,
title varchar(50) not null,
title varchar(255) not null,
content text not null,
category varchar(5) not null,
source varchar(50) not null,
Expand Down

0 comments on commit 0756620

Please sign in to comment.