diff --git a/crawler/crawler/dupefilters.py b/crawler/crawler/dupefilters.py index dedcb1f..f7e0e75 100644 --- a/crawler/crawler/dupefilters.py +++ b/crawler/crawler/dupefilters.py @@ -37,6 +37,7 @@ def __init__(self, database_name=None, table_name=None, filter_name=None, debug= sql = "SELECT {0} FROM {1} WHERE 1".format(filter_name, table_name) self.cursor.execute(sql) ids = self.cursor.fetchall() + ids = map(lambda x: x[0], ids) self.fingerprints.update(ids) @classmethod diff --git a/crawler/crawler/middlewares.py b/crawler/crawler/middlewares.py index a73c788..bf4b10d 100644 --- a/crawler/crawler/middlewares.py +++ b/crawler/crawler/middlewares.py @@ -126,20 +126,13 @@ def process_request(self, request, spider): class SaveHttpErrorMiddleware(object): def __init__(self): - if os.path.exists('../data/error/405.tsv'): - os.rename('../data/error/405.tsv', '../data/error/retry.tsv') - if os.path.exists('../data/error/error.tsv'): - os.remove('../data/error/error.tsv') + os.rename('../data/error/error.tsv', '../data/error/retry.tsv') def process_spider_input(self, response, spider): if 200 <= response.status < 300: # common case return - if response.status == 405: - with open('../data/error/405.tsv', 'a+') as f: - line = f'{response.url}\t{response.status}\n' - f.write(line) - else: + if response.status != 404: with open('../data/error/error.tsv', 'a+') as f: line = f'{response.url}\t{response.status}\n' f.write(line) diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py index 7b1e14a..94d8234 100644 --- a/crawler/crawler/pipelines.py +++ b/crawler/crawler/pipelines.py @@ -23,23 +23,23 @@ def __init__(self): mysql_user = settings.get('MYSQL_USER', 'root') mysql_pwd = settings.get('MYSQL_PASSWORD', 'news_crawler') mysql_port = settings.get('MYSQL_PORT', 3306) - database = 'cpd' + database = 'news' self.db = pymysql.connect(host, mysql_user, mysql_pwd, database, mysql_port) self.cursor = self.db.cursor() def process_item(self, item, spider): - if item.__len__() != 9: - raise DropItem(item) sql = """INSERT INTO - `data`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`) + `cpd_news`(`id`, `url`, `title`, `content`, `category`, `source`, `date`, `news_id`, `page`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s) """ try: - self.cursor.execute(sql, (item['id'], item['url'], item['title'], item['content'], item['category'], - item['source'], item['date'], item['news_id'], item['page'])) + self.cursor.execute(sql, ( + item.get('id', ''), item.get('url', ''), item.get('title', ''), item.get('content', ''), + item.get('category', ''), + item.get('source', ''), item.get('date', ''), item.get('news_id', ''), item.get('page', ''))) self.db.commit() except Exception as e: - spider.logger.error(f'occur error when db commit date: {e.args[1]}') + spider.logger.error(f'occur error when db commit date: {e.args[1]}; url: {item.get("url", "")}') self.db.rollback() return diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index 5a841b4..a6c01c1 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -108,7 +108,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 405] # 数据库设置 -MYSQL_HOST = 'localhost' +MYSQL_HOST = '39.99.157.187' MYSQL_USER = 'root' MYSQL_PASSWORD = 'news_crawler' MYSQL_PORT = 3306 diff --git a/crawler/crawler/spiders/cpd_spider.py b/crawler/crawler/spiders/cpd_spider.py index 70930be..930d8f6 100644 --- a/crawler/crawler/spiders/cpd_spider.py +++ b/crawler/crawler/spiders/cpd_spider.py @@ -72,7 +72,7 @@ def __init__(self, *args, **kwargs): 'http://minsheng.cpd.com.cn/n1448492/' ] - link = LinkExtractor(allow=start_urls, deny='%3Cscript%3Edocument.write(location.href);%3C/script%3E') + link = LinkExtractor(allow=start_urls, deny='.*?') # # http://zhian.cpd.com.cn/n26237006/ # p_index = re.compile('createPageHTML\((\d+), (\d+),') @@ -101,7 +101,7 @@ def start_requests(self): lines = f.readlines() for line in lines: news_url = line.split('\t')[0] - yield scrapy.Request(url=news_url, callback=self.parse_news, dont_filter=True) + yield scrapy.Request(url=news_url, callback=self.parse_news) except IOError: logger.info('retry.tsv not accessible') @@ -120,9 +120,11 @@ def parse_index(self, response): def parse_news(self, response): url = response.url - next_page_html = response.xpath('//*[@id="autopage"]//script').get() + title = response.xpath('//*[@id="newslist"]/h1/gettitle/text()').get() + # 判断是 index 页面,还是 news 页面 - if next_page_html is not None: + if title is not None: + next_page_html = response.xpath('//*[@id="autopage"]//script').get() page = response.meta.get('page', 1) @@ -159,6 +161,6 @@ def parse_news(self, response): cpd_item.add_value('page', page) yield cpd_item.load_item() - links = self.link.extract_links(response,) + links = self.link.extract_links(response, ) for link in links: yield scrapy.Request(url=link.url, callback=self.parse_news) diff --git a/deploy/init/cpd.sql b/deploy/init/cpd.sql index 22ace1f..dbc0b36 100644 --- a/deploy/init/cpd.sql +++ b/deploy/init/cpd.sql @@ -6,7 +6,7 @@ create table if not exists cpd_news ( id varchar(40) not null, url varchar(255) not null, - title varchar(50) not null, + title varchar(255) not null, content text not null, category varchar(5) not null, source varchar(50) not null,