forked from meilihujia/crawler_samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqiushibaike.js
42 lines (40 loc) · 1.1 KB
/
qiushibaike.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
var configs = {
domains: ["www.qiushibaike.com"],
scanUrls: ["http://www.qiushibaike.com/"],
contentUrlRegex: "http://www\\.qiushibaike\\.com/article/\\d+",
helperUrlRegexes: ["http://www\\.qiushibaike\\.com/(8hr/page/\\d+\\?s=\\d+)?"],
enableJS: false,
interval: 3000,
fields: [
{
name: "article_title",
selector: "//*[@id='single-next-link']//div[contains(@class,'content')]/text()[1]",
required: true
},
{
name: "article_content",
selector: "//*[@id='single-next-link']",
required: true
},
{
name: "article_author",
selector: "//div[contains(@class,'author')]//h2"
},
{
name: "article_publish_time",
selector: "//div[contains(@class,'author')]//h2"
}
]
};
configs.afterExtractField = function(fieldName, data, page){
if(fieldName=="article_title"){
if(data.length>10){
data=data.substring(0,10)+"...";
}
}else if(fieldName=="article_publish_time"){
data = Date.parse(new Date())/1000+"";
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();