-
Notifications
You must be signed in to change notification settings - Fork 3
/
crawlExample.js
147 lines (137 loc) · 3.76 KB
/
crawlExample.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// 项目初始化,执行npm init,来初始化package.json
// npm install request--save
// npm install async --save
// 需要引入的包
var request = require("request");
var fs = require("fs");
var async = require("async");
var url =
"https://www.zhihu.com/api/v4/members/bai-dian/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20";
//request请求的options
var options = {
url: url,
headers: {
authorization:
"Bearer Mi4xRGViWEF3QUFBQUFBSU1KQjNudUNEQmNBQUFCaEFsVk4xU1lGV2dERnlQQzRVYXVQNVNqSmZIUnl1WWY1RkFSczJR|1507695061|f575db5eb9910d90c54f8e92ede7bb0fcfe795e0",
},
};
var users = [];
// 获取大量的用户数据
function getDataList(url) {
options.url = url;
request.get(options, function (error, response, body) {
if (!error && response.statusCode == 200) {
var response = JSON.parse(response.body);
var zhList = response.data;
zhList.forEach(function (item) {
//item.gender == 0 性别判断
if (item.gender == 0) {
console.log(`正在抓取${item.avatar_url}`);
users.push({
name: item.name,
img: item.avatar_url.replace("_is", ""),
url_token: item.url_token,
});
}
});
//is_end当前用户的关注用户是否到最后一页
if (response.paging.is_end) {
//这里判断抓取的条数
if (users.length <= 1000) {
console.log(`抓取完成`);
downLoadContent(JSON.stringify(users));
return;
} else {
console.log(`第${i + 1}个用户的数据`);
getDataList(zurl.replace("demouser", users[i].url_token));
i++;
}
} else {
if (users.length <= 1000) {
console.log(`抓取完成`);
downLoadContent(JSON.stringify(users));
return;
}
getDataList(response.paging.next);
}
}
});
}
// 把数据下载保存到data.js
function downLoadContent(cont) {
fs.appendFile("./" + "data.js", "module.exports =" + cont, "utf-8", function (
err
) {
if (err) {
console.log(err);
} else console.log("success");
startDownLoad(JSON.parse(cont));
});
}
// 下载图片
var eyeUrl = "http://api.eyekey.com/face/Check/checking";
var config = {
app_id: "f89ae61fd63d4a63842277e9144a6bd2",
app_key: "af1cd33549c54b27ae24aeb041865da2",
url: "https://pic4.zhimg.com/43fda2d268bd17c561ab94d3cb8c80eb.jpg",
};
function face(item) {
config.url = item.img;
request.post(
{
url: eyeUrl,
form: config,
},
function (error, response, body) {
if (!error && response.statusCode == 200) {
var data = JSON.parse(body);
try {
if (data.face[0].attribute.gender == "Female") {
console.log(`正在下载${item.img}`);
downLoadImg(item);
}
} catch (e) {
console.log(`验证失败${item.img}~`);
}
}
}
);
}
// 下载图片方法
function downLoadImg(image) {
request.head(image.img, function (err, res, body) {
if (err) {
console.log(err);
}
});
request(image.img).pipe(
fs.createWriteStream(
"./beauty_image/" +
image.name +
Date.now() +
"." +
image.img.substring(image.img.lastIndexOf(".") + 1, image.img.length)
)
);
}
// 开始下载
function startDownLoad(imgdata) {
//控制并发量,在5个以内
console.log(imgdata);
async.eachLimit(
imgdata,
3,
function (item, callback) {
face(item);
callback();
},
function (err) {
if (err) {
console.log(err);
} else {
console.log("success!");
}
}
);
}
getDataList(url);