forked from kirbystudy/MediaCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
redbook_image.js
210 lines (190 loc) · 7.81 KB
/
redbook_image.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import axios from 'axios';
import fetch from 'node-fetch';
import fs from 'fs'
import { mkdirp } from 'mkdirp';
import cheerio from 'cheerio';
import path from 'path';
import readline from 'readline'
/**
* 创建文件夹
*
* @param {string} folderPath - 文件夹路径
*/
function mkdir(folderPath) {
const isFolderExist = fs.existsSync(folderPath);
if (!isFolderExist) {
console.log(`正在创建新文件夹: ${folderPath}`);
mkdirp(folderPath);
console.log('创建成功!');
} else {
console.log(`文件夹已存在: ${folderPath}`);
}
}
/**
* 获取html页面文本
*
* @param {string} url - 页面地址
* @param {object} headers - 请求头
* @returns {Promise<string>}
*/
/**
* 获取html页面文本
*
* @param {string} url - 页面地址
* @param {object} headers - 请求头
* @returns {Promise<string>}
*/
async function fetchHtml(url, headers) {
const newUrl = url.replace(/\?.*/g, '');
try {
const response = await fetch(newUrl, { headers: headers });
const html = await response.text();
return html;
} catch (err) {
throw new Error(`获取 ${newUrl} 数据失败:${err}`);
}
}
/**
* 正则表达式匹配除中文、字母、数字外的所有字符,并将其替换为空
*
* @param {string} title - 标题
* @returns {string}
*/
function cleanTitle(title) {
return title.replace(/[^\u4e00-\u9fa5\w]/g, '');
}
/**
* 解析html文本,提取无水印图片的 url
*
* @param {string} html - 页面html文本
*/
async function getPictures(html) {
const $ = cheerio.load(html);
let success = false; // 通过 success 变量来判断是否成功获取图片链接
let retryCount = 0; // 重试次数初始化为 0
while (!success && retryCount < 5) { // 最多重试 5 次
const scripts = $('script').get();
const result = scripts
.map(({ children }) => children[0] && children[0].data) // 获取 script 标签中的文本信息
.find(text => text && text.startsWith('window.__INITIAL_STATE__=')); // 查找 window.__INITIAL_STATE__= 字符串
if (result !== undefined) {
success = true; // 如果找到了对应字符串,则设置 success 为 true
const jsonStr = result.slice(result.indexOf('=') + 1); // 截取 JSON 数据
const jsonData = jsonStr.replace(/undefined/g, 'null'); // 将 undefined 替换为 null
try {
const data = JSON.parse(jsonData); // 解析 JSON 数据
if (data === null || !data.note.note.imageList) { // 如果没有找到图片链接则抛出错误
throw new Error('未找到图片链接');
}
let imageList = data.note.note.imageList;
let title = data.note.note.title;
console.log(`开始下载 ${imageList.length}张 图片`);
let folderPath = './images/' + cleanTitle(title);
mkdir(folderPath);
for (let i = 0; i < imageList.length; i++) {
let picUrl = `https://sns-img-qc.xhscdn.com/${imageList[i].traceId}`;
let filename = `${folderPath}/${imageList[i].traceId}.jpg`;
// 检查文件是否已存在
if (fs.existsSync(filename)) {
console.log(`文件 ${filename} 已存在,跳过下载`);
continue;
}
await download(picUrl, filename);
}
} catch (error) { // 解析 JSON 和下载图片的过程中可能出现异常
console.log(`解析 JSON 失败:${error}`);
retryCount++; // 当解析 JSON 或下载图片失败时,增加重试次数,并等待一段时间后再进行重试
await sleep(3000);
}
} else { // 如果没有找到对应字符串,则抛出错误
console.log('没有找到对应的 script 标签');
retryCount++; // 增加重试次数
await sleep(3000);
}
}
if (retryCount === 5) { // 重试 3 次仍未成功获取到图片链接时,抛出错误
throw new Error('重试5次仍未成功获取图片链接');
}
}
// 定义一个睡眠函数
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* 下载文件到本地目录
* @param {String} url - 文件的URL地址
* @param {String} filePath - 保存到本地的文件路径(包括文件名)
* @param {Number} retries - 当前已经重试的次数
* @param {Number} maxRetries - 最大重试次数
* @return {Promise} - 返回一个Promise对象
*/
async function download(url, filePath, retries = 0, maxRetries = 5) {
const headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
};
const timeout = 10000; // 设置超时时间为10秒
try {
// 发送GET请求并获取响应内容
const response = await axios.get(url, { headers, responseType: 'arraybuffer', timeout });
if (response.status !== 200) {
throw new Error(`下载图片失败 ${filePath}`);
}
// 把内容写入到文件
fs.writeFileSync(filePath, response.data);
console.log(`图片 ${filePath} 下载成功!`);
return filePath;
} catch (error) {
retries++;
if (retries < maxRetries) {
console.error(`下载图片失败 ${path.basename(filePath)}, 重试(${retries}/${maxRetries})...`);
// 递归调用本身,并传入重试次数
return await download(url, filePath, retries, maxRetries);
} else {
console.error(`下载图片失败 ${path.basename(filePath)}, 重试次数已达到上限`);
throw error;
}
}
}
/**
* 遍历urls,批量下载去水印图片
*
* @param {[string]} urls - 页面地址列表
* @param {string} cookie - cookies
*/
async function loopLink(urls) {
const headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': 'xhsTrackerId=a32db973-f67d-4842-a047-f60a6dfb64bd; xhsTrackerId.sig=pze-jfxgqNP0jmBwKyjA2awterEQPKQTDa1ZkvvsPIo; xhsTracker=url=explore&searchengine=baidu; xhsTracker.sig=u1cFYHAwm89lKbFLL1Y8vp9JcskioXWTa56RKaAB2ys; a1=18834163b32fq9eg76e7cap0vs5ld3veuvpmhktco50000424130; webId=71199a5d0b387d06f3f1fa825b4071f0; gid=yYYq4yKqYi7iyYYq4yKqDhT9qJiAjdkWKdWSUSl0U8hM26281fqhdx8884J4yq88DS04f42S; gid.sign=W8CEAhgALtsKx2rpcArnuEyWR24=; web_session=040069b5f5e5ce20e2f81088a4364ba65d30f0; webBuild=2.11.5; cache_feeds=[]; xsecappid=xhs-pc-web; websectiga=82e85efc5500b609ac1166aaf086ff8aa4261153a448ef0be5b17417e4512f28; sec_poison_id=cf7a4e0b-b26b-4963-8588-417cf50da70e',
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
};
for (let i = 0; i < urls.length; i++) {
try {
const html = await fetchHtml(urls[i], headers);
await getPictures(html);
} catch (error) {
throw new Error(`无法处理 URL ${urls[i]}: ${error}`);
}
}
}
(async () => {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
const answer = await new Promise((resolve) => {
rl.question('请输入小红书图片链接:', (input) => {
resolve(input);
});
});
const urls = answer.split(' ');
try {
await loopLink(urls);
console.log('所有图片下载成功!');
process.exit(0);
} catch (error) {
console.error('下载图片失败:', error);
process.exit(1)
}
})();