Skip to content

Commit

Permalink
fix(route): fix bjfu encoding (#13223)
Browse files Browse the repository at this point in the history
* fix(route): fix bjfu encoding

* fix: use cache.tryGet

---------
  • Loading branch information
JimenezLi authored Sep 6, 2023
1 parent bb524fb commit fa27a51
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 175 deletions.
58 changes: 22 additions & 36 deletions lib/v2/bjfu/grs.js
Original file line number Diff line number Diff line change
@@ -1,60 +1,46 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const iconv = require('iconv-lite');
const { parseDate } = require('@/utils/parse-date');
const timezone = require('@/utils/timezone');

module.exports = async (ctx) => {
const url = 'http://graduate.bjfu.edu.cn/pygl/pydt/index.html';
const response = await got.get(url, {
responseType: 'buffer',
});
const data = iconv.decode(response.data, 'gb2312');
const response = await got.get(url);
const data = response.data;
const $ = cheerio.load(data);
const list = $('.itemList li')
.slice(0, 10)
.map((i, e) => {
.slice(0, 11)
.toArray()
.map((e) => {
const element = $(e);
const title = element.find('li a').attr('title');
const link = element.find('li a').attr('href');
const date = new Date(
element
.find('li a')
.text()
.match(/\d{4}-\d{2}-\d{2}/)
);
const timeZone = 8;
const serverOffset = date.getTimezoneOffset() / 60;
const pubDate = new Date(date.getTime() - 60 * 60 * 1000 * (timeZone + serverOffset)).toUTCString();
const date = element
.find('li a')
.text()
.match(/\d{4}-\d{2}-\d{2}/);
const pubDate = timezone(parseDate(date), 8);

return {
title,
description: '',
link: 'http://graduate.bjfu.edu.cn/pygl/pydt/' + link,
author: '北京林业大学研究生院培养动态',
pubDate,
};
})
.get();
});

const result = await Promise.all(
list.map(async (item) => {
const link = item.link;
list.map((item) =>
ctx.cache.tryGet(item.link, async () => {
const itemReponse = await got.get(item.link);
const data = itemReponse.data;
const itemElement = cheerio.load(data);

const cache = await ctx.cache.get(link);
if (cache) {
return Promise.resolve(JSON.parse(cache));
}
item.description = itemElement('.articleTxt').html();

const itemReponse = await got.get(link, {
responseType: 'buffer',
});
const data = iconv.decode(itemReponse.data, 'gb2312');
const itemElement = cheerio.load(data);

item.description = itemElement('.articleTxt').html();

ctx.cache.set(link, JSON.stringify(item));
return item;
})
return item;
})
)
);

ctx.state.data = {
Expand Down
16 changes: 9 additions & 7 deletions lib/v2/bjfu/it/index.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const util = require('./utils');
const iconv = require('iconv-lite'); // 转码
const iconv = require('iconv-lite');

module.exports = async (ctx) => {
const type = ctx.params.type;
Expand All @@ -27,16 +27,18 @@ module.exports = async (ctx) => {

const response = await got({
method: 'get',
responseType: 'buffer', // 转码
responseType: 'buffer',
url: base,
});

const data = iconv.decode(response.data, 'gb2312'); // 转码
const $ = cheerio.load(data);

// const list = $('div[item-content]').slice(0, 10).get();
const data = response.data;
let $ = cheerio.load(iconv.decode(data, 'utf-8'));
const charset = $('meta[charset]').attr('charset');
if (charset?.toLowerCase() !== 'utf-8') {
$ = cheerio.load(iconv.decode(data, charset ?? 'utf-8'));
}

const list = $('.item-content').get();
const list = $('.item-content').toArray();

const result = await util.ProcessFeed(base, list, ctx.cache); // 感谢@hoilc指导

Expand Down
67 changes: 38 additions & 29 deletions lib/v2/bjfu/it/utils.js
Original file line number Diff line number Diff line change
@@ -1,62 +1,71 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const iconv = require('iconv-lite'); // 转码
const iconv = require('iconv-lite');
const { parseDate } = require('@/utils/parse-date');
const timezone = require('@/utils/timezone');

// 完整文章页
async function load(link) {
const response = await got.get(link, {
responseType: 'buffer',
});
let response;
try {
response = await got.get(link, {
responseType: 'buffer',
});
} catch (e) {
return { description: '' };
}

const data = iconv.decode(response.data, 'gb2312'); // 转码
const data = response.data; // 不用转码

// 加载文章内容
const $ = cheerio.load(data);

// 解析日期
const pubDate = timezone(
parseDate(
$('.template-head-info')
.text()
.match(/\d{4}-\d{2}-\d{2}/)
),
+8
);
let $ = cheerio.load(iconv.decode(data, 'utf-8'));
const charset = $('meta[charset]').attr('charset');
if (charset?.toLowerCase() !== 'utf-8') {
$ = cheerio.load(iconv.decode(data, charset ?? 'utf-8'));
}

// 提取内容
const description = $('.template-body').html();
const description = ($('.template-body').length ? $('.template-body').html() : '') + ($('.template-tail').length ? $('.template-tail').html() : '');

// 返回解析的结果
return { description, pubDate };
return { description };
}

const ProcessFeed = (base, list, caches) =>
// 使用 Promise.all() 进行 async 并发
Promise.all(
// 遍历每一篇文章
list.map(async (item) => {
list.map((item) => {
const $ = cheerio.load(item);

const $title = $('a');
// 还原相对链接为绝对链接
const itemUrl = new URL($title.attr('href'), base).href; // 感谢@hoilc指导

// 列表上提取到的信息
const single = {
title: $title.text(),
link: itemUrl,
author: '北林信息',
guid: itemUrl,
};
// 解析日期
const pubDate = timezone(
parseDate(
$('span')
.text()
.match(/\d{4}-\d{2}-\d{2}/)
),
+8
);

// 使用tryGet方法从缓存获取内容。
// 当缓存中无法获取到链接内容的时候,则使用load方法加载文章内容。
const other = await caches.tryGet(itemUrl, () => load(itemUrl));
return caches.tryGet(itemUrl, async () => {
const { description } = await load(itemUrl);

// 合并解析后的结果集作为该篇文章最终的输出结果
return { ...single, ...other };
// 列表上提取到的信息
return {
title: $title.text(),
link: itemUrl,
author: '北林信息',
description,
pubDate,
};
});
})
);
module.exports = {
Expand Down
11 changes: 3 additions & 8 deletions lib/v2/bjfu/jwc/index.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const util = require('./utils');
const iconv = require('iconv-lite'); // 转码

module.exports = async (ctx) => {
const type = ctx.params.type;
let title, path;
switch (type) {
case 'jwkx':
title = '教务快讯';
path = 'jwkx/';
break;
case 'jgdt':
title = '教改动态';
path = 'jgdt/';
Expand All @@ -27,6 +22,7 @@ module.exports = async (ctx) => {
title = '图片新闻';
path = 'tpxw/';
break;
case 'jwkx':
default:
title = '教务快讯';
path = 'jwkx/';
Expand All @@ -35,14 +31,13 @@ module.exports = async (ctx) => {

const response = await got({
method: 'get',
responseType: 'buffer', // 转码
url: base,
});

const data = iconv.decode(response.data, 'gb2312'); // 转码
const data = response.data; // 不用转码
const $ = cheerio.load(data);

const list = $('.list_c li').slice(0, 10).get();
const list = $('.list_c li').slice(0, 15).toArray();

const result = await util.ProcessFeed(base, list, ctx.cache); // 感谢@hoilc指导

Expand Down
55 changes: 26 additions & 29 deletions lib/v2/bjfu/jwc/utils.js
Original file line number Diff line number Diff line change
@@ -1,61 +1,58 @@
const got = require('@/utils/got');
const cheerio = require('cheerio');
const iconv = require('iconv-lite'); // 转码
const { parseDate } = require('@/utils/parse-date');
const timezone = require('@/utils/timezone');

// 完整文章页
async function load(link) {
const response = await got.get(link, {
responseType: 'buffer',
});
const response = await got.get(link);

const data = iconv.decode(response.data, 'gb2312'); // 转码
const data = response.data; // 不用转码

// 加载文章内容
const $ = cheerio.load(data);

// 解析日期
const pubDate = timezone(
parseDate(
$('div #con_djl')
.text()
.match(/\d{4}-\d{2}-\d{2}/)
),
+8
);

// 提取内容
const description = $('#con_c').html();
const description = ($('#con_c').length ? $('#con_c').html() : '') + ($('#con_fujian').length ? $('#con_fujian').html() : '');

// 返回解析的结果
return { description, pubDate };
return { description };
}

const ProcessFeed = (base, list, caches) =>
Promise.all(
// 遍历每一篇文章
list.map(async (item) => {
list.map((item) => {
const $ = cheerio.load(item);

const $title = $('a');
// 还原相对链接为绝对链接
const itemUrl = new URL($title.attr('href'), base).href; // 感谢@hoilc指导

// 列表上提取到的信息
const single = {
title: $title.text(),
link: itemUrl,
author: '北林教务处',
guid: itemUrl,
};
// 解析日期
const pubDate = timezone(
parseDate(
$('.datetime')
.text()
.match(/\d{4}-\d{2}-\d{2}/)
),
+8
);

// 使用tryGet方法从缓存获取内容。
// 当缓存中无法获取到链接内容的时候,则使用load方法加载文章内容。
const other = await caches.tryGet(itemUrl, () => load(itemUrl));

// 合并解析后的结果集作为该篇文章最终的输出结果
return { ...single, ...other };
return caches.tryGet(itemUrl, async () => {
const { description } = await load(itemUrl);

// 列表上提取到的信息
return {
title: $title.text(),
link: itemUrl,
author: '北林教务处',
description,
pubDate,
};
});
})
);
module.exports = {
Expand Down
Loading

0 comments on commit fa27a51

Please sign in to comment.