-
Notifications
You must be signed in to change notification settings - Fork 0
/
old.js
80 lines (69 loc) · 2.27 KB
/
old.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
// Gets the list of posts on the page
/*
1. Get array of images within the post, add to object
2. Iterate through each page to create a json file of all the posts
3. Iterate through each post grabbing content and photos
*/
const axios = require('axios');
const cheerio = require('cheerio');
const sanitizeHtml = require('sanitize-html');
// const url = 'https://www.4vultures.org/news/';
const url = 'https://www.4vultures.org/2019/05/23/griffon-vulture-falls-victim-to-severe-lead-poisoning/';
let getList = html => {
data = [];
const $ = cheerio.load(html, {
normalizeWhitespace: true,
decodeEntities: true
});
$('.blogselection .j-blogarticle').each((i, elem) => {
const header = $(elem).find('h2 a');
const postDate = {
day: $(elem).find('.datetime-inner .day').text(),
month: $(elem).find('.datetime-inner .mon').text(),
year: $(elem).find('.datetime-inner .yr').text(),
};
const formatDate = `${postDate.day} ${postDate.month} ${postDate.year}`;
data.push({
title: header.text(),
date: formatDate,
link: url.substring(0,url.length - 6) + header.attr('href')
});
console.log(data);
});
}
let getPostContent = html => {
const $ = cheerio.load(html, {
normalizeWhitespace: true,
});
const tags = [];
$('.j-blog-post--tags-list .j-blog-post--tag').each((i, elem) => {
tags.push($(elem).text());
});
const data = {
title: $('#content_area .j-blog-post--headline').text(),
date: $('#content_area .j-blog-post--date').text().trim(),
content: sanitizeHtml($('.j-blog-post--header').next().html(), {
allowedTags: sanitizeHtml.defaults.allowedTags.concat(['img','figure','figcaption']).filter(element => element !== 'div' ),
allowedAttributes: {
a: [ 'href', 'name', 'target' ],
img: [ 'src' ]
},
textFilter: function(text) {
return text.replace(/\r?\n|\r| +|/g, ''); // new lines
},
exclusiveFilter: function(frame) {
return frame.tag === 'p' && !frame.text.trim();
}
}).replace(/((?:dimension=([\w\-]+)(\S+)?:)?)/gm, ''),
tags: tags
};
console.log(data);
}
axios.get(url)
.then(response => {
// getList(response.data);
getPostContent(response.data);
})
.catch(error => {
console.log(error);
});