-
Notifications
You must be signed in to change notification settings - Fork 0
/
server.js
67 lines (53 loc) · 1.97 KB
/
server.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
//import libraries
const axios = require('axios');
//what is axios library?
//axios is a promise based HTTP client for the browser and node.js
//it is used to make HTTP requests from node.js
const cheerio = require('cheerio');
//what is cheerio library?
//cheerio is a library that allows us to use jQuery syntax to parse HTML
//it is used to parse and manipulate HTML
async function main(maxPages=50){
console.log("Hello World!")
//pageToVisit
const pagesToVisit = ["https://scrapeme.live/shop/"];
const visitedURLS=[];
const productURLs = new Set();
//iterate till maxPages or will queue is empty
while(
pagesToVisit.length > 0 &&
visitedURLS.length <= maxPages
){
//pop front of queue
const pageURL=pagesToVisit.pop();
//download HTML content from webpage
const pageHTML = await axios.get(pageURL);
//marks as visited
visitedURLS.push(pageURL);
//init cheerio
const $ = cheerio.load(pageHTML.data);
//retreiving pagination links
$(".page-numbers a").each((index,element) => {
const paginationURL= ($(element).attr('href'));
if(!visitedURLS.includes(paginationURL) && !pagesToVisit.includes(paginationURL))
{
pagesToVisit.push(paginationURL);
}
});
//we are using jQuery syntax to select all elements with class page-numbers a
//then we are looping through each element
//for each element we are printing the href attribute
//the href attribute contains the link to the next page
//product links
$("li.product a.woocommerce-LoopProduct-link").each((index,element) => {
const productURL=($(element).attr('href'));
productURLs.add(productURL);
});
//log crawling results
}
console.log([...productURLs]);
}
main(10).then(() => {
console.log("Crawling completed!");
process.exit(0)
});