-
Notifications
You must be signed in to change notification settings - Fork 9
/
index.js
85 lines (79 loc) · 2.82 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const mongoose = require("mongoose");
const Listing = require("./model/Listing");
//craigslistuser:SuperStrongPassword1
const scrapingResults = [
{
title: "Entry Level Software Engineer - C or C++",
datePosted: new Date("2019-07-26 12:00:00"),
neighborhood: "(palo alto)",
url:
"https://sfbay.craigslist.org/pen/sof/d/palo-alto-entry-level-software-engineer/6943135190.html",
jobDescription:
"Major Technology company is seeking an Entry Level software Engineer. The ideal candidate will have extensive school project experience with C or C++. Under general supervision...",
compensation: "Up to US$0.00 per year"
}
];
async function connectToMongoDb() {
await mongoose.connect(
"mongodb://craigslistuser:[email protected]:59377/craigslistlistings",
{ useNewUrlParser: true }
);
console.log("connected to mongodb");
}
async function scrapeListings(page) {
await page.goto(
"https://sfbay.craigslist.org/d/software-qa-dba-etc/search/sof"
);
const html = await page.content();
const $ = cheerio.load(html);
const listings = $(".result-info")
.map((index, element) => {
const titleElement = $(element).find(".result-title");
const timeElement = $(element).find(".result-date");
const hoodElement = $(element).find(".result-hood");
const title = $(titleElement).text();
const neighborhood = $(hoodElement)
.text()
.trim()
.replace("(", "")
.replace(")", "");
const url = $(titleElement).attr("href");
const datePosted = new Date($(timeElement).attr("datetime"));
return { title, url, datePosted, neighborhood };
})
.get();
return listings;
}
async function scrapeJobDescriptions(listings, page) {
for (var i = 0; i < listings.length; i++) {
await page.goto(listings[i].url);
const html = await page.content();
const $ = cheerio.load(html);
const jobDescription = $("#postingbody").text();
const compensation = $("p.attrgroup > span:nth-child(1) > b").text();
listings[i].jobDescription = jobDescription;
listings[i].compensation = compensation;
console.log(listings[i].jobDescription);
console.log(listings[i].compensation);
const listingModel = new Listing(listings[i]);
await listingModel.save();
await sleep(1000); //1 second sleep
}
}
async function sleep(miliseconds) {
return new Promise(resolve => setTimeout(resolve, miliseconds));
}
async function main() {
await connectToMongoDb();
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const listings = await scrapeListings(page);
const listingsWithJobDescriptions = await scrapeJobDescriptions(
listings,
page
);
console.log(listings);
}
main();