-
Notifications
You must be signed in to change notification settings - Fork 32
/
crawl.js
92 lines (71 loc) · 2.67 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
const puppeteer = require('puppeteer');
(async () => {
const crawler = {
url: process.env.ALGOLIA_CRAWLER_URL,
id: process.env.ALGOLIA_CRAWLER_ID,
userId: process.env.ALGOLIA_CRAWLER_USER_ID,
apiKey: process.env.ALGOLIA_CRAWLER_API_KEY,
}
if (!crawler.url || !crawler.id || !crawler.userId || !crawler.apiKey) {
console.log("Missing required env vars")
return
}
const updateUrls = async (urls) => {
const updateResponse = await fetch(`${crawler.url}/${crawler.id}/config`, {
method: "PATCH",
headers: {
"Content-Type": "application/json",
"Authorization": `Basic ${Buffer.from(`${crawler.userId}:${crawler.apiKey}`).toString("base64")}`
},
body: JSON.stringify({
extraUrls: urls
})
});
console.log("Updating config")
const updateJson = await updateResponse.json();
if (!updateResponse.ok) console.log(updateJson);
const runResponse = await fetch(`${crawler.url}/${crawler.id}/reindex`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Basic ${Buffer.from(`${crawler.userId}:${crawler.apiKey}`).toString("base64")}`
},
});
if (!runResponse.ok) console.log(runResponse);
console.log("Running crawler")
return {
updateOk: updateResponse.ok,
runOk: runResponse.ok,
}
}
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
const url = 'https://developers.cloudflare.com/api?_expand=true&schema_url=https://raw.githubusercontent.com/cloudflare/api-schemas/main/openapi.yaml'
console.log('Navigating to url: ', url)
await page.goto(url);
console.log("Waiting for page to render...")
await page.waitForSelector('text/Cloudflare API')
// Wait for text from an expanded sidebar item
await page.waitForSelector('text/List Accounts')
const links = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a[href]'));
return links.map(link => link.href);
});
const filteredLinks = links.filter(link => {
return link.startsWith('https://developers.cloudflare.com/api/')
})
const uniqLinks = [...new Set(filteredLinks)];
console.log(`Got ${uniqLinks.length} links`)
const linksWithParam = uniqLinks.map(link => {
const baseUrl = new URL(link)
baseUrl.searchParams.set("schema_url", "https://raw.githubusercontent.com/cloudflare/api-schemas/main/openapi.yaml")
return baseUrl.toString()
})
const resp = await updateUrls(linksWithParam);
console.log(resp)
await browser.close();
})()