Skip to content

Commit

Permalink
driver: make driver optional, default to just calling loadPage()
Browse files Browse the repository at this point in the history
tests: convert previous driver test to custom selector test with --selectLinks, also test invalid selector (no links extracted)
tests: add new driver test to create PDF instead of using custom selector (no longer customizable via driver params)
  • Loading branch information
ikreymer committed Nov 5, 2024
1 parent 9ef7984 commit 6dd5da5
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 75 deletions.
36 changes: 22 additions & 14 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,14 @@ export class Crawler {

proxyServer?: string;

driver!: (opts: {
page: Page;
data: PageState;
// eslint-disable-next-line no-use-before-define
crawler: Crawler;
}) => Promise<void>;
driver:
| ((opts: {
page: Page;
data: PageState;
// eslint-disable-next-line no-use-before-define
crawler: Crawler;
}) => Promise<void>)
| null = null;

recording: boolean;

Expand Down Expand Up @@ -930,8 +932,12 @@ self.__bx_behaviors.selectMainBehavior();
await page.setExtraHTTPHeaders({});
}

// run custom driver here
await this.driver({ page, data, crawler: this });
// run custom driver here, if any
if (this.driver) {
await this.driver({ page, data, crawler: this });
} else {
await this.loadPage(page, data);
}

data.title = await timedRun(
page.title(),
Expand Down Expand Up @@ -1347,12 +1353,14 @@ self.__bx_behaviors.selectMainBehavior();
);
}

try {
const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl.href)).default;
} catch (e) {
logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
if (this.params.driver) {
try {
const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl.href)).default;
} catch (e) {
logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
}
}

await this.initCrawlState();
Expand Down
15 changes: 0 additions & 15 deletions src/defaultDriver.ts

This file was deleted.

3 changes: 1 addition & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,8 @@ class ArgParser {
},

driver: {
describe: "JS driver for the crawler",
describe: "Custom driver for the crawler, if any",
type: "string",
default: "./defaultDriver.js",
},

generateCDX: {
Expand Down
45 changes: 4 additions & 41 deletions tests/custom_driver.test.js
Original file line number Diff line number Diff line change
@@ -1,52 +1,15 @@
import child_process from "child_process";
import fs from "fs";

test("ensure custom driver with custom selector crawls JS files as pages", async () => {
test("ensure custom driver creates PDF", async () => {
try {
child_process.execSync(
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1",
);
} catch (error) {
console.log(error);
}

const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set();

for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}

const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();

for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}

const expectedPages = new Set([
"https://www.iana.org/",
]);

const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);

expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf"));
expect(pdfs.length).toBe(1);
});
71 changes: 71 additions & 0 deletions tests/custom_selector.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import child_process from "child_process";
import fs from "fs";

test("test custom selector crawls JS files as pages", async () => {
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"",
);
} catch (error) {
console.log(error);
}

const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set();

for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}

const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();

for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}

const expectedPages = new Set([
"https://www.iana.org/",
]);

const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);

expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
});


test("test invalid selector, no pages extracted", async () => {
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
);
} catch (error) {
console.log(error);
}

const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-sel-invalid/pages/extraPages.jsonl",
"utf8",
);
expect(crawledExtraPages.trim().split("\n").length).toBe(1);
});


6 changes: 3 additions & 3 deletions tests/fixtures/driver-1.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export default async ({ data, page, crawler }) => {
await crawler.loadPage(page, data, [
{ selector: "script[src]", extract: "src", isAttribute: false },
]);
await crawler.loadPage(page, data);

await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`});
};

0 comments on commit 6dd5da5

Please sign in to comment.