Skip to content

Commit

Permalink
Merge branch 'main' into add-link-selector
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Nov 5, 2024
2 parents c4ae086 + 2a9b152 commit 9ef7984
Show file tree
Hide file tree
Showing 27 changed files with 339 additions and 186 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile

RUN mkdir -p /app/behaviors

WORKDIR /crawls

# enable to test custom behaviors build (from browsertrix-behaviors)
Expand Down
8 changes: 5 additions & 3 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,11 @@ Options:
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
[string]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direct
ory of behavior files.
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
ess (for debugging) [boolean]
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.3.4",
"version": "1.4.0-beta.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down Expand Up @@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^23.5.1",
"puppeteer-core": "^23.6.0",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
20 changes: 11 additions & 9 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import {
runWorkers,
} from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -175,6 +175,7 @@ export class Crawler {
finalExit = false;
uploadAndDeleteLocal = false;
done = false;
postCrawling = false;

textInPages = false;

Expand Down Expand Up @@ -510,7 +511,7 @@ export class Crawler {
}

if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
);
}
Expand Down Expand Up @@ -800,24 +801,24 @@ self.__bx_behaviors.selectMainBehavior();
});
}

loadCustomBehaviors(filename: string) {
async loadCustomBehaviors(sources: string[]) {
let str = "";

for (const { contents } of collectAllFileSources(filename, ".js")) {
for (const { contents } of await collectCustomBehaviors(sources)) {
str += `self.__bx_behaviors.load(${contents});\n`;
}

return str;
}

async checkBehaviorScripts(cdp: CDPSession) {
const filename = this.params.customBehaviors;
const sources = this.params.customBehaviors;

if (!filename) {
if (!sources) {
return;
}

for (const { path, contents } of collectAllFileSources(filename, ".js")) {
for (const { path, contents } of await collectCustomBehaviors(sources)) {
await this.browser.checkScript(cdp, path, contents);
}
}
Expand Down Expand Up @@ -1536,12 +1537,13 @@ self.__bx_behaviors.selectMainBehavior();
}

async postCrawl() {
this.postCrawling = true;
logger.info("Crawling done");

if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC();
}

logger.info("Crawling done");

if (
(this.params.generateCDX || this.params.generateWACZ) &&
!this.params.dryRun
Expand Down
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,10 @@ class ArgParser {

customBehaviors: {
describe:
"injects a custom behavior file or set of behavior files in a directory",
type: "string",
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
type: "array",
default: [],
},

debugAccessRedis: {
Expand Down
36 changes: 11 additions & 25 deletions src/util/browser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Readable } from "node:stream";
import os from "os";
import path from "path";

import { LogContext, logger } from "./logger.js";
import { formatErr, LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";

import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
Expand Down Expand Up @@ -126,7 +126,7 @@ export class Browser {
? undefined
: (target) => this.targetFilter(target),
};
await this._init(launchOpts, ondisconnect, recording);
await this._init(launchOpts, ondisconnect);
}

targetFilter(target: Target) {
Expand Down Expand Up @@ -392,17 +392,14 @@ export class Browser {
launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types
ondisconnect: Function | null = null,
recording: boolean,
) {
this.browser = await puppeteer.launch(launchOpts);

const target = this.browser.target();

this.firstCDP = await target.createCDPSession();

if (recording) {
await this.serviceWorkerFetch();
}
await this.browserContextFetch();

if (ondisconnect) {
this.browser.on("disconnected", (err) => ondisconnect(err));
Expand Down Expand Up @@ -479,35 +476,24 @@ export class Browser {
return { page, cdp };
}

async serviceWorkerFetch() {
async browserContextFetch() {
if (!this.firstCDP) {
return;
}

this.firstCDP.on("Fetch.requestPaused", async (params) => {
const { frameId, requestId, networkId, request } = params;
const { frameId, requestId, request } = params;

const { url } = request;

if (!this.firstCDP) {
throw new Error("CDP missing");
}

if (networkId) {
try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
}
return;
}

let foundRecorder = null;

for (const recorder of this.recorders) {
if (recorder.swUrls.has(request.url)) {
if (recorder.swUrls.has(url)) {
recorder.swFrameIds.add(frameId);
}

Expand All @@ -520,16 +506,16 @@ export class Browser {
if (!foundRecorder) {
logger.warn(
"Skipping URL from unknown frame",
{ url: request.url, frameId },
{ url, frameId },
"recorder",
);

try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
logger.debug(
"continueResponse failed",
{ url: request.url },
{ url, ...formatErr(e), from: "serviceWorker" },
"recorder",
);
}
Expand Down
104 changes: 82 additions & 22 deletions src/util/file_reader.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,83 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";

import { logger } from "./logger.js";

const MAX_DEPTH = 2;

export function collectAllFileSources(
// Add .ts to allowed extensions when we can support it
const ALLOWED_EXTS = [".js"];

export type FileSource = {
path: string;
contents: string;
};

export type FileSources = FileSource[];

export async function collectCustomBehaviors(
sources: string[],
): Promise<FileSources> {
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
const newSources = await collectLocalPathBehaviors(fileSource);
collectedSources.push(...newSources);
}
}

return collectedSources;
}

async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
"behavior",
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
);
}
return [];
}

async function collectLocalPathBehaviors(
fileOrDir: string,
ext?: string,
depth = 0,
): { path: string; contents: string }[] {
): Promise<FileSources> {
const resolvedPath = path.resolve(fileOrDir);

if (depth >= MAX_DEPTH) {
console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
logger.warn(
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
{},
"behavior",
);
return [];
}

const stat = fs.statSync(resolvedPath);
const stat = await fsp.stat(resolvedPath);

if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
Expand All @@ -30,24 +86,28 @@ export function collectAllFileSources(
];
}

if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath);
return files.reduce(
(acc: { path: string; contents: string }[], next: string) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
},
[],
const behaviors: FileSources = [];

const isDir = stat.isDirectory();

if (!isDir && depth === 0) {
logger.warn(
"The provided path is not a .js file or directory",
{ path: resolvedPath },
"behavior",
);
}

if (depth === 0) {
console.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
);
if (isDir) {
const files = await fsp.readdir(resolvedPath);
for (const file of files) {
const filePath = path.join(resolvedPath, file);
const newBehaviors = await collectLocalPathBehaviors(filePath, depth + 1);
behaviors.push(...newBehaviors);
}
}

return [];
return behaviors;
}

export async function getInfoString() {
Expand Down
Loading

0 comments on commit 9ef7984

Please sign in to comment.