diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 3678f49e4..cb1610a98 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -111,6 +111,7 @@ class SettingsResponse(BaseModel): defaultPageLoadTimeSeconds: int maxPagesPerCrawl: int + numBrowsers: int maxScale: int billingEnabled: bool @@ -143,6 +144,7 @@ def main() -> None: os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120) ), maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), + numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)), maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)), billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")), signUpUrl=os.environ.get("SIGN_UP_URL", ""), diff --git a/backend/test/test_api.py b/backend/test/test_api.py index 439bfbff3..5c0a1d68b 100644 --- a/backend/test/test_api.py +++ b/backend/test/test_api.py @@ -43,6 +43,7 @@ def test_api_settings(): "jwtTokenLifetime": 86400, "defaultBehaviorTimeSeconds": 300, "maxPagesPerCrawl": 4, + "numBrowsers": 2, "maxScale": 3, "defaultPageLoadTimeSeconds": 120, "billingEnabled": True, diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index fa1c7db64..6026287f7 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -56,6 +56,8 @@ data: MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" + NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}" + MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}" diff --git a/docs/user-guide/archived-items.md b/docs/user-guide/archived-items.md index 719587585..428ce4c38 100644 --- a/docs/user-guide/archived-items.md +++ b/docs/user-guide/archived-items.md @@ -56,7 +56,7 @@ For more details on navigating web archives within ReplayWeb.page, see the [Repl ### Exporting Files -While crawling, Browsertrix will output one or more WACZ files — the crawler aims to output files in consistently sized chunks, and each [crawler instance](workflow-setup.md#crawler-instances) will output separate WACZ files. +While crawling, Browsertrix will output one or more WACZ files — the crawler aims to output files in consistently sized chunks, and each crawler will output separate WACZ files. The **WACZ Files** tab lists the individually downloadable WACZ files that make up the archived item as well as their file sizes and backup status. diff --git a/docs/user-guide/crawl-workflows.md b/docs/user-guide/crawl-workflows.md index 435621fd7..78bbe3f60 100644 --- a/docs/user-guide/crawl-workflows.md +++ b/docs/user-guide/crawl-workflows.md @@ -34,7 +34,7 @@ Run a crawl workflow by clicking _Run Crawl_ in the actions menu of the workflow While crawling, the **Watch Crawl** section displays a list of queued URLs that will be visited, and streams the current state of the browser windows as they visit pages from the queue. You can [modify the crawl live](./running-crawl.md) by adding URL exclusions or changing the number of crawling instances. -Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#scope). +Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#crawl-scope). ## Status diff --git a/docs/user-guide/overview.md b/docs/user-guide/overview.md index 6194d5aea..b67b5d6a0 100644 --- a/docs/user-guide/overview.md +++ b/docs/user-guide/overview.md @@ -21,7 +21,7 @@ The crawling panel lists the number of currently running and waiting crawls, as For organizations with a set execution minute limit, the crawling panel displays a graph of how much execution time has been used and how much is currently remaining. Monthly execution time limits reset on the first of each month at 12:00 AM GMT. ??? Question "How is execution time calculated?" - Execution time is the total runtime of all [_Crawler Instances_](workflow-setup.md/#crawler-instances) during a crawl. For instance, if _Crawler Instances_ scale is set to 2× and each crawler instance uses 2 minutes of active crawling time, execution time for the crawl will be 4 minutes. Like elapsed time, this is tracked as the crawl runs so changing the _Crawler Instances_ scale while a crawl is running may change the amount of execution time used in a given time period. + Execution time is the total runtime of scaled by the [_Browser Windows_](workflow-setup.md/#browser-windows) setting increment value during a crawl. Like elapsed time, this is tracked as the crawl runs so changing the amount of _Browser Windows_ while a crawl is running may change the amount of execution time used in a given time period. ## Collections diff --git a/docs/user-guide/running-crawl.md b/docs/user-guide/running-crawl.md index ae31f398d..f899f83fd 100644 --- a/docs/user-guide/running-crawl.md +++ b/docs/user-guide/running-crawl.md @@ -23,9 +23,9 @@ If the crawl queue is filled with URLs that should not be crawled, use the _Edit Exclusions added while crawling are applied to the same exclusion table saved in the workflow's settings and will be used the next time the crawl workflow is run unless they are manually removed. -## Changing the Number of Crawler Instances +## Changing the Number of Browser Windows -Like exclusions, the [crawler instance](workflow-setup.md#crawler-instances) scale can also be adjusted while crawling. On the Watch Crawl page, press the _Edit Crawler Instances_ button, and set the desired value. +Like exclusions, the number of [browser windows](workflow-setup.md#browser-windows) can also be adjusted while crawling. On the **Watch Crawl** tab, press the _Edit Browser Windows_ button, and set the desired value. Unlike exclusions, this change will not be applied to future workflow runs. diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 120eea66c..5dae9c82e 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -6,7 +6,7 @@ Changes to a setting will only apply to subsequent crawls. Crawl settings are shown in the crawl workflow detail **Settings** tab and in the archived item **Crawl Settings** tab. -## Scope +## Crawl Scope Specify the range and depth of your crawl. Different settings will be shown depending on whether you chose _Known URLs_ (crawl type of **URL List**) or _Automated Discovery_ (crawl type of **Seeded Crawl**) when creating a new workflow. @@ -114,10 +114,6 @@ The crawl will be gracefully stopped after this set period of elapsed time. The crawl will be gracefully stopped after reaching this set size in GB. -### Crawler Instances - -Increasing the amount of crawler instances will speed up crawls by using additional browser windows to capture more pages in parallel. This will also increase the amount of traffic sent to the website and may result in a higher chance of getting rate limited. - ### Page Load Timeout Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded. @@ -146,6 +142,15 @@ Configure the browser used to visit URLs during the crawl. Sets the [_Browser Profile_](browser-profiles.md) to be used for this crawl. +### Browser Windows + +Sets the number of browser windows that are used to visit webpages while crawling. Increasing the number of browser windows will speed up crawls by capturing more pages in parallel. + +There are some trade-offs: + +- This may result in a higher chance of getting rate limited due to the increase in traffic sent to the website. +- More execution minutes will be used per-crawl. + ### Crawler Release Channel Sets the release channel of [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) to be used for this crawl. Crawls started by this workflow will use the latest crawler version from the selected release channel. Generally "Default" will be the most stable, however others may have newer features (or bugs)! diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 78fc14cc7..21d130e87 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -166,10 +166,6 @@ export class ConfigDetails extends LiteElement { msg("Crawl Size Limit"), renderSize(crawlConfig?.maxCrawlSize), )} - ${this.renderSetting( - msg("Crawler Instances"), - crawlConfig?.scale ? `${crawlConfig.scale}×` : "", - )}

${sectionStrings.perPageLimits}

@@ -232,6 +228,12 @@ export class ConfigDetails extends LiteElement { >`, ), )} + ${this.renderSetting( + msg("Browser Windows"), + crawlConfig?.scale && this.appState.settings + ? `${crawlConfig.scale * this.appState.settings.numBrowsers}` + : "", + )} ${this.renderSetting( msg("Crawler Channel (Exact Crawler Version)"), capitalize(crawlConfig?.crawlerChannel || "default") + diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 260153bf3..cf95e4f0b 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1254,29 +1254,6 @@ https://archiveweb.page/images/${"logo.svg"}`} `)} ${this.renderHelpTextCol(infoTextStrings["maxCrawlSizeGB"])} - ${inputCol(html` - - this.updateFormState({ - scale: +(e.target as SlCheckbox).value, - })} - > - ${map( - range(this.defaults.maxScale), - (i: number) => - html` ${i + 1}×`, - )} - - `)} - ${this.renderHelpTextCol( - msg(`Increasing parallel crawler instances can speed up crawls, but may - increase the chances of getting rate limited.`), - )} ${this.renderSectionHeading(sectionStrings.perPageLimits)} ${inputCol(html` `)} ${this.renderHelpTextCol(infoTextStrings["browserProfile"])} + ${inputCol(html` + + this.updateFormState({ + scale: +(e.target as SlCheckbox).value, + })} + > + ${when(this.appState.settings?.numBrowsers, (numBrowsers) => + map( + range(this.defaults.maxScale), + (i: number) => + html` ${(i + 1) * numBrowsers}`, + ), + )} + + `)} + ${this.renderHelpTextCol( + html`${msg( + `Increase the number of open browser windows during a crawl. This will speed up your crawl by effectively running more crawlers at the same time.`, + )} + ${msg("See caveats")}.`, + )} ${inputCol(html`
${msg( - "Aggregated time across all crawler instances that the crawler was actively executing a crawl or QA analysis run, i.e. not in a waiting state", + "Aggregated time across all browser windows that the crawler was actively executing a crawl or QA analysis run, i.e. not in a waiting state", )}
diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 02eb0ad02..1d098b6e4 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -493,11 +493,15 @@ export class WorkflowDetail extends LiteElement { return html`

${this.tabLabels[this.activePanel]}

(this.openDialogName = "scale")} > - - ${msg("Edit Crawler Instances")} + + ${msg("Edit Browser Windows")} `; } if (this.activePanel === "logs") { @@ -558,17 +562,15 @@ export class WorkflowDetail extends LiteElement { ${when( - !this.isLoading && this.seeds, - () => html` + !this.isLoading && this.seeds && this.workflow, + (workflow) => html` - this.navTo( - `${this.orgBasePath}/workflows/crawl/${this.workflow!.id}`, - )} + this.navTo(`${this.orgBasePath}/workflows/crawl/${workflow.id}`)} > `, this.renderLoading, @@ -675,7 +677,7 @@ export class WorkflowDetail extends LiteElement { (this.openDialogName = "scale")}> - ${msg("Edit Crawler Instances")} + ${msg("Edit Browser Windows")} (this.openDialogName = "exclusions")} @@ -732,36 +734,36 @@ export class WorkflowDetail extends LiteElement { ${this.renderDetailItem( msg("Status"), - () => html` + (workflow) => html` `, )} ${this.renderDetailItem( msg("Total Size"), - () => + (workflow) => html` `, )} - ${this.renderDetailItem(msg("Schedule"), () => - this.workflow!.schedule + ${this.renderDetailItem(msg("Schedule"), (workflow) => + workflow.schedule ? html`
- ${humanizeSchedule(this.workflow!.schedule, { + ${humanizeSchedule(workflow.schedule, { length: "short", })}
` : html`${msg("No Schedule")}`, )} - ${this.renderDetailItem(msg("Created By"), () => + ${this.renderDetailItem(msg("Created By"), (workflow) => msg( - str`${this.workflow!.createdByName} on ${this.dateFormatter.format( - new Date(`${this.workflow!.created}Z`), + str`${workflow.createdByName} on ${this.dateFormatter.format( + new Date(`${workflow.created}Z`), )}`, ), )} @@ -771,7 +773,7 @@ export class WorkflowDetail extends LiteElement { private renderDetailItem( label: string | TemplateResult, - renderContent: () => TemplateResult | string | number, + renderContent: (workflow: Workflow) => TemplateResult | string | number, ) { return html` @@ -947,8 +949,10 @@ export class WorkflowDetail extends LiteElement { >` : skeleton, )} - ${this.renderDetailItem(msg("Crawler Instances"), () => - this.workflow ? this.workflow.scale : skeleton, + ${this.renderDetailItem(msg("Browser Windows"), () => + this.workflow && this.appState.settings + ? this.workflow.scale * this.appState.settings.numBrowsers + : skeleton, )}
`; @@ -1002,13 +1006,13 @@ export class WorkflowDetail extends LiteElement { ` : this.renderInactiveCrawlMessage()} ${when( - isRunning, - () => html` + isRunning && this.workflow, + (workflow) => html`
@@ -1016,7 +1020,7 @@ export class WorkflowDetail extends LiteElement {
${this.renderExclusions()}
(this.openDialogName = undefined)} @sl-show=${this.showDialog} @@ -1039,12 +1043,10 @@ export class WorkflowDetail extends LiteElement {

${when( - this.workflow?.lastCrawlId, - () => html` + this.workflow?.lastCrawlId && this.workflow, + (workflow) => html` + this.isCrawler && this.workflow, + (workflow) => html` @@ -1254,21 +1254,24 @@ export class WorkflowDetail extends LiteElement { if (!this.workflow) return; const scaleOptions = []; - for (let value = 1; value <= this.maxScale; value++) { - scaleOptions.push({ - value, - label: `${value}×`, - }); + + if (this.appState.settings) { + for (let value = 1; value <= this.maxScale; value++) { + scaleOptions.push({ + value, + label: value * this.appState.settings.numBrowsers, + }); + } } return html`
- + ${msg( + "Change the number of browser windows crawling in parallel. This change will take effect immediately on the currently running crawl and update crawl workflow settings.", )} - > +

+ ${scaleOptions.map( ({ value, label }) => html` { try { const data = await this.apiFetch<{ started: string | null }>( - `/orgs/${this.orgId}/crawlconfigs/${this.workflow!.id}/run`, + `/orgs/${this.orgId}/crawlconfigs/${this.workflowId}/run`, { method: "POST", }, diff --git a/frontend/src/pages/org/workflows-list.ts b/frontend/src/pages/org/workflows-list.ts index e2c064ebf..a51473774 100644 --- a/frontend/src/pages/org/workflows-list.ts +++ b/frontend/src/pages/org/workflows-list.ts @@ -475,7 +475,7 @@ export class WorkflowsList extends LiteElement { )} > - ${msg("Edit Crawler Instances")} + ${msg("Edit Browser Windows")} diff --git a/frontend/src/utils/app.ts b/frontend/src/utils/app.ts index d810754d8..f76265632 100644 --- a/frontend/src/utils/app.ts +++ b/frontend/src/utils/app.ts @@ -6,6 +6,7 @@ export type AppSettings = { defaultBehaviorTimeSeconds: number; defaultPageLoadTimeSeconds: number; maxPagesPerCrawl: number; + numBrowsers: number; maxScale: number; billingEnabled: boolean; signUpUrl: string; @@ -33,6 +34,7 @@ export async function getAppSettings(): Promise { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, + numBrowsers: 1, maxScale: 0, billingEnabled: false, signUpUrl: "",