Skip to content

Commit

Permalink
feat: add failed results to output (#85)
Browse files Browse the repository at this point in the history
* feat: add failed results to output

* docs: update changelog
  • Loading branch information
Patai5 authored Nov 17, 2024
1 parent dad574a commit 226a7e9
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 28 deletions.
6 changes: 6 additions & 0 deletions code/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ import { Cookie, RequestList, log } from 'crawlee';
import { Page } from 'playwright';

import { getModelConfigByName } from './models/models.js';
import { LABELS } from './routes/router.js';
import { Config } from './types/config.js';
import { Input, PAGE_FORMAT } from './types/input.js';
import { ModelConfig } from './types/model.js';
import { OpenAIModelSettings } from './types/models.js';
import { CrawlRouteUserData } from './types/user-data.js';

// eslint-disable-next-line new-cap
const ajv = new Ajv.default();
Expand Down Expand Up @@ -44,6 +46,10 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);

const { requests } = await RequestList.open({ sources: startUrls });
requests.forEach((request) => {
request.userData = { depth: 0, startUrl: request.url } satisfies CrawlRouteUserData;
request.label = LABELS.CRAWL;
});

const totalMaxItems = Number(process.env.ACTOR_MAX_PAID_DATASET_ITEMS) || Number.POSITIVE_INFINITY;
const maxPagesPerCrawl = Math.min(input.maxPagesPerCrawl || Number.POSITIVE_INFINITY, totalMaxItems);
Expand Down
42 changes: 16 additions & 26 deletions code/src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
import {
Dataset,
NonRetryableError,
PlaywrightCrawler,
PlaywrightCrawlingContext,
createRequestDebugInfo,
log,
} from 'crawlee';
import { NonRetryableError, PlaywrightCrawler, PlaywrightCrawlingContext, createRequestDebugInfo } from 'crawlee';

import { initialCookiesHook } from './hooks/initial-cookies.js';
import { LABELS, router } from './routes/router.js';
import { Config } from './types/config.js';
import { CrawlerState } from './types/crawler-state.js';
import { ERROR_TYPE } from './utils.js';
import { UserData } from './types/user-data.js';
import { ERROR_TYPE, saveErrorResult } from './utils.js';

export const createCrawler = async (config: Config) => {
const { maxPagesPerCrawl, proxyConfiguration, requests } = config;
Expand Down Expand Up @@ -59,23 +53,19 @@ export const createCrawler = async (config: Config) => {
},
],

async failedRequestHandler({ request }, error: Error) {
if (error.name === ERROR_TYPE.LIMIT_ERROR) {
return;
}
const errorMessage = error.message || 'no error';
const url = request.loadedUrl || request.url;
log.error(`Request ${url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
if (error.name === 'UserFacedError') {
await Dataset.pushData({
url,
answer: `ERROR: ${errorMessage}`,
});
return;
}
await Dataset.pushData({
'#error': true,
'#debug': createRequestDebugInfo(request),
async failedRequestHandler(context, error: Error) {
const { request } = context;

if (error.name === ERROR_TYPE.LIMIT_ERROR) return;

const state = await crawler.useState<CrawlerState>();
if (state.pagesOpened >= maxPagesPerCrawl) return;

state.pagesOpened++;
await saveErrorResult(context as PlaywrightCrawlingContext<UserData>, {
error: 'failed_to_load_page',
errorDescription: 'The page failed to load, reaching the maximum number of retries.',
debugInfo: createRequestDebugInfo(request),
});
},
});
Expand Down
2 changes: 1 addition & 1 deletion code/src/types/user-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export type UserData = {

export type CrawlRouteUserData = UserData & {
depth?: number;
wasOpenedKey: string;
wasOpenedKey?: string;
};

export type GptRequestUserData = {
Expand Down
20 changes: 19 additions & 1 deletion code/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { GlobInput } from 'crawlee';
import { Dictionary, GlobInput, PlaywrightCrawlingContext } from 'crawlee';
import { minimatch } from 'minimatch';

import { UserData } from './types/user-data.js';

export const doesUrlMatchGlobs = (url: string, globs: GlobInput[]): boolean => {
return globs.some((glob) => doesUrlMatchGlob(url, glob));
};
Expand All @@ -14,3 +16,19 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => {
export enum ERROR_TYPE {
LIMIT_ERROR = 'LimitError',
}

export const saveErrorResult = async (
context: PlaywrightCrawlingContext<UserData>,
additionalData: { error: string; errorDescription: string; debugInfo: Dictionary },
) => {
const { request, crawler } = context;
const { startUrl } = request.userData;

const errorItem = {
url: request.loadedUrl || request.url,
startUrl,
...additionalData,
};

await crawler.pushData(errorItem);
};
5 changes: 5 additions & 0 deletions shared/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors.

# 2024-11-17
*Features*
- Improved GPT call handling, which should parallelize the calls together with the crawling better.
- Added error results to output, which will contain the failed website URL to help with debugging and error handling.

# 2024-10-07
*Fixes*
- Fixed initial cookies not being set correctly from input.
Expand Down

0 comments on commit 226a7e9

Please sign in to comment.