feat: add failed results to output (#85)

* feat: add failed results to output * docs: update changelog
apify-projects · Nov 17, 2024 · 226a7e9 · 226a7e9
1 parent dad574a
commit 226a7e9
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 28 deletions.
diff --git a/code/src/configuration.ts b/code/src/configuration.ts
@@ -4,10 +4,12 @@ import { Cookie, RequestList, log } from 'crawlee';
 import { Page } from 'playwright';
 
 import { getModelConfigByName } from './models/models.js';
+import { LABELS } from './routes/router.js';
 import { Config } from './types/config.js';
 import { Input, PAGE_FORMAT } from './types/input.js';
 import { ModelConfig } from './types/model.js';
 import { OpenAIModelSettings } from './types/models.js';
+import { CrawlRouteUserData } from './types/user-data.js';
 
 // eslint-disable-next-line new-cap
 const ajv = new Ajv.default();
@@ -44,6 +46,10 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
     const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
 
     const { requests } = await RequestList.open({ sources: startUrls });
+    requests.forEach((request) => {
+        request.userData = { depth: 0, startUrl: request.url } satisfies CrawlRouteUserData;
+        request.label = LABELS.CRAWL;
+    });
 
     const totalMaxItems = Number(process.env.ACTOR_MAX_PAID_DATASET_ITEMS) || Number.POSITIVE_INFINITY;
     const maxPagesPerCrawl = Math.min(input.maxPagesPerCrawl || Number.POSITIVE_INFINITY, totalMaxItems);

diff --git a/code/src/crawler.ts b/code/src/crawler.ts
@@ -1,17 +1,11 @@
-import {
-    Dataset,
-    NonRetryableError,
-    PlaywrightCrawler,
-    PlaywrightCrawlingContext,
-    createRequestDebugInfo,
-    log,
-} from 'crawlee';
+import { NonRetryableError, PlaywrightCrawler, PlaywrightCrawlingContext, createRequestDebugInfo } from 'crawlee';
 
 import { initialCookiesHook } from './hooks/initial-cookies.js';
 import { LABELS, router } from './routes/router.js';
 import { Config } from './types/config.js';
 import { CrawlerState } from './types/crawler-state.js';
-import { ERROR_TYPE } from './utils.js';
+import { UserData } from './types/user-data.js';
+import { ERROR_TYPE, saveErrorResult } from './utils.js';
 
 export const createCrawler = async (config: Config) => {
     const { maxPagesPerCrawl, proxyConfiguration, requests } = config;
@@ -59,23 +53,19 @@ export const createCrawler = async (config: Config) => {
             },
         ],
 
-        async failedRequestHandler({ request }, error: Error) {
-            if (error.name === ERROR_TYPE.LIMIT_ERROR) {
-                return;
-            }
-            const errorMessage = error.message || 'no error';
-            const url = request.loadedUrl || request.url;
-            log.error(`Request ${url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
-            if (error.name === 'UserFacedError') {
-                await Dataset.pushData({
-                    url,
-                    answer: `ERROR: ${errorMessage}`,
-                });
-                return;
-            }
-            await Dataset.pushData({
-                '#error': true,
-                '#debug': createRequestDebugInfo(request),
+        async failedRequestHandler(context, error: Error) {
+            const { request } = context;
+
+            if (error.name === ERROR_TYPE.LIMIT_ERROR) return;
+
+            const state = await crawler.useState<CrawlerState>();
+            if (state.pagesOpened >= maxPagesPerCrawl) return;
+
+            state.pagesOpened++;
+            await saveErrorResult(context as PlaywrightCrawlingContext<UserData>, {
+                error: 'failed_to_load_page',
+                errorDescription: 'The page failed to load, reaching the maximum number of retries.',
+                debugInfo: createRequestDebugInfo(request),
             });
         },
     });

diff --git a/code/src/types/user-data.ts b/code/src/types/user-data.ts
@@ -4,7 +4,7 @@ export type UserData = {
 
 export type CrawlRouteUserData = UserData & {
     depth?: number;
-    wasOpenedKey: string;
+    wasOpenedKey?: string;
 };
 
 export type GptRequestUserData = {

diff --git a/code/src/utils.ts b/code/src/utils.ts
@@ -1,6 +1,8 @@
-import { GlobInput } from 'crawlee';
+import { Dictionary, GlobInput, PlaywrightCrawlingContext } from 'crawlee';
 import { minimatch } from 'minimatch';
 
+import { UserData } from './types/user-data.js';
+
 export const doesUrlMatchGlobs = (url: string, globs: GlobInput[]): boolean => {
     return globs.some((glob) => doesUrlMatchGlob(url, glob));
 };
@@ -14,3 +16,19 @@ const doesUrlMatchGlob = (url: string, glob: GlobInput): boolean => {
 export enum ERROR_TYPE {
     LIMIT_ERROR = 'LimitError',
 }
+
+export const saveErrorResult = async (
+    context: PlaywrightCrawlingContext<UserData>,
+    additionalData: { error: string; errorDescription: string; debugInfo: Dictionary },
+) => {
+    const { request, crawler } = context;
+    const { startUrl } = request.userData;
+
+    const errorItem = {
+        url: request.loadedUrl || request.url,
+        startUrl,
+        ...additionalData,
+    };
+
+    await crawler.pushData(errorItem);
+};
diff --git a/shared/CHANGELOG.md b/shared/CHANGELOG.md
@@ -1,5 +1,10 @@
 This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors.
 
+# 2024-11-17
+*Features*
+- Improved GPT call handling, which should parallelize the calls together with the crawling better.
+- Added error results to output, which will contain the failed website URL to help with debugging and error handling.
+
 # 2024-10-07
 *Fixes*
 - Fixed initial cookies not being set correctly from input.