diff --git a/INTEGRATION.md b/INTEGRATION.md
index 6ea72ddd..619d404a 100644
--- a/INTEGRATION.md
+++ b/INTEGRATION.md
@@ -197,7 +197,7 @@ Create cypress.config.js
with the following contents, and change yo
return await purpleA11y.pushScanResults(res, metadata, elementsToClick);
},
returnResultsDir() {
- return `results/${purpleA11y.randomToken}_${purpleA11y.scanDetails.urlsCrawled.scanned.length}pages/reports/report.html`;
+ return `results/${purpleA11y.randomToken}_${purpleA11y.scanDetails.urlsCrawled.scanned.length}pages/report.html`;
},
finishPurpleA11yTestCase() {
purpleA11y.testThresholds();
diff --git a/__tests__/mergeAxeResults.test.ts b/__tests__/mergeAxeResults.test.ts
index 0a64ef3a..5f1110a6 100644
--- a/__tests__/mergeAxeResults.test.ts
+++ b/__tests__/mergeAxeResults.test.ts
@@ -56,7 +56,7 @@ beforeEach(() => {
// Reports storagePath, expected report and compiled result files
htmlFilename = 'report';
- expectedHTMLFilename = `${expectedStoragePath}/reports/${htmlFilename}.html`;
+ expectedHTMLFilename = `${expectedStoragePath}/${htmlFilename}.html`;
// Mock the JSON result generated from the issues
dateTimeStamp = getFormattedTime();
diff --git a/gitlab-pipeline-template.yml b/gitlab-pipeline-template.yml
index f8ee8da3..2899d0dc 100644
--- a/gitlab-pipeline-template.yml
+++ b/gitlab-pipeline-template.yml
@@ -42,13 +42,13 @@ a11y-scan:
artifacts:
paths:
# Stores the report CSV, HTML, summary PDF only to save storage space
- - artifacts/reports/report.csv
- - artifacts/reports/report.html
- - artifacts/reports/scanDetails.csv
- - artifacts/reports/summary.pdf
+ - artifacts/report.csv
+ - artifacts/report.html
+ - artifacts/scanDetails.csv
+ - artifacts/summary.pdf
# Include screenhots folder
- # - artifacts/reports/elemScreenshots/
+ # - artifacts/elemScreenshots/
# Stores the reports folder so it can be accessed through Browse
- # - artifacts/reports
+ # - artifacts/
# Uploads the results as zipped file
# - $A11Y_SCAN_ARTIFACT_NAME
diff --git a/src/cli.ts b/src/cli.ts
index a12e19cc..91acf3a1 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -268,7 +268,7 @@ const scanInit = async (argvs: Answers): Promise => {
printMessage([statuses.systemError.message], messageOptions);
process.exit(res.status);
case statuses.invalidUrl.code:
- if (argvs.scanner !== ScannerTypes.SITEMAP) {
+ if (argvs.scanner !== ScannerTypes.SITEMAP && argvs.scanner !== ScannerTypes.LOCALFILE) {
printMessage([statuses.invalidUrl.message], messageOptions);
process.exit(res.status);
}
@@ -277,7 +277,7 @@ const scanInit = async (argvs: Answers): Promise => {
file is a sitemap */
const finalFilePath = getFileSitemap(argvs.url);
if (finalFilePath) {
- argvs.isLocalSitemap = true;
+ argvs.isLocalFileScan = true;
argvs.finalUrl = finalFilePath;
if (process.env.VALIDATE_URL_PH_GUI) {
console.log('Url is valid');
@@ -367,7 +367,7 @@ const optionsAnswer: Answers = {
followRobots: options['followRobots'],
customFlowLabel: options['customFlowLabel'],
viewportWidth: options['viewportWidth'],
- isLocalSitemap: options['isLocalSitemap'],
+ isLocalFileScan: options['isLocalFileScan'],
exportDirectory: options['exportDirectory'],
clonedBrowserDataDir: options['clonedBrowserDataDir'],
specifiedMaxConcurrency: options['specifiedMaxConcurrency'],
diff --git a/src/combine.ts b/src/combine.ts
index e5916eca..5d8d5c11 100644
--- a/src/combine.ts
+++ b/src/combine.ts
@@ -1,6 +1,7 @@
import printMessage from 'print-message';
import crawlSitemap from './crawlers/crawlSitemap.js';
import crawlDomain from './crawlers/crawlDomain.js';
+import crawlLocalFile from './crawlers/crawlLocalFile.js';
import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
import { generateArtifacts } from './mergeAxeResults.js';
import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js';
@@ -10,6 +11,7 @@ import { consoleLogger, silentLogger } from './logs.js';
import runCustom from './crawlers/runCustom.js';
import { alertMessageOptions } from './constants/cliFunctions.js';
import { Data } from './index.js';
+import { fileURLToPath, pathToFileURL } from 'url';
// Class exports
@@ -42,7 +44,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
viewportWidth,
playwrightDeviceDetailsObject,
maxRequestsPerCrawl,
- isLocalSitemap,
+ isLocalFileScan,
browser,
userDataDirectory,
strategy,
@@ -60,7 +62,11 @@ const combineRun = async (details:Data, deviceToScan:string) => {
process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
process.env.CRAWLEE_STORAGE_DIR = randomToken;
- const host = type === ScannerTypes.SITEMAP && isLocalSitemap ? '' : getHost(url);
+ const host =
+ (type === ScannerTypes.SITEMAP && isLocalFileScan) ||
+ (type === ScannerTypes.LOCALFILE && isLocalFileScan)
+ ? ''
+ : getHost(url);
let blacklistedPatterns:string[] | null = null;
try {
@@ -72,7 +78,10 @@ const combineRun = async (details:Data, deviceToScan:string) => {
}
// remove basic-auth credentials from URL
- let finalUrl = urlWithoutAuth(url);
+ let finalUrl = (!(type === ScannerTypes.SITEMAP && isLocalFileScan || type === ScannerTypes.LOCALFILE && isLocalFileScan)) ? urlWithoutAuth(url) : new URL(pathToFileURL(url));
+
+ //Use the string version of finalUrl to reduce logic at submitForm
+ let finalUrlString = finalUrl.toString();
const scanDetails = {
startTime: new Date(),
@@ -80,7 +89,6 @@ const combineRun = async (details:Data, deviceToScan:string) => {
crawlType: type,
requestUrl: finalUrl,
urlsCrawled: new UrlsCrawled(),
-
};
const viewportSettings:ViewportSettingsClass = new ViewportSettingsClass(
@@ -119,6 +127,23 @@ const combineRun = async (details:Data, deviceToScan:string) => {
);
break;
+ case ScannerTypes.LOCALFILE:
+ urlsCrawledObj = await crawlLocalFile(
+ url,
+ randomToken,
+ host,
+ viewportSettings,
+ maxRequestsPerCrawl,
+ browser,
+ userDataDirectory,
+ specifiedMaxConcurrency,
+ fileTypes,
+ blacklistedPatterns,
+ includeScreenshots,
+ extraHTTPHeaders,
+ );
+ break;
+
case ScannerTypes.INTELLIGENT:
urlsCrawledObj = await crawlIntelligentSitemap(
url,
@@ -168,6 +193,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
scanDetails.endTime = new Date();
scanDetails.urlsCrawled = urlsCrawledObj;
await createDetailsAndLogs(randomToken);
+ if (scanDetails.urlsCrawled) {
if (scanDetails.urlsCrawled.scanned.length > 0) {
await createAndUpdateResultsFolders(randomToken);
const pagesNotScanned = [
@@ -192,7 +218,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
browser,
userDataDirectory,
url, // scannedUrl
- finalUrl.href, //entryUrl
+ new URL(finalUrlString).href, //entryUrl
type,
email,
name,
@@ -202,7 +228,8 @@ const combineRun = async (details:Data, deviceToScan:string) => {
pagesNotScanned.length,
metadata,
);
- } else {
+ }
+}else {
printMessage([`No pages were scanned.`], alertMessageOptions);
}
};
diff --git a/src/constants/cliFunctions.ts b/src/constants/cliFunctions.ts
index 1b69603d..696c487b 100644
--- a/src/constants/cliFunctions.ts
+++ b/src/constants/cliFunctions.ts
@@ -16,10 +16,10 @@ export const alertMessageOptions = {
export const cliOptions: { [key: string]: Options } = {
c: {
alias: 'scanner',
- describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent',
+ describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent 5) local file',
requiresArg: true,
coerce: option => {
- const choices = ['sitemap', 'website', 'custom', 'intelligent'];
+ const choices = ['sitemap', 'website', 'custom', 'intelligent', 'localfile'];
if (typeof option === 'number') {
// Will also allow integer choices
if (Number.isInteger(option) && option > 0 && option <= choices.length) {
@@ -34,6 +34,8 @@ export const cliOptions: { [key: string]: Options } = {
return ScannerTypes.WEBSITE;
case 'custom':
return ScannerTypes.CUSTOM;
+ case 'localfile':
+ return ScannerTypes.LOCALFILE;
case 'intelligent':
return ScannerTypes.INTELLIGENT;
default:
diff --git a/src/constants/common.ts b/src/constants/common.ts
index 5b25c894..9c407465 100644
--- a/src/constants/common.ts
+++ b/src/constants/common.ts
@@ -15,7 +15,7 @@ import * as https from 'https';
import os from 'os';
import { minimatch } from 'minimatch';
import { globSync } from 'glob';
-import { LaunchOptions, devices, webkit } from 'playwright';
+import { LaunchOptions, devices, request, webkit } from 'playwright';
import printMessage from 'print-message';
import constants, {
getDefaultChromeDataDir,
@@ -30,6 +30,7 @@ import { silentLogger } from '../logs.js';
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
import { randomThreeDigitNumberString } from '../utils.js';
import { Answers, Data } from '#root/index.js';
+import { fileURLToPath, pathToFileURL } from 'url';
// validateDirPath validates a provided directory path
// returns null if no error
@@ -228,8 +229,8 @@ export const getFileSitemap = (filePath: string): string | null => {
}
const file = fs.readFileSync(filePath, 'utf8');
- const isLocalSitemap = isSitemapContent(file);
- return isLocalSitemap ? filePath : null;
+ const isLocalFileScan = isSitemapContent(file);
+ return isLocalFileScan || (file != undefined) ? filePath : null;
};
export const getUrlMessage = (scanner: ScannerTypes): string => {
@@ -239,7 +240,8 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
return 'Please enter URL of website: ';
case ScannerTypes.SITEMAP:
return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: ';
-
+ case ScannerTypes.LOCALFILE:
+ return 'Please enter file path: ';
default:
return 'Invalid option';
}
@@ -525,7 +527,10 @@ export const checkUrl = async (
}
}
- if (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) {
+ if (
+ (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) ||
+ (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.LOCALFILE)
+ ) {
const isSitemap = isSitemapContent(res.content);
if (!isSitemap) {
@@ -551,7 +556,7 @@ export const prepareData = async (argv: Answers): Promise => {
playwrightDeviceDetailsObject,
maxpages,
strategy,
- isLocalSitemap,
+ isLocalFileScan,
finalUrl,
browserToRun,
nameEmail,
@@ -568,7 +573,7 @@ export const prepareData = async (argv: Answers): Promise => {
// construct filename for scan results
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
- const domain = argv.isLocalSitemap ? 'custom' : new URL(argv.url).hostname;
+ const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
let resultFilename: string;
const randomThreeDigitNumber = randomThreeDigitNumberString();
@@ -594,7 +599,7 @@ export const prepareData = async (argv: Answers): Promise => {
playwrightDeviceDetailsObject,
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
strategy,
- isLocalSitemap,
+ isLocalFileScan,
browser: browserToRun,
nameEmail,
customFlowLabel,
@@ -740,6 +745,7 @@ export const getLinksFromSitemap = async (
username: string,
password: string,
) => {
+ const scannedSitemaps = new Set();
const urls = {}; // dictionary of requests to urls to be scanned
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
@@ -753,7 +759,14 @@ export const getLinksFromSitemap = async (
? (url = addBasicAuthCredentials(url, username, password))
: url;
- const request = new Request({ url: url });
+ url = convertPathToLocalFile(url);
+
+ let request;
+ try {
+ request = new Request({ url: url });
+ } catch (e) {
+ console.log('Error creating request', e);
+ }
if (isUrlPdf(url)) {
request.skipNavigation = true;
}
@@ -837,17 +850,41 @@ export const getLinksFromSitemap = async (
let sitemapType;
let isBasicAuth = false;
- const parsedUrl = new URL(url);
let username = '';
let password = '';
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
- isBasicAuth = true;
- username = decodeURIComponent(parsedUrl.username);
- password = decodeURIComponent(parsedUrl.password);
- parsedUrl.username = '';
- parsedUrl.password = '';
- }
+ let parsedUrl;
+
+ if (scannedSitemaps.has(url)) {
+ // Skip processing if the sitemap has already been scanned
+ return;
+ }
+
+ scannedSitemaps.add(url);
+
+ // Convert file if its not local file path
+ url = convertLocalFileToPath(url)
+
+ // Check whether its a file path or a URL
+ if (isFilePath(url)) {
+ if (!fs.existsSync(url)) {
+ return;
+ }
+ parsedUrl = url;
+ } else if(isValidHttpUrl(url)){
+ parsedUrl = new URL(url);
+
+ if (parsedUrl.username !== '' && parsedUrl.password !== '') {
+ isBasicAuth = true;
+ username = decodeURIComponent(parsedUrl.username);
+ password = decodeURIComponent(parsedUrl.password);
+ parsedUrl.username = '';
+ parsedUrl.password = '';
+ }
+ } else{
+ printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
+ return;
+ }
const getDataUsingPlaywright = async () => {
const browserContext = await constants.launcher.launchPersistentContext(
@@ -859,9 +896,7 @@ export const getLinksFromSitemap = async (
);
const page = await browserContext.newPage();
-
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
-
if (constants.launcher === webkit) {
data = await page.locator('body').innerText();
} else {
@@ -904,7 +939,11 @@ export const getLinksFromSitemap = async (
password: password,
},
});
+ try{
data = await (await instance.get(url, { timeout: 80000 })).data;
+ } catch(error){
+ return; //to skip the error
+ }
} catch (error) {
if (error.code === 'ECONNABORTED') {
await getDataUsingPlaywright();
@@ -912,6 +951,7 @@ export const getLinksFromSitemap = async (
}
}
} else {
+ url = convertLocalFileToPath(url);
data = fs.readFileSync(url, 'utf8');
}
const $ = cheerio.load(data, { xml: true });
@@ -944,11 +984,15 @@ export const getLinksFromSitemap = async (
case constants.xmlSitemapTypes.xmlIndex:
silentLogger.info(`This is a XML format sitemap index.`);
for (const childSitemapUrl of $('loc')) {
+ const childSitemapUrlText = $(childSitemapUrl).text();
if (isLimitReached()) {
break;
}
-
- await fetchUrls($(childSitemapUrl).text());
+ if (childSitemapUrlText.endsWith('.xml')) {
+ await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
+ } else {
+ addToUrlList(childSitemapUrlText); // Add regular URLs to the list
+ }
}
break;
case constants.xmlSitemapTypes.xml:
@@ -1735,3 +1779,26 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
new Promise((resolve) => setTimeout(resolve, timeout))
]);
}
+
+function isValidHttpUrl(urlString) {
+ const pattern = /^(http|https):\/\/[^ "]+$/;
+ return pattern.test(urlString);
+}
+
+export const isFilePath = (url: string): boolean => {
+ return url.startsWith('file://') || url.startsWith('/');
+};
+
+export function convertLocalFileToPath(url: string): string {
+ if (url.startsWith('file://')) {
+ url = fileURLToPath(url);
+ }
+ return url;
+}
+
+export function convertPathToLocalFile(filePath: string): string {
+ if (filePath.startsWith("/")){
+ filePath = pathToFileURL(filePath).toString();
+ }
+ return filePath;
+}
\ No newline at end of file
diff --git a/src/constants/constants.ts b/src/constants/constants.ts
index 36671090..940026c2 100644
--- a/src/constants/constants.ts
+++ b/src/constants/constants.ts
@@ -34,7 +34,7 @@ export const blackListedFileExtensions = [
export const getIntermediateScreenshotsPath = (datasetsPath: string): string =>
`${datasetsPath}/screenshots`;
export const destinationPath = (storagePath: string): string =>
- `${storagePath}/reports/screenshots`;
+ `${storagePath}/screenshots`;
/** Get the path to Default Profile in the Chrome Data Directory
* as per https://chromium.googlesource.com/chromium/src/+/master/docs/user_data_dir.md
@@ -210,6 +210,7 @@ export enum ScannerTypes {
WEBSITE = 'Website',
CUSTOM = 'Custom',
INTELLIGENT = 'Intelligent',
+ LOCALFILE = 'LocalFile',
}
export const guiInfoStatusTypes = {
diff --git a/src/constants/questions.ts b/src/constants/questions.ts
index 108301fa..2d971e9b 100644
--- a/src/constants/questions.ts
+++ b/src/constants/questions.ts
@@ -29,6 +29,7 @@ const startScanQuestions = [
{ name: 'Website', value: ScannerTypes.WEBSITE },
{ name: 'Custom', value: ScannerTypes.CUSTOM },
{ name: 'Intelligent', value: ScannerTypes.INTELLIGENT },
+ { name: 'Localfile', value: ScannerTypes.LOCALFILE},
],
},
{
@@ -104,7 +105,7 @@ const startScanQuestions = [
case statuses.systemError.code:
return statuses.systemError.message;
case statuses.invalidUrl.code:
- if (answers.scanner !== ScannerTypes.SITEMAP) {
+ if (answers.scanner !== (ScannerTypes.SITEMAP || ScannerTypes.LOCALFILE)) {
return statuses.invalidUrl.message;
}
@@ -113,7 +114,7 @@ const startScanQuestions = [
file is a sitemap */
const finalFilePath = getFileSitemap(url);
if (finalFilePath) {
- answers.isLocalSitemap = true;
+ answers.isLocalFileScan = true;
answers.finalUrl = finalFilePath;
return true;
} else {
diff --git a/src/crawlers/commonCrawlerFunc.ts b/src/crawlers/commonCrawlerFunc.ts
index f8f63f3c..1d57d916 100644
--- a/src/crawlers/commonCrawlerFunc.ts
+++ b/src/crawlers/commonCrawlerFunc.ts
@@ -5,6 +5,7 @@ import axe, { resultGroups } from 'axe-core';
import { axeScript, guiInfoStatusTypes, saflyIconSelector } from '../constants/constants.js';
import { guiInfoLog } from '../logs.js';
import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
+import { isFilePath } from '../constants/common.js';
// types
type RuleDetails = {
@@ -221,8 +222,12 @@ export const failedRequestHandler = async ({ request }) => {
};
export const isUrlPdf = url => {
- const parsedUrl = new URL(url);
- return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
+ if(isFilePath(url)) {
+ return /\.pdf$/i.test(url);
+ } else {
+ const parsedUrl = new URL(url);
+ return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
+ }
};
diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts
index 40a46dde..b3954f11 100644
--- a/src/crawlers/crawlDomain.ts
+++ b/src/crawlers/crawlDomain.ts
@@ -318,7 +318,7 @@ const crawlDomain = async (
launchContext: {
launcher: constants.launcher,
launchOptions: getPlaywrightLaunchOptions(browser),
- // Bug in Chrome which causes brwoser pool crash when userDataDirectory is set in non-headless mode
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
userDataDir: userDataDirectory
? process.env.CRAWLEE_HEADLESS !== '0'
? userDataDirectory
@@ -356,7 +356,7 @@ const crawlDomain = async (
preNavigationHooks(extraHTTPHeaders);
},
],
- requestHandlerTimeoutSecs: 90, // Alow each page to be processed by up from default 60 seconds
+ requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
requestHandler: async ({
page,
request,
diff --git a/src/crawlers/crawlIntelligentSitemap.ts b/src/crawlers/crawlIntelligentSitemap.ts
index 78204ecb..fe8f484f 100644
--- a/src/crawlers/crawlIntelligentSitemap.ts
+++ b/src/crawlers/crawlIntelligentSitemap.ts
@@ -128,6 +128,7 @@ import {chromium} from 'playwright';
url,
dataset, //for crawlSitemap to add on to
urlsCrawled, //for crawlSitemap to add on to
+ false,
)
if (urlsCrawled.scanned.length < maxRequestsPerCrawl){
diff --git a/src/crawlers/crawlLocalFile.ts b/src/crawlers/crawlLocalFile.ts
new file mode 100644
index 00000000..6478732e
--- /dev/null
+++ b/src/crawlers/crawlLocalFile.ts
@@ -0,0 +1,201 @@
+import crawlee, { Request, RequestList } from 'crawlee';
+import printMessage from 'print-message';
+import {
+ createCrawleeSubFolders,
+ preNavigationHooks,
+ runAxeScript,
+ failedRequestHandler,
+ isUrlPdf,
+} from './commonCrawlerFunc.js';
+
+import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js';
+import {
+ getLinksFromSitemap,
+ getPlaywrightLaunchOptions,
+ messageOptions,
+ isSkippedUrl,
+ isFilePath,
+ convertLocalFileToPath,
+ convertPathToLocalFile,
+} from '../constants/common.js';
+import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
+import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
+import fs from 'fs';
+import { guiInfoLog } from '../logs.js';
+import playwright from 'playwright';
+import path from 'path';
+import crawlSitemap from './crawlSitemap.js';
+
+const crawlLocalFile = async (
+ sitemapUrl: string,
+ randomToken: string,
+ host: string,
+ viewportSettings: any,
+ maxRequestsPerCrawl: number,
+ browser: string,
+ userDataDirectory: string,
+ specifiedMaxConcurrency: number,
+ fileTypes: string,
+ blacklistedPatterns: string[],
+ includeScreenshots: boolean,
+ extraHTTPHeaders: any,
+ fromCrawlIntelligentSitemap: boolean = false, //optional
+ userUrlInputFromIntelligent: any = null, //optional
+ datasetFromIntelligent: any = null, //optional
+ urlsCrawledFromIntelligent: any = null, //optional
+) => {
+ let dataset: any;
+ let urlsCrawled: any;
+ let linksFromSitemap = [];
+
+ // Boolean to omit axe scan for basic auth URL
+ let isBasicAuth: boolean;
+ let basicAuthPage: number = 0;
+ let finalLinks: Request[] = [];
+
+ if (fromCrawlIntelligentSitemap) {
+ dataset = datasetFromIntelligent;
+ urlsCrawled = urlsCrawledFromIntelligent;
+ } else {
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
+ urlsCrawled = { ...constants.urlsCrawledObj };
+
+ if (!fs.existsSync(randomToken)) {
+ fs.mkdirSync(randomToken);
+ }
+ }
+
+ // Check if the sitemapUrl is a local file and if it exists
+ if (!(isFilePath(sitemapUrl)) || !fs.existsSync(sitemapUrl)) {
+ return;
+ }
+
+ // Checks if its in the right file format, and change it before placing into linksFromSitemap
+ convertLocalFileToPath(sitemapUrl);
+
+ // XML Files
+ if (!sitemapUrl.match(/\.xml$/i)) {
+ linksFromSitemap = [new Request({ url: sitemapUrl })];
+
+ // Non XML file
+ } else {
+ const username = '';
+ const password = '';
+
+ // Put it to crawlSitemap function to handle xml files
+ const updatedUrlsCrawled = await crawlSitemap(
+ sitemapUrl,
+ randomToken,
+ host,
+ viewportSettings,
+ maxRequestsPerCrawl,
+ browser,
+ userDataDirectory,
+ specifiedMaxConcurrency,
+ fileTypes,
+ blacklistedPatterns,
+ includeScreenshots,
+ extraHTTPHeaders,
+ (fromCrawlIntelligentSitemap = false), //optional
+ (userUrlInputFromIntelligent = null), //optional
+ (datasetFromIntelligent = null), //optional
+ (urlsCrawledFromIntelligent = null), //optional
+ true,
+ );
+
+ urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
+ return urlsCrawled;
+ }
+
+ try {
+ sitemapUrl = encodeURI(sitemapUrl);
+ } catch (e) {
+ console.log(e);
+ }
+
+ if (basicAuthRegex.test(sitemapUrl)) {
+ isBasicAuth = true;
+ // request to basic auth URL to authenticate for browser session
+ finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
+ const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
+ // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
+ finalLinks.push(new Request({ url: finalUrl }));
+ basicAuthPage = -2;
+ }
+
+ let uuidToPdfMapping: Record = {}; //key and value of string type
+ const isScanHtml: boolean = ['all', 'html-only'].includes(fileTypes);
+
+ printMessage(['Fetching URLs. This might take some time...'], { border: false });
+
+ finalLinks = [...finalLinks, ...linksFromSitemap];
+
+ const requestList = await RequestList.open({
+ sources: finalLinks,
+ });
+
+ printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
+
+ const request = linksFromSitemap[0];
+ const pdfFileName = path.basename(request.url);
+ const trimmedUrl: string = request.url;
+ const destinationFilePath: string = `${randomToken}/${pdfFileName}`;
+ const data: Buffer = fs.readFileSync(trimmedUrl);
+ fs.writeFileSync(destinationFilePath, data);
+ uuidToPdfMapping[pdfFileName] = trimmedUrl;
+
+ if (!isUrlPdf(request.url)) {
+ let browserUsed;
+ // Playwright only supports chromium,firefox and webkit thus hardcoded to chromium
+ if (browser === 'chromium') {
+ browserUsed = await playwright.chromium.launch();
+ } else if (browser === 'firefox') {
+ browserUsed = await playwright.firefox.launch();
+ } else if (browser === 'webkit') {
+ browserUsed = await playwright.webkit.launch();
+ } else if (browser === 'chrome') {
+ browserUsed = await playwright.chromium.launch(); //chrome not supported, default to chromium
+ } else {
+ console.log('Browser not supported, please use chrome, chromium, firefox, webkit');
+ console.log(' ');
+ return;
+ }
+ const context = await browserUsed.newContext();
+ const page = await context.newPage();
+ request.url = convertPathToLocalFile(request.url);
+ await page.goto(request.url);
+ const results = await runAxeScript(includeScreenshots, page, randomToken, null);
+
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
+ numScanned: urlsCrawled.scanned.length,
+ urlScanned: request.url,
+ });
+
+ urlsCrawled.scanned.push({
+ url: request.url,
+ pageTitle: results.pageTitle,
+ actualUrl: request.loadedUrl, // i.e. actualUrl
+ });
+
+ urlsCrawled.scannedRedirects.push({
+ fromUrl: request.url,
+ toUrl: request.loadedUrl, // i.e. actualUrl
+ });
+
+ results.url = request.url;
+ // results.actualUrl = request.loadedUrl;
+
+ await dataset.pushData(results);
+ } else {
+ urlsCrawled.scanned.push({ url: trimmedUrl, pageTitle: pdfFileName });
+
+ await runPdfScan(randomToken);
+ // transform result format
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
+
+ // push results for each pdf document to key value store
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
+ }
+ return urlsCrawled;
+};
+export default crawlLocalFile;
\ No newline at end of file
diff --git a/src/crawlers/crawlSitemap.ts b/src/crawlers/crawlSitemap.ts
index aa110847..485c51b6 100644
--- a/src/crawlers/crawlSitemap.ts
+++ b/src/crawlers/crawlSitemap.ts
@@ -1,4 +1,4 @@
-import crawlee, { Request,RequestList } from 'crawlee';
+import crawlee, { Request, RequestList } from 'crawlee';
import printMessage from 'print-message';
import {
createCrawleeSubFolders,
@@ -16,6 +16,7 @@ import {
isSkippedUrl,
urlWithoutAuth,
waitForPageLoaded,
+ isFilePath,
} from '../constants/common.js';
import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
@@ -27,7 +28,7 @@ const crawlSitemap = async (
randomToken,
host,
viewportSettings,
- maxRequestsPerCrawl,
+ maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
@@ -39,71 +40,86 @@ const crawlSitemap = async (
userUrlInputFromIntelligent = null, //optional
datasetFromIntelligent = null, //optional
urlsCrawledFromIntelligent = null, //optional
-
+ crawledFromLocalFile = false, //optional
) => {
let dataset;
let urlsCrawled;
- let linksFromSitemap
+ let linksFromSitemap;
-
// Boolean to omit axe scan for basic auth URL
let isBasicAuth;
let basicAuthPage = 0;
- let finalLinks = [];
- let authHeader = "";
-
- if (fromCrawlIntelligentSitemap){
- dataset=datasetFromIntelligent;
+ let finalLinks = [];
+ let authHeader = '';
+
+ if (fromCrawlIntelligentSitemap) {
+ dataset = datasetFromIntelligent;
urlsCrawled = urlsCrawledFromIntelligent;
-
} else {
({ dataset } = await createCrawleeSubFolders(randomToken));
urlsCrawled = { ...constants.urlsCrawledObj };
-
+
if (!fs.existsSync(randomToken)) {
fs.mkdirSync(randomToken);
}
}
- const parsedUrl = new URL(sitemapUrl);
- let username = ""
- let password = "";
- if (parsedUrl.username !=="" && parsedUrl.password !=="") {
- isBasicAuth = true;
- username = decodeURIComponent(parsedUrl.username);
- password = decodeURIComponent(parsedUrl.password);
+ let parsedUrl;
+ let username = '';
+ let password = '';
- // Create auth header
- authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
+ if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
+ console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
+ return;
+ }
+
+ if (isFilePath(sitemapUrl)) {
+ parsedUrl = sitemapUrl;
+ } else {
+ parsedUrl = new URL(sitemapUrl);
+ if (parsedUrl.username !== '' && parsedUrl.password !== '') {
+ isBasicAuth = true;
+ username = decodeURIComponent(parsedUrl.username);
+ password = decodeURIComponent(parsedUrl.password);
- parsedUrl.username = "";
- parsedUrl.password = "";
+ // Create auth header
+ authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
+ parsedUrl.username = '';
+ parsedUrl.password = '';
+ }
}
- linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, username, password)
-
+ linksFromSitemap = await getLinksFromSitemap(
+ sitemapUrl,
+ maxRequestsPerCrawl,
+ browser,
+ userDataDirectory,
+ userUrlInputFromIntelligent,
+ fromCrawlIntelligentSitemap,
+ username,
+ password,
+ );
/**
* Regex to match http://username:password@hostname.com
* utilised in scan strategy to ensure subsequent URLs within the same domain are scanned.
* First time scan with original `url` containing credentials is strictly to authenticate for browser session
* subsequent URLs are without credentials.
* basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl
- */
+ */
+
+ sitemapUrl = encodeURI(sitemapUrl);
- sitemapUrl = encodeURI(sitemapUrl)
-
if (isBasicAuth) {
// request to basic auth URL to authenticate for browser session
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
-
+
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
finalLinks.push(new Request({ url: finalUrl }));
basicAuthPage = -2;
- }
-
-
+ }
+
let pdfDownloads = [];
let uuidToPdfMapping = {};
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
@@ -111,10 +127,7 @@ const crawlSitemap = async (
const { playwrightDeviceDetailsObject } = viewportSettings;
const { maxConcurrency } = constants;
-
-
printMessage(['Fetching URLs. This might take some time...'], { border: false });
-
finalLinks = [...finalLinks, ...linksFromSitemap];
@@ -127,8 +140,12 @@ const crawlSitemap = async (
launchContext: {
launcher: constants.launcher,
launchOptions: getPlaywrightLaunchOptions(browser),
- // Bug in Chrome which causes brwoser pool crash when userDataDirectory is set in non-headless mode
- userDataDir: userDataDirectory ? (process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '') : '',
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
+ userDataDir: userDataDirectory
+ ? process.env.CRAWLEE_HEADLESS !== '0'
+ ? userDataDirectory
+ : ''
+ : '',
},
retryOnBlocked: true,
browserPoolOptions: {
@@ -147,36 +164,34 @@ const crawlSitemap = async (
requestList,
preNavigationHooks: isBasicAuth
? [
- async ({ page, request }) => {
- await page.setExtraHTTPHeaders({
- Authorization: authHeader,
- ...extraHTTPHeaders,
- });
- },
- ]
+ async ({ page, request }) => {
+ await page.setExtraHTTPHeaders({
+ Authorization: authHeader,
+ ...extraHTTPHeaders,
+ });
+ },
+ ]
: [
- async ({ page, request }) => {
- preNavigationHooks(extraHTTPHeaders)
- //insert other code here
- },
- ],
+ async ({ page, request }) => {
+ preNavigationHooks(extraHTTPHeaders);
+ //insert other code here
+ },
+ ],
requestHandlerTimeoutSecs: 90,
requestHandler: async ({ page, request, response, sendRequest }) => {
-
await waitForPageLoaded(page, 10000);
// Set basic auth header if needed
if (isBasicAuth) {
await page.setExtraHTTPHeaders({
- 'Authorization': authHeader
+ Authorization: authHeader,
});
const currentUrl = new URL(request.url);
currentUrl.username = username;
currentUrl.password = password;
request.url = currentUrl.href;
}
-
-
+
const actualUrl = request.loadedUrl || request.url;
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
@@ -241,13 +256,13 @@ const crawlSitemap = async (
numScanned: urlsCrawled.scanned.length,
urlScanned: request.url,
});
-
+
const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
if (isRedirected) {
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
item => (item.actualUrl || item.url) === request.loadedUrl,
);
-
+
if (isLoadedUrlInCrawledUrls) {
urlsCrawled.notScannedRedirects.push({
fromUrl: request.url,
@@ -255,22 +270,25 @@ const crawlSitemap = async (
});
return;
}
-
+
urlsCrawled.scanned.push({
url: urlWithoutAuth(request.url),
pageTitle: results.pageTitle,
actualUrl: request.loadedUrl, // i.e. actualUrl
});
-
+
urlsCrawled.scannedRedirects.push({
fromUrl: urlWithoutAuth(request.url),
toUrl: request.loadedUrl, // i.e. actualUrl
});
-
+
results.url = request.url;
results.actualUrl = request.loadedUrl;
} else {
- urlsCrawled.scanned.push({ url: urlWithoutAuth(request.url), pageTitle: results.pageTitle });
+ urlsCrawled.scanned.push({
+ url: urlWithoutAuth(request.url),
+ pageTitle: results.pageTitle,
+ });
}
await dataset.pushData(results);
} else {
@@ -278,22 +296,23 @@ const crawlSitemap = async (
numScanned: urlsCrawled.scanned.length,
urlScanned: request.url,
});
-
+
isScanHtml && urlsCrawled.invalid.push(actualUrl);
}
}
},
failedRequestHandler: async ({ request }) => {
-
- if (isBasicAuth){
- request.url ? request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}` : null;
+ if (isBasicAuth) {
+ request.url
+ ? (request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`)
+ : null;
}
// check if scanned pages have reached limit due to multi-instances of handler running
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
return;
}
-
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
numScanned: urlsCrawled.scanned.length,
urlScanned: request.url,
@@ -309,9 +328,6 @@ const crawlSitemap = async (
await requestList.isFinished();
-
-
-
if (pdfDownloads.length > 0) {
// wait for pdf downloads to complete
await Promise.all(pdfDownloads);
@@ -333,13 +349,11 @@ const crawlSitemap = async (
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
}
-
- if (!fromCrawlIntelligentSitemap){
+ if (!fromCrawlIntelligentSitemap) {
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
}
return urlsCrawled;
-
};
export default crawlSitemap;
diff --git a/src/crawlers/pdfScanFunc.ts b/src/crawlers/pdfScanFunc.ts
index 3f48aef5..100c0604 100644
--- a/src/crawlers/pdfScanFunc.ts
+++ b/src/crawlers/pdfScanFunc.ts
@@ -8,6 +8,7 @@ import { createRequire } from 'module';
import os from 'os';
import path from 'path';
import { getPageFromContext } from '../screenshotFunc/pdfScreenshotFunc.js';
+import { isFilePath } from '../constants/common.js';
const require = createRequire(import.meta.url);
@@ -144,10 +145,18 @@ export const handlePdfDownload = (randomToken, pdfDownloads, request, sendReques
pdfDownloads.push(
new Promise(async resolve => {
- const pdfResponse = await sendRequest({ responseType: 'buffer', isStream: true });
- pdfResponse.setEncoding('binary');
-
- const bufs = []; // to check for pdf validity
+ let bufs = [];
+ let pdfResponse;
+
+ if (isFilePath(url)) {
+ // Read the file from the file system
+ const filePath = new URL(url).pathname;
+ pdfResponse = fs.createReadStream(filePath, { encoding: 'binary' });
+ } else {
+ // Send HTTP/HTTPS request
+ pdfResponse = await sendRequest({ responseType: 'buffer', isStream: true });
+ pdfResponse.setEncoding('binary');
+ }
const downloadFile = fs.createWriteStream(`${randomToken}/${pdfFileName}.pdf`, {
flags: 'a',
});
@@ -216,17 +225,24 @@ export const mapPdfScanResults = async (randomToken, uuidToUrlMapping) => {
const intermediateFolder = randomToken;
const intermediateResultPath = `${intermediateFolder}/${constants.pdfScanResultFileName}`;
- const rawdata = fs.readFileSync(intermediateResultPath);
- const output = JSON.parse(rawdata.toString());
+ const rawdata = fs.readFileSync(intermediateResultPath, 'utf-8');
+
+ let parsedJsonData;
+ try {
+ parsedJsonData = JSON.parse(rawdata);
+ } catch (err) {
+ consoleLogger.log(err);
+ }
const errorMeta = require('../constants/errorMeta.json');
const resultsList = [];
+ if (parsedJsonData) {
// jobs: files that are scanned
const {
report: { jobs },
- } = output;
+ } = parsedJsonData;
// loop through all jobs
for (let jobIdx = 0; jobIdx < jobs.length; jobIdx++) {
@@ -277,6 +293,7 @@ export const mapPdfScanResults = async (randomToken, uuidToUrlMapping) => {
resultsList.push(translated);
}
+}
return resultsList;
};
diff --git a/src/index.ts b/src/index.ts
index d47d76b5..22d1f228 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -39,7 +39,7 @@ export type Answers = {
metadata: string;
maxpages: number;
strategy: string;
- isLocalSitemap: boolean;
+ isLocalFileScan: boolean;
finalUrl: string;
customFlowLabel: string;
specifiedMaxConcurrency: number;
@@ -63,7 +63,7 @@ export type Data = {
playwrightDeviceDetailsObject: Object;
maxRequestsPerCrawl: number;
strategy: string;
- isLocalSitemap: boolean;
+ isLocalFileScan: boolean;
browser: string;
nameEmail: string;
customFlowLabel: string;
diff --git a/src/mergeAxeResults.ts b/src/mergeAxeResults.ts
index bbeb10b0..d9a8ec36 100644
--- a/src/mergeAxeResults.ts
+++ b/src/mergeAxeResults.ts
@@ -106,7 +106,7 @@ const parseContentToJson = async rPath =>
const writeCsv = async (allIssues, storagePath) => {
- const csvOutput = createWriteStream(`${storagePath}/reports/report.csv`, { encoding: 'utf8' });
+ const csvOutput = createWriteStream(`${storagePath}/report.csv`, { encoding: 'utf8' });
const formatPageViolation = pageNum => {
if (pageNum < 0) return 'Document';
return `Page ${pageNum}`;
@@ -201,7 +201,7 @@ const writeHTML = async (allIssues, storagePath, htmlFilename = 'report') => {
filename: path.join(__dirname, './static/ejs/report.ejs'),
});
const html = template(allIssues);
- fs.writeFileSync(`${storagePath}/reports/${htmlFilename}.html`, html);
+ fs.writeFileSync(`${storagePath}/${htmlFilename}.html`, html);
};
const writeSummaryHTML = async (allIssues, storagePath, htmlFilename = 'summary') => {
@@ -210,7 +210,7 @@ const writeSummaryHTML = async (allIssues, storagePath, htmlFilename = 'summary'
filename: path.join(__dirname, './static/ejs/summary.ejs'),
});
const html = template(allIssues);
- fs.writeFileSync(`${storagePath}/reports/${htmlFilename}.html`, html);
+ fs.writeFileSync(`${storagePath}/${htmlFilename}.html`, html);
};
// Proper base64 encoding function using Buffer
@@ -230,7 +230,7 @@ const writeBase64 = async (allIssues, storagePath, htmlFilename = 'report.html')
const encodedScanItems = base64Encode(items);
const encodedScanData = base64Encode(rest);
- const filePath = path.join(storagePath, 'reports', 'scanDetails.csv');
+ const filePath = path.join(storagePath, 'scanDetails.csv');
const directoryPath = path.dirname(filePath);
if (!fs.existsSync(directoryPath)) {
@@ -239,7 +239,7 @@ const writeBase64 = async (allIssues, storagePath, htmlFilename = 'report.html')
await fs.promises.writeFile(filePath, `scanData_base64,scanItems_base64\n${encodedScanData},${encodedScanItems}`);
- const htmlFilePath = path.join(storagePath, 'reports', htmlFilename);
+ const htmlFilePath = path.join(storagePath, htmlFilename);
let htmlContent = fs.readFileSync(htmlFilePath, 'utf8');
const allIssuesJson = JSON.stringify(allIssues);
@@ -282,8 +282,8 @@ if (os.platform() === 'linux') {
}
const writeSummaryPdf = async (storagePath, filename = 'summary') => {
- const htmlFilePath = `${storagePath}/reports/${filename}.html`;
- const fileDestinationPath = `${storagePath}/reports/${filename}.pdf`;
+ const htmlFilePath = `${storagePath}/${filename}.html`;
+ const fileDestinationPath = `${storagePath}/${filename}.pdf`;
const browser = await chromium.launch({
headless: true,
channel: browserChannel,
@@ -468,7 +468,7 @@ const createRuleIdJson = allIssues => {
const moveElemScreenshots = (randomToken, storagePath) => {
const currentScreenshotsPath = `${randomToken}/elemScreenshots`;
- const resultsScreenshotsPath = `${storagePath}/reports/elemScreenshots`;
+ const resultsScreenshotsPath = `${storagePath}/elemScreenshots`;
if (fs.existsSync(currentScreenshotsPath)) {
fs.moveSync(currentScreenshotsPath, resultsScreenshotsPath);
}
@@ -490,7 +490,7 @@ export const generateArtifacts = async (
const storagePath = getStoragePath(randomToken);
- urlScanned = urlWithoutAuth(urlScanned);
+ urlScanned = (scanType === ScannerTypes.SITEMAP || scanType === ScannerTypes.LOCALFILE) ? urlScanned : urlWithoutAuth(urlScanned);
const formatAboutStartTime = dateString => {
const utcStartTimeDate = new Date(dateString);
diff --git a/src/screenshotFunc/htmlScreenshotFunc.ts b/src/screenshotFunc/htmlScreenshotFunc.ts
index e8a20667..2812722c 100644
--- a/src/screenshotFunc/htmlScreenshotFunc.ts
+++ b/src/screenshotFunc/htmlScreenshotFunc.ts
@@ -156,7 +156,7 @@ const saveImageBufferToFile = (buffer, fileName) => {
// export const takeScreenshotForHTMLElements = async (screenshotData, storagePath, browserToRun) => {
-// const screenshotDir = `${storagePath}/reports/screenshots`;
+// const screenshotDir = `${storagePath}/screenshots`;
// let screenshotItems = [];
// let randomToken = `cloned-${Date.now()}`;
// const clonedDir = getClonedProfilesWithRandomToken(browserToRun, randomToken);
diff --git a/src/screenshotFunc/pdfScreenshotFunc.ts b/src/screenshotFunc/pdfScreenshotFunc.ts
index 3eaaf4d1..d6a32795 100644
--- a/src/screenshotFunc/pdfScreenshotFunc.ts
+++ b/src/screenshotFunc/pdfScreenshotFunc.ts
@@ -337,6 +337,7 @@ export const getSelectedPageByLocation = bboxLocation => {
};
export const getPageFromContext = async (context, pdfFilePath) => {
+ try{
const loadingTask = pdfjs.getDocument({
url: pdfFilePath,
// canvasFactory,
@@ -348,6 +349,9 @@ export const getPageFromContext = async (context, pdfFilePath) => {
const structureTree = await pdf._pdfInfo.structureTree;
const page = getBboxPage({ location: context }, structureTree);
return page;
+} catch (error){
+ // Error handling
+}
};
export const getBboxPages = (bboxes, structure) => {
diff --git a/src/utils.ts b/src/utils.ts
index 0a59d29f..1cc079e5 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -122,7 +122,7 @@ export const writeToUserDataTxt = async (key, value) => {
export const createAndUpdateResultsFolders = async randomToken => {
const storagePath = getStoragePath(randomToken);
- await fs.ensureDir(`${storagePath}/reports`);
+ await fs.ensureDir(`${storagePath}`);
const intermediatePdfResultsPath = `${randomToken}/${constants.pdfScanResultFileName}`;