Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix on sitemap adds localFileScan, move report directory up one level #364

Merged
merged 15 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
printMessage([statuses.systemError.message], messageOptions);
process.exit(res.status);
case statuses.invalidUrl.code:
if (argvs.scanner !== ScannerTypes.SITEMAP) {
if (argvs.scanner !== ScannerTypes.SITEMAP && argvs.scanner !== ScannerTypes.LOCALFILE) {
printMessage([statuses.invalidUrl.message], messageOptions);
process.exit(res.status);
}
Expand All @@ -277,7 +277,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
file is a sitemap */
const finalFilePath = getFileSitemap(argvs.url);
if (finalFilePath) {
argvs.isLocalSitemap = true;
argvs.isLocalFileScan = true;
argvs.finalUrl = finalFilePath;
if (process.env.VALIDATE_URL_PH_GUI) {
console.log('Url is valid');
Expand Down Expand Up @@ -367,7 +367,7 @@ const optionsAnswer: Answers = {
followRobots: options['followRobots'],
customFlowLabel: options['customFlowLabel'],
viewportWidth: options['viewportWidth'],
isLocalSitemap: options['isLocalSitemap'],
isLocalFileScan: options['isLocalFileScan'],
exportDirectory: options['exportDirectory'],
clonedBrowserDataDir: options['clonedBrowserDataDir'],
specifiedMaxConcurrency: options['specifiedMaxConcurrency'],
Expand Down
39 changes: 33 additions & 6 deletions src/combine.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import printMessage from 'print-message';
import crawlSitemap from './crawlers/crawlSitemap.js';
import crawlDomain from './crawlers/crawlDomain.js';
import crawlLocalFile from './crawlers/crawlLocalFile.js';
import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
import { generateArtifacts } from './mergeAxeResults.js';
import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js';
Expand All @@ -10,6 +11,7 @@ import { consoleLogger, silentLogger } from './logs.js';
import runCustom from './crawlers/runCustom.js';
import { alertMessageOptions } from './constants/cliFunctions.js';
import { Data } from './index.js';
import { fileURLToPath, pathToFileURL } from 'url';


// Class exports
Expand Down Expand Up @@ -42,7 +44,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
viewportWidth,
playwrightDeviceDetailsObject,
maxRequestsPerCrawl,
isLocalSitemap,
isLocalFileScan,
browser,
userDataDirectory,
strategy,
Expand All @@ -60,7 +62,11 @@ const combineRun = async (details:Data, deviceToScan:string) => {
process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
process.env.CRAWLEE_STORAGE_DIR = randomToken;

const host = type === ScannerTypes.SITEMAP && isLocalSitemap ? '' : getHost(url);
const host =
(type === ScannerTypes.SITEMAP && isLocalFileScan) ||
(type === ScannerTypes.LOCALFILE && isLocalFileScan)
? ''
: getHost(url);

let blacklistedPatterns:string[] | null = null;
try {
Expand All @@ -72,15 +78,17 @@ const combineRun = async (details:Data, deviceToScan:string) => {
}

// remove basic-auth credentials from URL
let finalUrl = urlWithoutAuth(url);
let finalUrl = (!(type === ScannerTypes.SITEMAP && isLocalFileScan || type === ScannerTypes.LOCALFILE && isLocalFileScan)) ? urlWithoutAuth(url) : new URL(pathToFileURL(url));

//Use the string version of finalUrl to reduce logic at submitForm
let finalUrlString = finalUrl.toString();

const scanDetails = {
startTime: new Date(),
endTime: new Date(),
crawlType: type,
requestUrl: finalUrl,
urlsCrawled: new UrlsCrawled(),

};

const viewportSettings:ViewportSettingsClass = new ViewportSettingsClass(
Expand Down Expand Up @@ -119,6 +127,23 @@ const combineRun = async (details:Data, deviceToScan:string) => {
);
break;

case ScannerTypes.LOCALFILE:
urlsCrawledObj = await crawlLocalFile(
url,
randomToken,
host,
viewportSettings,
maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
fileTypes,
blacklistedPatterns,
includeScreenshots,
extraHTTPHeaders,
);
break;

case ScannerTypes.INTELLIGENT:
urlsCrawledObj = await crawlIntelligentSitemap(
url,
Expand Down Expand Up @@ -168,6 +193,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
scanDetails.endTime = new Date();
scanDetails.urlsCrawled = urlsCrawledObj;
await createDetailsAndLogs(randomToken);
if (scanDetails.urlsCrawled) {
if (scanDetails.urlsCrawled.scanned.length > 0) {
await createAndUpdateResultsFolders(randomToken);
const pagesNotScanned = [
Expand All @@ -192,7 +218,7 @@ const combineRun = async (details:Data, deviceToScan:string) => {
browser,
userDataDirectory,
url, // scannedUrl
finalUrl.href, //entryUrl
new URL(finalUrlString).href, //entryUrl
type,
email,
name,
Expand All @@ -202,7 +228,8 @@ const combineRun = async (details:Data, deviceToScan:string) => {
pagesNotScanned.length,
metadata,
);
} else {
}
}else {
printMessage([`No pages were scanned.`], alertMessageOptions);
}
};
Expand Down
6 changes: 4 additions & 2 deletions src/constants/cliFunctions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ export const alertMessageOptions = {
export const cliOptions: { [key: string]: Options } = {
c: {
alias: 'scanner',
describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent',
describe: 'Type of scan, 1) sitemap, 2) website crawl, 3) custom flow, 4) intelligent 5) local file',
requiresArg: true,
coerce: option => {
const choices = ['sitemap', 'website', 'custom', 'intelligent'];
const choices = ['sitemap', 'website', 'custom', 'intelligent', 'localfile'];
if (typeof option === 'number') {
// Will also allow integer choices
if (Number.isInteger(option) && option > 0 && option <= choices.length) {
Expand All @@ -34,6 +34,8 @@ export const cliOptions: { [key: string]: Options } = {
return ScannerTypes.WEBSITE;
case 'custom':
return ScannerTypes.CUSTOM;
case 'localfile':
return ScannerTypes.LOCALFILE;
case 'intelligent':
return ScannerTypes.INTELLIGENT;
default:
Expand Down
109 changes: 88 additions & 21 deletions src/constants/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import * as https from 'https';
import os from 'os';
import { minimatch } from 'minimatch';
import { globSync } from 'glob';
import { LaunchOptions, devices, webkit } from 'playwright';
import { LaunchOptions, devices, request, webkit } from 'playwright';
import printMessage from 'print-message';
import constants, {
getDefaultChromeDataDir,
Expand All @@ -30,6 +30,7 @@ import { silentLogger } from '../logs.js';
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
import { randomThreeDigitNumberString } from '../utils.js';
import { Answers, Data } from '#root/index.js';
import { fileURLToPath, pathToFileURL } from 'url';

// validateDirPath validates a provided directory path
// returns null if no error
Expand Down Expand Up @@ -228,8 +229,8 @@ export const getFileSitemap = (filePath: string): string | null => {
}

const file = fs.readFileSync(filePath, 'utf8');
const isLocalSitemap = isSitemapContent(file);
return isLocalSitemap ? filePath : null;
const isLocalFileScan = isSitemapContent(file);
return isLocalFileScan || (file != undefined) ? filePath : null;
};

export const getUrlMessage = (scanner: ScannerTypes): string => {
Expand All @@ -239,7 +240,8 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
return 'Please enter URL of website: ';
case ScannerTypes.SITEMAP:
return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: ';

case ScannerTypes.LOCALFILE:
return 'Please enter file path: ';
default:
return 'Invalid option';
}
Expand Down Expand Up @@ -525,7 +527,10 @@ export const checkUrl = async (
}
}

if (res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) {
if (
(res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.SITEMAP) ||
(res.status === constants.urlCheckStatuses.success.code && scanner === ScannerTypes.LOCALFILE)
) {
const isSitemap = isSitemapContent(res.content);

if (!isSitemap) {
Expand All @@ -551,7 +556,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
playwrightDeviceDetailsObject,
maxpages,
strategy,
isLocalSitemap,
isLocalFileScan,
finalUrl,
browserToRun,
nameEmail,
Expand All @@ -568,7 +573,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {

// construct filename for scan results
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
const domain = argv.isLocalSitemap ? 'custom' : new URL(argv.url).hostname;
const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
let resultFilename: string;
const randomThreeDigitNumber = randomThreeDigitNumberString();
Expand All @@ -594,7 +599,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
playwrightDeviceDetailsObject,
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
strategy,
isLocalSitemap,
isLocalFileScan,
browser: browserToRun,
nameEmail,
customFlowLabel,
Expand Down Expand Up @@ -740,6 +745,7 @@ export const getLinksFromSitemap = async (
username: string,
password: string,
) => {
const scannedSitemaps = new Set<string>();
const urls = {}; // dictionary of requests to urls to be scanned

const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
Expand All @@ -753,7 +759,14 @@ export const getLinksFromSitemap = async (
? (url = addBasicAuthCredentials(url, username, password))
: url;

const request = new Request({ url: url });
url = convertPathToLocalFile(url);

let request;
try {
request = new Request({ url: url });
} catch (e) {
console.log('Error creating request', e);
}
if (isUrlPdf(url)) {
request.skipNavigation = true;
}
Expand Down Expand Up @@ -837,17 +850,41 @@ export const getLinksFromSitemap = async (
let sitemapType;
let isBasicAuth = false;

const parsedUrl = new URL(url);
let username = '';
let password = '';

if (parsedUrl.username !== '' && parsedUrl.password !== '') {
isBasicAuth = true;
username = decodeURIComponent(parsedUrl.username);
password = decodeURIComponent(parsedUrl.password);
parsedUrl.username = '';
parsedUrl.password = '';
}
let parsedUrl;

if (scannedSitemaps.has(url)) {
// Skip processing if the sitemap has already been scanned
return;
}

scannedSitemaps.add(url);

// Convert file if its not local file path
url = convertLocalFileToPath(url)

// Check whether its a file path or a URL
if (isFilePath(url)) {
if (!fs.existsSync(url)) {
return;
}
parsedUrl = url;
} else if(isValidHttpUrl(url)){
parsedUrl = new URL(url);

if (parsedUrl.username !== '' && parsedUrl.password !== '') {
isBasicAuth = true;
username = decodeURIComponent(parsedUrl.username);
password = decodeURIComponent(parsedUrl.password);
parsedUrl.username = '';
parsedUrl.password = '';
}
} else{
printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
return;
}

const getDataUsingPlaywright = async () => {
const browserContext = await constants.launcher.launchPersistentContext(
Expand All @@ -859,9 +896,7 @@ export const getLinksFromSitemap = async (
);

const page = await browserContext.newPage();

await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });

if (constants.launcher === webkit) {
data = await page.locator('body').innerText();
} else {
Expand Down Expand Up @@ -904,14 +939,19 @@ export const getLinksFromSitemap = async (
password: password,
},
});
try{
data = await (await instance.get(url, { timeout: 80000 })).data;
} catch(error){
return; //to skip the error
}
} catch (error) {
if (error.code === 'ECONNABORTED') {
await getDataUsingPlaywright();
}
}
}
} else {
url = convertLocalFileToPath(url);
data = fs.readFileSync(url, 'utf8');
}
const $ = cheerio.load(data, { xml: true });
Expand Down Expand Up @@ -944,11 +984,15 @@ export const getLinksFromSitemap = async (
case constants.xmlSitemapTypes.xmlIndex:
silentLogger.info(`This is a XML format sitemap index.`);
for (const childSitemapUrl of $('loc')) {
const childSitemapUrlText = $(childSitemapUrl).text();
if (isLimitReached()) {
break;
}

await fetchUrls($(childSitemapUrl).text());
if (childSitemapUrlText.endsWith('.xml')) {
await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
} else {
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
}
}
break;
case constants.xmlSitemapTypes.xml:
Expand Down Expand Up @@ -1735,3 +1779,26 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
new Promise((resolve) => setTimeout(resolve, timeout))
]);
}

function isValidHttpUrl(urlString) {
const pattern = /^(http|https):\/\/[^ "]+$/;
return pattern.test(urlString);
}

export const isFilePath = (url: string): boolean => {
return url.startsWith('file://') || url.startsWith('/');
};

export function convertLocalFileToPath(url: string): string {
if (url.startsWith('file://')) {
url = fileURLToPath(url);
}
return url;
}

export function convertPathToLocalFile(filePath: string): string {
if (filePath.startsWith("/")){
filePath = pathToFileURL(filePath).toString();
}
return filePath;
}
1 change: 1 addition & 0 deletions src/constants/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ export enum ScannerTypes {
WEBSITE = 'Website',
CUSTOM = 'Custom',
INTELLIGENT = 'Intelligent',
LOCALFILE = 'LocalFile',
}

export const guiInfoStatusTypes = {
Expand Down
Loading