Skip to content

Commit

Permalink
fix: shrink HTML with cheerio (#74)
Browse files Browse the repository at this point in the history
* fix: shrink HTML with cheerio

* test(unit): add shrink HTML tests

* docs: update changelog
  • Loading branch information
Patai5 authored Sep 22, 2024
1 parent 5ff9938 commit dee78cb
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 81 deletions.
103 changes: 54 additions & 49 deletions code/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions code/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"ajv-formats": "^2.1.1",
"apify": "^3.1.16",
"apify-client": "^2.9.3",
"cheerio": "^1.0.0",
"crawlee": "^3.8.1",
"gpt-3-encoder": "^1.1.4",
"joplin-turndown-plugin-gfm": "^1.0.12",
Expand Down
44 changes: 13 additions & 31 deletions code/src/processors.ts
Original file line number Diff line number Diff line change
@@ -1,51 +1,33 @@
import { load } from 'cheerio';
import { encode } from 'gpt-3-encoder';
import { Page } from 'playwright';

import { htmlToMarkdownProcessor } from './markdown.js';

const JSON_REGEX = /\{(?:[^{}]|())*\}/;

/**
* Shrinks HTML by removing css targeted elements and extra spaces
* @param html
*/
export const shrinkHtml = async (
html: string,
page: Page,
options: { removeLinkUrls: boolean; removeElementsCssSelector?: string },
) => {
const { removeElementsCssSelector, removeLinkUrls } = options;

const stripped = await page.evaluate(
// eslint-disable-next-line @typescript-eslint/no-shadow
([unstripped, removeSelector, removeLinkUrls]) => {
const doc = new DOMParser().parseFromString(unstripped, 'text/html');
if (removeSelector) {
const elements = doc.querySelectorAll(removeSelector);
for (const element of elements) {
// there have been some cases when the page's own scripts cause errors and running this line
// causes them to reemerge, so what in try/cartch
try {
element.remove();
} catch (err) {
/* ignore */
}
}
}
const $ = load(html);

if (removeLinkUrls) {
const linkEls = doc.querySelectorAll('a');
for (const linkEl of linkEls) {
linkEl.removeAttribute('href');
}
}
if (removeElementsCssSelector) {
$(removeElementsCssSelector).map((_, el) => $(el).remove());
}
if (removeLinkUrls) {
$('a').map((_, el) => $(el).removeAttr('href'));
}

return doc.documentElement.outerHTML;
},
[html, removeElementsCssSelector, removeLinkUrls] as const,
);
return stripped.replace(/\s{2,}/g, ' ') // remove extra spaces
.replace(/>\s+</g, '><'); // remove all spaces between tags
const stripped = $.html();
return stripped
.replace(/\s{2,}/g, ' ') // remove extra spaces
.replace(/>\s+</g, '><') // remove all spaces between tags
.replace(/^<!DOCTYPE[^>]*>/i, ''); // remove doctype
};

/**
Expand Down
2 changes: 1 addition & 1 deletion code/src/routes/crawl-route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
originContentHtml = await page.content();
}

const shrunkHtml = await shrinkHtml(originContentHtml, page, { removeLinkUrls, removeElementsCssSelector });
const shrunkHtml = await shrinkHtml(originContentHtml, { removeLinkUrls, removeElementsCssSelector });
const originPageContent = pageFormat === PAGE_FORMAT.MARKDOWN ? htmlToMarkdown(shrunkHtml) : shrunkHtml;

const instructionTokenLength = getNumberOfTextTokens(instructions);
Expand Down
35 changes: 35 additions & 0 deletions code/test/unit/processors.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { describe, expect, test } from 'vitest';

import { shrinkHtml } from '../../src/processors';

describe('shrinkHtml', () => {
test('should shrink additional spaces', async () => {
const html = `<html> <head> <title>Title1 end</title> </head> <body> <p>text 1 </p> </body></html>`;
const result = await shrinkHtml(html, { removeLinkUrls: false });

expect(result).toBe(`<html><head><title>Title1 end</title></head><body><p>text 1 </p></body></html>`);
});

test('should ignore doctype', async () => {
const html = `<!DOCTYPE html><html><head></head><body><p>Test</p></body></html>`;
const result = await shrinkHtml(html, { removeLinkUrls: false });

expect(result).toBe(`<html><head></head><body><p>Test</p></body></html>`);
});

test('should remove link urls', async () => {
const html = `<html><a href="http://example.com">Link</a><p class="test">Test</p></html>`;
const result = await shrinkHtml(html, { removeLinkUrls: true });

expect(result).toBe(`<html><head></head><body><a>Link</a><p class="test">Test</p></body></html>`);
});

test('should remove elements by css selector', async () => {
const html = `<html><a href="http://example.com">Link</a><p class="not-remove">Test1</p><p class="remove">Test2</p></html>`;
const result = await shrinkHtml(html, { removeLinkUrls: false, removeElementsCssSelector: '.remove' });

expect(result).toBe(
`<html><head></head><body><a href="http://example.com">Link</a><p class="not-remove">Test1</p></body></html>`,
);
});
});
4 changes: 4 additions & 0 deletions shared/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
This changelog tracks updates to both GTP Scraper and Extended GPT Scraper actors.

# 2024-09-22
*Fixes*
- Fixed a bug where HTML minimization was failing on some specific websites.

# 2024-08-12
*Features*
- Added support for GPT-4o-mini model. (Extended GPT scraper)
Expand Down

0 comments on commit dee78cb

Please sign in to comment.