diff --git a/deno.json b/deno.json deleted file mode 100644 index 3092e4a3..00000000 --- a/deno.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "imports": { - "cross-fetch": "./src/deno/cross-fetch.js", - "linkedom": "https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm.ts", - "@mozilla/readability": "https://esm.sh/@mozilla/readability@0.5.0", - "sanitize-html": "https://esm.sh/sanitize-html@2.13.0", - "bellajs": "https://esm.sh/bellajs@11.1.3" - } -} diff --git a/examples/browser-article-parser/package.json b/examples/browser-article-parser/package.json index 972e49c5..fd76eb47 100644 --- a/examples/browser-article-parser/package.json +++ b/examples/browser-article-parser/package.json @@ -6,7 +6,7 @@ "start": "node server" }, "dependencies": { - "express": "^4.18.2", - "got": "^14.2.0" + "express": "latest", + "got": "latest" } } diff --git a/examples/bun-article-parser/package.json b/examples/bun-article-parser/package.json index 98c4e46a..74b6ec35 100644 --- a/examples/bun-article-parser/package.json +++ b/examples/bun-article-parser/package.json @@ -5,10 +5,10 @@ "start": "bun run index.ts" }, "devDependencies": { - "bun-types": "^1.0.26" + "bun-types": "latest" }, "dependencies": { "@extractus/article-extractor": "latest", - "hono": "^4.0.1" + "hono": "latest" } } diff --git a/examples/deno-article-parser/deno.json b/examples/deno-article-parser/deno.json index b8a77bea..2243c409 100644 --- a/examples/deno-article-parser/deno.json +++ b/examples/deno-article-parser/deno.json @@ -3,7 +3,7 @@ "version": "1.0.0", "imports": { "serve": "https://deno.land/std/http/server.ts", - "hono": "https://deno.land/x/hono@v3.11.2/mod.ts", + "hono": "https://deno.land/x/hono/mod.ts", "article-extractor": "https://esm.sh/@extractus/article-extractor" }, "tasks": { diff --git a/examples/node-article-parser/package.json b/examples/node-article-parser/package.json index 309e3518..ff22efb0 100644 --- a/examples/node-article-parser/package.json +++ b/examples/node-article-parser/package.json @@ -8,6 +8,6 @@ }, "dependencies": { "@extractus/article-extractor": "latest", - "express": "^4.18.2" + "express": "latest" } } diff --git a/examples/pupperteer/README.md b/examples/pupperteer/README.md new file mode 100644 index 00000000..86aad2e4 --- /dev/null +++ b/examples/pupperteer/README.md @@ -0,0 +1,19 @@ +# node-article-parser with Pupperteer + +Install dependencies: + +```bash +npm i + +# or pnpm, yarn +``` + +Start server: + +```bash +npm start +``` + +Open `http://localhost:3100/?url=https://client-side-rendering.pages.dev/lorem-ipsum` to see the result. + +--- diff --git a/examples/pupperteer/index.js b/examples/pupperteer/index.js new file mode 100644 index 00000000..5fa9ffdd --- /dev/null +++ b/examples/pupperteer/index.js @@ -0,0 +1,64 @@ +import puppeteer from 'puppeteer' +import express from 'express' +import { extractFromHtml } from '@extractus/article-extractor' + +const app = express() + +const meta = { + service: 'article-parser-pupperteer', + lang: 'javascript', + server: 'express', + platform: 'node', +} + +const loadHtml = async (url) => { + let browser = null + try { + console.log('Initialize puppeteer engine') + browser = await puppeteer.launch() + const page = await browser.newPage() + await page.setDefaultNavigationTimeout(6e4) + console.log(`Start rendering target page "${url}"`) + await page.goto(url, { + waitUntil: 'networkidle0', + }) + console.log(`Load html content from target page ${url}`) + const html = await page.content() + return html + } catch (err) { + console.error(err) + return null + } finally { + if (browser) { + await browser.close() + } + } +} + +app.get('/', async (req, res) => { + const url = req.query.url + if (!url) { + return res.json(meta) + } + try { + const html = await loadHtml(url) + const data = await extractFromHtml(html, url) + return res.json({ + error: 0, + message: 'article has been extracted successfully', + data, + meta, + }) + } catch (err) { + return res.json({ + error: 1, + message: err.message, + data: null, + meta, + }) + } +}) + +app.listen(3100, () => { + console.log('Server is running at http://localhost:3100') +}) diff --git a/examples/pupperteer/package.json b/examples/pupperteer/package.json new file mode 100644 index 00000000..23df8454 --- /dev/null +++ b/examples/pupperteer/package.json @@ -0,0 +1,14 @@ +{ + "name": "node-pupperteer", + "version": "1.0.0", + "main": "index.js", + "type": "module", + "scripts": { + "start": "node index.js" + }, + "dependencies": { + "@extractus/article-extractor": "latest", + "express": "latest", + "puppeteer": "latest" + } +} diff --git a/examples/tsnode-article-parser/package.json b/examples/tsnode-article-parser/package.json index a5eab79b..ca586db0 100644 --- a/examples/tsnode-article-parser/package.json +++ b/examples/tsnode-article-parser/package.json @@ -8,10 +8,10 @@ "start": "node dist/index.js" }, "devDependencies": { - "typescript": "^5.3.3" + "typescript": "latest" }, "dependencies": { "@extractus/article-extractor": "latest", - "express": "^4.18.2" + "express": "latest" } } diff --git a/package.json b/package.json index defbd965..b75783c0 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.8", + "version": "8.0.9", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor", @@ -11,7 +11,8 @@ "main": "./src/main.js", "type": "module", "imports": { - "cross-fetch": "./src/deno/cross-fetch.js" + "cross-fetch": "./src/deno/cross-fetch.js", + "linkedom": "https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm.ts" }, "browser": { "cross-fetch": "./src/deno/cross-fetch.js", @@ -19,7 +20,7 @@ }, "types": "./index.d.ts", "engines": { - "node": ">= 16" + "node": ">= 18" }, "scripts": { "lint": "eslint .", @@ -38,8 +39,8 @@ }, "devDependencies": { "@types/sanitize-html": "^2.11.0", - "eslint": "^9.1.1", - "globals": "^15.0.0", + "eslint": "^9.2.0", + "globals": "^15.1.0", "https-proxy-agent": "^7.0.4", "jest": "^29.7.0", "nock": "^13.5.4" diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js index 71e13c74..2375e742 100644 --- a/src/utils/parseFromHtml.js +++ b/src/utils/parseFromHtml.js @@ -30,8 +30,8 @@ const summarize = (desc, txt, threshold, maxlen) => { // eslint-disable-line } export default async (inputHtml, inputUrl = '', parserOptions = {}) => { - const html = purify(inputHtml) - const meta = extractMetaData(html) + const pureHtml = purify(inputHtml) + const meta = extractMetaData(pureHtml) let title = meta.title @@ -57,7 +57,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { // gather title if (!title) { - title = extractTitleWithReadability(html, inputUrl) + title = extractTitleWithReadability(pureHtml, inputUrl) } if (!title) { return null @@ -95,7 +95,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => { } ) - const content = fns(html) + const content = fns(inputHtml) if (!content) { return null