diff --git a/apps/converter/htmlToMarkdown.js b/apps/converter/htmlToMarkdown.js index 9b920de46..df9e45cb7 100644 --- a/apps/converter/htmlToMarkdown.js +++ b/apps/converter/htmlToMarkdown.js @@ -1,3 +1,4 @@ +import { extract } from '@extractus/article-extractor' import crypto from 'crypto'; import fs from 'fs-extra'; import imageType from 'image-type'; @@ -14,6 +15,18 @@ const __dirname = isLagoon ? '/app/web/sites/default/files/converted' : path.dirname(__filename); + +async function extractMainContentFromUrl(url) { + try { + const mainContent = await extract(url); + console.log("🚀 ~ extractMainContentFromUrl ~ mainContent:", mainContent) + return mainContent ? mainContent.content : ''; + } catch (err) { + console.error(err) + } + return ''; + } + async function extractMainContent(htmlString) { const bodyRegex = /]*>([\s\S]*?)<\/body>/i; const match = htmlString.match(bodyRegex); @@ -69,6 +82,7 @@ export async function htmlToMarkdown(url) { } // Fetch HTML content + /* const response = await fetch(url); if (!response.ok) { throw new Error(`Failed to fetch page: ${response.statusText}`); @@ -76,6 +90,8 @@ export async function htmlToMarkdown(url) { const fullHtml = await response.text(); const html = await extractMainContent(fullHtml); + */ + const html = await extractMainContentFromUrl(url); // Generate folder name based on HTML content const folderName = generateFolderName(html); const outputDir = path.join(__dirname, folderName); diff --git a/apps/converter/index.js b/apps/converter/index.js index 43d17a1bf..318b478c6 100644 --- a/apps/converter/index.js +++ b/apps/converter/index.js @@ -239,13 +239,12 @@ app.get('/html-convert', async (req, res) => { element.htmlValue = html; }); - const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); + const flatten = await flattenMdastNodesRecursive(enhanced); // Return the processed content along with conversion info res.json({ - //content: mdast.children, - content: enhanced.children, + content: flatten.children, outputDirectory: outputDir, warnings: warnings, }); diff --git a/apps/converter/package.json b/apps/converter/package.json index 681fe488f..7220d6e3f 100644 --- a/apps/converter/package.json +++ b/apps/converter/package.json @@ -12,6 +12,7 @@ "dev": "node --watch index.js" }, "dependencies": { + "@extractus/article-extractor": "^8.0.16", "@textlint/markdown-to-ast": "^14.3.0", "express": "^4.21.1", "fs-extra": "^11.2.0", diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module index a32eaec0d..35e798d70 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module @@ -136,13 +136,15 @@ function _silverback_ai_import_form_submit(array $form, FormStateInterface $form $file = \Drupal::entityTypeManager()->getStorage('file')->load($fid); $ast = $service->getAstFromFilePath($file); $content->create($ast->content, $entity); - $flatten = $service->flattenAst($ast->content); - $content->create($flatten, $entity); + + // $flatten = $service->flattenAst($ast->content); + // $content->create($flatten, $entity); } elseif (!empty($url_value) && $type == 'url') { $ast = $service->getAstFromUrl($url_value); $content->create($ast->content, $entity); - $flatten = $service->flattenAst($ast->content); - $content->create($flatten, $entity); + + // $flatten = $service->flattenAst($ast->content); + // $content->create($flatten, $entity); } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5e2f90f48..14a4b83d1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -61,6 +61,9 @@ importers: apps/converter: dependencies: + '@extractus/article-extractor': + specifier: ^8.0.16 + version: 8.0.16 '@textlint/markdown-to-ast': specifier: ^14.3.0 version: 14.4.0 @@ -4141,6 +4144,19 @@ packages: levn: 0.4.1 dev: true + /@extractus/article-extractor@8.0.16: + resolution: {integrity: sha512-amxCKO2uerY0UPxDVSoTDdcTny0otpKsAIGC2q2CUDEhUX6EfxmpURttlKLx9uWFT9DRlNX9LSyMSP/2p7kFLg==} + engines: {node: '>= 18'} + dependencies: + '@mozilla/readability': 0.5.0 + bellajs: 11.2.0 + cross-fetch: 4.0.0 + linkedom: 0.18.5 + sanitize-html: 2.13.1 + transitivePeerDependencies: + - encoding + dev: false + /@fastify/accept-negotiator@1.1.0: resolution: {integrity: sha512-OIHZrb2ImZ7XG85HXOONLcJWGosv7sIvM2ifAPQVhg9Lv7qdmMBNVaai4QTdyuaqbKM5eO6sLSQOYI7wEQeCJQ==} engines: {node: '>=14'} @@ -6104,6 +6120,11 @@ packages: got: 11.8.6 os-filter-obj: 2.0.0 + /@mozilla/readability@0.5.0: + resolution: {integrity: sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==} + engines: {node: '>=14.0.0'} + dev: false + /@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.2: resolution: {integrity: sha512-9bfjwDxIDWmmOKusUcqdS4Rw+SETlp9Dy39Xui9BEGEk19dDwH0jhipwFzEff/pFg95NKymc6TOTbRKcWeRqyQ==} cpu: [arm64] @@ -12743,6 +12764,11 @@ packages: resolution: {integrity: sha512-NzUnlZexiaH/46WDhANlyR2bXRopNg4F/zuSA3OpZnllCUgRaOF2znDioDWrmbNVsuZk6l9pMquQB38cfBZwkQ==} dev: false + /bellajs@11.2.0: + resolution: {integrity: sha512-Wjss+Bc674ZABPr+SCKWTqA4V1pyYFhzDTjNBJy4jdmgOv0oGIGXeKBRJyINwP5tIy+iIZD9SfgZpztduzQ5QA==} + engines: {node: '>= 18.4'} + dev: false + /better-ajv-errors@1.2.0(ajv@8.12.0): resolution: {integrity: sha512-UW+IsFycygIo7bclP9h5ugkNH8EjCSgqyFB/yQ4Hqqa1OEYDtb0uFIkYE0b6+CjkgJYVM5UKI/pJPxjYe9EZlA==} engines: {node: '>= 12.13.0'} @@ -14294,6 +14320,14 @@ packages: transitivePeerDependencies: - encoding + /cross-fetch@4.0.0: + resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==} + dependencies: + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + dev: false + /cross-inspect@1.0.0: resolution: {integrity: sha512-4PFfn4b5ZN6FMNGSZlyb7wUhuN8wvj8t/VQHZdM4JsDcruGJ8L2kf9zao98QIrBPFCpdk27qst/AGTl7pL3ypQ==} engines: {node: '>=16.0.0'} @@ -14598,6 +14632,10 @@ packages: dependencies: css-tree: 2.2.1 + /cssom@0.5.0: + resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==} + dev: false + /cssstyle@4.1.0: resolution: {integrity: sha512-h66W1URKpBS5YMI/V8PyXvTMFT8SupJ1IzoIV8IeBC/ji8WVmrO8dGlTi+2dh6whmdk6BiKJLD/ZBkhWbcg6nA==} engines: {node: '>=18'} @@ -21197,6 +21235,10 @@ packages: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} dev: true + /html-escaper@3.0.3: + resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==} + dev: false + /html-tags@3.3.1: resolution: {integrity: sha512-ztqyC3kLto0e9WbNp0aeP+M3kTt+nbaIveGmUxAtZa+8iFgKLUOD4YKM5j+f3QD89bra7UeumolZHKuOXnTmeQ==} engines: {node: '>=8'} @@ -21240,7 +21282,15 @@ packages: domhandler: 5.0.3 domutils: 3.1.0 entities: 4.5.0 - dev: true + + /htmlparser2@9.1.0: + resolution: {integrity: sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==} + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.1.0 + entities: 4.5.0 + dev: false /http-cache-semantics@4.1.1: resolution: {integrity: sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==} @@ -23468,6 +23518,16 @@ packages: /lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} + /linkedom@0.18.5: + resolution: {integrity: sha512-JGLaGGtqtu+eOhYrC1wkWYTBcpVWL4AsnwAtMtgO1Q0gI0PuPJKI0zBBE+a/1BrhOE3Uw8JI/ycByAv5cLrAuQ==} + dependencies: + css-select: 5.1.0 + cssom: 0.5.0 + html-escaper: 3.0.3 + htmlparser2: 9.1.0 + uhyphen: 0.2.0 + dev: false + /linkfs@2.1.0: resolution: {integrity: sha512-kmsGcmpvjStZ0ATjuHycBujtNnXiZR28BTivEu0gAMDTT7GEyodcK6zSRtu6xsrdorrPZEIN380x7BD7xEYkew==} @@ -26758,6 +26818,10 @@ packages: dependencies: protocols: 2.0.1 + /parse-srcset@1.0.2: + resolution: {integrity: sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q==} + dev: false + /parse-url@8.1.0: resolution: {integrity: sha512-xDvOoLU5XRrcOZvnI6b8zA6n9O9ejNk/GExuz1yBuWUGn9KA97GI6HTs6u02wKara1CeVmZhH+0TZFdWScR89w==} dependencies: @@ -29972,6 +30036,17 @@ packages: dependencies: truncate-utf8-bytes: 1.0.2 + /sanitize-html@2.13.1: + resolution: {integrity: sha512-ZXtKq89oue4RP7abL9wp/9URJcqQNABB5GGJ2acW1sdO8JTVl92f4ygD7Yc9Ze09VAZhnt2zegeU0tbNsdcLYg==} + dependencies: + deepmerge: 4.3.1 + escape-string-regexp: 4.0.0 + htmlparser2: 8.0.2 + is-plain-object: 5.0.0 + parse-srcset: 1.0.2 + postcss: 8.4.49 + dev: false + /sax@1.3.0: resolution: {integrity: sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==} dev: false @@ -32543,6 +32618,10 @@ packages: dev: true optional: true + /uhyphen@0.2.0: + resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==} + dev: false + /uid-safe@2.1.5: resolution: {integrity: sha512-KPHm4VL5dDXKz01UuEd88Df+KzynaohSL9fBh096KWAxSKZQDI2uBrVqtvRM4rwrIrRRKsdLNML/lnaaVSRioA==} engines: {node: '>= 0.8'}