Skip to content

Commit

Permalink
feat(slb-495): update html parsing library
Browse files Browse the repository at this point in the history
  • Loading branch information
dspachos committed Dec 8, 2024
1 parent dda7654 commit 3b7f08e
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 8 deletions.
16 changes: 16 additions & 0 deletions apps/converter/htmlToMarkdown.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { extract } from '@extractus/article-extractor'
import crypto from 'crypto';
import fs from 'fs-extra';
import imageType from 'image-type';
Expand All @@ -14,6 +15,18 @@ const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);


async function extractMainContentFromUrl(url) {
try {
const mainContent = await extract(url);
console.log("🚀 ~ extractMainContentFromUrl ~ mainContent:", mainContent)
return mainContent ? mainContent.content : '';
} catch (err) {
console.error(err)
}
return '';
}

async function extractMainContent(htmlString) {
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
const match = htmlString.match(bodyRegex);
Expand Down Expand Up @@ -69,13 +82,16 @@ export async function htmlToMarkdown(url) {
}

// Fetch HTML content
/*
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.statusText}`);
}
const fullHtml = await response.text();
const html = await extractMainContent(fullHtml);
*/
const html = await extractMainContentFromUrl(url);
// Generate folder name based on HTML content
const folderName = generateFolderName(html);
const outputDir = path.join(__dirname, folderName);
Expand Down
5 changes: 2 additions & 3 deletions apps/converter/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,12 @@ app.get('/html-convert', async (req, res) => {
element.htmlValue = html;
});


const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir);
const flatten = await flattenMdastNodesRecursive(enhanced);

// Return the processed content along with conversion info
res.json({
//content: mdast.children,
content: enhanced.children,
content: flatten.children,
outputDirectory: outputDir,
warnings: warnings,
});
Expand Down
1 change: 1 addition & 0 deletions apps/converter/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"dev": "node --watch index.js"
},
"dependencies": {
"@extractus/article-extractor": "^8.0.16",
"@textlint/markdown-to-ast": "^14.3.0",
"express": "^4.21.1",
"fs-extra": "^11.2.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,15 @@ function _silverback_ai_import_form_submit(array $form, FormStateInterface $form
$file = \Drupal::entityTypeManager()->getStorage('file')->load($fid);
$ast = $service->getAstFromFilePath($file);
$content->create($ast->content, $entity);
$flatten = $service->flattenAst($ast->content);
$content->create($flatten, $entity);

// $flatten = $service->flattenAst($ast->content);
// $content->create($flatten, $entity);
}
elseif (!empty($url_value) && $type == 'url') {
$ast = $service->getAstFromUrl($url_value);
$content->create($ast->content, $entity);
$flatten = $service->flattenAst($ast->content);
$content->create($flatten, $entity);

// $flatten = $service->flattenAst($ast->content);
// $content->create($flatten, $entity);
}
}
81 changes: 80 additions & 1 deletion pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3b7f08e

Please sign in to comment.