chore(slb-495): major updates on parsers and more

AmazeeLabs · Dec 18, 2024 · 90951ef · 90951ef
1 parent 3ada473
commit 90951ef
Show file tree

Hide file tree

Showing 13 changed files with 2,020 additions and 66 deletions.
diff --git a/apps/converter/htmlToMarkdown.js b/apps/converter/htmlToMarkdown.js
@@ -1,33 +1,151 @@
+import { extract } from '@extractus/article-extractor'
 import crypto from 'crypto';
 import fs from 'fs-extra';
 import imageType from 'image-type';
 import { JSDOM } from 'jsdom';
+import { applyFixes } from "markdownlint";
+import { lint as lintSync } from "markdownlint/sync";
 import fetch from 'node-fetch';
 import path from 'path';
 import TurndownService from 'turndown';
 import { fileURLToPath } from 'url';
 
+/**
+ * Extracts images from markdown content while preserving their positions
+ * @param {string} markdown - Original markdown content
+ * @returns {{cleanMarkdown: string, extractedImages: Array<{alt: string, url: string, position: number, placeholder: string}>}}
+ */
+function extractImagesWithPositions(markdown) {
+  const imageRegex = /!\[(.*?)\]\((.*?)\)/g;
+  const extractedImages = [];
+  let match;
+  let cleanMarkdown = markdown;
+  let index = 0;
+
+  while ((match = imageRegex.exec(markdown)) !== null) {
+      const placeholder = `__IMAGE_PLACEHOLDER_${index}__`;
+      extractedImages.push({
+          alt: match[1] || '',
+          url: match[2],
+          position: match.index,
+          placeholder
+      });
+      index++;
+  }
+
+  // Replace images with placeholders
+  extractedImages.forEach(image => {
+      cleanMarkdown = cleanMarkdown.replace(
+          `![${image.alt}](${image.url})`,
+          image.placeholder
+      );
+  });
+
+  return {
+      cleanMarkdown,
+      extractedImages
+  };
+}
+
+/**
+* Reinserts images just above their original link positions
+* @param {string} markdown - Markdown content with placeholders
+* @param {Array<{alt: string, url: string, placeholder: string}>} images - Extracted images
+* @returns {string} - Markdown with images reinserted
+*/
+function reinsertImages(markdown, images) {
+  let result = markdown;
+
+  // Sort images by their position in reverse order to maintain correct positions
+  const sortedImages = [...images].sort((a, b) => b.position - a.position);
+
+  for (const image of sortedImages) {
+      const imageMarkdown = `![${image.alt}](${image.url})\n\n`;
+      const placeholderPosition = result.indexOf(image.placeholder);
+
+      if (placeholderPosition !== -1) {
+          // Find the start of the line containing the placeholder
+          let lineStart = result.lastIndexOf('\n', placeholderPosition);
+          lineStart = lineStart === -1 ? 0 : lineStart + 1;
+
+          // Insert the image above the line containing the placeholder
+          result =
+              result.slice(0, lineStart) +
+              imageMarkdown +
+              result.slice(lineStart);
+
+          // Remove the placeholder
+          result = result.replace(image.placeholder, '');
+      }
+  }
+
+  // Clean up any double blank lines created during the process
+  result = result.replace(/\n{3,}/g, '\n\n');
+
+  return result.trim();
+}
+
+function validateAndFixMarkdown(markdown) {
+  const warnings = [];
+
+  // Regex to match the entire image syntax
+  const imageRegex = /!\[.*?\]\(.*?\)/g;
+
+  markdown = markdown.replace(imageRegex, (match) => {
+    // Parse the components of the Markdown image syntax
+    const altMatch = match.match(/!\[(.*?)\]/); // Match alt text
+    const urlMatch = match.match(/\((.*?)(?=\s|$)/); // Match URL
+    const titleMatch = match.match(/"([^"]*?)"\)$/); // Match title (if it exists)
+
+    let altText = altMatch ? altMatch[1] : '';
+    let url = urlMatch ? urlMatch[1] : '';
+    let title = titleMatch ? titleMatch[1] : null;
+
+    // Fix double quotes in alt text
+    if (altText.includes('"')) {
+      warnings.push(`Double quotes in alt text fixed: "${altText}"`);
+      altText = altText.replace(/"/g, "'");
+    }
+
+    // Fix double quotes in title
+    if (title && title.includes('"')) {
+      warnings.push(`Double quotes in title fixed: "${title}"`);
+      title = title.replace(/"/g, "'");
+    }
+
+    // Rebuild the image syntax
+    return title
+      ? `![${altText}](${url} "${title}")`
+      : `![${altText}](${url})`;
+  });
+
+  // Trim leading and trailing whitespace
+  const trimmedMarkdown = markdown.trim();
+  if (markdown !== trimmedMarkdown) {
+    warnings.push("Leading or trailing whitespace detected and removed.");
+    markdown = trimmedMarkdown;
+  }
+
+  return { markdown, warnings };
+}
+
 // @todo Fix this to work locally and live
 const isLagoon = !!process.env.LAGOON;
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = isLagoon
   ? '/app/web/sites/default/files/converted'
   : path.dirname(__filename);
 
-async function extractMainContent(htmlString) {
-  const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
-  const match = htmlString.match(bodyRegex);
-  // Return the captured group (content between tags) or null if no match
-  const html = match ? match[1] : null;
-
-  if (html) {
-    // Create a new JSDOM instance and parse the HTML string
-    const dom = new JSDOM(html);
-    const mainElement = dom.window.document.querySelector('main, article');
-    // Return the inner HTML of the <main> tag, or an empty string if not found
-    return mainElement ? mainElement.innerHTML : '';
+
+async function extractMainContentFromUrl(url) {
+      try {
+        const mainContent = await extract(url);
+        return mainContent ? mainContent.content : '';
+      } catch (err) {
+        console.error(err)
+      }
+      return '';
   }
-}
 
 async function getImageExtension(buffer) {
   const type = await imageType(buffer);
@@ -68,14 +186,7 @@ export async function htmlToMarkdown(url) {
     throw new Error('Invalid URL provided: ' + url);
   }
 
-  // Fetch HTML content
-  const response = await fetch(url);
-  if (!response.ok) {
-    throw new Error(`Failed to fetch page: ${response.statusText}`);
-  }
-  const fullHtml = await response.text();
-
-  const html = await extractMainContent(fullHtml);
+  const html = await extractMainContentFromUrl(url);
   // Generate folder name based on HTML content
   const folderName = generateFolderName(html);
   const outputDir = path.join(__dirname, folderName);
@@ -156,13 +267,33 @@ export async function htmlToMarkdown(url) {
     .replace(/!\[\]\(/g, '![image](')
     .trim();
 
-  // Save markdown file
+    const results = lintSync({ "strings": { "content": markdown } });
+    const fixed = applyFixes(markdown, results.content);
+    const { markdown: fixedMarkdown, warnings } = validateAndFixMarkdown(fixed);
+
+    const { cleanMarkdown, extractedImages } = extractImagesWithPositions(fixedMarkdown);
+    const correctedMarkdown = reinsertImages(cleanMarkdown, extractedImages);
+
+    const fixEmptyMarkdownLinks = (markdown) => {
+      // Regular expression to match markdown links with empty URL but with title
+      // Captures: []("title")
+      const emptyLinkRegex = /\[\]\(([^)]+)\s+"([^"]+)"\)/g;
+
+      // Replace empty links with their title text as link text
+      return markdown.replace(emptyLinkRegex, (match, url, title) => {
+        return `[${title}](${url} "${title}")`;
+      });
+    };
+
+    const fixedLinksMarkdown = fixEmptyMarkdownLinks(correctedMarkdown);
+
+    // Save markdown file
   const mdPath = path.join(outputDir, 'content.md');
-  await fs.writeFile(mdPath, markdown);
+  await fs.writeFile(mdPath, fixedLinksMarkdown);
 
   return {
     markdownPath: mdPath,
-    warnings: [], // You could add warnings for failed image downloads etc.
+    warnings: warnings, // You could add warnings for failed image downloads etc.
     outputDir,
   };
 }