Skip to content

Commit

Permalink
chore(slb-495): major updates on parsers and more
Browse files Browse the repository at this point in the history
  • Loading branch information
dspachos committed Dec 18, 2024
1 parent 3ada473 commit 90951ef
Show file tree
Hide file tree
Showing 13 changed files with 2,020 additions and 66 deletions.
179 changes: 155 additions & 24 deletions apps/converter/htmlToMarkdown.js
Original file line number Diff line number Diff line change
@@ -1,33 +1,151 @@
import { extract } from '@extractus/article-extractor'
import crypto from 'crypto';
import fs from 'fs-extra';
import imageType from 'image-type';
import { JSDOM } from 'jsdom';
import { applyFixes } from "markdownlint";
import { lint as lintSync } from "markdownlint/sync";
import fetch from 'node-fetch';
import path from 'path';
import TurndownService from 'turndown';
import { fileURLToPath } from 'url';

/**
* Extracts images from markdown content while preserving their positions
* @param {string} markdown - Original markdown content
* @returns {{cleanMarkdown: string, extractedImages: Array<{alt: string, url: string, position: number, placeholder: string}>}}
*/
function extractImagesWithPositions(markdown) {
const imageRegex = /!\[(.*?)\]\((.*?)\)/g;
const extractedImages = [];
let match;
let cleanMarkdown = markdown;
let index = 0;

while ((match = imageRegex.exec(markdown)) !== null) {
const placeholder = `__IMAGE_PLACEHOLDER_${index}__`;
extractedImages.push({
alt: match[1] || '',
url: match[2],
position: match.index,
placeholder
});
index++;
}

// Replace images with placeholders
extractedImages.forEach(image => {
cleanMarkdown = cleanMarkdown.replace(
`![${image.alt}](${image.url})`,
image.placeholder
);
});

return {
cleanMarkdown,
extractedImages
};
}

/**
* Reinserts images just above their original link positions
* @param {string} markdown - Markdown content with placeholders
* @param {Array<{alt: string, url: string, placeholder: string}>} images - Extracted images
* @returns {string} - Markdown with images reinserted
*/
function reinsertImages(markdown, images) {
let result = markdown;

// Sort images by their position in reverse order to maintain correct positions
const sortedImages = [...images].sort((a, b) => b.position - a.position);

for (const image of sortedImages) {
const imageMarkdown = `![${image.alt}](${image.url})\n\n`;
const placeholderPosition = result.indexOf(image.placeholder);

if (placeholderPosition !== -1) {
// Find the start of the line containing the placeholder
let lineStart = result.lastIndexOf('\n', placeholderPosition);
lineStart = lineStart === -1 ? 0 : lineStart + 1;

// Insert the image above the line containing the placeholder
result =
result.slice(0, lineStart) +
imageMarkdown +
result.slice(lineStart);

// Remove the placeholder
result = result.replace(image.placeholder, '');
}
}

// Clean up any double blank lines created during the process
result = result.replace(/\n{3,}/g, '\n\n');

return result.trim();
}

function validateAndFixMarkdown(markdown) {
const warnings = [];

// Regex to match the entire image syntax
const imageRegex = /!\[.*?\]\(.*?\)/g;

markdown = markdown.replace(imageRegex, (match) => {
// Parse the components of the Markdown image syntax
const altMatch = match.match(/!\[(.*?)\]/); // Match alt text
const urlMatch = match.match(/\((.*?)(?=\s|$)/); // Match URL
const titleMatch = match.match(/"([^"]*?)"\)$/); // Match title (if it exists)

let altText = altMatch ? altMatch[1] : '';
let url = urlMatch ? urlMatch[1] : '';
let title = titleMatch ? titleMatch[1] : null;

// Fix double quotes in alt text
if (altText.includes('"')) {
warnings.push(`Double quotes in alt text fixed: "${altText}"`);
altText = altText.replace(/"/g, "'");
}

// Fix double quotes in title
if (title && title.includes('"')) {
warnings.push(`Double quotes in title fixed: "${title}"`);
title = title.replace(/"/g, "'");
}

// Rebuild the image syntax
return title
? `![${altText}](${url} "${title}")`
: `![${altText}](${url})`;
});

// Trim leading and trailing whitespace
const trimmedMarkdown = markdown.trim();
if (markdown !== trimmedMarkdown) {
warnings.push("Leading or trailing whitespace detected and removed.");
markdown = trimmedMarkdown;
}

return { markdown, warnings };
}

// @todo Fix this to work locally and live
const isLagoon = !!process.env.LAGOON;
const __filename = fileURLToPath(import.meta.url);
const __dirname = isLagoon
? '/app/web/sites/default/files/converted'
: path.dirname(__filename);

async function extractMainContent(htmlString) {
const bodyRegex = /<body[^>]*>([\s\S]*?)<\/body>/i;
const match = htmlString.match(bodyRegex);
// Return the captured group (content between tags) or null if no match
const html = match ? match[1] : null;

if (html) {
// Create a new JSDOM instance and parse the HTML string
const dom = new JSDOM(html);
const mainElement = dom.window.document.querySelector('main, article');
// Return the inner HTML of the <main> tag, or an empty string if not found
return mainElement ? mainElement.innerHTML : '';

async function extractMainContentFromUrl(url) {
try {
const mainContent = await extract(url);
return mainContent ? mainContent.content : '';
} catch (err) {
console.error(err)
}
return '';
}
}

async function getImageExtension(buffer) {
const type = await imageType(buffer);
Expand Down Expand Up @@ -68,14 +186,7 @@ export async function htmlToMarkdown(url) {
throw new Error('Invalid URL provided: ' + url);
}

// Fetch HTML content
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.statusText}`);
}
const fullHtml = await response.text();

const html = await extractMainContent(fullHtml);
const html = await extractMainContentFromUrl(url);
// Generate folder name based on HTML content
const folderName = generateFolderName(html);
const outputDir = path.join(__dirname, folderName);
Expand Down Expand Up @@ -156,13 +267,33 @@ export async function htmlToMarkdown(url) {
.replace(/!\[\]\(/g, '![image](')
.trim();

// Save markdown file
const results = lintSync({ "strings": { "content": markdown } });
const fixed = applyFixes(markdown, results.content);
const { markdown: fixedMarkdown, warnings } = validateAndFixMarkdown(fixed);

const { cleanMarkdown, extractedImages } = extractImagesWithPositions(fixedMarkdown);
const correctedMarkdown = reinsertImages(cleanMarkdown, extractedImages);

const fixEmptyMarkdownLinks = (markdown) => {
// Regular expression to match markdown links with empty URL but with title
// Captures: []("title")
const emptyLinkRegex = /\[\]\(([^)]+)\s+"([^"]+)"\)/g;

// Replace empty links with their title text as link text
return markdown.replace(emptyLinkRegex, (match, url, title) => {
return `[${title}](${url} "${title}")`;
});
};

const fixedLinksMarkdown = fixEmptyMarkdownLinks(correctedMarkdown);

// Save markdown file
const mdPath = path.join(outputDir, 'content.md');
await fs.writeFile(mdPath, markdown);
await fs.writeFile(mdPath, fixedLinksMarkdown);

return {
markdownPath: mdPath,
warnings: [], // You could add warnings for failed image downloads etc.
warnings: warnings, // You could add warnings for failed image downloads etc.
outputDir,
};
}
Loading

0 comments on commit 90951ef

Please sign in to comment.