diff --git a/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.default.yml b/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.default.yml index 2a17c799a..746f92506 100644 --- a/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.default.yml @@ -28,3 +28,4 @@ content: region: content hidden: langcode: true + search_api_excerpt: true diff --git a/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.reusable_block.yml b/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.reusable_block.yml index f8fce79d3..3bc9e6382 100644 --- a/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.reusable_block.yml +++ b/apps/cms/config/sync/core.entity_view_display.block_content.reusable_block.reusable_block.yml @@ -29,3 +29,4 @@ content: region: content hidden: langcode: true + search_api_excerpt: true diff --git a/apps/cms/config/sync/core.entity_view_display.config_pages.website_settings.default.yml b/apps/cms/config/sync/core.entity_view_display.config_pages.website_settings.default.yml index c220f97a1..7e7e53369 100644 --- a/apps/cms/config/sync/core.entity_view_display.config_pages.website_settings.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.config_pages.website_settings.default.yml @@ -27,4 +27,5 @@ content: third_party_settings: { } weight: 2 region: content -hidden: { } +hidden: + search_api_excerpt: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.document.default.yml b/apps/cms/config/sync/core.entity_view_display.media.document.default.yml index b7d8b56ef..ba90023c0 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.document.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.document.default.yml @@ -24,5 +24,6 @@ hidden: created: true langcode: true name: true + search_api_excerpt: true thumbnail: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.document.media_library.yml b/apps/cms/config/sync/core.entity_view_display.media.document.media_library.yml index ce1302c66..43923dd9b 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.document.media_library.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.document.media_library.yml @@ -30,4 +30,5 @@ hidden: field_media_file: true langcode: true name: true + search_api_excerpt: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.image.default.yml b/apps/cms/config/sync/core.entity_view_display.media.image.default.yml index b8d76d4ac..15a948e2b 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.image.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.image.default.yml @@ -28,5 +28,6 @@ hidden: created: true langcode: true name: true + search_api_excerpt: true thumbnail: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.image.media_library.yml b/apps/cms/config/sync/core.entity_view_display.media.image.media_library.yml index fc1b68cdf..da28f0a39 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.image.media_library.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.image.media_library.yml @@ -30,4 +30,5 @@ hidden: field_media_image: true langcode: true name: true + search_api_excerpt: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.image.quote.yml b/apps/cms/config/sync/core.entity_view_display.media.image.quote.yml index b88657103..9919ff166 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.image.quote.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.image.quote.yml @@ -29,5 +29,6 @@ hidden: created: true langcode: true name: true + search_api_excerpt: true thumbnail: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.remote_video.default.yml b/apps/cms/config/sync/core.entity_view_display.media.remote_video.default.yml index 838ecbf13..9012b9232 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.remote_video.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.remote_video.default.yml @@ -27,5 +27,6 @@ hidden: created: true langcode: true name: true + search_api_excerpt: true thumbnail: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.remote_video.media_library.yml b/apps/cms/config/sync/core.entity_view_display.media.remote_video.media_library.yml index 1e4ba5e9a..35881528c 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.remote_video.media_library.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.remote_video.media_library.yml @@ -30,4 +30,5 @@ hidden: field_media_oembed_video: true langcode: true name: true + search_api_excerpt: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.video.default.yml b/apps/cms/config/sync/core.entity_view_display.media.video.default.yml index 7a60da306..88b9e323e 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.video.default.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.video.default.yml @@ -30,5 +30,6 @@ hidden: created: true langcode: true name: true + search_api_excerpt: true thumbnail: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.media.video.media_library.yml b/apps/cms/config/sync/core.entity_view_display.media.video.media_library.yml index 86a8689b0..187e035de 100644 --- a/apps/cms/config/sync/core.entity_view_display.media.video.media_library.yml +++ b/apps/cms/config/sync/core.entity_view_display.media.video.media_library.yml @@ -30,4 +30,5 @@ hidden: field_media_video_file: true langcode: true name: true + search_api_excerpt: true uid: true diff --git a/apps/cms/config/sync/core.entity_view_display.node.page.teaser.yml b/apps/cms/config/sync/core.entity_view_display.node.page.teaser.yml index e699294f3..a5eb67340 100644 --- a/apps/cms/config/sync/core.entity_view_display.node.page.teaser.yml +++ b/apps/cms/config/sync/core.entity_view_display.node.page.teaser.yml @@ -47,3 +47,4 @@ content: hidden: field_metatags: true langcode: true + search_api_excerpt: true diff --git a/apps/cms/config/sync/core.extension.yml b/apps/cms/config/sync/core.extension.yml index 248a09dfb..569fb5f75 100644 --- a/apps/cms/config/sync/core.extension.yml +++ b/apps/cms/config/sync/core.extension.yml @@ -20,6 +20,7 @@ module: datetime_range: 0 dblog: 0 default_content: 0 + devel: 0 dropzonejs: 0 dynamic_entity_reference: 0 dynamic_page_cache: 0 @@ -70,6 +71,7 @@ module: serialization: 0 shortcut: 0 silverback_ai: 0 + silverback_ai_import: 0 silverback_autosave: 0 silverback_campaign_urls: 0 silverback_cloudinary: 0 diff --git a/apps/cms/config/sync/devel.settings.yml b/apps/cms/config/sync/devel.settings.yml new file mode 100644 index 000000000..976cc91f3 --- /dev/null +++ b/apps/cms/config/sync/devel.settings.yml @@ -0,0 +1,12 @@ +_core: + default_config_hash: Aqx6J0yYT6mVqT0fbjeP4JkoL-700nmudVF5d6Pq2Yo +page_alter: false +raw_names: false +error_handlers: + 1: 1 +rebuild_theme: false +debug_mail_file_format: '%to-%subject-%datetime.mail.txt' +debug_mail_directory: 'temporary://devel-mails' +devel_dumper: var_dumper +debug_logfile: 'temporary://drupal_debug.txt' +debug_pre: true diff --git a/apps/cms/config/sync/devel.toolbar.settings.yml b/apps/cms/config/sync/devel.toolbar.settings.yml new file mode 100644 index 000000000..76ada43ce --- /dev/null +++ b/apps/cms/config/sync/devel.toolbar.settings.yml @@ -0,0 +1,10 @@ +_core: + default_config_hash: IQjf_ytthngZTAk_MU8-74VecArWD3G5g0oEH6PM6GA +toolbar_items: + - devel.admin_settings_link + - devel.cache_clear + - devel.container_info.service + - devel.menu_rebuild + - devel.reinstall + - devel.route_info + - devel.run_cron diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.403.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.403.yml index aa3cd2039..2e9d8b41b 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.403.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.403.yml @@ -1,5 +1 @@ label: '403 Zugriff verweigert' -tags: - robots: noindex - canonical_url: '[site:url]' - shortlink: '[site:url]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.404.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.404.yml index 99a896b07..7ea4b8877 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.404.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.404.yml @@ -1,4 +1 @@ label: '404 Seite nicht gefunden' -tags: - canonical_url: '[site:url]' - shortlink: '[site:url]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.front.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.front.yml index c6c25af7c..e1983d0be 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.front.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.front.yml @@ -1,4 +1 @@ label: Startseite -tags: - canonical_url: '[site:url]' - shortlink: '[site:url]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.global.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.global.yml index e7ad00f8b..85a1911f0 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.global.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.global.yml @@ -1,4 +1 @@ label: Global -tags: - canonical_url: '[current-page:url]' - title: '[current-page:title] | [site:name]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.node.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.node.yml index 973386d02..876680127 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.node.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.node.yml @@ -1,5 +1 @@ label: Inhalt -tags: - title: '[node:title] | [site:name]' - description: '[node:summary]' - canonical_url: '[node:url]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.taxonomy_term.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.taxonomy_term.yml index c4971135a..def38fb79 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.taxonomy_term.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.taxonomy_term.yml @@ -1,5 +1 @@ label: Taxonomie-Begriff -tags: - canonical_url: '[term:url]' - description: '[term:description]' - title: '[term:name] | [site:name]' diff --git a/apps/cms/config/sync/language/de/metatag.metatag_defaults.user.yml b/apps/cms/config/sync/language/de/metatag.metatag_defaults.user.yml index ffbd18825..edb74f587 100644 --- a/apps/cms/config/sync/language/de/metatag.metatag_defaults.user.yml +++ b/apps/cms/config/sync/language/de/metatag.metatag_defaults.user.yml @@ -1,5 +1,3 @@ label: Benutzer tags: - canonical_url: '[user:url]' description: '[site:name]' - title: '[user:display-name] | [site:name]' diff --git a/apps/cms/config/sync/language/de/system.menu.devel.yml b/apps/cms/config/sync/language/de/system.menu.devel.yml new file mode 100644 index 000000000..f5f5ba1b1 --- /dev/null +++ b/apps/cms/config/sync/language/de/system.menu.devel.yml @@ -0,0 +1 @@ +label: Entwicklung diff --git a/apps/cms/config/sync/system.menu.devel.yml b/apps/cms/config/sync/system.menu.devel.yml new file mode 100644 index 000000000..747046d46 --- /dev/null +++ b/apps/cms/config/sync/system.menu.devel.yml @@ -0,0 +1,13 @@ +uuid: 60d996af-2c98-4b34-90cd-6d2daf7bface +langcode: en +status: true +dependencies: + enforced: + module: + - devel +_core: + default_config_hash: 3V-l1uuTcyirYOGLPZV5HWaDfr02uEbWZJIwc8Byz-c +id: devel +label: Development +description: 'Links related to Devel module.' +locked: true diff --git a/apps/cms/config/sync/views.view.global_search.yml b/apps/cms/config/sync/views.view.global_search.yml index 3ce03fb92..5a2ed19f6 100644 --- a/apps/cms/config/sync/views.view.global_search.yml +++ b/apps/cms/config/sync/views.view.global_search.yml @@ -129,6 +129,7 @@ display: type: full options: offset: 0 + pagination_heading_level: h4 items_per_page: 10 total_pages: null id: 0 diff --git a/apps/converter/htmlToMarkdown.js b/apps/converter/htmlToMarkdown.js index df9e45cb7..1d623c534 100644 --- a/apps/converter/htmlToMarkdown.js +++ b/apps/converter/htmlToMarkdown.js @@ -3,11 +3,132 @@ import crypto from 'crypto'; import fs from 'fs-extra'; import imageType from 'image-type'; import { JSDOM } from 'jsdom'; +import { applyFixes } from "markdownlint"; +import { lint as lintSync } from "markdownlint/sync"; import fetch from 'node-fetch'; import path from 'path'; import TurndownService from 'turndown'; import { fileURLToPath } from 'url'; +/** + * Extracts images from markdown content while preserving their positions + * @param {string} markdown - Original markdown content + * @returns {{cleanMarkdown: string, extractedImages: Array<{alt: string, url: string, position: number, placeholder: string}>}} + */ +function extractImagesWithPositions(markdown) { + const imageRegex = /!\[(.*?)\]\((.*?)\)/g; + const extractedImages = []; + let match; + let cleanMarkdown = markdown; + let index = 0; + + while ((match = imageRegex.exec(markdown)) !== null) { + const placeholder = `__IMAGE_PLACEHOLDER_${index}__`; + extractedImages.push({ + alt: match[1] || '', + url: match[2], + position: match.index, + placeholder + }); + index++; + } + + // Replace images with placeholders + extractedImages.forEach(image => { + cleanMarkdown = cleanMarkdown.replace( + `![${image.alt}](${image.url})`, + image.placeholder + ); + }); + + return { + cleanMarkdown, + extractedImages + }; +} + +/** +* Reinserts images just above their original link positions +* @param {string} markdown - Markdown content with placeholders +* @param {Array<{alt: string, url: string, placeholder: string}>} images - Extracted images +* @returns {string} - Markdown with images reinserted +*/ +function reinsertImages(markdown, images) { + let result = markdown; + + // Sort images by their position in reverse order to maintain correct positions + const sortedImages = [...images].sort((a, b) => b.position - a.position); + + for (const image of sortedImages) { + const imageMarkdown = `![${image.alt}](${image.url})\n\n`; + const placeholderPosition = result.indexOf(image.placeholder); + + if (placeholderPosition !== -1) { + // Find the start of the line containing the placeholder + let lineStart = result.lastIndexOf('\n', placeholderPosition); + lineStart = lineStart === -1 ? 0 : lineStart + 1; + + // Insert the image above the line containing the placeholder + result = + result.slice(0, lineStart) + + imageMarkdown + + result.slice(lineStart); + + // Remove the placeholder + result = result.replace(image.placeholder, ''); + } + } + + // Clean up any double blank lines created during the process + result = result.replace(/\n{3,}/g, '\n\n'); + + return result.trim(); +} + +function validateAndFixMarkdown(markdown) { + const warnings = []; + + // Regex to match the entire image syntax + const imageRegex = /!\[.*?\]\(.*?\)/g; + + markdown = markdown.replace(imageRegex, (match) => { + // Parse the components of the Markdown image syntax + const altMatch = match.match(/!\[(.*?)\]/); // Match alt text + const urlMatch = match.match(/\((.*?)(?=\s|$)/); // Match URL + const titleMatch = match.match(/"([^"]*?)"\)$/); // Match title (if it exists) + + let altText = altMatch ? altMatch[1] : ''; + let url = urlMatch ? urlMatch[1] : ''; + let title = titleMatch ? titleMatch[1] : null; + + // Fix double quotes in alt text + if (altText.includes('"')) { + warnings.push(`Double quotes in alt text fixed: "${altText}"`); + altText = altText.replace(/"/g, "'"); + } + + // Fix double quotes in title + if (title && title.includes('"')) { + warnings.push(`Double quotes in title fixed: "${title}"`); + title = title.replace(/"/g, "'"); + } + + // Rebuild the image syntax + return title + ? `![${altText}](${url} "${title}")` + : `![${altText}](${url})`; + }); + + // Trim leading and trailing whitespace + const trimmedMarkdown = markdown.trim(); + if (markdown !== trimmedMarkdown) { + warnings.push("Leading or trailing whitespace detected and removed."); + markdown = trimmedMarkdown; + } + + return { markdown, warnings }; +} + // @todo Fix this to work locally and live const isLagoon = !!process.env.LAGOON; const __filename = fileURLToPath(import.meta.url); @@ -19,7 +140,6 @@ const __dirname = isLagoon async function extractMainContentFromUrl(url) { try { const mainContent = await extract(url); - console.log("🚀 ~ extractMainContentFromUrl ~ mainContent:", mainContent) return mainContent ? mainContent.content : ''; } catch (err) { console.error(err) @@ -27,21 +147,6 @@ async function extractMainContentFromUrl(url) { return ''; } -async function extractMainContent(htmlString) { - const bodyRegex = /]*>([\s\S]*?)<\/body>/i; - const match = htmlString.match(bodyRegex); - // Return the captured group (content between tags) or null if no match - const html = match ? match[1] : null; - - if (html) { - // Create a new JSDOM instance and parse the HTML string - const dom = new JSDOM(html); - const mainElement = dom.window.document.querySelector('main, article,div.sectionInnerLeft'); - // Return the inner HTML of the
tag, or an empty string if not found - return mainElement ? mainElement.innerHTML : ''; - } -} - async function getImageExtension(buffer) { const type = await imageType(buffer); return type ? `.${type.ext}` : '.png'; @@ -81,16 +186,6 @@ export async function htmlToMarkdown(url) { throw new Error('Invalid URL provided: ' + url); } - // Fetch HTML content - /* - const response = await fetch(url); - if (!response.ok) { - throw new Error(`Failed to fetch page: ${response.statusText}`); - } - const fullHtml = await response.text(); - - const html = await extractMainContent(fullHtml); - */ const html = await extractMainContentFromUrl(url); // Generate folder name based on HTML content const folderName = generateFolderName(html); @@ -172,13 +267,33 @@ export async function htmlToMarkdown(url) { .replace(/!\[\]\(/g, '![image](') .trim(); - // Save markdown file + const results = lintSync({ "strings": { "content": markdown } }); + const fixed = applyFixes(markdown, results.content); + const { markdown: fixedMarkdown, warnings } = validateAndFixMarkdown(fixed); + + const { cleanMarkdown, extractedImages } = extractImagesWithPositions(fixedMarkdown); + const correctedMarkdown = reinsertImages(cleanMarkdown, extractedImages); + + const fixEmptyMarkdownLinks = (markdown) => { + // Regular expression to match markdown links with empty URL but with title + // Captures: []("title") + const emptyLinkRegex = /\[\]\(([^)]+)\s+"([^"]+)"\)/g; + + // Replace empty links with their title text as link text + return markdown.replace(emptyLinkRegex, (match, url, title) => { + return `[${title}](${url} "${title}")`; + }); + }; + + const fixedLinksMarkdown = fixEmptyMarkdownLinks(correctedMarkdown); + + // Save markdown file const mdPath = path.join(outputDir, 'content.md'); - await fs.writeFile(mdPath, markdown); + await fs.writeFile(mdPath, fixedLinksMarkdown); return { markdownPath: mdPath, - warnings: [], // You could add warnings for failed image downloads etc. + warnings: warnings, // You could add warnings for failed image downloads etc. outputDir, }; } diff --git a/apps/converter/index.js b/apps/converter/index.js index 318b478c6..fbcde2fcd 100644 --- a/apps/converter/index.js +++ b/apps/converter/index.js @@ -6,25 +6,25 @@ import { fromMarkdown } from 'mdast-util-from-markdown'; import { toHast } from 'mdast-util-to-hast'; import { htmlToMarkdown } from './htmlToMarkdown.js'; +import { pdfToMarkdown } from './pdfToMarkdown.js'; import { wordToMarkdown } from './wordToMarkdown.js'; const app = express(); const PORT = 3000; async function enhanceMdastNodesRecursive(tree, outputDir) { - // Process a single node and its children async function processNode(node) { // First process all children recursively to ensure they have htmlValue if (node.children && Array.isArray(node.children)) { - await Promise.all(node.children.map(child => processNode(child))); + await Promise.all(node.children.map((child) => processNode(child))); } const hast = toHast(node, { allowDangerousHtml: true }); const html = toHtml(hast, { allowDangerousHtml: true }); const type = node.type; - node.type = type.charAt(0).toUpperCase() + type.slice(1) + node.type = type.charAt(0).toUpperCase() + type.slice(1); node.outputDir = outputDir; if (!node.htmlValue) { @@ -42,75 +42,42 @@ async function enhanceMdastNodesRecursive(tree, outputDir) { return node; } - - // Helper function to generate HTML for each node type - function generateHtml(node) { - switch (node.type.toLowerCase()) { - case 'paragraph': - return `

${node.children?.map(child => child.htmlValue || '').join('')}

`; - case 'heading': - return `${node.children?.map(child => child.htmlValue || '').join('')}`; - case 'text': - return node.value; - case 'emphasis': - return `${node.children?.map(child => child.htmlValue || '').join('')}`; - case 'strong': - return `${node.children?.map(child => child.htmlValue || '').join('')}`; - case 'link': - return `${node.children?.map(child => child.htmlValue || '').join('')}`; - case 'image': - return `${node.alt || ''}`; - case 'list': - const tag = node.ordered ? 'ol' : 'ul'; - return `<${tag}>${node.children?.map(child => child.raw || '').join('')}`; - case 'listItem': - return `
  • ${node.children?.map(child => child.htmlValue || '').join('')}
  • `; - case 'blockquote': - return `
    ${node.children?.map(child => child.htmlValue || '').join('')}
    `; - case 'code': - return `
    ${node.value}
    `; - case 'inlineCode': - return `${node.value}`; - case 'thematicBreak': - return '
    '; - default: - return ''; - } -} - - - return processNode(tree); -} - -async function flattenMdastNodesRecursive(tree) { - async function flattenNode(node) { - // Base case: if no node or no children, return the node as is - if (!node || !node.children) { - return node; - } - - // Recursively flatten all children first - const flattenedChildren = await Promise.all( - node.children.map(child => flattenNode(child)) - ); - - // Update node's children with flattened results - node.children = flattenedChildren; - - // Handle the special case of Paragraph with single Image - if ( - node.type === 'Paragraph' && - Array.isArray(node.children) && - node.children.length === 1 && - node.children[0].type === 'Image' - ) { - return node.children[0]; + // Helper function to generate HTML for each node type + function generateHtml(node) { + switch (node.type.toLowerCase()) { + case 'paragraph': + return `

    ${node.children?.map((child) => child.htmlValue || '').join('')}

    `; + case 'heading': + return `${node.children?.map((child) => child.htmlValue || '').join('')}`; + case 'text': + return node.value; + case 'emphasis': + return `${node.children?.map((child) => child.htmlValue || '').join('')}`; + case 'strong': + return `${node.children?.map((child) => child.htmlValue || '').join('')}`; + case 'link': + return `${node.children?.map((child) => child.htmlValue || '').join('')}`; + case 'image': + return `${node.alt || ''}`; + case 'list': + const tag = node.ordered ? 'ol' : 'ul'; + return `<${tag}>${node.children?.map((child) => child.raw || '').join('')}`; + case 'listItem': + return `
  • ${node.children?.map((child) => child.htmlValue || '').join('')}
  • `; + case 'blockquote': + return `
    ${node.children?.map((child) => child.htmlValue || '').join('')}
    `; + case 'code': + return `
    ${node.value}
    `; + case 'inlineCode': + return `${node.value}`; + case 'thematicBreak': + return '
    '; + default: + return ''; } - - return node; } - return flattenNode(tree); + return processNode(tree); } function markdownToHtmlTable(markdownTable) { @@ -173,7 +140,7 @@ app.get('/convert', async (req, res) => { const { markdownPath, warnings, outputDir } = await wordToMarkdown(filePath); - // Then read and process the Markdown + // Then read and process the Markdown const markdown = readFileSync(markdownPath, 'utf-8'); const mdast = fromMarkdown(markdown); const md = readFileSync(markdownPath, 'utf-8'); @@ -188,13 +155,11 @@ app.get('/convert', async (req, res) => { element.htmlValue = html; }); - // Flatten images const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); - const flatten = await flattenMdastNodesRecursive(enhanced); // Return the processed content along with conversion info res.json({ - content: flatten.children, + content: enhanced.children, outputDirectory: outputDir, warnings: warnings, }); @@ -240,11 +205,9 @@ app.get('/html-convert', async (req, res) => { }); const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); - const flatten = await flattenMdastNodesRecursive(enhanced); - // Return the processed content along with conversion info res.json({ - content: flatten.children, + content: enhanced.children, outputDirectory: outputDir, warnings: warnings, }); @@ -260,6 +223,54 @@ app.get('/html-convert', async (req, res) => { } }); +app.get('/pdf-convert', async (req, res) => { + const filePath = req.query.path; + + if (!filePath) { + return res.status(400).json({ + error: "Please provide a URLas 'path' query parameter", + }); + } + + try { + // First convert Word to Markdown + const { markdownPath, outputDir } = await pdfToMarkdown(filePath); + + // Then read and process the Markdown + const markdown = readFileSync(markdownPath, 'utf-8'); + const mdast = fromMarkdown(markdown); + + const md = readFileSync(markdownPath, 'utf-8'); + const ast = parse(md); + + mdast.children.forEach(async (element, index) => { + const hast = toHast(element, { allowDangerousHtml: true }); + const html = toHtml(hast, { allowDangerousHtml: true }); + element.type = ast.children[index].type; + element.raw = ast.children[index].raw; + element.htmlValue = html; + }); + + const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); + + // Return the processed content along with conversion info + res.json({ + content: enhanced.children, + outputDirectory: outputDir, + // warnings: warnings, + }); + } catch (error) { + if (error.code === 'ENOENT') { + res.status(404).json({ error: `File not found: ${filePath}` }); + } else { + res.status(500).json({ + error: 'Error processing document', + details: error.message, + }); + } + } +}); + app.listen(PORT, () => { console.log(`Server running on http://localhost:${PORT}`); }); diff --git a/apps/converter/package-lock.json b/apps/converter/package-lock.json index a8534d2e9..fbc3bb1b4 100644 --- a/apps/converter/package-lock.json +++ b/apps/converter/package-lock.json @@ -9,26 +9,115 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "@extractus/article-extractor": "^8.0.16", "@textlint/markdown-to-ast": "^14.3.0", + "dotenv": "^16.4.7", "express": "^4.21.1", "fs-extra": "^11.2.0", "hast-util-to-html": "^9.0.3", "image-type": "^5.2.0", "jsdom": "^25.0.1", + "langchain": "^0.3.6", "mammoth": "^1.8.0", "mdast-util-from-markdown": "^2.0.2", "mdast-util-to-hast": "^13.2.0", "node-fetch": "^3.3.2", + "openai": "^4.76.1", + "pdf-parse": "github:iamh2o/pdf-parse#1.1.3", + "pdf2json": "^3.1.4", "sanitize-filename": "^1.6.3", "turndown": "^7.2.0", "unist-util-visit": "^5.0.0" } }, + "node_modules/@cfworker/json-schema": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.0.3.tgz", + "integrity": "sha512-ZykIcDTVv5UNmKWSTLAs3VukO6NDJkkSKxrgUTDPBkAlORVT3H9n5DbRjRl8xIotklscHdbLIa0b9+y3mQq73g==", + "peer": true + }, + "node_modules/@extractus/article-extractor": { + "version": "8.0.16", + "resolved": "https://registry.npmjs.org/@extractus/article-extractor/-/article-extractor-8.0.16.tgz", + "integrity": "sha512-amxCKO2uerY0UPxDVSoTDdcTny0otpKsAIGC2q2CUDEhUX6EfxmpURttlKLx9uWFT9DRlNX9LSyMSP/2p7kFLg==", + "dependencies": { + "@mozilla/readability": "^0.5.0", + "bellajs": "^11.2.0", + "cross-fetch": "^4.0.0", + "linkedom": "^0.18.5", + "sanitize-html": "2.13.1" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/@langchain/core": { + "version": "0.3.23", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.23.tgz", + "integrity": "sha512-Aut43dEJYH/ibccSErFOLQzymkBG4emlN16P0OHWwx02bDosOR9ilZly4JJiCSYcprn2X2H8nee6P/4VMg1oQA==", + "peer": true, + "dependencies": { + "@cfworker/json-schema": "^4.0.2", + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.2.8", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/openai": { + "version": "0.3.14", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.3.14.tgz", + "integrity": "sha512-lNWjUo1tbvsss45IF7UQtMu1NJ6oUKvhgPYWXnX9f/d6OmuLu7D99HQ3Y88vLcUo9XjjOy417olYHignMduMjA==", + "dependencies": { + "js-tiktoken": "^1.0.12", + "openai": "^4.71.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.26 <0.4.0" + } + }, + "node_modules/@langchain/textsplitters": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", + "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", + "dependencies": { + "js-tiktoken": "^1.0.12" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, "node_modules/@mixmark-io/domino": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==" }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@textlint/ast-node-types": { "version": "14.3.0", "resolved": "https://registry.npmjs.org/@textlint/ast-node-types/-/ast-node-types-14.3.0.tgz", @@ -84,11 +173,38 @@ "resolved": "https://registry.npmjs.org/@types/ms/-/ms-0.7.34.tgz", "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==" }, + "node_modules/@types/node": { + "version": "18.19.67", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.67.tgz", + "integrity": "sha512-wI8uHusga+0ZugNp0Ol/3BqQfEcCCNfojtO6Oou9iVNGPTL6QNSdnUdqq85fRgIorLhLMuPIKpsN98QE9Nh+KQ==", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.12", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz", + "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==" + }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==" }, + "node_modules/@types/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==" + }, "node_modules/@ungap/structured-clone": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.2.0.tgz", @@ -102,6 +218,17 @@ "node": ">=10.0.0" } }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/accepts": { "version": "1.3.8", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", @@ -122,6 +249,29 @@ "node": ">= 14" } }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "peer": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/argparse": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", @@ -168,6 +318,14 @@ } ] }, + "node_modules/bellajs": { + "version": "11.2.0", + "resolved": "https://registry.npmjs.org/bellajs/-/bellajs-11.2.0.tgz", + "integrity": "sha512-Wjss+Bc674ZABPr+SCKWTqA4V1pyYFhzDTjNBJy4jdmgOv0oGIGXeKBRJyINwP5tIy+iIZD9SfgZpztduzQ5QA==", + "engines": { + "node": ">= 18.4" + } + }, "node_modules/bluebird": { "version": "3.4.7", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", @@ -209,6 +367,11 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -235,6 +398,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/camelcase": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", + "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", + "peer": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/ccount": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", @@ -300,6 +475,14 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/commander": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-10.0.1.tgz", + "integrity": "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==", + "engines": { + "node": ">=14" + } + }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -337,6 +520,83 @@ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" }, + "node_modules/cross-fetch": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.0.0.tgz", + "integrity": "sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==", + "dependencies": { + "node-fetch": "^2.6.12" + } + }, + "node_modules/cross-fetch/node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/cross-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, + "node_modules/cross-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/cross-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cssom": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz", + "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==" + }, "node_modules/cssstyle": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.1.0.tgz", @@ -384,6 +644,15 @@ } } }, + "node_modules/decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", + "peer": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/decimal.js": { "version": "10.4.3", "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", @@ -401,6 +670,14 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/define-data-property": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", @@ -467,6 +744,68 @@ "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", + "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/dotenv": { + "version": "16.4.7", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz", + "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/duck": { "version": "0.1.12", "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", @@ -542,6 +881,19 @@ "node": ">= 0.6" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==" + }, "node_modules/express": { "version": "4.21.1", "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz", @@ -694,6 +1046,11 @@ "node": ">= 6" } }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" + }, "node_modules/format": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", @@ -702,6 +1059,26 @@ "node": ">=0.4.x" } }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/formdata-node/node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "engines": { + "node": ">= 14" + } + }, "node_modules/formdata-polyfill": { "version": "4.0.10", "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", @@ -873,6 +1250,11 @@ "node": ">=18" } }, + "node_modules/html-escaper": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", + "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==" + }, "node_modules/html-void-elements": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz", @@ -882,6 +1264,24 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, "node_modules/http-errors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", @@ -921,6 +1321,14 @@ "node": ">= 14" } }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -1053,6 +1461,14 @@ "node": ">=8" } }, + "node_modules/is-plain-object": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz", + "integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-potential-custom-element-name": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", @@ -1063,6 +1479,30 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "node_modules/js-tiktoken": { + "version": "1.0.15", + "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.15.tgz", + "integrity": "sha512-65ruOWWXDEZHHbAo7EjOcNxOGasQKbL4Fq3jEr2xsCqSsoOo6VVSqzWQb6PRIqypFSDcma4jO90YP0w5X8qVXQ==", + "dependencies": { + "base64-js": "^1.5.1" + } + }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/js-yaml/node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" + }, "node_modules/jsdom": { "version": "25.0.1", "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-25.0.1.tgz", @@ -1113,6 +1553,14 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonpointer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz", + "integrity": "sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/jszip": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", @@ -1124,6 +1572,106 @@ "setimmediate": "^1.0.5" } }, + "node_modules/langchain": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.3.6.tgz", + "integrity": "sha512-erZOIKXzwCOrQHqY9AyjkQmaX62zUap1Sigw1KrwMUOnVoLKkVNRmAyxFlNZDZ9jLs/58MaQcaT9ReJtbj3x6w==", + "dependencies": { + "@langchain/openai": ">=0.1.0 <0.4.0", + "@langchain/textsplitters": ">=0.0.0 <0.2.0", + "js-tiktoken": "^1.0.12", + "js-yaml": "^4.1.0", + "jsonpointer": "^5.0.1", + "langsmith": "^0.2.0", + "openapi-types": "^12.1.3", + "p-retry": "4", + "uuid": "^10.0.0", + "yaml": "^2.2.1", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/anthropic": "*", + "@langchain/aws": "*", + "@langchain/cohere": "*", + "@langchain/core": ">=0.2.21 <0.4.0", + "@langchain/google-genai": "*", + "@langchain/google-vertexai": "*", + "@langchain/groq": "*", + "@langchain/mistralai": "*", + "@langchain/ollama": "*", + "axios": "*", + "cheerio": "*", + "handlebars": "^4.7.8", + "peggy": "^3.0.2", + "typeorm": "*" + }, + "peerDependenciesMeta": { + "@langchain/anthropic": { + "optional": true + }, + "@langchain/aws": { + "optional": true + }, + "@langchain/cohere": { + "optional": true + }, + "@langchain/google-genai": { + "optional": true + }, + "@langchain/google-vertexai": { + "optional": true + }, + "@langchain/groq": { + "optional": true + }, + "@langchain/mistralai": { + "optional": true + }, + "@langchain/ollama": { + "optional": true + }, + "axios": { + "optional": true + }, + "cheerio": { + "optional": true + }, + "handlebars": { + "optional": true + }, + "peggy": { + "optional": true + }, + "typeorm": { + "optional": true + } + } + }, + "node_modules/langsmith": { + "version": "0.2.11", + "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.2.11.tgz", + "integrity": "sha512-rVPUN/jQEHjTuYaoVKGjfb3NsYNLGTQT9LXcgJvka5M0EDcXciC598A+DsAQrl6McdfSJCFJDelgRPqVoF2xNA==", + "dependencies": { + "@types/uuid": "^10.0.0", + "commander": "^10.0.1", + "p-queue": "^6.6.2", + "p-retry": "4", + "semver": "^7.6.3", + "uuid": "^10.0.0" + }, + "peerDependencies": { + "openai": "*" + }, + "peerDependenciesMeta": { + "openai": { + "optional": true + } + } + }, "node_modules/lie": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", @@ -1132,6 +1680,18 @@ "immediate": "~3.0.5" } }, + "node_modules/linkedom": { + "version": "0.18.5", + "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.5.tgz", + "integrity": "sha512-JGLaGGtqtu+eOhYrC1wkWYTBcpVWL4AsnwAtMtgO1Q0gI0PuPJKI0zBBE+a/1BrhOE3Uw8JI/ycByAv5cLrAuQ==", + "dependencies": { + "css-select": "^5.1.0", + "cssom": "^0.5.0", + "html-escaper": "^3.0.3", + "htmlparser2": "^9.1.0", + "uhyphen": "^0.2.0" + } + }, "node_modules/longest-streak": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-2.0.4.tgz", @@ -2126,6 +2686,32 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" }, + "node_modules/mustache": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", + "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", + "peer": true, + "bin": { + "mustache": "bin/mustache" + } + }, + "node_modules/nanoid": { + "version": "3.3.8", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", + "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, "node_modules/negotiator": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", @@ -2160,6 +2746,11 @@ "node": ">=10.5.0" } }, + "node_modules/node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==" + }, "node_modules/node-fetch": { "version": "3.3.2", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", @@ -2177,6 +2768,17 @@ "url": "https://opencollective.com/node-fetch" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/nwsapi": { "version": "2.2.16", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.16.tgz", @@ -2204,11 +2806,125 @@ "node": ">= 0.8" } }, + "node_modules/openai": { + "version": "4.76.1", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.76.1.tgz", + "integrity": "sha512-ci63/WFEMd6QjjEVeH0pV7hnFS6CCqhgJydSti4Aak/8uo2SpgzKjteUDaY+OkwziVj11mi6j+0mRUIiGKUzWw==", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, + "node_modules/openai/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/openai/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/openapi-types": { + "version": "12.1.3", + "resolved": "https://registry.npmjs.org/openapi-types/-/openapi-types-12.1.3.tgz", + "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==" + }, "node_modules/option": { "version": "0.2.4", "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==" }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", + "engines": { + "node": ">=4" + } + }, + "node_modules/p-queue": { + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", + "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", + "dependencies": { + "eventemitter3": "^4.0.4", + "p-timeout": "^3.2.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-retry": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz", + "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==", + "dependencies": { + "@types/retry": "0.12.0", + "retry": "^0.13.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/p-timeout": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", + "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", + "dependencies": { + "p-finally": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/pako": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", @@ -2240,6 +2956,11 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/parse-srcset": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz", + "integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q==" + }, "node_modules/parse5": { "version": "7.2.1", "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", @@ -2272,6 +2993,52 @@ "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==" }, + "node_modules/pdf-parse": { + "version": "1.1.3", + "resolved": "git+ssh://git@github.com/iamh2o/pdf-parse.git#d7a41d5aaed1503bee2d7ea50bf89588d3b2d2cf", + "license": "MIT", + "dependencies": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + }, + "engines": { + "node": ">=6.8.1" + } + }, + "node_modules/pdf-parse/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dependencies": { + "ms": "^2.1.1" + } + }, + "node_modules/pdf2json": { + "version": "3.1.4", + "resolved": "https://registry.npmjs.org/pdf2json/-/pdf2json-3.1.4.tgz", + "integrity": "sha512-rS+VapXpXZr+5lUpHmRh3ugXdFXp24p1RyG24yP1DMpqP4t0mrYNGpLtpSbWD42PnQ59GIXofxF+yWb7M+3THg==", + "bundleDependencies": [ + "@xmldom/xmldom" + ], + "dependencies": { + "@xmldom/xmldom": "^0.8.10" + }, + "bin": { + "pdf2json": "bin/pdf2json.js" + }, + "engines": { + "node": ">=18.12.1", + "npm": ">=8.19.2" + } + }, + "node_modules/pdf2json/node_modules/@xmldom/xmldom": { + "version": "0.8.10", + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/peek-readable": { "version": "5.3.1", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-5.3.1.tgz", @@ -2284,6 +3051,38 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==" + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, "node_modules/process-nextick-args": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", @@ -2529,6 +3328,14 @@ "node": ">=0.10" } }, + "node_modules/retry": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", + "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==", + "engines": { + "node": ">= 4" + } + }, "node_modules/rrweb-cssom": { "version": "0.7.1", "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.7.1.tgz", @@ -2566,6 +3373,37 @@ "truncate-utf8-bytes": "^1.0.0" } }, + "node_modules/sanitize-html": { + "version": "2.13.1", + "resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.1.tgz", + "integrity": "sha512-ZXtKq89oue4RP7abL9wp/9URJcqQNABB5GGJ2acW1sdO8JTVl92f4ygD7Yc9Ze09VAZhnt2zegeU0tbNsdcLYg==", + "dependencies": { + "deepmerge": "^4.2.2", + "escape-string-regexp": "^4.0.0", + "htmlparser2": "^8.0.0", + "is-plain-object": "^5.0.0", + "parse-srcset": "^1.0.2", + "postcss": "^8.3.11" + } + }, + "node_modules/sanitize-html/node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, "node_modules/saxes": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", @@ -2577,6 +3415,17 @@ "node": ">=v12.22.7" } }, + "node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/send": { "version": "0.19.0", "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", @@ -2678,6 +3527,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/space-separated-tokens": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", @@ -2864,11 +3721,21 @@ "node": ">= 0.6" } }, + "node_modules/uhyphen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", + "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==" + }, "node_modules/underscore": { "version": "1.13.7", "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==" }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/unified": { "version": "9.2.2", "resolved": "https://registry.npmjs.org/unified/-/unified-9.2.2.tgz", @@ -3055,6 +3922,18 @@ "node": ">= 0.4.0" } }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/vary": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", @@ -3199,6 +4078,33 @@ "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" }, + "node_modules/yaml": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz", + "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/zod": { + "version": "3.24.1", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.24.1.tgz", + "integrity": "sha512-muH7gBL9sI1nciMZV67X5fTKKBLtwpZ5VBp1vsOQzj1MhrBZ4wlVCm3gedKZWLp0Oyel8sIGfeiz54Su+OVT+A==", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, + "node_modules/zod-to-json-schema": { + "version": "3.23.5", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.5.tgz", + "integrity": "sha512-5wlSS0bXfF/BrL4jPAbz9da5hDlDptdEppYfe+x4eIJ7jioqKG9uUxOwPzqof09u/XeVdrgFu29lZi+8XNDJtA==", + "peerDependencies": { + "zod": "^3.23.3" + } + }, "node_modules/zwitch": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", diff --git a/apps/converter/package.json b/apps/converter/package.json index 7220d6e3f..f109e1fcc 100644 --- a/apps/converter/package.json +++ b/apps/converter/package.json @@ -14,15 +14,20 @@ "dependencies": { "@extractus/article-extractor": "^8.0.16", "@textlint/markdown-to-ast": "^14.3.0", + "dotenv": "^16.4.7", "express": "^4.21.1", "fs-extra": "^11.2.0", "hast-util-to-html": "^9.0.3", "image-type": "^5.2.0", "jsdom": "^25.0.1", "mammoth": "^1.8.0", + "markdownlint": "^0.37.1", "mdast-util-from-markdown": "^2.0.2", "mdast-util-to-hast": "^13.2.0", "node-fetch": "^3.3.2", + "openai": "^4.76.1", + "pdf-parse": "github:iamh2o/pdf-parse#1.1.3", + "pdf2json": "^3.1.4", "sanitize-filename": "^1.6.3", "turndown": "^7.2.0", "unist-util-visit": "^5.0.0" diff --git a/apps/converter/pdfToMarkdown.js b/apps/converter/pdfToMarkdown.js new file mode 100644 index 000000000..fcab436bb --- /dev/null +++ b/apps/converter/pdfToMarkdown.js @@ -0,0 +1,108 @@ +import crypto from 'crypto'; +import fs from 'fs'; +import path from 'path'; +import pdf from 'pdf-parse'; +import PDFParser from 'pdf2json'; + +export function generateFolderName(filePath) { + const fileContent = fs.readFileSync(filePath); + const hash = crypto.createHash('md5').update(fileContent).digest('hex'); + return hash.substring(0, 12); +} + +export async function pdfToMarkdown(pdfPath) { + try { + // Validate input file exists and is a PDF + if (!fs.existsSync(pdfPath) || !pdfPath.toLowerCase().endsWith('.pdf')) { + throw new Error('Invalid PDF file path'); + } + + // Generate output folder name + const folderName = generateFolderName(pdfPath); + const outputDir = path.join(path.dirname(pdfPath), folderName); + const imagesDir = path.join(outputDir, 'images'); + + // Create output directories + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir); + } + if (!fs.existsSync(imagesDir)) { + fs.mkdirSync(imagesDir); + } + + // Extract text content from all pages + const dataBuffer = fs.readFileSync(pdfPath); + const data = await pdf(dataBuffer); + // Combine text from all pages + const markdownContent = data.text; + + // Save markdown content + fs.writeFileSync(path.join(outputDir, 'content.md'), markdownContent); + + // Extract images + const pdfParser = new PDFParser(null, 1); // Added parameter to preserve images + + return new Promise((resolve, reject) => { + pdfParser.on('pdfParser_dataReady', (pdfData) => { + try { + // Extract and save images + let imageCount = 0; + + if (pdfData.Pages) { + pdfData.Pages.forEach((page, pageIndex) => { + console.log("🚀 ~ pdfData.Pages.forEach ~ page:1"); + // Handle both Images and Bg (background) images + const images = [...(page.Images || []), ...(page.Bg || [])]; + + images.forEach((image) => { + try { + // Check if image data exists and is valid + if (image.data) { + let imageBuffer; + + // Handle different image data formats + if (Buffer.isBuffer(image.data)) { + imageBuffer = image.data; + } else if (typeof image.data === 'string') { + imageBuffer = Buffer.from(image.data, 'base64'); + } else { + console.warn(`Skipping invalid image data at page ${pageIndex + 1}`); + return; + } + + const imagePath = path.join( + imagesDir, + `image_${pageIndex + 1}_${++imageCount}.png` + ); + + fs.writeFileSync(imagePath, imageBuffer); + } + } catch (imageError) { + console.warn(`Error processing image at page ${pageIndex + 1}:`, imageError); + } + }); + }); + } + + resolve({ + outputDir, + markdownPath: path.join(outputDir, 'content.md'), + imagesDir, + totalPages: pdfData.Pages.length, + totalImages: imageCount + }); + } catch (extractError) { + reject(extractError); + } + }); + + pdfParser.on('pdfParser_dataError', (error) => { + reject(error); + }); + + pdfParser.loadPDF(pdfPath); + }); + } catch (error) { + throw new Error(`PDF conversion failed: ${error.message}`); + } +} diff --git a/package.json b/package.json index 2dcbb5752..326825e5e 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,8 @@ "resolutions": { "gatsby-plugin-sharp": "5.13.1", "sharp": "0.33.1", - "graphql": "16.8.1" + "graphql": "16.8.1", + "pdf-parse@1.1.1": "patch:pdf-parse@npm%3A1.1.1#./.yarn/patches/pdf-parse-npm-1.1.1-04a6109b2a.patch" }, "pnpm": { "patchedDependencies": {} diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install index f4a635f72..3e3acebf5 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install @@ -2,5 +2,84 @@ /** * @file - * Install, update and uninstall functions for the silverback_ai_import module. + * Install, update and uninstall functions for the Silverback AI module. */ + +use Drupal\Core\Entity\EntityTypeInterface; + +/** + * Implements hook_schema(). + */ +function silverback_ai_import_schema() { + + $db_schema = \Drupal::database()->schema(); + if ($db_schema->tableExists('silverback_ai_import')) { + $db_schema->dropTable('silverback_ai_import'); + } + + $schema['silverback_ai_import'] = [ + 'description' => 'Import log for the Silverback AI import module.', + 'fields' => [ + 'id' => [ + 'type' => 'serial', + 'not null' => TRUE, + 'description' => 'Primary Key.', + ], + 'uid' => [ + 'description' => 'Foreign key to {users}.uid; uniquely identifies a Drupal user executed the ai fetch action.', + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => TRUE, + ], + 'timestamp' => [ + 'description' => 'Date/time if the import, as Unix timestamp.', + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => TRUE, + ], + 'target_entity_type_id' => [ + 'type' => 'varchar_ascii', + 'length' => EntityTypeInterface::ID_MAX_LENGTH, + 'not null' => FALSE, + 'default' => '', + 'description' => 'The ID of the associated entity type.', + ], + 'target_entity_id' => [ + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => FALSE, + 'description' => 'The ID of the associated entity.', + ], + 'target_entity_revision_id' => [ + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => FALSE, + 'description' => 'The revision ID of the associated entity.', + ], + 'source' => [ + 'type' => 'text', + 'not null' => TRUE, + 'size' => 'normal', + 'description' => 'The source of the import.', + ], + 'output_folder' => [ + 'type' => 'text', + 'not null' => TRUE, + 'size' => 'small', + 'description' => 'The name of the folder exported.', + ], + 'data' => [ + 'type' => 'blob', + 'not null' => TRUE, + 'size' => 'big', + 'description' => 'The import response *usually a text series of gutenberg formatted blocks.', + ], + 'primary key' => ['id'], + 'indexes' => [ + 'uid' => ['uid'], + 'timestamp' => ['timestamp'], + ], + ]; + + return $schema; +} diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module index 35e798d70..4c92de170 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module @@ -12,43 +12,6 @@ use Drupal\Core\Form\FormStateInterface; */ function silverback_ai_import_form_alter(&$form, &$form_state, $form_id) { // @todo - if ($form_id == 'views_exposed_form') { - $service = \Drupal::service('silverback_ai_import.content'); - $url_value = 'https://www.tcs.ch/de/camping-reisen/camping-insider/community/backen-mit-dem-omnia.php'; - $ast = $service->getAstFromUrl($url_value); - $flatten = $service->flattenAst($ast->content); - // dpm($flatten); - // $file = \Drupal::entityTypeManager()->getStorage('file')->load(11); - // $ast = $service->getAstFromFilePath($file); - $flatten = $service->flattenAst($ast->content); - // dpm($ast->content);. - foreach ($flatten as $el) { - // dpm($el); - } - // dpm($flatten); - $arr = json_decode(json_encode($ast), TRUE); - $data = $arr['content']; - /* $callback = function (&$item, $depth) { - $item['xxx'] = 'xxx'; - }; */ - - $service->iterateArray($data); - dpm($data); - // Example usage: - /* - foreach ($ast->content as $obj) { - dpm($obj); - $iterator = function ($key, $value) { - $times = (int) $key * 1 + 1; - $repeat = str_repeat(' ', (int) $times); - if (isset($value->type) && !empty($value->type)) { - // dpm($repeat . $value->type);. - } - }; - $service->iterateObject($obj, $iterator); - } - */ - } } /** @@ -135,16 +98,14 @@ function _silverback_ai_import_form_submit(array $form, FormStateInterface $form if ($fid && $type == 'docx') { $file = \Drupal::entityTypeManager()->getStorage('file')->load($fid); $ast = $service->getAstFromFilePath($file); - $content->create($ast->content, $entity); - - // $flatten = $service->flattenAst($ast->content); - // $content->create($flatten, $entity); + // $content->create($ast->content, $entity); + $flatten = $service->flattenAst($ast->content); + $content->create($flatten, $entity); } elseif (!empty($url_value) && $type == 'url') { $ast = $service->getAstFromUrl($url_value); - $content->create($ast->content, $entity); - - // $flatten = $service->flattenAst($ast->content); - // $content->create($flatten, $entity); + // $content->create($ast->content, $entity); + $flatten = $service->flattenAst($ast->content); + $content->create($flatten, $entity); } } diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php b/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php index e2d2bd35e..84bcd1704 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php @@ -274,7 +274,22 @@ public function getPlugin(array $chunk) { } /** + * Flattens a hierarchical AST (Abstract Syntax Tree) into a linear array of nodes. * + * This function converts a nested AST structure into a flat array where each node + * is assigned a unique ID and maintains a reference to its parent. It processes + * specific node types differently and handles recursive traversal of child nodes. + * + * @param array|null $ast + * The AST structure to flatten. + * @param int|null $parent + * The ID of the parent node (used in recursion) + * + * @return array An array of flattened nodes, where each node contains: + * - type: The capitalized node type + * - id: A unique identifier + * - parent: Reference to the parent node's ID + * - Additional properties specific to each node type */ public function flattenAst($ast, $parent = NULL) { @@ -287,18 +302,25 @@ public function flattenAst($ast, $parent = NULL) { } foreach ($ast as $chunk) { - if (isset($chunk['type']) && in_array($chunk['type'], [ 'Strong', 'Text', 'ListItem', 'Emphasis', - 'Link', ])) { continue; } + if (isset($chunk['type']) + && $chunk['type'] == 'Link' + && isset($chunk['children']) + && count($chunk['children']) == 1 + && $chunk['children'][0]['type'] !== 'Image' + ) { + continue; + } + $children = $chunk['children'] ?? []; // unset($chunk['children']); // Chunk preprocessing. @@ -323,13 +345,106 @@ public function iterateArray(array &$data, int $depth = 0): void { foreach ($data as &$item) { // Process item here. if (isset($item['type'])) { - $item['gutenberg'] = $this->processChunk($item); + if ($item['type'] == 'Image') { + $item['gutenberg'] = $this->processChunk($item); + } } - if (isset($item['children']) && is_array($item['children'])) { $this->iterateArray($item['children'], $depth + 1); } } + + } + + /** + * Extracts various metadata from a given URL by fetching and parsing its HTML content. + * + * This function attempts to retrieve the HTML content of a URL and extract key information + * including title, path, meta tags, and language settings. It includes error handling for + * various failure scenarios. + * + * @param string $url + * The URL to extract data from. + * + * @return array An associative array containing: + * - title: string|null The page title if found + * - path: string The URL path component, defaults to "/" if not found + * - metatags: array Meta tag name-content pairs + * - language: string|null The page language if specified + * - error: string|null Error message if any error occurred, null otherwise + * + * @throws \Exception Caught internally and returned as error in result array + */ + public function extractPageDataFromUrl($url) { + $data = [ + 'title' => NULL, + 'path' => NULL, + 'metatags' => [], + 'language' => NULL, + // Add an error key. + 'error' => NULL, + ]; + + // Validate URL. + if (!filter_var($url, FILTER_VALIDATE_URL)) { + $data['error'] = "Invalid URL"; + return $data; + } + + try { + // Use file_get_contents with a user agent to avoid being blocked by some servers. + $options = [ + 'http' => [ + 'method' => 'GET', + // Example user agent. + 'user_agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + // Timeout in seconds. + 'timeout' => 10, + ], + ]; + $context = stream_context_create($options); + // Use @ to suppress warnings for invalid URLs or network issues. + $html = @file_get_contents($url, FALSE, $context); + + if ($html === FALSE) { + $error = error_get_last(); + $data['error'] = "Failed to fetch URL: " . ($error ? $error['message'] : "Unknown error"); + return $data; + } + + // Extract Title. + if (preg_match('/(.*?)<\/title>/i', $html, $matches)) { + $data['title'] = trim(html_entity_decode($matches[1])); + } + + // Extract Path. + $data['path'] = parse_url($url, PHP_URL_PATH); + if ($data['path'] === NULL) { + // Handle cases where there's no path. + $data['path'] = "/"; + } + + // Extract Meta Tags. + preg_match_all('/<meta\s+(?:name|http-equiv)="([^"]*)"\s+content="([^"]*)"/i', $html, $matches); + for ($i = 0; $i < count($matches[0]); $i++) { + $name = strtolower($matches[1][$i]); + $data['metatags'][$name] = trim(html_entity_decode($matches[2][$i])); + } + + // Extract Language. + if (preg_match('/<html.*?lang="([^"]*)"/i', $html, $matches)) { + $data['language'] = $matches[1]; + } + elseif (preg_match('/<meta\s+http-equiv="Content-Language"\s+content="([^"]*)"/i', $html, $matches)) { + $data['language'] = $matches[1]; + } + + } + catch (\Exception $e) { + $data['error'] = "An error occurred: " . $e->getMessage(); + } + + return $data; } } diff --git a/packages/drupal/silverback_ai/silverback_ai.install b/packages/drupal/silverback_ai/silverback_ai.install index 207ca0eee..8f927c10c 100644 --- a/packages/drupal/silverback_ai/silverback_ai.install +++ b/packages/drupal/silverback_ai/silverback_ai.install @@ -33,7 +33,7 @@ function silverback_ai_schema() { 'not null' => TRUE, ], 'timestamp' => [ - 'description' => 'Date/time when the form submission failed, as Unix timestamp.', + 'description' => 'Date/time when the ai request, as Unix timestamp.', 'type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 14a4b83d1..8a8d283a1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,6 +8,7 @@ overrides: gatsby-plugin-sharp: 5.13.1 sharp: 0.33.1 graphql: 16.8.1 + pdf-parse@1.1.1: patch:pdf-parse@npm%3A1.1.1#./.yarn/patches/pdf-parse-npm-1.1.1-04a6109b2a.patch importers: @@ -67,6 +68,9 @@ importers: '@textlint/markdown-to-ast': specifier: ^14.3.0 version: 14.4.0 + dotenv: + specifier: ^16.4.7 + version: 16.4.7 express: specifier: ^4.21.1 version: 4.21.1 @@ -85,6 +89,9 @@ importers: mammoth: specifier: ^1.8.0 version: 1.8.0 + markdownlint: + specifier: ^0.37.1 + version: 0.37.1 mdast-util-from-markdown: specifier: ^2.0.2 version: 2.0.2 @@ -94,6 +101,15 @@ importers: node-fetch: specifier: ^3.3.2 version: 3.3.2 + openai: + specifier: ^4.76.1 + version: 4.76.1 + pdf-parse: + specifier: github:iamh2o/pdf-parse#1.1.3 + version: github.com/iamh2o/pdf-parse/d7a41d5aaed1503bee2d7ea50bf89588d3b2d2cf + pdf2json: + specifier: ^3.1.4 + version: 3.1.4 sanitize-filename: specifier: ^1.6.3 version: 1.6.3 @@ -1442,7 +1458,7 @@ packages: '@babel/traverse': 7.24.1 '@babel/types': 7.24.0 convert-source-map: 2.0.0 - debug: 4.3.4 + debug: 4.3.7 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -5054,7 +5070,7 @@ packages: '@types/json-stable-stringify': 1.0.36 '@whatwg-node/fetch': 0.9.17 chalk: 4.1.2 - debug: 4.3.4 + debug: 4.3.7 dotenv: 16.4.5 graphql: 16.8.1 graphql-request: 6.1.0(graphql@16.8.1) @@ -9233,6 +9249,10 @@ packages: resolution: {integrity: sha512-R5M+SYhMbwBeQcNXYWNCZkl09vkVfAtcPIaCGdzIkkbeaTrVbGQ7HVgi4s+EmM/M1K4ZuWQH0jGcvMvNePfxYA==} dev: true + /@types/katex@0.16.7: + resolution: {integrity: sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==} + dev: false + /@types/keyv@3.1.4: resolution: {integrity: sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==} dependencies: @@ -9288,7 +9308,7 @@ packages: /@types/node-fetch@2.6.11: resolution: {integrity: sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==} dependencies: - '@types/node': 18.15.13 + '@types/node': 22.9.0 form-data: 4.0.0 /@types/node@17.0.45: @@ -9321,7 +9341,6 @@ packages: resolution: {integrity: sha512-vuyHg81vvWA1Z1ELfvLko2c8f34gyA0zaic0+Rllc5lbCnbSyuvb2Oxpm6TAUAC/2xZN3QGqxBNggD1nNR2AfQ==} dependencies: undici-types: 6.19.8 - dev: true /@types/node@8.10.66: resolution: {integrity: sha512-tktOkFUA4kXx2hhhrB8bIFb5TbwzS4uOhKEmwiD+NoiL0qtP2OQ9mFldbgD4dV1djrlBYP6eBuQZiWjuHUpqFw==} @@ -11653,7 +11672,6 @@ packages: dependencies: humanize-ms: 1.2.1 dev: false - optional: true /aggregate-error@3.1.0: resolution: {integrity: sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA==} @@ -13412,6 +13430,10 @@ packages: resolution: {integrity: sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==} dev: false + /character-reference-invalid@2.0.1: + resolution: {integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==} + dev: false + /chardet@0.7.0: resolution: {integrity: sha512-mT8iDcrh03qDGRRmoA2hmBJnxpllMR+0/0qlzjqZES6NdiWDcZkCNAk4rPFZ9Q85r27unkiNNg8ZOiwZXBHwcA==} @@ -13842,6 +13864,11 @@ packages: resolution: {integrity: sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==} engines: {node: '>= 10'} + /commander@8.3.0: + resolution: {integrity: sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==} + engines: {node: '>= 12'} + dev: false + /commander@9.5.0: resolution: {integrity: sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ==} engines: {node: ^12.20.0 || >=14} @@ -16111,6 +16138,11 @@ packages: resolution: {integrity: sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==} engines: {node: '>=12'} + /dotenv@16.4.7: + resolution: {integrity: sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==} + engines: {node: '>=12'} + dev: false + /dotenv@7.0.0: resolution: {integrity: sha512-M3NhsLbV1i6HuGzBUH8vXrtxOk+tWmzWKDMbAVSUp3Zsjm7ywFeuwrUXhmhQyRK1q5B5GGy7hcXPbj3bnfZg2g==} engines: {node: '>=6'} @@ -18775,6 +18807,10 @@ packages: typescript: 5.6.3 webpack: 5.91.0 + /form-data-encoder@1.7.2: + resolution: {integrity: sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==} + dev: false + /form-data-encoder@2.1.4: resolution: {integrity: sha512-yDYSgNMraqvnxiEXO4hi88+YZxaHC6QKzb5N84iRCTDeRO7ZALpir/lVmf/uXUhnwUr2O4HU8s/n6x+yNjQkHw==} engines: {node: '>= 14.17'} @@ -18792,6 +18828,14 @@ packages: engines: {node: '>=0.4.x'} dev: false + /formdata-node@4.4.1: + resolution: {integrity: sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==} + engines: {node: '>= 12.20'} + dependencies: + node-domexception: 1.0.0 + web-streams-polyfill: 4.0.0-beta.3 + dev: false + /formdata-polyfill@4.0.10: resolution: {integrity: sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==} engines: {node: '>=12.20.0'} @@ -21480,7 +21524,6 @@ packages: dependencies: ms: 2.1.3 dev: false - optional: true /husky@8.0.3: resolution: {integrity: sha512-+dQSyqPh4x1hlO1swXBiNb2HzTDN1I2IGLQx1GrBuiqFJfoMrnZWwVmatvSiO+Iz8fBUnf+lekwNo4c2LlXItg==} @@ -21840,6 +21883,10 @@ packages: resolution: {integrity: sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==} dev: false + /is-alphabetical@2.0.1: + resolution: {integrity: sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==} + dev: false + /is-alphanumeric@1.0.0: resolution: {integrity: sha512-ZmRL7++ZkcMOfDuWZuMJyIVLr2keE1o/DeNWh1EmgqGhUcV+9BIVsx0BcSBOHTZqzjs4+dISzr2KAeBEWGgXeA==} engines: {node: '>=0.10.0'} @@ -21852,6 +21899,13 @@ packages: is-decimal: 1.0.4 dev: false + /is-alphanumerical@2.0.1: + resolution: {integrity: sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==} + dependencies: + is-alphabetical: 2.0.1 + is-decimal: 2.0.1 + dev: false + /is-arguments@1.1.1: resolution: {integrity: sha512-8Q7EARjzEnKpt/PCD7e1cgUS0a6X8u5tdSiMqXhojOdoV9TsMsiO+9VLC5vAmO8N7/GmXn7yjR8qnA6bVAEzfA==} engines: {node: '>= 0.4'} @@ -21965,6 +22019,10 @@ packages: resolution: {integrity: sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==} dev: false + /is-decimal@2.0.1: + resolution: {integrity: sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==} + dev: false + /is-descriptor@0.1.7: resolution: {integrity: sha512-C3grZTvObeN1xud4cRWl366OMXZTj0+HGyk4hvfpx4ZHt1Pb60ANSXqCK7pdOTeUQpRzECBSTphqvD7U+l22Eg==} engines: {node: '>= 0.4'} @@ -22056,6 +22114,10 @@ packages: resolution: {integrity: sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==} dev: false + /is-hexadecimal@2.0.1: + resolution: {integrity: sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==} + dev: false + /is-hotkey@0.1.8: resolution: {integrity: sha512-qs3NZ1INIS+H+yeo7cD9pDfwYV/jqRh1JG9S9zYrNudkoUQg7OL7ziXqRKu+InFjUIDoP2o6HIkLYMh1pcWgyQ==} dev: false @@ -22496,7 +22558,7 @@ packages: resolution: {integrity: sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw==} engines: {node: '>=10'} dependencies: - debug: 4.3.4 + debug: 4.3.7 istanbul-lib-coverage: 3.2.2 source-map: 0.6.1 transitivePeerDependencies: @@ -23369,6 +23431,13 @@ packages: resolution: {integrity: sha512-UfpWE/VZn0iP50d8cz9NrZLM9lSWhcJ+0Gt/nm4by88UL+J1SiKN8/5dkjMmbEzwL2CAe+67GsegCbIKtbp75A==} dev: false + /katex@0.16.17: + resolution: {integrity: sha512-OyzSrXBllz+Jdc9Auiw0kt21gbZ4hkz8Q5srVAb2U9INcYIfGKbxe+bvNvEz1bQ/NrDeRRho5eLCyk/L03maAw==} + hasBin: true + dependencies: + commander: 8.3.0 + dev: false + /kebab-hash@0.1.2: resolution: {integrity: sha512-BTZpq3xgISmQmAVzkISy4eUutsUA7s4IEFlCwOBJjvSFOwyR7I+fza+tBc/rzYWK/NrmFHjfU1IhO3lu29Ib/w==} dependencies: @@ -23531,6 +23600,12 @@ packages: /linkfs@2.1.0: resolution: {integrity: sha512-kmsGcmpvjStZ0ATjuHycBujtNnXiZR28BTivEu0gAMDTT7GEyodcK6zSRtu6xsrdorrPZEIN380x7BD7xEYkew==} + /linkify-it@5.0.0: + resolution: {integrity: sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==} + dependencies: + uc.micro: 2.1.0 + dev: false + /listhen@1.7.2: resolution: {integrity: sha512-7/HamOm5YD9Wb7CFgAZkKgVPA96WwhcTQoqtm2VTZGVbVVn3IWKRBTgrU7cchA3Q8k9iCsG8Osoi9GX4JsGM9g==} hasBin: true @@ -24118,6 +24193,18 @@ packages: resolution: {integrity: sha512-8z4efJYk43E0upd0NbVXwgSTQs6cT3T06etieCMEg7dRbzCbxUCK/GHlX8mhHRDcp+OLlHkPKsvqQTCvsRl2cg==} dev: false + /markdown-it@14.1.0: + resolution: {integrity: sha512-a54IwgWPaeBCAAsv13YgmALOF1elABB08FxO9i+r4VFk5Vl4pKokRPeX8u5TCgSsPi6ec1otfLjdOpVcgbpshg==} + hasBin: true + dependencies: + argparse: 2.0.1 + entities: 4.5.0 + linkify-it: 5.0.0 + mdurl: 2.0.0 + punycode.js: 2.3.1 + uc.micro: 2.1.0 + dev: false + /markdown-table@1.1.3: resolution: {integrity: sha512-1RUZVgQlpJSPWYbFSpmudq5nHY1doEIv89gBtF0s4gW1GF2XorxcA/70M5vq7rLv0a6mhOUccRsqkwhwLCIQ2Q==} dev: false @@ -24141,6 +24228,22 @@ packages: react: 18.2.0 dev: true + /markdownlint@0.37.1: + resolution: {integrity: sha512-Q7JexBa4ZB1rXJ2HGJF/fYUMeo4oBe5Zn1ZTlbwJ7BFC9V8lOZQoB3acBWW1f4FXmrlrwRYLLpWRfLCZXLo7kw==} + engines: {node: '>=18'} + dependencies: + markdown-it: 14.1.0 + micromark: 4.0.1 + micromark-extension-directive: 3.0.2 + micromark-extension-gfm-autolink-literal: 2.1.0 + micromark-extension-gfm-footnote: 2.1.0 + micromark-extension-gfm-table: 2.1.0 + micromark-extension-math: 3.1.0 + micromark-util-types: 2.0.1 + transitivePeerDependencies: + - supports-color + dev: false + /material-colors@1.2.6: resolution: {integrity: sha512-6qE4B9deFBIa9YSpOc9O0Sgc43zTeVYbgDT5veRKSlB2+ZuHNoVVxA1L/ckMUayV9Ay9y7Z/SZCLcGteW9i7bg==} dev: false @@ -24535,6 +24638,10 @@ packages: resolution: {integrity: sha512-/sKlQJCBYVY9Ers9hqzKou4H6V5UWc/M59TH2dvkt+84itfnq7uFOMLpOiOS4ujvHP4etln18fmIxA5R5fll0g==} dev: false + /mdurl@2.0.0: + resolution: {integrity: sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w==} + dev: false + /meant@1.0.3: resolution: {integrity: sha512-88ZRGcNxAq4EH38cQ4D85PM57pikCwS8Z99EWHODxN7KBY+UuPiqzRTtZzS8KTXO/ywSWbdjjJST2Hly/EQxLw==} @@ -24741,6 +24848,18 @@ packages: micromark-util-symbol: 2.0.0 micromark-util-types: 2.0.0 + /micromark-extension-directive@3.0.2: + resolution: {integrity: sha512-wjcXHgk+PPdmvR58Le9d7zQYWy+vKEU9Se44p2CrCDPiLr2FMyiT4Fyb5UFKFC66wGB3kPlgD7q3TnoqPS7SZA==} + dependencies: + devlop: 1.1.0 + micromark-factory-space: 2.0.0 + micromark-factory-whitespace: 2.0.0 + micromark-util-character: 2.1.0 + micromark-util-symbol: 2.0.0 + micromark-util-types: 2.0.1 + parse-entities: 4.0.2 + dev: false + /micromark-extension-footnote@0.3.2: resolution: {integrity: sha512-gr/BeIxbIWQoUm02cIfK7mdMZ/fbroRpLsck4kvFtjbzP4yi+OPVbnukTc/zy0i7spC2xYE/dbX1Sur8BEDJsQ==} dependencies: @@ -24769,8 +24888,7 @@ packages: micromark-util-character: 2.1.0 micromark-util-sanitize-uri: 2.0.0 micromark-util-symbol: 2.0.0 - micromark-util-types: 2.0.0 - dev: true + micromark-util-types: 2.0.1 /micromark-extension-gfm-footnote@2.1.0: resolution: {integrity: sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==} @@ -24782,8 +24900,7 @@ packages: micromark-util-normalize-identifier: 2.0.0 micromark-util-sanitize-uri: 2.0.0 micromark-util-symbol: 2.0.0 - micromark-util-types: 2.0.0 - dev: true + micromark-util-types: 2.0.1 /micromark-extension-gfm-strikethrough@0.6.5: resolution: {integrity: sha512-PpOKlgokpQRwUesRwWEp+fHjGGkZEejj83k9gU5iXCbDG+XBA92BqnRKYJdfqfkrRcZRgGuPuXb7DaK/DmxOhw==} @@ -24819,8 +24936,7 @@ packages: micromark-factory-space: 2.0.0 micromark-util-character: 2.1.0 micromark-util-symbol: 2.0.0 - micromark-util-types: 2.0.0 - dev: true + micromark-util-types: 2.0.1 /micromark-extension-gfm-tagfilter@0.3.0: resolution: {integrity: sha512-9GU0xBatryXifL//FJH+tAZ6i240xQuFrSL7mYi8f4oZSbc+NvXjkrHemeYP0+L4ZUT+Ptz3b95zhUZnMtoi/Q==} @@ -24876,6 +24992,18 @@ packages: micromark-util-types: 2.0.0 dev: true + /micromark-extension-math@3.1.0: + resolution: {integrity: sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==} + dependencies: + '@types/katex': 0.16.7 + devlop: 1.1.0 + katex: 0.16.17 + micromark-factory-space: 2.0.0 + micromark-util-character: 2.1.0 + micromark-util-symbol: 2.0.0 + micromark-util-types: 2.0.1 + dev: false + /micromark-factory-destination@1.1.0: resolution: {integrity: sha512-XaNDROBgx9SgSChd69pjiGKbV+nfHGDPVYFs5dOoDd7ZnMAE+Cuu91BCpsY8RT2NP9vo/B8pds2VQNCLiu0zhg==} dependencies: @@ -25117,6 +25245,9 @@ packages: /micromark-util-types@2.0.0: resolution: {integrity: sha512-oNh6S2WMHWRZrmutsRmDDfkzKtxF+bc2VxLC9dvtrDIRFln627VsFP6fLMgTryGDljgLPjkrzQSDcPrjPyDJ5w==} + /micromark-util-types@2.0.1: + resolution: {integrity: sha512-534m2WhVTddrcKVepwmVEVnUAmtrx9bfIjNoQHRqfnvdaHQiFytEhJoTgpWJvDEXCO5gLTQh3wYC1PgOJA4NSQ==} + /micromark@2.11.4: resolution: {integrity: sha512-+WoovN/ppKolQOFIAajxi7Lu9kInbPxFuTBVEavFcL8eAfVstoc5MocPmqBeAdBOJV00uaVjegzH4+MA0DN/uA==} dependencies: @@ -25173,6 +25304,30 @@ packages: transitivePeerDependencies: - supports-color + /micromark@4.0.1: + resolution: {integrity: sha512-eBPdkcoCNvYcxQOAKAlceo5SNdzZWfF+FcSupREAzdAh9rRmE239CEQAiTwIgblwnoM8zzj35sZ5ZwvSEOF6Kw==} + dependencies: + '@types/debug': 4.1.12 + debug: 4.3.7 + decode-named-character-reference: 1.0.2 + devlop: 1.1.0 + micromark-core-commonmark: 2.0.1 + micromark-factory-space: 2.0.0 + micromark-util-character: 2.1.0 + micromark-util-chunked: 2.0.0 + micromark-util-combine-extensions: 2.0.0 + micromark-util-decode-numeric-character-reference: 2.0.1 + micromark-util-encode: 2.0.0 + micromark-util-normalize-identifier: 2.0.0 + micromark-util-resolve-all: 2.0.0 + micromark-util-sanitize-uri: 2.0.0 + micromark-util-subtokenize: 2.0.1 + micromark-util-symbol: 2.0.0 + micromark-util-types: 2.0.1 + transitivePeerDependencies: + - supports-color + dev: false + /micromatch@3.1.10: resolution: {integrity: sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==} engines: {node: '>=0.10.0'} @@ -25890,6 +26045,10 @@ packages: engines: {node: '>=10.5.0'} dev: false + /node-ensure@0.0.0: + resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} + dev: false + /node-fetch-native@1.6.4: resolution: {integrity: sha512-IhOigYzAKHd244OC0JIMIUrjzctirCmPkaIfhDeGcEETWof5zKYUW7e7MYvChGWh/4CJeXEgsRyGzuF334rOOQ==} dev: false @@ -26405,6 +26564,26 @@ packages: is-docker: 2.2.1 is-wsl: 2.2.0 + /openai@4.76.1: + resolution: {integrity: sha512-ci63/WFEMd6QjjEVeH0pV7hnFS6CCqhgJydSti4Aak/8uo2SpgzKjteUDaY+OkwziVj11mi6j+0mRUIiGKUzWw==} + hasBin: true + peerDependencies: + zod: ^3.23.8 + peerDependenciesMeta: + zod: + optional: true + dependencies: + '@types/node': 18.15.13 + '@types/node-fetch': 2.6.11 + abort-controller: 3.0.0 + agentkeepalive: 4.5.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + dev: false + /opentracing@0.14.7: resolution: {integrity: sha512-vz9iS7MJ5+Bp1URw8Khvdyw1H/hGvzHWlKQ7eRrQojSCDL1/SrWfrY9QebLw97n2deyRtzHRC3MkQfVNUCo91Q==} engines: {node: '>=0.10'} @@ -26762,6 +26941,18 @@ packages: is-hexadecimal: 1.0.4 dev: false + /parse-entities@4.0.2: + resolution: {integrity: sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==} + dependencies: + '@types/unist': 2.0.10 + character-entities-legacy: 3.0.0 + character-reference-invalid: 2.0.1 + decode-named-character-reference: 1.0.2 + is-alphanumerical: 2.0.1 + is-decimal: 2.0.1 + is-hexadecimal: 2.0.1 + dev: false + /parse-filepath@1.0.2: resolution: {integrity: sha512-FwdRXKCohSVeXqwtYonZTXtbGJKrn+HNyWDYVcp5yuJlesTwNH4rsmRZ+GrKAPJ5bLpRxESMeS+Rl0VCHRvB2Q==} engines: {node: '>=0.8'} @@ -26983,6 +27174,14 @@ packages: resolve-protobuf-schema: 2.1.0 dev: false + /pdf2json@3.1.4: + resolution: {integrity: sha512-rS+VapXpXZr+5lUpHmRh3ugXdFXp24p1RyG24yP1DMpqP4t0mrYNGpLtpSbWD42PnQ59GIXofxF+yWb7M+3THg==} + engines: {node: '>=18.12.1', npm: '>=8.19.2'} + hasBin: true + dev: false + bundledDependencies: + - '@xmldom/xmldom' + /peek-readable@4.1.0: resolution: {integrity: sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg==} engines: {node: '>=8'} @@ -28249,6 +28448,11 @@ packages: end-of-stream: 1.4.4 once: 1.4.0 + /punycode.js@2.3.1: + resolution: {integrity: sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==} + engines: {node: '>=6'} + dev: false + /punycode@1.4.1: resolution: {integrity: sha512-jmYNElW7yvO7TV33CjSmvSiE2yco3bV2czu/OzDKdMNVZQWfxCblURLhf+47syQRBntjfLdd/H0egrzIG+oaFQ==} @@ -32607,6 +32811,10 @@ packages: /ua-parser-js@1.0.37: resolution: {integrity: sha512-bhTyI94tZofjo+Dn8SN6Zv8nBDvyXTymAdM3LDI/0IboIUwTu1rEhW7v2TfiVsoYWgkQ4kOVqnI8APUFbIQIFQ==} + /uc.micro@2.1.0: + resolution: {integrity: sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==} + dev: false + /ufo@1.5.3: resolution: {integrity: sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==} @@ -32667,7 +32875,6 @@ packages: /undici-types@6.19.8: resolution: {integrity: sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==} - dev: true /unenv@1.9.0: resolution: {integrity: sha512-QKnFNznRxmbOF1hDgzpqrlIf6NC5sbZ2OJ+5Wl3OX8uM+LUJXbj4TXvLJCtwbPTmbMHCLIz6JLKNinNsMShK9g==} @@ -34087,6 +34294,11 @@ packages: resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} engines: {node: '>= 8'} + /web-streams-polyfill@4.0.0-beta.3: + resolution: {integrity: sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==} + engines: {node: '>= 14'} + dev: false + /web-worker@1.3.0: resolution: {integrity: sha512-BSR9wyRsy/KOValMgd5kMyr3JzpdeoR9KVId8u5GVlTTAtNChlsE4yTxeY7zMdNSyOmoKBv8NH2qeRY9Tg+IaA==} dev: false @@ -34334,7 +34546,7 @@ packages: dependencies: cheerio: 1.0.0-rc.12 css-url-parser: 1.1.3 - debug: 4.3.4 + debug: 4.3.7 fs-extra: 10.1.0 got: 12.6.1 normalize-url: 7.2.0 @@ -34937,3 +35149,15 @@ packages: /zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} + + github.com/iamh2o/pdf-parse/d7a41d5aaed1503bee2d7ea50bf89588d3b2d2cf: + resolution: {tarball: https://codeload.github.com/iamh2o/pdf-parse/tar.gz/d7a41d5aaed1503bee2d7ea50bf89588d3b2d2cf} + name: pdf-parse + version: 1.1.3 + engines: {node: '>=6.8.1'} + dependencies: + debug: 3.2.7 + node-ensure: 0.0.0 + transitivePeerDependencies: + - supports-color + dev: false