${node.children?.map(child => child.htmlValue || '').join('')}
`; - case 'heading': - return `${node.children?.map(child => child.htmlValue || '').join('')}`; - case 'code': - return `
${node.value}
`;
- case 'inlineCode':
- return `${node.value}
`;
- case 'thematicBreak':
- return ''; - default: - return ''; - } -} - - - return processNode(tree); -} - -async function flattenMdastNodesRecursive(tree) { - async function flattenNode(node) { - // Base case: if no node or no children, return the node as is - if (!node || !node.children) { - return node; - } - - // Recursively flatten all children first - const flattenedChildren = await Promise.all( - node.children.map(child => flattenNode(child)) - ); - - // Update node's children with flattened results - node.children = flattenedChildren; - - // Handle the special case of Paragraph with single Image - if ( - node.type === 'Paragraph' && - Array.isArray(node.children) && - node.children.length === 1 && - node.children[0].type === 'Image' - ) { - return node.children[0]; + // Helper function to generate HTML for each node type + function generateHtml(node) { + switch (node.type.toLowerCase()) { + case 'paragraph': + return `
${node.children?.map((child) => child.htmlValue || '').join('')}
`; + case 'heading': + return `${node.children?.map((child) => child.htmlValue || '').join('')}`; + case 'code': + return `
${node.value}
`;
+ case 'inlineCode':
+ return `${node.value}
`;
+ case 'thematicBreak':
+ return ''; + default: + return ''; } - - return node; } - return flattenNode(tree); + return processNode(tree); } function markdownToHtmlTable(markdownTable) { @@ -173,7 +140,7 @@ app.get('/convert', async (req, res) => { const { markdownPath, warnings, outputDir } = await wordToMarkdown(filePath); - // Then read and process the Markdown + // Then read and process the Markdown const markdown = readFileSync(markdownPath, 'utf-8'); const mdast = fromMarkdown(markdown); const md = readFileSync(markdownPath, 'utf-8'); @@ -188,13 +155,11 @@ app.get('/convert', async (req, res) => { element.htmlValue = html; }); - // Flatten images const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); - const flatten = await flattenMdastNodesRecursive(enhanced); // Return the processed content along with conversion info res.json({ - content: flatten.children, + content: enhanced.children, outputDirectory: outputDir, warnings: warnings, }); @@ -240,11 +205,9 @@ app.get('/html-convert', async (req, res) => { }); const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); - const flatten = await flattenMdastNodesRecursive(enhanced); - // Return the processed content along with conversion info res.json({ - content: flatten.children, + content: enhanced.children, outputDirectory: outputDir, warnings: warnings, }); @@ -260,6 +223,54 @@ app.get('/html-convert', async (req, res) => { } }); +app.get('/pdf-convert', async (req, res) => { + const filePath = req.query.path; + + if (!filePath) { + return res.status(400).json({ + error: "Please provide a URLas 'path' query parameter", + }); + } + + try { + // First convert Word to Markdown + const { markdownPath, outputDir } = await pdfToMarkdown(filePath); + + // Then read and process the Markdown + const markdown = readFileSync(markdownPath, 'utf-8'); + const mdast = fromMarkdown(markdown); + + const md = readFileSync(markdownPath, 'utf-8'); + const ast = parse(md); + + mdast.children.forEach(async (element, index) => { + const hast = toHast(element, { allowDangerousHtml: true }); + const html = toHtml(hast, { allowDangerousHtml: true }); + element.type = ast.children[index].type; + element.raw = ast.children[index].raw; + element.htmlValue = html; + }); + + const enhanced = await enhanceMdastNodesRecursive(mdast, outputDir); + + // Return the processed content along with conversion info + res.json({ + content: enhanced.children, + outputDirectory: outputDir, + // warnings: warnings, + }); + } catch (error) { + if (error.code === 'ENOENT') { + res.status(404).json({ error: `File not found: ${filePath}` }); + } else { + res.status(500).json({ + error: 'Error processing document', + details: error.message, + }); + } + } +}); + app.listen(PORT, () => { console.log(`Server running on http://localhost:${PORT}`); }); diff --git a/apps/converter/package-lock.json b/apps/converter/package-lock.json index a8534d2e9..fbc3bb1b4 100644 --- a/apps/converter/package-lock.json +++ b/apps/converter/package-lock.json @@ -9,26 +9,115 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "@extractus/article-extractor": "^8.0.16", "@textlint/markdown-to-ast": "^14.3.0", + "dotenv": "^16.4.7", "express": "^4.21.1", "fs-extra": "^11.2.0", "hast-util-to-html": "^9.0.3", "image-type": "^5.2.0", "jsdom": "^25.0.1", + "langchain": "^0.3.6", "mammoth": "^1.8.0", "mdast-util-from-markdown": "^2.0.2", "mdast-util-to-hast": "^13.2.0", "node-fetch": "^3.3.2", + "openai": "^4.76.1", + "pdf-parse": "github:iamh2o/pdf-parse#1.1.3", + "pdf2json": "^3.1.4", "sanitize-filename": "^1.6.3", "turndown": "^7.2.0", "unist-util-visit": "^5.0.0" } }, + "node_modules/@cfworker/json-schema": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.0.3.tgz", + "integrity": "sha512-ZykIcDTVv5UNmKWSTLAs3VukO6NDJkkSKxrgUTDPBkAlORVT3H9n5DbRjRl8xIotklscHdbLIa0b9+y3mQq73g==", + "peer": true + }, + "node_modules/@extractus/article-extractor": { + "version": "8.0.16", + "resolved": "https://registry.npmjs.org/@extractus/article-extractor/-/article-extractor-8.0.16.tgz", + "integrity": "sha512-amxCKO2uerY0UPxDVSoTDdcTny0otpKsAIGC2q2CUDEhUX6EfxmpURttlKLx9uWFT9DRlNX9LSyMSP/2p7kFLg==", + "dependencies": { + "@mozilla/readability": "^0.5.0", + "bellajs": "^11.2.0", + "cross-fetch": "^4.0.0", + "linkedom": "^0.18.5", + "sanitize-html": "2.13.1" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/@langchain/core": { + "version": "0.3.23", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.23.tgz", + "integrity": "sha512-Aut43dEJYH/ibccSErFOLQzymkBG4emlN16P0OHWwx02bDosOR9ilZly4JJiCSYcprn2X2H8nee6P/4VMg1oQA==", + "peer": true, + "dependencies": { + "@cfworker/json-schema": "^4.0.2", + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.2.8", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/openai": { + "version": "0.3.14", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.3.14.tgz", + "integrity": "sha512-lNWjUo1tbvsss45IF7UQtMu1NJ6oUKvhgPYWXnX9f/d6OmuLu7D99HQ3Y88vLcUo9XjjOy417olYHignMduMjA==", + "dependencies": { + "js-tiktoken": "^1.0.12", + "openai": "^4.71.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.26 <0.4.0" + } + }, + "node_modules/@langchain/textsplitters": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", + "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", + "dependencies": { + "js-tiktoken": "^1.0.12" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, "node_modules/@mixmark-io/domino": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz", "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==" }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@textlint/ast-node-types": { "version": "14.3.0", "resolved": "https://registry.npmjs.org/@textlint/ast-node-types/-/ast-node-types-14.3.0.tgz", @@ -84,11 +173,38 @@ "resolved": "https://registry.npmjs.org/@types/ms/-/ms-0.7.34.tgz", "integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==" }, + "node_modules/@types/node": { + "version": "18.19.67", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.67.tgz", + "integrity": "sha512-wI8uHusga+0ZugNp0Ol/3BqQfEcCCNfojtO6Oou9iVNGPTL6QNSdnUdqq85fRgIorLhLMuPIKpsN98QE9Nh+KQ==", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.12", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz", + "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==" + }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==" }, + "node_modules/@types/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==" + }, "node_modules/@ungap/structured-clone": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.2.0.tgz", @@ -102,6 +218,17 @@ "node": ">=10.0.0" } }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/accepts": { "version": "1.3.8", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", @@ -122,6 +249,29 @@ "node": ">= 14" } }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "peer": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/argparse": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", @@ -168,6 +318,14 @@ } ] }, + "node_modules/bellajs": { + "version": "11.2.0", + "resolved": "https://registry.npmjs.org/bellajs/-/bellajs-11.2.0.tgz", + "integrity": "sha512-Wjss+Bc674ZABPr+SCKWTqA4V1pyYFhzDTjNBJy4jdmgOv0oGIGXeKBRJyINwP5tIy+iIZD9SfgZpztduzQ5QA==", + "engines": { + "node": ">= 18.4" + } + }, "node_modules/bluebird": { "version": "3.4.7", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", @@ -209,6 +367,11 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -235,6 +398,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/camelcase": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", + "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", + "peer": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/ccount": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", @@ -300,6 +475,14 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/commander": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-10.0.1.tgz", + "integrity": "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==", + "engines": { + "node": ">=14" + } + }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -337,6 +520,83 @@ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" }, + "node_modules/cross-fetch": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.0.0.tgz", + "integrity": "sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==", + "dependencies": { + "node-fetch": "^2.6.12" + } + }, + "node_modules/cross-fetch/node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/cross-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, + "node_modules/cross-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/cross-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cssom": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz", + "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==" + }, "node_modules/cssstyle": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.1.0.tgz", @@ -384,6 +644,15 @@ } } }, + "node_modules/decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", + "peer": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/decimal.js": { "version": "10.4.3", "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", @@ -401,6 +670,14 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/define-data-property": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", @@ -467,6 +744,68 @@ "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", + "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/dotenv": { + "version": "16.4.7", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz", + "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/duck": { "version": "0.1.12", "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", @@ -542,6 +881,19 @@ "node": ">= 0.6" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==" + }, "node_modules/express": { "version": "4.21.1", "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz", @@ -694,6 +1046,11 @@ "node": ">= 6" } }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==" + }, "node_modules/format": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", @@ -702,6 +1059,26 @@ "node": ">=0.4.x" } }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/formdata-node/node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "engines": { + "node": ">= 14" + } + }, "node_modules/formdata-polyfill": { "version": "4.0.10", "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", @@ -873,6 +1250,11 @@ "node": ">=18" } }, + "node_modules/html-escaper": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", + "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==" + }, "node_modules/html-void-elements": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz", @@ -882,6 +1264,24 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, "node_modules/http-errors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", @@ -921,6 +1321,14 @@ "node": ">= 14" } }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -1053,6 +1461,14 @@ "node": ">=8" } }, + "node_modules/is-plain-object": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz", + "integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-potential-custom-element-name": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", @@ -1063,6 +1479,30 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "node_modules/js-tiktoken": { + "version": "1.0.15", + "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.15.tgz", + "integrity": "sha512-65ruOWWXDEZHHbAo7EjOcNxOGasQKbL4Fq3jEr2xsCqSsoOo6VVSqzWQb6PRIqypFSDcma4jO90YP0w5X8qVXQ==", + "dependencies": { + "base64-js": "^1.5.1" + } + }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/js-yaml/node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" + }, "node_modules/jsdom": { "version": "25.0.1", "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-25.0.1.tgz", @@ -1113,6 +1553,14 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonpointer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz", + "integrity": "sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/jszip": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", @@ -1124,6 +1572,106 @@ "setimmediate": "^1.0.5" } }, + "node_modules/langchain": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.3.6.tgz", + "integrity": "sha512-erZOIKXzwCOrQHqY9AyjkQmaX62zUap1Sigw1KrwMUOnVoLKkVNRmAyxFlNZDZ9jLs/58MaQcaT9ReJtbj3x6w==", + "dependencies": { + "@langchain/openai": ">=0.1.0 <0.4.0", + "@langchain/textsplitters": ">=0.0.0 <0.2.0", + "js-tiktoken": "^1.0.12", + "js-yaml": "^4.1.0", + "jsonpointer": "^5.0.1", + "langsmith": "^0.2.0", + "openapi-types": "^12.1.3", + "p-retry": "4", + "uuid": "^10.0.0", + "yaml": "^2.2.1", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/anthropic": "*", + "@langchain/aws": "*", + "@langchain/cohere": "*", + "@langchain/core": ">=0.2.21 <0.4.0", + "@langchain/google-genai": "*", + "@langchain/google-vertexai": "*", + "@langchain/groq": "*", + "@langchain/mistralai": "*", + "@langchain/ollama": "*", + "axios": "*", + "cheerio": "*", + "handlebars": "^4.7.8", + "peggy": "^3.0.2", + "typeorm": "*" + }, + "peerDependenciesMeta": { + "@langchain/anthropic": { + "optional": true + }, + "@langchain/aws": { + "optional": true + }, + "@langchain/cohere": { + "optional": true + }, + "@langchain/google-genai": { + "optional": true + }, + "@langchain/google-vertexai": { + "optional": true + }, + "@langchain/groq": { + "optional": true + }, + "@langchain/mistralai": { + "optional": true + }, + "@langchain/ollama": { + "optional": true + }, + "axios": { + "optional": true + }, + "cheerio": { + "optional": true + }, + "handlebars": { + "optional": true + }, + "peggy": { + "optional": true + }, + "typeorm": { + "optional": true + } + } + }, + "node_modules/langsmith": { + "version": "0.2.11", + "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.2.11.tgz", + "integrity": "sha512-rVPUN/jQEHjTuYaoVKGjfb3NsYNLGTQT9LXcgJvka5M0EDcXciC598A+DsAQrl6McdfSJCFJDelgRPqVoF2xNA==", + "dependencies": { + "@types/uuid": "^10.0.0", + "commander": "^10.0.1", + "p-queue": "^6.6.2", + "p-retry": "4", + "semver": "^7.6.3", + "uuid": "^10.0.0" + }, + "peerDependencies": { + "openai": "*" + }, + "peerDependenciesMeta": { + "openai": { + "optional": true + } + } + }, "node_modules/lie": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", @@ -1132,6 +1680,18 @@ "immediate": "~3.0.5" } }, + "node_modules/linkedom": { + "version": "0.18.5", + "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.5.tgz", + "integrity": "sha512-JGLaGGtqtu+eOhYrC1wkWYTBcpVWL4AsnwAtMtgO1Q0gI0PuPJKI0zBBE+a/1BrhOE3Uw8JI/ycByAv5cLrAuQ==", + "dependencies": { + "css-select": "^5.1.0", + "cssom": "^0.5.0", + "html-escaper": "^3.0.3", + "htmlparser2": "^9.1.0", + "uhyphen": "^0.2.0" + } + }, "node_modules/longest-streak": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-2.0.4.tgz", @@ -2126,6 +2686,32 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" }, + "node_modules/mustache": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", + "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", + "peer": true, + "bin": { + "mustache": "bin/mustache" + } + }, + "node_modules/nanoid": { + "version": "3.3.8", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", + "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, "node_modules/negotiator": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", @@ -2160,6 +2746,11 @@ "node": ">=10.5.0" } }, + "node_modules/node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==" + }, "node_modules/node-fetch": { "version": "3.3.2", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", @@ -2177,6 +2768,17 @@ "url": "https://opencollective.com/node-fetch" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/nwsapi": { "version": "2.2.16", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.16.tgz", @@ -2204,11 +2806,125 @@ "node": ">= 0.8" } }, + "node_modules/openai": { + "version": "4.76.1", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.76.1.tgz", + "integrity": "sha512-ci63/WFEMd6QjjEVeH0pV7hnFS6CCqhgJydSti4Aak/8uo2SpgzKjteUDaY+OkwziVj11mi6j+0mRUIiGKUzWw==", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, + "node_modules/openai/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "node_modules/openai/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/openapi-types": { + "version": "12.1.3", + "resolved": "https://registry.npmjs.org/openapi-types/-/openapi-types-12.1.3.tgz", + "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==" + }, "node_modules/option": { "version": "0.2.4", "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==" }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", + "engines": { + "node": ">=4" + } + }, + "node_modules/p-queue": { + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", + "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", + "dependencies": { + "eventemitter3": "^4.0.4", + "p-timeout": "^3.2.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-retry": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz", + "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==", + "dependencies": { + "@types/retry": "0.12.0", + "retry": "^0.13.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/p-timeout": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", + "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", + "dependencies": { + "p-finally": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/pako": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", @@ -2240,6 +2956,11 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/parse-srcset": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz", + "integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q==" + }, "node_modules/parse5": { "version": "7.2.1", "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", @@ -2272,6 +2993,52 @@ "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==" }, + "node_modules/pdf-parse": { + "version": "1.1.3", + "resolved": "git+ssh://git@github.com/iamh2o/pdf-parse.git#d7a41d5aaed1503bee2d7ea50bf89588d3b2d2cf", + "license": "MIT", + "dependencies": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + }, + "engines": { + "node": ">=6.8.1" + } + }, + "node_modules/pdf-parse/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dependencies": { + "ms": "^2.1.1" + } + }, + "node_modules/pdf2json": { + "version": "3.1.4", + "resolved": "https://registry.npmjs.org/pdf2json/-/pdf2json-3.1.4.tgz", + "integrity": "sha512-rS+VapXpXZr+5lUpHmRh3ugXdFXp24p1RyG24yP1DMpqP4t0mrYNGpLtpSbWD42PnQ59GIXofxF+yWb7M+3THg==", + "bundleDependencies": [ + "@xmldom/xmldom" + ], + "dependencies": { + "@xmldom/xmldom": "^0.8.10" + }, + "bin": { + "pdf2json": "bin/pdf2json.js" + }, + "engines": { + "node": ">=18.12.1", + "npm": ">=8.19.2" + } + }, + "node_modules/pdf2json/node_modules/@xmldom/xmldom": { + "version": "0.8.10", + "inBundle": true, + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/peek-readable": { "version": "5.3.1", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-5.3.1.tgz", @@ -2284,6 +3051,38 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==" + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, "node_modules/process-nextick-args": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", @@ -2529,6 +3328,14 @@ "node": ">=0.10" } }, + "node_modules/retry": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", + "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==", + "engines": { + "node": ">= 4" + } + }, "node_modules/rrweb-cssom": { "version": "0.7.1", "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.7.1.tgz", @@ -2566,6 +3373,37 @@ "truncate-utf8-bytes": "^1.0.0" } }, + "node_modules/sanitize-html": { + "version": "2.13.1", + "resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.1.tgz", + "integrity": "sha512-ZXtKq89oue4RP7abL9wp/9URJcqQNABB5GGJ2acW1sdO8JTVl92f4ygD7Yc9Ze09VAZhnt2zegeU0tbNsdcLYg==", + "dependencies": { + "deepmerge": "^4.2.2", + "escape-string-regexp": "^4.0.0", + "htmlparser2": "^8.0.0", + "is-plain-object": "^5.0.0", + "parse-srcset": "^1.0.2", + "postcss": "^8.3.11" + } + }, + "node_modules/sanitize-html/node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, "node_modules/saxes": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", @@ -2577,6 +3415,17 @@ "node": ">=v12.22.7" } }, + "node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/send": { "version": "0.19.0", "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", @@ -2678,6 +3527,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/space-separated-tokens": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", @@ -2864,11 +3721,21 @@ "node": ">= 0.6" } }, + "node_modules/uhyphen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", + "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==" + }, "node_modules/underscore": { "version": "1.13.7", "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==" }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/unified": { "version": "9.2.2", "resolved": "https://registry.npmjs.org/unified/-/unified-9.2.2.tgz", @@ -3055,6 +3922,18 @@ "node": ">= 0.4.0" } }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/vary": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", @@ -3199,6 +4078,33 @@ "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" }, + "node_modules/yaml": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz", + "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/zod": { + "version": "3.24.1", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.24.1.tgz", + "integrity": "sha512-muH7gBL9sI1nciMZV67X5fTKKBLtwpZ5VBp1vsOQzj1MhrBZ4wlVCm3gedKZWLp0Oyel8sIGfeiz54Su+OVT+A==", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, + "node_modules/zod-to-json-schema": { + "version": "3.23.5", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.5.tgz", + "integrity": "sha512-5wlSS0bXfF/BrL4jPAbz9da5hDlDptdEppYfe+x4eIJ7jioqKG9uUxOwPzqof09u/XeVdrgFu29lZi+8XNDJtA==", + "peerDependencies": { + "zod": "^3.23.3" + } + }, "node_modules/zwitch": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", diff --git a/apps/converter/package.json b/apps/converter/package.json index 7220d6e3f..f109e1fcc 100644 --- a/apps/converter/package.json +++ b/apps/converter/package.json @@ -14,15 +14,20 @@ "dependencies": { "@extractus/article-extractor": "^8.0.16", "@textlint/markdown-to-ast": "^14.3.0", + "dotenv": "^16.4.7", "express": "^4.21.1", "fs-extra": "^11.2.0", "hast-util-to-html": "^9.0.3", "image-type": "^5.2.0", "jsdom": "^25.0.1", "mammoth": "^1.8.0", + "markdownlint": "^0.37.1", "mdast-util-from-markdown": "^2.0.2", "mdast-util-to-hast": "^13.2.0", "node-fetch": "^3.3.2", + "openai": "^4.76.1", + "pdf-parse": "github:iamh2o/pdf-parse#1.1.3", + "pdf2json": "^3.1.4", "sanitize-filename": "^1.6.3", "turndown": "^7.2.0", "unist-util-visit": "^5.0.0" diff --git a/apps/converter/pdfToMarkdown.js b/apps/converter/pdfToMarkdown.js new file mode 100644 index 000000000..fcab436bb --- /dev/null +++ b/apps/converter/pdfToMarkdown.js @@ -0,0 +1,108 @@ +import crypto from 'crypto'; +import fs from 'fs'; +import path from 'path'; +import pdf from 'pdf-parse'; +import PDFParser from 'pdf2json'; + +export function generateFolderName(filePath) { + const fileContent = fs.readFileSync(filePath); + const hash = crypto.createHash('md5').update(fileContent).digest('hex'); + return hash.substring(0, 12); +} + +export async function pdfToMarkdown(pdfPath) { + try { + // Validate input file exists and is a PDF + if (!fs.existsSync(pdfPath) || !pdfPath.toLowerCase().endsWith('.pdf')) { + throw new Error('Invalid PDF file path'); + } + + // Generate output folder name + const folderName = generateFolderName(pdfPath); + const outputDir = path.join(path.dirname(pdfPath), folderName); + const imagesDir = path.join(outputDir, 'images'); + + // Create output directories + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir); + } + if (!fs.existsSync(imagesDir)) { + fs.mkdirSync(imagesDir); + } + + // Extract text content from all pages + const dataBuffer = fs.readFileSync(pdfPath); + const data = await pdf(dataBuffer); + // Combine text from all pages + const markdownContent = data.text; + + // Save markdown content + fs.writeFileSync(path.join(outputDir, 'content.md'), markdownContent); + + // Extract images + const pdfParser = new PDFParser(null, 1); // Added parameter to preserve images + + return new Promise((resolve, reject) => { + pdfParser.on('pdfParser_dataReady', (pdfData) => { + try { + // Extract and save images + let imageCount = 0; + + if (pdfData.Pages) { + pdfData.Pages.forEach((page, pageIndex) => { + console.log("🚀 ~ pdfData.Pages.forEach ~ page:1"); + // Handle both Images and Bg (background) images + const images = [...(page.Images || []), ...(page.Bg || [])]; + + images.forEach((image) => { + try { + // Check if image data exists and is valid + if (image.data) { + let imageBuffer; + + // Handle different image data formats + if (Buffer.isBuffer(image.data)) { + imageBuffer = image.data; + } else if (typeof image.data === 'string') { + imageBuffer = Buffer.from(image.data, 'base64'); + } else { + console.warn(`Skipping invalid image data at page ${pageIndex + 1}`); + return; + } + + const imagePath = path.join( + imagesDir, + `image_${pageIndex + 1}_${++imageCount}.png` + ); + + fs.writeFileSync(imagePath, imageBuffer); + } + } catch (imageError) { + console.warn(`Error processing image at page ${pageIndex + 1}:`, imageError); + } + }); + }); + } + + resolve({ + outputDir, + markdownPath: path.join(outputDir, 'content.md'), + imagesDir, + totalPages: pdfData.Pages.length, + totalImages: imageCount + }); + } catch (extractError) { + reject(extractError); + } + }); + + pdfParser.on('pdfParser_dataError', (error) => { + reject(error); + }); + + pdfParser.loadPDF(pdfPath); + }); + } catch (error) { + throw new Error(`PDF conversion failed: ${error.message}`); + } +} diff --git a/package.json b/package.json index 2dcbb5752..326825e5e 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,8 @@ "resolutions": { "gatsby-plugin-sharp": "5.13.1", "sharp": "0.33.1", - "graphql": "16.8.1" + "graphql": "16.8.1", + "pdf-parse@1.1.1": "patch:pdf-parse@npm%3A1.1.1#./.yarn/patches/pdf-parse-npm-1.1.1-04a6109b2a.patch" }, "pnpm": { "patchedDependencies": {} diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install index f4a635f72..3e3acebf5 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.install @@ -2,5 +2,84 @@ /** * @file - * Install, update and uninstall functions for the silverback_ai_import module. + * Install, update and uninstall functions for the Silverback AI module. */ + +use Drupal\Core\Entity\EntityTypeInterface; + +/** + * Implements hook_schema(). + */ +function silverback_ai_import_schema() { + + $db_schema = \Drupal::database()->schema(); + if ($db_schema->tableExists('silverback_ai_import')) { + $db_schema->dropTable('silverback_ai_import'); + } + + $schema['silverback_ai_import'] = [ + 'description' => 'Import log for the Silverback AI import module.', + 'fields' => [ + 'id' => [ + 'type' => 'serial', + 'not null' => TRUE, + 'description' => 'Primary Key.', + ], + 'uid' => [ + 'description' => 'Foreign key to {users}.uid; uniquely identifies a Drupal user executed the ai fetch action.', + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => TRUE, + ], + 'timestamp' => [ + 'description' => 'Date/time if the import, as Unix timestamp.', + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => TRUE, + ], + 'target_entity_type_id' => [ + 'type' => 'varchar_ascii', + 'length' => EntityTypeInterface::ID_MAX_LENGTH, + 'not null' => FALSE, + 'default' => '', + 'description' => 'The ID of the associated entity type.', + ], + 'target_entity_id' => [ + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => FALSE, + 'description' => 'The ID of the associated entity.', + ], + 'target_entity_revision_id' => [ + 'type' => 'int', + 'unsigned' => TRUE, + 'not null' => FALSE, + 'description' => 'The revision ID of the associated entity.', + ], + 'source' => [ + 'type' => 'text', + 'not null' => TRUE, + 'size' => 'normal', + 'description' => 'The source of the import.', + ], + 'output_folder' => [ + 'type' => 'text', + 'not null' => TRUE, + 'size' => 'small', + 'description' => 'The name of the folder exported.', + ], + 'data' => [ + 'type' => 'blob', + 'not null' => TRUE, + 'size' => 'big', + 'description' => 'The import response *usually a text series of gutenberg formatted blocks.', + ], + 'primary key' => ['id'], + 'indexes' => [ + 'uid' => ['uid'], + 'timestamp' => ['timestamp'], + ], + ]; + + return $schema; +} diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module index 35e798d70..4c92de170 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/silverback_ai_import.module @@ -12,43 +12,6 @@ use Drupal\Core\Form\FormStateInterface; */ function silverback_ai_import_form_alter(&$form, &$form_state, $form_id) { // @todo - if ($form_id == 'views_exposed_form') { - $service = \Drupal::service('silverback_ai_import.content'); - $url_value = 'https://www.tcs.ch/de/camping-reisen/camping-insider/community/backen-mit-dem-omnia.php'; - $ast = $service->getAstFromUrl($url_value); - $flatten = $service->flattenAst($ast->content); - // dpm($flatten); - // $file = \Drupal::entityTypeManager()->getStorage('file')->load(11); - // $ast = $service->getAstFromFilePath($file); - $flatten = $service->flattenAst($ast->content); - // dpm($ast->content);. - foreach ($flatten as $el) { - // dpm($el); - } - // dpm($flatten); - $arr = json_decode(json_encode($ast), TRUE); - $data = $arr['content']; - /* $callback = function (&$item, $depth) { - $item['xxx'] = 'xxx'; - }; */ - - $service->iterateArray($data); - dpm($data); - // Example usage: - /* - foreach ($ast->content as $obj) { - dpm($obj); - $iterator = function ($key, $value) { - $times = (int) $key * 1 + 1; - $repeat = str_repeat(' ', (int) $times); - if (isset($value->type) && !empty($value->type)) { - // dpm($repeat . $value->type);. - } - }; - $service->iterateObject($obj, $iterator); - } - */ - } } /** @@ -135,16 +98,14 @@ function _silverback_ai_import_form_submit(array $form, FormStateInterface $form if ($fid && $type == 'docx') { $file = \Drupal::entityTypeManager()->getStorage('file')->load($fid); $ast = $service->getAstFromFilePath($file); - $content->create($ast->content, $entity); - - // $flatten = $service->flattenAst($ast->content); - // $content->create($flatten, $entity); + // $content->create($ast->content, $entity); + $flatten = $service->flattenAst($ast->content); + $content->create($flatten, $entity); } elseif (!empty($url_value) && $type == 'url') { $ast = $service->getAstFromUrl($url_value); - $content->create($ast->content, $entity); - - // $flatten = $service->flattenAst($ast->content); - // $content->create($flatten, $entity); + // $content->create($ast->content, $entity); + $flatten = $service->flattenAst($ast->content); + $content->create($flatten, $entity); } } diff --git a/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php b/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php index e2d2bd35e..84bcd1704 100644 --- a/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php +++ b/packages/drupal/silverback_ai/modules/silverback_ai_import/src/ContentImportAiService.php @@ -274,7 +274,22 @@ public function getPlugin(array $chunk) { } /** + * Flattens a hierarchical AST (Abstract Syntax Tree) into a linear array of nodes. * + * This function converts a nested AST structure into a flat array where each node + * is assigned a unique ID and maintains a reference to its parent. It processes + * specific node types differently and handles recursive traversal of child nodes. + * + * @param array|null $ast + * The AST structure to flatten. + * @param int|null $parent + * The ID of the parent node (used in recursion) + * + * @return array An array of flattened nodes, where each node contains: + * - type: The capitalized node type + * - id: A unique identifier + * - parent: Reference to the parent node's ID + * - Additional properties specific to each node type */ public function flattenAst($ast, $parent = NULL) { @@ -287,18 +302,25 @@ public function flattenAst($ast, $parent = NULL) { } foreach ($ast as $chunk) { - if (isset($chunk['type']) && in_array($chunk['type'], [ 'Strong', 'Text', 'ListItem', 'Emphasis', - 'Link', ])) { continue; } + if (isset($chunk['type']) + && $chunk['type'] == 'Link' + && isset($chunk['children']) + && count($chunk['children']) == 1 + && $chunk['children'][0]['type'] !== 'Image' + ) { + continue; + } + $children = $chunk['children'] ?? []; // unset($chunk['children']); // Chunk preprocessing. @@ -323,13 +345,106 @@ public function iterateArray(array &$data, int $depth = 0): void { foreach ($data as &$item) { // Process item here. if (isset($item['type'])) { - $item['gutenberg'] = $this->processChunk($item); + if ($item['type'] == 'Image') { + $item['gutenberg'] = $this->processChunk($item); + } } - if (isset($item['children']) && is_array($item['children'])) { $this->iterateArray($item['children'], $depth + 1); } } + + } + + /** + * Extracts various metadata from a given URL by fetching and parsing its HTML content. + * + * This function attempts to retrieve the HTML content of a URL and extract key information + * including title, path, meta tags, and language settings. It includes error handling for + * various failure scenarios. + * + * @param string $url + * The URL to extract data from. + * + * @return array An associative array containing: + * - title: string|null The page title if found + * - path: string The URL path component, defaults to "/" if not found + * - metatags: array Meta tag name-content pairs + * - language: string|null The page language if specified + * - error: string|null Error message if any error occurred, null otherwise + * + * @throws \Exception Caught internally and returned as error in result array + */ + public function extractPageDataFromUrl($url) { + $data = [ + 'title' => NULL, + 'path' => NULL, + 'metatags' => [], + 'language' => NULL, + // Add an error key. + 'error' => NULL, + ]; + + // Validate URL. + if (!filter_var($url, FILTER_VALIDATE_URL)) { + $data['error'] = "Invalid URL"; + return $data; + } + + try { + // Use file_get_contents with a user agent to avoid being blocked by some servers. + $options = [ + 'http' => [ + 'method' => 'GET', + // Example user agent. + 'user_agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', + // Timeout in seconds. + 'timeout' => 10, + ], + ]; + $context = stream_context_create($options); + // Use @ to suppress warnings for invalid URLs or network issues. + $html = @file_get_contents($url, FALSE, $context); + + if ($html === FALSE) { + $error = error_get_last(); + $data['error'] = "Failed to fetch URL: " . ($error ? $error['message'] : "Unknown error"); + return $data; + } + + // Extract Title. + if (preg_match('/