From 1e0c75d4da595732651a51c1568b0b3f76af5b48 Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 01:50:00 +0000 Subject: [PATCH 01/12] Install rehype-stringify --- apps/app/package.json | 1 + pnpm-lock.yaml | 106 +++++++++++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 38 deletions(-) diff --git a/apps/app/package.json b/apps/app/package.json index ffbd501e4dd..44ddf9ec064 100644 --- a/apps/app/package.json +++ b/apps/app/package.json @@ -200,6 +200,7 @@ "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "rehype-slug": "^6.0.0", + "rehype-stringify": "^10.0.1", "rehype-toc": "^3.0.2", "remark-breaks": "^4.0.0", "remark-directive": "^3.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 77af8f0cea0..6f96108f05e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -606,6 +606,9 @@ importers: rehype-slug: specifier: ^6.0.0 version: 6.0.0 + rehype-stringify: + specifier: ^10.0.1 + version: 10.0.1 rehype-toc: specifier: ^3.0.2 version: 3.0.2 @@ -7292,6 +7295,9 @@ packages: hast-util-select@6.0.2: resolution: {integrity: sha512-hT/SD/d/Meu+iobvgkffo1QecV8WeKWxwsNMzcTJsKw1cKTQKSR/7ArJeURLNJF9HDjp9nVoORyNNJxrvBye8Q==} + hast-util-to-html@9.0.3: + resolution: {integrity: sha512-M17uBDzMJ9RPCqLMO92gNNUDuBSq10a25SDBI08iCCxmorf4Yy6sYHK57n9WAbRAAaU+DuR4W6GN9K4DFZesYg==} + hast-util-to-jsx-runtime@2.3.0: resolution: {integrity: sha512-H/y0+IWPdsLLS738P8tDnrQ8Z+dj12zQQ6WC11TIM21C8WFVoIxcqWXf2H3hiTVZjF1AWqoimGwrTWecWrnmRQ==} @@ -10229,6 +10235,9 @@ packages: rehype-slug@6.0.0: resolution: {integrity: sha512-lWyvf/jwu+oS5+hL5eClVd3hNdmwM1kAC0BUvEGD19pajQMIzcNUd/k9GsfQ+FfECvX+JE+e9/btsKH0EjJT6A==} + rehype-stringify@10.0.1: + resolution: {integrity: sha512-k9ecfXHmIPuFVI61B9DeLPN0qFHfawM6RsuX48hoqlaKSF61RskNjSm1lI8PhBEM0MRdLxVVm4WmTqJQccH9mA==} + rehype-toc@3.0.2: resolution: {integrity: sha512-DMt376+4i1KJGgHJL7Ezd65qKkJ7Eqp6JSB47BJ90ReBrohI9ufrornArM6f4oJjP2E2DVZZHufWucv/9t7GUQ==} engines: {node: '>=10'} @@ -12268,6 +12277,9 @@ packages: zwitch@2.0.2: resolution: {integrity: sha512-JZxotl7SxAJH0j7dN4pxsTV6ZLXoLdGME+PsjkL/DaBrVryK9kTGq06GfKrwcSOqypP+fdXGoCHE36b99fWVoA==} + zwitch@2.0.4: + resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} + snapshots: '@adobe/css-tools@4.4.0': {} @@ -13389,7 +13401,7 @@ snapshots: '@babel/traverse': 7.24.6 '@babel/types': 7.25.6 convert-source-map: 2.0.0 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -13580,7 +13592,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.24.6 '@babel/parser': 7.25.6 '@babel/types': 7.25.6 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -14085,7 +14097,7 @@ snapshots: '@elastic/elasticsearch@7.17.13': dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) hpagent: 0.1.2 ms: 2.1.3 secure-json-parse: 2.7.0 @@ -14101,7 +14113,7 @@ snapshots: '@elastic/transport@8.6.1': dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) hpagent: 1.2.0 ms: 2.1.3 secure-json-parse: 2.7.0 @@ -14212,7 +14224,7 @@ snapshots: '@eslint/eslintrc@2.0.3': dependencies: ajv: 6.12.6 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -14285,7 +14297,7 @@ snapshots: '@humanwhocodes/config-array@0.11.8': dependencies: '@humanwhocodes/object-schema': 1.2.1 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -14301,7 +14313,7 @@ snapshots: '@antfu/install-pkg': 0.4.1 '@antfu/utils': 0.7.10 '@iconify/types': 2.0.0 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) kolorist: 1.8.0 local-pkg: 0.5.0 mlly: 1.7.1 @@ -15760,7 +15772,7 @@ snapshots: '@swc-node/sourcemap-support': 0.5.0 '@swc/core': 1.5.25(@swc/helpers@0.5.11) colorette: 2.0.20 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) pirates: 4.0.6 tslib: 2.8.0 typescript: 5.4.2 @@ -16361,7 +16373,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.59.7 '@typescript-eslint/type-utils': 5.59.7(eslint@8.41.0)(typescript@5.4.2) '@typescript-eslint/utils': 5.59.7(eslint@8.41.0)(typescript@5.4.2) - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) eslint: 8.41.0 grapheme-splitter: 1.0.4 ignore: 5.3.1 @@ -16391,7 +16403,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.59.7 '@typescript-eslint/types': 5.59.7 '@typescript-eslint/typescript-estree': 5.59.7(typescript@5.4.2) - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) eslint: 8.41.0 optionalDependencies: typescript: 5.4.2 @@ -16420,7 +16432,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 5.59.7(typescript@5.4.2) '@typescript-eslint/utils': 5.59.7(eslint@8.41.0)(typescript@5.4.2) - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) eslint: 8.41.0 tsutils: 3.21.0(typescript@5.4.2) optionalDependencies: @@ -16449,7 +16461,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.59.7 '@typescript-eslint/visitor-keys': 5.59.7 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.3 @@ -16835,13 +16847,13 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color agent-base@7.1.1: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17915,7 +17927,7 @@ snapshots: connect-mongo@4.6.0(express-session@1.18.0)(mongodb@4.17.2(@aws-sdk/client-sso-oidc@3.600.0)): dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) express-session: 1.18.0 kruptein: 3.0.6 mongodb: 4.17.2(@aws-sdk/client-sso-oidc@3.600.0) @@ -18395,10 +18407,6 @@ snapshots: dependencies: ms: 2.1.3 - debug@4.3.7: - dependencies: - ms: 2.1.3 - debug@4.3.7(supports-color@5.5.0): dependencies: ms: 2.1.3 @@ -18693,7 +18701,7 @@ snapshots: engine.io-client@6.6.2: dependencies: '@socket.io/component-emitter': 3.1.2 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) engine.io-parser: 5.2.3 ws: 8.17.1 xmlhttprequest-ssl: 2.1.2 @@ -18713,7 +18721,7 @@ snapshots: base64id: 2.0.0 cookie: 0.7.2 cors: 2.8.5 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) engine.io-parser: 5.2.3 ws: 8.17.1 transitivePeerDependencies: @@ -19159,7 +19167,7 @@ snapshots: ajv: 6.12.6 chalk: 4.1.2 cross-spawn: 7.0.3 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) doctrine: 3.0.0 escape-string-regexp: 4.0.0 eslint-scope: 7.2.0 @@ -19499,7 +19507,7 @@ snapshots: follow-redirects@1.15.9(debug@4.3.7): optionalDependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) follow-redirects@1.5.10: dependencies: @@ -20005,6 +20013,20 @@ snapshots: unist-util-visit: 5.0.0 zwitch: 2.0.2 + hast-util-to-html@9.0.3: + dependencies: + '@types/hast': 3.0.4 + '@types/unist': 3.0.3 + ccount: 2.0.1 + comma-separated-tokens: 2.0.2 + hast-util-whitespace: 3.0.0 + html-void-elements: 3.0.0 + mdast-util-to-hast: 13.2.0 + property-information: 6.1.1 + space-separated-tokens: 2.0.1 + stringify-entities: 4.0.4 + zwitch: 2.0.4 + hast-util-to-jsx-runtime@2.3.0: dependencies: '@types/estree': 1.0.6 @@ -20147,14 +20169,14 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -20177,14 +20199,14 @@ snapshots: https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color https-proxy-agent@7.0.5: dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -20543,7 +20565,7 @@ snapshots: istanbul-lib-source-maps@4.0.1: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) istanbul-lib-coverage: 3.2.2 source-map: 0.6.1 transitivePeerDependencies: @@ -21942,7 +21964,7 @@ snapshots: micromark@4.0.0: dependencies: '@types/debug': 4.1.7 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) decode-named-character-reference: 1.0.2 devlop: 1.1.0 micromark-core-commonmark: 2.0.1 @@ -22105,7 +22127,7 @@ snapshots: dependencies: async-mutex: 0.4.1 camelcase: 6.3.0 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) find-cache-dir: 3.3.2 follow-redirects: 1.15.9(debug@4.3.7) https-proxy-agent: 7.0.5 @@ -22213,7 +22235,7 @@ snapshots: mquery@4.0.3: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -22303,7 +22325,7 @@ snapshots: new-find-package-json@2.0.0: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -22834,7 +22856,7 @@ snapshots: passport-saml@3.2.4: dependencies: '@xmldom/xmldom': 0.7.13 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) passport-strategy: 1.0.0 xml-crypto: 2.1.5 xml-encryption: 2.0.0 @@ -23640,6 +23662,12 @@ snapshots: hast-util-to-string: 3.0.1 unist-util-visit: 5.0.0 + rehype-stringify@10.0.1: + dependencies: + '@types/hast': 3.0.4 + hast-util-to-html: 9.0.3 + unified: 11.0.5 + rehype-toc@3.0.2: dependencies: '@jsdevtools/rehype-toc': 3.0.2 @@ -23821,7 +23849,7 @@ snapshots: retry-request@4.2.2: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) extend: 3.0.2 transitivePeerDependencies: - supports-color @@ -24211,7 +24239,7 @@ snapshots: socket.io-adapter@2.5.5: dependencies: - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) ws: 8.17.1 transitivePeerDependencies: - bufferutil @@ -24221,7 +24249,7 @@ snapshots: socket.io-client@4.8.1: dependencies: '@socket.io/component-emitter': 3.1.2 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) engine.io-client: 6.6.2 socket.io-parser: 4.2.4 transitivePeerDependencies: @@ -24232,7 +24260,7 @@ snapshots: socket.io-parser@4.2.4: dependencies: '@socket.io/component-emitter': 3.1.2 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -24241,7 +24269,7 @@ snapshots: accepts: 1.3.8 base64id: 2.0.0 cors: 2.8.5 - debug: 4.3.7 + debug: 4.3.7(supports-color@5.5.0) engine.io: 6.6.2 socket.io-adapter: 2.5.5 socket.io-parser: 4.2.4 @@ -26037,3 +26065,5 @@ snapshots: zwitch@1.0.5: {} zwitch@2.0.2: {} + + zwitch@2.0.4: {} From 95eb6ec6ccb5e079b4db68abf4a315fa663724d7 Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 01:50:38 +0000 Subject: [PATCH 02/12] Install rehype-meta --- apps/app/package.json | 1 + pnpm-lock.yaml | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/apps/app/package.json b/apps/app/package.json index 44ddf9ec064..592d97fd22c 100644 --- a/apps/app/package.json +++ b/apps/app/package.json @@ -197,6 +197,7 @@ "reconnecting-websocket": "^4.4.0", "redis": "^3.0.2", "rehype-katex": "^7.0.1", + "rehype-meta": "^4.0.1", "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "rehype-slug": "^6.0.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6f96108f05e..40507391d88 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -597,6 +597,9 @@ importers: rehype-katex: specifier: ^7.0.1 version: 7.0.1 + rehype-meta: + specifier: ^4.0.1 + version: 4.0.1 rehype-raw: specifier: ^7.0.0 version: 7.0.0 @@ -7271,6 +7274,9 @@ packages: hast-util-from-parse5@8.0.1: resolution: {integrity: sha512-Er/Iixbc7IEa7r/XLtuG52zoqn/b3Xng/w6aZQ0xGVxzhw5xUFxcRqdPzP6yFi/4HBYRaifaI5fQ1RH8n0ZeOQ==} + hast-util-from-selector@3.0.1: + resolution: {integrity: sha512-CA2dwcsAS6a7DNZq8HT5fNP4FzUq2PUpQpKnAtOCmfTk429jR0RtasLSMlFA1FNKd8lgfeCIAFl3/vD95be8Lg==} + hast-util-has-property@3.0.0: resolution: {integrity: sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==} @@ -7319,6 +7325,9 @@ packages: hastscript@8.0.0: resolution: {integrity: sha512-dMOtzCEd3ABUeSIISmrETiKuyydk1w0pa+gE/uormcTpSYuaNJPbX1NU3JLyscSLjwAQM8bWMhhIlnCqnRvDTw==} + hastscript@9.0.0: + resolution: {integrity: sha512-jzaLBGavEDKHrc5EfFImKN7nZKKBdSLIdGvCwDZ9TfzbF2ffXiov8CKE445L2Z1Ek2t/m4SKQ2j6Ipv7NyUolw==} + he@1.2.0: resolution: {integrity: sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==} hasBin: true @@ -10222,6 +10231,9 @@ packages: rehype-katex@7.0.1: resolution: {integrity: sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==} + rehype-meta@4.0.1: + resolution: {integrity: sha512-nLwA17+GbtBYi3C1KSrFR8JlqXv76mz185U//xDEAYgzE3g/bSD6WKSXva1W95ttzouUCJwA09X3AQZIi3R+Nw==} + rehype-raw@7.0.0: resolution: {integrity: sha512-/aE8hCfKlQeA8LmyeyQvQF3eBiLRGNlfBJEvWH7ivp9sBqs7TNqBL5X3v157rM4IFETqDnIOO+z5M/biZbo9Ww==} @@ -19954,6 +19966,13 @@ snapshots: vfile-location: 5.0.3 web-namespaces: 2.0.1 + hast-util-from-selector@3.0.1: + dependencies: + '@types/hast': 3.0.4 + css-selector-parser: 3.0.5 + devlop: 1.1.0 + hastscript: 9.0.0 + hast-util-has-property@3.0.0: dependencies: '@types/hast': 3.0.4 @@ -20088,6 +20107,14 @@ snapshots: property-information: 6.1.1 space-separated-tokens: 2.0.1 + hastscript@9.0.0: + dependencies: + '@types/hast': 3.0.4 + comma-separated-tokens: 2.0.2 + hast-util-parse-selector: 4.0.0 + property-information: 6.1.1 + space-separated-tokens: 2.0.1 + he@1.2.0: {} header-case@2.0.4: @@ -23637,6 +23664,14 @@ snapshots: unist-util-visit-parents: 6.0.1 vfile: 6.0.3 + rehype-meta@4.0.1: + dependencies: + '@types/hast': 3.0.4 + hast-util-from-selector: 3.0.1 + hast-util-select: 6.0.2 + hastscript: 9.0.0 + vfile: 6.0.3 + rehype-raw@7.0.0: dependencies: '@types/hast': 3.0.4 From f87304f15b7e243c94daf9b6b7acd84a9ad09bad Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 02:46:47 +0000 Subject: [PATCH 03/12] Modified markdown to html conversion logic --- .../features/openai/server/services/openai.ts | 16 +++-- .../openai/server/utils/sanitize-markdown.ts | 65 +++++++++++++++---- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index 608f6576278..ecd3d0ed9c3 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -2,6 +2,7 @@ import assert from 'node:assert'; import { Readable, Transform } from 'stream'; import { pipeline } from 'stream/promises'; +import type { IPagePopulatedToShowRevision } from '@growi/core'; import { PageGrant, isPopulated } from '@growi/core'; import type { HydratedDocument, Types } from 'mongoose'; import mongoose from 'mongoose'; @@ -20,7 +21,7 @@ import { createBatchStream } from '~/server/util/batch-stream'; import loggerFactory from '~/utils/logger'; import { OpenaiServiceTypes } from '../../interfaces/ai'; -import { sanitizeMarkdown } from '../utils/sanitize-markdown'; +import { convertMarkdownToHtml } from '../utils/sanitize-markdown'; import { getClient } from './client-delegator'; // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter'; @@ -157,9 +158,10 @@ class OpenaiService implements IOpenaiService { // } // } - private async uploadFile(pageId: Types.ObjectId, body: string): Promise { - const sanitizedMarkdown = await sanitizeMarkdown(body); - const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`); + private async uploadFile(page: HydratedDocument | IPagePopulatedToShowRevision): Promise { + const convertedHtml = await convertMarkdownToHtml(page); + console.log('convertedHtml', convertedHtml); + const file = await toFile(Readable.from(convertedHtml), `${page._id}.html`); const uploadedFile = await this.client.uploadFile(file); return uploadedFile; } @@ -183,17 +185,17 @@ class OpenaiService implements IOpenaiService { async createVectorStoreFile(pages: Array>): Promise { const vectorStore = await this.getOrCreateVectorStoreForPublicScope(); const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map(); - const processUploadFile = async(page: PageDocument) => { + const processUploadFile = async(page: HydratedDocument) => { if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) { if (isPopulated(page.revision) && page.revision.body.length > 0) { - const uploadedFile = await this.uploadFile(page._id, page.revision.body); + const uploadedFile = await this.uploadFile(page); prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap); return; } const pagePopulatedToShowRevision = await page.populateDataToShowRevision(); if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) { - const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body); + const uploadedFile = await this.uploadFile(pagePopulatedToShowRevision); prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap); } } diff --git a/apps/app/src/features/openai/server/utils/sanitize-markdown.ts b/apps/app/src/features/openai/server/utils/sanitize-markdown.ts index f604be5e058..8cea762d2c4 100644 --- a/apps/app/src/features/openai/server/utils/sanitize-markdown.ts +++ b/apps/app/src/features/openai/server/utils/sanitize-markdown.ts @@ -1,48 +1,82 @@ import { dynamicImport } from '@cspell/dynamic-import'; +import { isPopulated } from '@growi/core'; +import type { IPagePopulatedToShowRevision } from '@growi/core/dist/interfaces'; import type { Root, Code } from 'mdast'; +import type { HydratedDocument } from 'mongoose'; +import type * as RehypeMeta from 'rehype-meta'; +import type * as RehypeStringify from 'rehype-stringify'; import type * as RemarkParse from 'remark-parse'; -import type * as RemarkStringify from 'remark-stringify'; +import type * as RemarkRehype from 'remark-rehype'; import type * as Unified from 'unified'; import type * as UnistUtilVisit from 'unist-util-visit'; +import type { PageDocument } from '~/server/models/page'; + + interface ModuleCache { remarkParse?: typeof RemarkParse.default; - remarkStringify?: typeof RemarkStringify.default; unified?: typeof Unified.unified; visit?: typeof UnistUtilVisit.visit; + remarkRehype?: typeof RemarkRehype.default; + rehypeMeta?: typeof RehypeMeta.default; + rehypeStringify?: typeof RehypeStringify.default; } let moduleCache: ModuleCache = {}; const initializeModules = async(): Promise => { - if (moduleCache.remarkParse != null && moduleCache.remarkStringify != null && moduleCache.unified != null && moduleCache.visit != null) { + if (moduleCache.remarkParse != null + && moduleCache.unified != null + && moduleCache.visit != null + && moduleCache.remarkRehype != null + && moduleCache.rehypeMeta != null + && moduleCache.rehypeStringify != null + ) { return; } - const [{ default: remarkParse }, { default: remarkStringify }, { unified }, { visit }] = await Promise.all([ + const [ + { default: remarkParse }, + { unified }, { visit }, + { default: remarkRehype }, + { default: rehypeMeta }, + { default: rehypeStringify }, + ] = await Promise.all([ dynamicImport('remark-parse', __dirname), - dynamicImport('remark-stringify', __dirname), dynamicImport('unified', __dirname), dynamicImport('unist-util-visit', __dirname), + dynamicImport('remark-rehype', __dirname), + dynamicImport('rehype-meta', __dirname), + dynamicImport('rehype-stringify', __dirname), ]); moduleCache = { remarkParse, - remarkStringify, unified, visit, + remarkRehype, + rehypeMeta, + rehypeStringify, }; }; -export const sanitizeMarkdown = async(markdown: string): Promise => { +export const convertMarkdownToHtml = async(page: HydratedDocument | IPagePopulatedToShowRevision): Promise => { await initializeModules(); const { - remarkParse, remarkStringify, unified, visit, + remarkParse, + unified, visit, + remarkRehype, + rehypeMeta, + rehypeStringify, } = moduleCache; - - if (remarkParse == null || remarkStringify == null || unified == null || visit == null) { + if (remarkParse == null + || unified == null + || visit == null + || remarkRehype == null + || rehypeMeta == null + || rehypeStringify == null) { throw new Error('Failed to initialize required modules'); } @@ -56,10 +90,17 @@ export const sanitizeMarkdown = async(markdown: string): Promise => { }; }; + + const revisionBody = page.revision != null && isPopulated(page.revision) ? page.revision.body : undefined; + const processor = unified() .use(remarkParse) .use(sanitize) - .use(remarkStringify); + .use(remarkRehype) + .use(rehypeMeta, { + title: page.path, + }) + .use(rehypeStringify); - return processor.processSync(markdown).toString(); + return processor.processSync(revisionBody).toString(); }; From 1b05ae3b1b601c12d6725b572faa354b6cac7e02 Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 02:50:17 +0000 Subject: [PATCH 04/12] sanitize-markdown.ts -> convert-markdown-to-html.ts --- apps/app/src/features/openai/server/services/openai.ts | 2 +- .../utils/{sanitize-markdown.ts => convert-markdown-to-html.ts} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename apps/app/src/features/openai/server/utils/{sanitize-markdown.ts => convert-markdown-to-html.ts} (100%) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index ecd3d0ed9c3..7b38351f063 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -21,7 +21,7 @@ import { createBatchStream } from '~/server/util/batch-stream'; import loggerFactory from '~/utils/logger'; import { OpenaiServiceTypes } from '../../interfaces/ai'; -import { convertMarkdownToHtml } from '../utils/sanitize-markdown'; +import { convertMarkdownToHtml } from '../utils/convert-markdown-to-html'; import { getClient } from './client-delegator'; // import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter'; diff --git a/apps/app/src/features/openai/server/utils/sanitize-markdown.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts similarity index 100% rename from apps/app/src/features/openai/server/utils/sanitize-markdown.ts rename to apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts From 20ecef066bfa0d03604ab30d1244968fc96e2f23 Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 07:29:44 +0000 Subject: [PATCH 05/12] sanitize -> sanitizeMarkdown --- .../features/openai/server/utils/convert-markdown-to-html.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts index 8cea762d2c4..42e1965b596 100644 --- a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts +++ b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts @@ -80,7 +80,7 @@ export const convertMarkdownToHtml = async(page: HydratedDocument throw new Error('Failed to initialize required modules'); } - const sanitize = () => { + const sanitizeMarkdown = () => { return (tree: Root) => { visit(tree, 'code', (node: Code) => { if (node.lang === 'drawio') { @@ -95,7 +95,7 @@ export const convertMarkdownToHtml = async(page: HydratedDocument const processor = unified() .use(remarkParse) - .use(sanitize) + .use(sanitizeMarkdown) .use(remarkRehype) .use(rehypeMeta, { title: page.path, From 23c9dbd4f1655ccbfb9265a827313b2d06e64e9e Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 07:38:00 +0000 Subject: [PATCH 06/12] clean code --- .../server/utils/convert-markdown-to-html.ts | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts index 42e1965b596..531099402e9 100644 --- a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts +++ b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts @@ -12,11 +12,10 @@ import type * as UnistUtilVisit from 'unist-util-visit'; import type { PageDocument } from '~/server/models/page'; - interface ModuleCache { - remarkParse?: typeof RemarkParse.default; unified?: typeof Unified.unified; visit?: typeof UnistUtilVisit.visit; + remarkParse?: typeof RemarkParse.default; remarkRehype?: typeof RemarkRehype.default; rehypeMeta?: typeof RehypeMeta.default; rehypeStringify?: typeof RehypeStringify.default; @@ -25,9 +24,9 @@ interface ModuleCache { let moduleCache: ModuleCache = {}; const initializeModules = async(): Promise => { - if (moduleCache.remarkParse != null - && moduleCache.unified != null + if (moduleCache.unified != null && moduleCache.visit != null + && moduleCache.remarkParse != null && moduleCache.remarkRehype != null && moduleCache.rehypeMeta != null && moduleCache.rehypeStringify != null @@ -36,24 +35,25 @@ const initializeModules = async(): Promise => { } const [ + { unified }, + { visit }, { default: remarkParse }, - { unified }, { visit }, { default: remarkRehype }, { default: rehypeMeta }, { default: rehypeStringify }, ] = await Promise.all([ - dynamicImport('remark-parse', __dirname), dynamicImport('unified', __dirname), dynamicImport('unist-util-visit', __dirname), + dynamicImport('remark-parse', __dirname), dynamicImport('remark-rehype', __dirname), dynamicImport('rehype-meta', __dirname), dynamicImport('rehype-stringify', __dirname), ]); moduleCache = { - remarkParse, unified, visit, + remarkParse, remarkRehype, rehypeMeta, rehypeStringify, @@ -64,19 +64,10 @@ export const convertMarkdownToHtml = async(page: HydratedDocument await initializeModules(); const { - remarkParse, - unified, visit, - remarkRehype, - rehypeMeta, - rehypeStringify, + unified, visit, remarkParse, remarkRehype, rehypeMeta, rehypeStringify, } = moduleCache; - if (remarkParse == null - || unified == null - || visit == null - || remarkRehype == null - || rehypeMeta == null - || rehypeStringify == null) { + if (unified == null || visit == null || remarkParse == null || remarkRehype == null || rehypeMeta == null || rehypeStringify == null) { throw new Error('Failed to initialize required modules'); } @@ -90,8 +81,9 @@ export const convertMarkdownToHtml = async(page: HydratedDocument }; }; - - const revisionBody = page.revision != null && isPopulated(page.revision) ? page.revision.body : undefined; + const revisionBody = page.revision != null && isPopulated(page.revision) + ? page.revision.body + : undefined; const processor = unified() .use(remarkParse) From 621907ae24d079908f60bbe9f0afa18f821306cf Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 2 Dec 2024 12:48:36 +0000 Subject: [PATCH 07/12] rm debug log --- apps/app/src/features/openai/server/services/openai.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index 7b38351f063..f9dfa82e1de 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -160,7 +160,6 @@ class OpenaiService implements IOpenaiService { private async uploadFile(page: HydratedDocument | IPagePopulatedToShowRevision): Promise { const convertedHtml = await convertMarkdownToHtml(page); - console.log('convertedHtml', convertedHtml); const file = await toFile(Readable.from(convertedHtml), `${page._id}.html`); const uploadedFile = await this.client.uploadFile(file); return uploadedFile; From c7068fbde9bae6ae8883299d99917d5a4e4e122e Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Fri, 20 Dec 2024 05:39:03 +0000 Subject: [PATCH 08/12] Simplify cvertMarkdownToHtml argument types --- .../src/features/openai/server/services/openai.ts | 15 +++++++++++++-- .../server/utils/convert-markdown-to-html.ts | 12 ++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index f9dfa82e1de..3c35ba0b89b 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -36,6 +36,13 @@ let isVectorStoreForPublicScopeExist = false; type VectorStoreFileRelationsMap = Map +const isPagePopulatedToShowRevision = (page: HydratedDocument): page is IPagePopulatedToShowRevision & PageDocument => { + if (page?.revision != null && !isPopulated(page.revision)) { + return false; + } + + return true; +}; export interface IOpenaiService { getOrCreateThread(userId: string, vectorStoreId?: string, threadId?: string): Promise; getOrCreateVectorStoreForPublicScope(): Promise; @@ -158,7 +165,7 @@ class OpenaiService implements IOpenaiService { // } // } - private async uploadFile(page: HydratedDocument | IPagePopulatedToShowRevision): Promise { + private async uploadFile(page: IPagePopulatedToShowRevision): Promise { const convertedHtml = await convertMarkdownToHtml(page); const file = await toFile(Readable.from(convertedHtml), `${page._id}.html`); const uploadedFile = await this.client.uploadFile(file); @@ -186,7 +193,11 @@ class OpenaiService implements IOpenaiService { const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map(); const processUploadFile = async(page: HydratedDocument) => { if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) { - if (isPopulated(page.revision) && page.revision.body.length > 0) { + if (isPagePopulatedToShowRevision(page)) { + if (page.revision.body.length > 0) { + return; + } + const uploadedFile = await this.uploadFile(page); prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap); return; diff --git a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts index 531099402e9..22e270f585b 100644 --- a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts +++ b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts @@ -1,8 +1,6 @@ import { dynamicImport } from '@cspell/dynamic-import'; -import { isPopulated } from '@growi/core'; import type { IPagePopulatedToShowRevision } from '@growi/core/dist/interfaces'; import type { Root, Code } from 'mdast'; -import type { HydratedDocument } from 'mongoose'; import type * as RehypeMeta from 'rehype-meta'; import type * as RehypeStringify from 'rehype-stringify'; import type * as RemarkParse from 'remark-parse'; @@ -10,8 +8,6 @@ import type * as RemarkRehype from 'remark-rehype'; import type * as Unified from 'unified'; import type * as UnistUtilVisit from 'unist-util-visit'; -import type { PageDocument } from '~/server/models/page'; - interface ModuleCache { unified?: typeof Unified.unified; visit?: typeof UnistUtilVisit.visit; @@ -60,7 +56,7 @@ const initializeModules = async(): Promise => { }; }; -export const convertMarkdownToHtml = async(page: HydratedDocument | IPagePopulatedToShowRevision): Promise => { +export const convertMarkdownToHtml = async(page: IPagePopulatedToShowRevision): Promise => { await initializeModules(); const { @@ -81,10 +77,6 @@ export const convertMarkdownToHtml = async(page: HydratedDocument }; }; - const revisionBody = page.revision != null && isPopulated(page.revision) - ? page.revision.body - : undefined; - const processor = unified() .use(remarkParse) .use(sanitizeMarkdown) @@ -94,5 +86,5 @@ export const convertMarkdownToHtml = async(page: HydratedDocument }) .use(rehypeStringify); - return processor.processSync(revisionBody).toString(); + return processor.processSync(page.revision?.body).toString(); }; From 3e17890b25f0c7985c2e8e7dfab62f99a1165c6b Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Fri, 20 Dec 2024 06:08:28 +0000 Subject: [PATCH 09/12] Fix logic error in page revision body length check --- apps/app/src/features/openai/server/services/openai.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index 3c35ba0b89b..e44b0a16278 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -194,7 +194,7 @@ class OpenaiService implements IOpenaiService { const processUploadFile = async(page: HydratedDocument) => { if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) { if (isPagePopulatedToShowRevision(page)) { - if (page.revision.body.length > 0) { + if (page.revision.body.length < 0) { return; } From bac5032c56b15e4d73378f30d5c9a27eec079dea Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Fri, 20 Dec 2024 06:29:24 +0000 Subject: [PATCH 10/12] add comment --- apps/app/src/features/openai/server/services/openai.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index e44b0a16278..597226a6984 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -36,6 +36,7 @@ let isVectorStoreForPublicScopeExist = false; type VectorStoreFileRelationsMap = Map +// type guard const isPagePopulatedToShowRevision = (page: HydratedDocument): page is IPagePopulatedToShowRevision & PageDocument => { if (page?.revision != null && !isPopulated(page.revision)) { return false; From 1bebb75be0cb35009755466a4a9bf849f0b65bac Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 23 Dec 2024 09:49:45 +0000 Subject: [PATCH 11/12] Refactor uploadFile and convertMarkdownToHtml to use simplified parameters --- .../features/openai/server/services/openai.ts | 24 +++++-------------- .../server/utils/convert-markdown-to-html.ts | 12 ++++++---- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/apps/app/src/features/openai/server/services/openai.ts b/apps/app/src/features/openai/server/services/openai.ts index 597226a6984..bf9c4dcc765 100644 --- a/apps/app/src/features/openai/server/services/openai.ts +++ b/apps/app/src/features/openai/server/services/openai.ts @@ -36,14 +36,6 @@ let isVectorStoreForPublicScopeExist = false; type VectorStoreFileRelationsMap = Map -// type guard -const isPagePopulatedToShowRevision = (page: HydratedDocument): page is IPagePopulatedToShowRevision & PageDocument => { - if (page?.revision != null && !isPopulated(page.revision)) { - return false; - } - - return true; -}; export interface IOpenaiService { getOrCreateThread(userId: string, vectorStoreId?: string, threadId?: string): Promise; getOrCreateVectorStoreForPublicScope(): Promise; @@ -166,9 +158,9 @@ class OpenaiService implements IOpenaiService { // } // } - private async uploadFile(page: IPagePopulatedToShowRevision): Promise { - const convertedHtml = await convertMarkdownToHtml(page); - const file = await toFile(Readable.from(convertedHtml), `${page._id}.html`); + private async uploadFile(pageId: Types.ObjectId, pagePath: string, revisionBody: string): Promise { + const convertedHtml = await convertMarkdownToHtml({ pagePath, revisionBody }); + const file = await toFile(Readable.from(convertedHtml), `${pageId}.html`); const uploadedFile = await this.client.uploadFile(file); return uploadedFile; } @@ -194,19 +186,15 @@ class OpenaiService implements IOpenaiService { const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map(); const processUploadFile = async(page: HydratedDocument) => { if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) { - if (isPagePopulatedToShowRevision(page)) { - if (page.revision.body.length < 0) { - return; - } - - const uploadedFile = await this.uploadFile(page); + if (isPopulated(page.revision) && page.revision.body.length > 0) { + const uploadedFile = await this.uploadFile(page._id, page.path, page.revision.body); prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap); return; } const pagePopulatedToShowRevision = await page.populateDataToShowRevision(); if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) { - const uploadedFile = await this.uploadFile(pagePopulatedToShowRevision); + const uploadedFile = await this.uploadFile(page._id, page.path, pagePopulatedToShowRevision.revision.body); prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap); } } diff --git a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts index 22e270f585b..804ce534852 100644 --- a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts +++ b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts @@ -1,5 +1,4 @@ import { dynamicImport } from '@cspell/dynamic-import'; -import type { IPagePopulatedToShowRevision } from '@growi/core/dist/interfaces'; import type { Root, Code } from 'mdast'; import type * as RehypeMeta from 'rehype-meta'; import type * as RehypeStringify from 'rehype-stringify'; @@ -56,7 +55,12 @@ const initializeModules = async(): Promise => { }; }; -export const convertMarkdownToHtml = async(page: IPagePopulatedToShowRevision): Promise => { +type ConvertMarkdownToHtmlParams = { + pagePath: string; + revisionBody: string; +}; + +export const convertMarkdownToHtml = async({ pagePath, revisionBody }: ConvertMarkdownToHtmlParams): Promise => { await initializeModules(); const { @@ -82,9 +86,9 @@ export const convertMarkdownToHtml = async(page: IPagePopulatedToShowRevision): .use(sanitizeMarkdown) .use(remarkRehype) .use(rehypeMeta, { - title: page.path, + title: pagePath, }) .use(rehypeStringify); - return processor.processSync(page.revision?.body).toString(); + return processor.processSync(revisionBody).toString(); }; From 35618a67d947ce042831255f98de1c091ca43daa Mon Sep 17 00:00:00 2001 From: Shun Miyazawa Date: Mon, 23 Dec 2024 09:54:44 +0000 Subject: [PATCH 12/12] Simplify parameter type definition in convertMarkdownToHtml function --- .../openai/server/utils/convert-markdown-to-html.ts | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts index 804ce534852..fa0dcf4dbe4 100644 --- a/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts +++ b/apps/app/src/features/openai/server/utils/convert-markdown-to-html.ts @@ -55,12 +55,7 @@ const initializeModules = async(): Promise => { }; }; -type ConvertMarkdownToHtmlParams = { - pagePath: string; - revisionBody: string; -}; - -export const convertMarkdownToHtml = async({ pagePath, revisionBody }: ConvertMarkdownToHtmlParams): Promise => { +export const convertMarkdownToHtml = async({ pagePath, revisionBody }: { pagePath: string, revisionBody: string }): Promise => { await initializeModules(); const {