Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ai): Save file to VectorStore in HTML format #9462

Merged
merged 13 commits into from
Dec 23, 2024
2 changes: 2 additions & 0 deletions apps/app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,11 @@
"reconnecting-websocket": "^4.4.0",
"redis": "^3.0.2",
"rehype-katex": "^7.0.1",
"rehype-meta": "^4.0.1",
"rehype-raw": "^7.0.0",
"rehype-sanitize": "^6.0.0",
"rehype-slug": "^6.0.0",
"rehype-stringify": "^10.0.1",
"rehype-toc": "^3.0.2",
"remark-breaks": "^4.0.0",
"remark-directive": "^3.0.0",
Expand Down
15 changes: 8 additions & 7 deletions apps/app/src/features/openai/server/services/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import assert from 'node:assert';
import { Readable, Transform } from 'stream';
import { pipeline } from 'stream/promises';

import type { IPagePopulatedToShowRevision } from '@growi/core';
import { PageGrant, isPopulated } from '@growi/core';
import type { HydratedDocument, Types } from 'mongoose';
import mongoose from 'mongoose';
Expand All @@ -20,7 +21,7 @@ import { createBatchStream } from '~/server/util/batch-stream';
import loggerFactory from '~/utils/logger';

import { OpenaiServiceTypes } from '../../interfaces/ai';
import { sanitizeMarkdown } from '../utils/sanitize-markdown';
import { convertMarkdownToHtml } from '../utils/convert-markdown-to-html';

import { getClient } from './client-delegator';
// import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
Expand Down Expand Up @@ -157,9 +158,9 @@ class OpenaiService implements IOpenaiService {
// }
// }

private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
const sanitizedMarkdown = await sanitizeMarkdown(body);
const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`);
private async uploadFile(pageId: Types.ObjectId, pagePath: string, revisionBody: string): Promise<OpenAI.Files.FileObject> {
const convertedHtml = await convertMarkdownToHtml({ pagePath, revisionBody });
const file = await toFile(Readable.from(convertedHtml), `${pageId}.html`);
const uploadedFile = await this.client.uploadFile(file);
return uploadedFile;
}
Expand All @@ -183,17 +184,17 @@ class OpenaiService implements IOpenaiService {
async createVectorStoreFile(pages: Array<HydratedDocument<PageDocument>>): Promise<void> {
const vectorStore = await this.getOrCreateVectorStoreForPublicScope();
const vectorStoreFileRelationsMap: VectorStoreFileRelationsMap = new Map();
const processUploadFile = async(page: PageDocument) => {
const processUploadFile = async(page: HydratedDocument<PageDocument>) => {
if (page._id != null && page.grant === PageGrant.GRANT_PUBLIC && page.revision != null) {
if (isPopulated(page.revision) && page.revision.body.length > 0) {
const uploadedFile = await this.uploadFile(page._id, page.revision.body);
const uploadedFile = await this.uploadFile(page._id, page.path, page.revision.body);
prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
return;
}

const pagePopulatedToShowRevision = await page.populateDataToShowRevision();
if (pagePopulatedToShowRevision.revision != null && pagePopulatedToShowRevision.revision.body.length > 0) {
const uploadedFile = await this.uploadFile(page._id, pagePopulatedToShowRevision.revision.body);
const uploadedFile = await this.uploadFile(page._id, page.path, pagePopulatedToShowRevision.revision.body);
prepareVectorStoreFileRelations(vectorStore._id, page._id, uploadedFile.id, vectorStoreFileRelationsMap);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import { dynamicImport } from '@cspell/dynamic-import';
import type { Root, Code } from 'mdast';
import type * as RehypeMeta from 'rehype-meta';
import type * as RehypeStringify from 'rehype-stringify';
import type * as RemarkParse from 'remark-parse';
import type * as RemarkRehype from 'remark-rehype';
import type * as Unified from 'unified';
import type * as UnistUtilVisit from 'unist-util-visit';

interface ModuleCache {
unified?: typeof Unified.unified;
visit?: typeof UnistUtilVisit.visit;
remarkParse?: typeof RemarkParse.default;
remarkRehype?: typeof RemarkRehype.default;
rehypeMeta?: typeof RehypeMeta.default;
rehypeStringify?: typeof RehypeStringify.default;
}

let moduleCache: ModuleCache = {};

const initializeModules = async(): Promise<void> => {
if (moduleCache.unified != null
&& moduleCache.visit != null
&& moduleCache.remarkParse != null
&& moduleCache.remarkRehype != null
&& moduleCache.rehypeMeta != null
&& moduleCache.rehypeStringify != null
) {
return;
}

const [
{ unified },
{ visit },
{ default: remarkParse },
{ default: remarkRehype },
{ default: rehypeMeta },
{ default: rehypeStringify },
] = await Promise.all([
dynamicImport<typeof Unified>('unified', __dirname),
dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
dynamicImport<typeof RemarkRehype>('remark-rehype', __dirname),
dynamicImport<typeof RehypeMeta>('rehype-meta', __dirname),
dynamicImport<typeof RehypeStringify>('rehype-stringify', __dirname),
]);

moduleCache = {
unified,
visit,
remarkParse,
remarkRehype,
rehypeMeta,
rehypeStringify,
};
};

export const convertMarkdownToHtml = async({ pagePath, revisionBody }: { pagePath: string, revisionBody: string }): Promise<string> => {
await initializeModules();

const {
unified, visit, remarkParse, remarkRehype, rehypeMeta, rehypeStringify,
} = moduleCache;

if (unified == null || visit == null || remarkParse == null || remarkRehype == null || rehypeMeta == null || rehypeStringify == null) {
throw new Error('Failed to initialize required modules');
}

const sanitizeMarkdown = () => {
return (tree: Root) => {
visit(tree, 'code', (node: Code) => {
if (node.lang === 'drawio') {
node.value = '<!-- drawio content replaced -->';
}
});
};
};

const processor = unified()
.use(remarkParse)
.use(sanitizeMarkdown)
.use(remarkRehype)
.use(rehypeMeta, {
title: pagePath,
})
.use(rehypeStringify);

return processor.processSync(revisionBody).toString();
};
65 changes: 0 additions & 65 deletions apps/app/src/features/openai/server/utils/sanitize-markdown.ts

This file was deleted.

69 changes: 69 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading