feat(RabbitHole): add metadata in endpoints

zAlweNy26 · Jul 28, 2024 · f3d2eb4 · f3d2eb4
1 parent 20f2f12
commit f3d2eb4
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 48 deletions.
diff --git a/src/rabbit-hole.ts b/src/rabbit-hole.ts
@@ -123,13 +123,14 @@ export class RabbitHole {
 	 * @param stray The StrayCat instance.
 	 * @param content The textual content to ingest.
 	 * @param source The source of the content (default: 'unknown').
+	 * @param metadata Additional metadata to store with the content.
 	 */
-	async ingestContent(stray: StrayCat, content: string | string[], source = 'unknown') {
+	async ingestContent(stray: StrayCat, content: string | string[], source = 'unknown', metadata?: Record<string, any>) {
 		log.info('Ingesting textual content...')
 		content = Array.isArray(content) ? content : [content]
 		let docs = content.map(c => new Document({ pageContent: c }))
 		docs = await this.splitDocs(stray, docs)
-		await this.storeDocuments(stray, docs, source)
+		await this.storeDocuments(stray, docs, source, metadata)
 	}
 
 	/**
@@ -138,9 +139,10 @@ export class RabbitHole {
 	 * @param file The file to ingest.
 	 * @param chunkSize The size of each chunk for splitting the content.
 	 * @param chunkOverlap The overlap between chunks.
+	 * @param metadata Additional metadata to store with the content.
 	 * @throws An error if the file type is not supported.
 	 */
-	async ingestFile(stray: StrayCat, file: File, chunkSize?: number, chunkOverlap?: number) {
+	async ingestFile(stray: StrayCat, file: File, chunkSize?: number, chunkOverlap?: number, metadata?: Record<string, any>) {
 		const mime = file.type as keyof typeof this.fileHandlers
 		if (!Object.keys(this.fileHandlers).includes(mime))
 			throw new Error(`The file type "${file.type}" is not supported. Skipping ingestion...`)
@@ -151,7 +153,7 @@ export class RabbitHole {
 		const content = await loader.load()
 		stray.send({ type: 'notification', content: 'Parsing completed. Starting now the reading process...' })
 		const docs = await this.splitDocs(stray, content, chunkSize, chunkOverlap)
-		await this.storeDocuments(stray, docs, file.name)
+		await this.storeDocuments(stray, docs, file.name, metadata)
 	}
 
 	/**
@@ -162,9 +164,10 @@ export class RabbitHole {
 	 * @param path The path or URL to ingest.
 	 * @param chunkSize The size of each chunk for splitting the content.
 	 * @param chunkOverlap The overlap between chunks.
+	 * @param metadata Additional metadata to store with the content.
 	 * @throws If the URL doesn't match any web handler or the path doesn't exist.
 	 */
-	async ingestPathOrURL(stray: StrayCat, path: string, chunkSize?: number, chunkOverlap?: number) {
+	async ingestPathOrURL(stray: StrayCat, path: string, chunkSize?: number, chunkOverlap?: number, metadata?: Record<string, any>) {
 		try {
 			const url = new URL(path)
 			log.info('Ingesting URL...')
@@ -175,15 +178,15 @@ export class RabbitHole {
 			const content = await loader.load()
 			stray.send({ type: 'notification', content: 'Parsing completed. Starting now the reading process...' })
 			const docs = await this.splitDocs(stray, content, chunkSize, chunkOverlap)
-			await this.storeDocuments(stray, docs, url.href)
+			await this.storeDocuments(stray, docs, url.href, metadata)
 		}
 		catch (error) {
 			if (error instanceof TypeError) log.info('The string is not a valid URL, trying with a file-system path...')
 			else if (error instanceof Error) log.error(error.message)
 			if (!(await Bun.file(path).exists())) throw new Error('The file path does not exist. Skipping ingestion...')
 			const data = await Bun.file(resolve(path)).text()
 			const file = new File([data], basename(path), { type: extname(path) })
-			await this.ingestFile(stray, file, chunkSize, chunkOverlap)
+			await this.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
 		}
 	}
 
@@ -194,8 +197,9 @@ export class RabbitHole {
 	 * @param stray The StrayCat instance.
 	 * @param docs An array of documents to store.
 	 * @param source The source of the documents.
+	 * @param metadata Additional metadata to store with the content.
 	 */
-	async storeDocuments(stray: StrayCat, docs: Document[], source: string) {
+	async storeDocuments(stray: StrayCat, docs: Document[], source: string, metadata?: Record<string, any>) {
 		log.info(`Preparing to store ${docs.length} documents`)
 		docs = await madHatter.executeHook('beforeStoreDocuments', docs, stray)
 		for (let [i, doc] of docs.entries()) {
@@ -204,8 +208,11 @@ export class RabbitHole {
 			const readMsg = `Read ${percRead}% of ${source}`
 			stray.send({ type: 'notification', content: readMsg })
 			log.info(readMsg)
-			doc.metadata.source = source
-			doc.metadata.when = Date.now()
+			doc.metadata = {
+				...metadata,
+				source,
+				when: Date.now(),
+			}
 			doc = await madHatter.executeHook('beforeInsertInMemory', doc, stray)
 			const interaction: EmbedderInteraction = {
 				model: 'embedder',

diff --git a/src/routes/rabbit_hole.ts b/src/routes/rabbit_hole.ts
@@ -22,10 +22,10 @@ export const rabbitHoleRoutes = new Elysia({
 		},
 	})
 	.post('/chunk', async ({ rh, body, query, stray, log, HttpError }) => {
-		const { sync, source } = query
+		const { sync, source } = query, { chunk, metadata } = body
 		try {
-			if (sync) await rh.ingestContent(stray, body.chunk, source)
-			else rh.ingestContent(stray, body.chunk).catch(log.error)
+			if (sync) await rh.ingestContent(stray, chunk, source, metadata)
+			else rh.ingestContent(stray, chunk, source, metadata).catch(log.error)
 		}
 		catch (error) {
 			log.error('Error while ingesting chunk:', error)
@@ -37,6 +37,7 @@ export const rabbitHoleRoutes = new Elysia({
 	}, {
 		body: t.Object({
 			chunk: t.Union([t.String(), t.Array(t.String())]),
+			metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
 		}),
 		query: t.Object({
 			sync: t.Boolean({ default: true }),
@@ -54,10 +55,10 @@ export const rabbitHoleRoutes = new Elysia({
 		},
 	})
 	.post('/file', async ({ rh, body, query, stray, log, HttpError }) => {
-		const { file } = body, { sync, chunkOverlap, chunkSize } = query
+		const { file, metadata } = body, { sync, chunkOverlap, chunkSize } = query
 		try {
-			if (sync) await rh.ingestFile(stray, file, chunkSize, chunkOverlap)
-			else rh.ingestFile(stray, file, chunkSize, chunkOverlap).catch(log.error)
+			if (sync) await rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
+			else rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata).catch(log.error)
 		}
 		catch (error) {
 			log.error('Error while ingesting file:', error)
@@ -69,6 +70,7 @@ export const rabbitHoleRoutes = new Elysia({
 	}, {
 		body: t.Object({
 			file: t.File(),
+			metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
 		}),
 		query: t.Object({
 			sync: t.Boolean({ default: true }),
@@ -87,15 +89,15 @@ export const rabbitHoleRoutes = new Elysia({
 		},
 	})
 	.post('/files', async ({ rh, body, query, stray, log, HttpError }) => {
-		const { files } = body, { sync, chunkOverlap, chunkSize } = query
+		const { content } = body, { sync, chunkOverlap, chunkSize } = query
 		try {
 			if (sync) {
-				for (const file of files)
-					await rh.ingestFile(stray, file, chunkSize, chunkOverlap)
+				for (const { file, metadata } of content)
+					await rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
 			}
 			else {
-				for (const file of files)
-					rh.ingestFile(stray, file, chunkSize, chunkOverlap).catch(log.error)
+				for (const { file, metadata } of content)
+					rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata).catch(log.error)
 			}
 		}
 		catch (error) {
@@ -107,7 +109,10 @@ export const rabbitHoleRoutes = new Elysia({
 		}
 	}, {
 		body: t.Object({
-			files: t.Files(),
+			content: t.Array(t.Object({
+				file: t.File(),
+				metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
+			})),
 		}),
 		query: t.Object({
 			sync: t.Boolean({ default: true }),
@@ -125,29 +130,36 @@ export const rabbitHoleRoutes = new Elysia({
 			400: 'error',
 		},
 	})
-	.post('/memory', async ({ rh, body, query, log, HttpError }) => {
-		const { file } = body, { sync } = query
+	.post('/web', async ({ rh, body, query, stray, log, HttpError }) => {
+		const { webUrl, metadata } = body, { sync, chunkOverlap, chunkSize } = query
 		try {
-			if (sync) await rh.ingestMemory(file)
-			else rh.ingestMemory(file).catch(log.error)
+			if (sync) await rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap, metadata)
+			else rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap, metadata).catch(log.error)
 		}
 		catch (error) {
-			log.error('Error while ingesting memory file:', error)
-			throw HttpError.InternalServer('Error while ingesting the passed memory file')
+			log.error('Error while ingesting web url:', error)
+			throw HttpError.InternalServer('Error while ingesting the passed url')
 		}
 		return {
-			info: sync ? 'Memory file has been ingested successfully.' : 'Memory file is being ingested asynchronously...',
+			info: sync ? 'Web page has been ingested successfully.' : 'Web page is being ingested asynchronously...',
 		}
 	}, {
 		body: t.Object({
-			file: t.File(),
+			webUrl: t.String({
+				format: 'uri',
+				default: 'https://example.com',
+				description: 'URL of the website or the path of the file to ingest.',
+			}),
+			metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
 		}),
 		query: t.Object({
 			sync: t.Boolean({ default: true }),
+			chunkSize: t.Number({ default: 256 }),
+			chunkOverlap: t.Number({ default: 64 }),
 		}),
 		detail: {
-			description: 'Upload a memory json file to the cat memory.',
-			summary: 'Upload memory',
+			description: 'Upload a website whose content will be extracted and segmented into chunks. Chunks will be then vectorized and stored into documents memory.',
+			summary: 'Upload URL',
 		},
 		response: {
 			200: t.Object({
@@ -156,35 +168,29 @@ export const rabbitHoleRoutes = new Elysia({
 			400: 'error',
 		},
 	})
-	.post('/web', async ({ rh, body, query, stray, log, HttpError }) => {
-		const { webUrl } = body, { sync, chunkOverlap, chunkSize } = query
+	.post('/memory', async ({ rh, body, query, log, HttpError }) => {
+		const { file } = body, { sync } = query
 		try {
-			if (sync) await rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap)
-			else rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap).catch(log.error)
+			if (sync) await rh.ingestMemory(file)
+			else rh.ingestMemory(file).catch(log.error)
 		}
 		catch (error) {
-			log.error('Error while ingesting web url:', error)
-			throw HttpError.InternalServer('Error while ingesting the passed url')
+			log.error('Error while ingesting memory file:', error)
+			throw HttpError.InternalServer('Error while ingesting the passed memory file')
 		}
 		return {
-			info: sync ? 'Web page has been ingested successfully.' : 'Web page is being ingested asynchronously...',
+			info: sync ? 'Memory file has been ingested successfully.' : 'Memory file is being ingested asynchronously...',
 		}
 	}, {
 		body: t.Object({
-			webUrl: t.String({
-				format: 'uri',
-				default: 'https://example.com',
-				description: 'URL of the website or the path of the file to ingest.',
-			}),
+			file: t.File({ description: 'Memory file to ingest. It must be a JSON.' }),
 		}),
 		query: t.Object({
 			sync: t.Boolean({ default: true }),
-			chunkSize: t.Number({ default: 256 }),
-			chunkOverlap: t.Number({ default: 64 }),
 		}),
 		detail: {
-			description: 'Upload a website whose content will be extracted and segmented into chunks. Chunks will be then vectorized and stored into documents memory.',
-			summary: 'Upload URL',
+			description: 'Upload a memory json file to the cat memory.',
+			summary: 'Upload memory',
 		},
 		response: {
 			200: t.Object({