Skip to content

Commit

Permalink
feat(RabbitHole): add metadata in endpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
zAlweNy26 committed Jul 28, 2024
1 parent 20f2f12 commit f3d2eb4
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 48 deletions.
27 changes: 17 additions & 10 deletions src/rabbit-hole.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,14 @@ export class RabbitHole {
* @param stray The StrayCat instance.
* @param content The textual content to ingest.
* @param source The source of the content (default: 'unknown').
* @param metadata Additional metadata to store with the content.
*/
async ingestContent(stray: StrayCat, content: string | string[], source = 'unknown') {
async ingestContent(stray: StrayCat, content: string | string[], source = 'unknown', metadata?: Record<string, any>) {
log.info('Ingesting textual content...')
content = Array.isArray(content) ? content : [content]
let docs = content.map(c => new Document({ pageContent: c }))
docs = await this.splitDocs(stray, docs)
await this.storeDocuments(stray, docs, source)
await this.storeDocuments(stray, docs, source, metadata)
}

/**
Expand All @@ -138,9 +139,10 @@ export class RabbitHole {
* @param file The file to ingest.
* @param chunkSize The size of each chunk for splitting the content.
* @param chunkOverlap The overlap between chunks.
* @param metadata Additional metadata to store with the content.
* @throws An error if the file type is not supported.
*/
async ingestFile(stray: StrayCat, file: File, chunkSize?: number, chunkOverlap?: number) {
async ingestFile(stray: StrayCat, file: File, chunkSize?: number, chunkOverlap?: number, metadata?: Record<string, any>) {
const mime = file.type as keyof typeof this.fileHandlers
if (!Object.keys(this.fileHandlers).includes(mime))
throw new Error(`The file type "${file.type}" is not supported. Skipping ingestion...`)
Expand All @@ -151,7 +153,7 @@ export class RabbitHole {
const content = await loader.load()
stray.send({ type: 'notification', content: 'Parsing completed. Starting now the reading process...' })
const docs = await this.splitDocs(stray, content, chunkSize, chunkOverlap)
await this.storeDocuments(stray, docs, file.name)
await this.storeDocuments(stray, docs, file.name, metadata)
}

/**
Expand All @@ -162,9 +164,10 @@ export class RabbitHole {
* @param path The path or URL to ingest.
* @param chunkSize The size of each chunk for splitting the content.
* @param chunkOverlap The overlap between chunks.
* @param metadata Additional metadata to store with the content.
* @throws If the URL doesn't match any web handler or the path doesn't exist.
*/
async ingestPathOrURL(stray: StrayCat, path: string, chunkSize?: number, chunkOverlap?: number) {
async ingestPathOrURL(stray: StrayCat, path: string, chunkSize?: number, chunkOverlap?: number, metadata?: Record<string, any>) {
try {
const url = new URL(path)
log.info('Ingesting URL...')
Expand All @@ -175,15 +178,15 @@ export class RabbitHole {
const content = await loader.load()
stray.send({ type: 'notification', content: 'Parsing completed. Starting now the reading process...' })
const docs = await this.splitDocs(stray, content, chunkSize, chunkOverlap)
await this.storeDocuments(stray, docs, url.href)
await this.storeDocuments(stray, docs, url.href, metadata)
}
catch (error) {
if (error instanceof TypeError) log.info('The string is not a valid URL, trying with a file-system path...')
else if (error instanceof Error) log.error(error.message)
if (!(await Bun.file(path).exists())) throw new Error('The file path does not exist. Skipping ingestion...')
const data = await Bun.file(resolve(path)).text()
const file = new File([data], basename(path), { type: extname(path) })
await this.ingestFile(stray, file, chunkSize, chunkOverlap)
await this.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
}
}

Expand All @@ -194,8 +197,9 @@ export class RabbitHole {
* @param stray The StrayCat instance.
* @param docs An array of documents to store.
* @param source The source of the documents.
* @param metadata Additional metadata to store with the content.
*/
async storeDocuments(stray: StrayCat, docs: Document[], source: string) {
async storeDocuments(stray: StrayCat, docs: Document[], source: string, metadata?: Record<string, any>) {
log.info(`Preparing to store ${docs.length} documents`)
docs = await madHatter.executeHook('beforeStoreDocuments', docs, stray)
for (let [i, doc] of docs.entries()) {
Expand All @@ -204,8 +208,11 @@ export class RabbitHole {
const readMsg = `Read ${percRead}% of ${source}`
stray.send({ type: 'notification', content: readMsg })
log.info(readMsg)
doc.metadata.source = source
doc.metadata.when = Date.now()
doc.metadata = {
...metadata,
source,
when: Date.now(),
}
doc = await madHatter.executeHook('beforeInsertInMemory', doc, stray)
const interaction: EmbedderInteraction = {
model: 'embedder',
Expand Down
82 changes: 44 additions & 38 deletions src/routes/rabbit_hole.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ export const rabbitHoleRoutes = new Elysia({
},
})
.post('/chunk', async ({ rh, body, query, stray, log, HttpError }) => {
const { sync, source } = query
const { sync, source } = query, { chunk, metadata } = body
try {
if (sync) await rh.ingestContent(stray, body.chunk, source)
else rh.ingestContent(stray, body.chunk).catch(log.error)
if (sync) await rh.ingestContent(stray, chunk, source, metadata)
else rh.ingestContent(stray, chunk, source, metadata).catch(log.error)
}
catch (error) {
log.error('Error while ingesting chunk:', error)
Expand All @@ -37,6 +37,7 @@ export const rabbitHoleRoutes = new Elysia({
}, {
body: t.Object({
chunk: t.Union([t.String(), t.Array(t.String())]),
metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
}),
query: t.Object({
sync: t.Boolean({ default: true }),
Expand All @@ -54,10 +55,10 @@ export const rabbitHoleRoutes = new Elysia({
},
})
.post('/file', async ({ rh, body, query, stray, log, HttpError }) => {
const { file } = body, { sync, chunkOverlap, chunkSize } = query
const { file, metadata } = body, { sync, chunkOverlap, chunkSize } = query
try {
if (sync) await rh.ingestFile(stray, file, chunkSize, chunkOverlap)
else rh.ingestFile(stray, file, chunkSize, chunkOverlap).catch(log.error)
if (sync) await rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
else rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata).catch(log.error)
}
catch (error) {
log.error('Error while ingesting file:', error)
Expand All @@ -69,6 +70,7 @@ export const rabbitHoleRoutes = new Elysia({
}, {
body: t.Object({
file: t.File(),
metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
}),
query: t.Object({
sync: t.Boolean({ default: true }),
Expand All @@ -87,15 +89,15 @@ export const rabbitHoleRoutes = new Elysia({
},
})
.post('/files', async ({ rh, body, query, stray, log, HttpError }) => {
const { files } = body, { sync, chunkOverlap, chunkSize } = query
const { content } = body, { sync, chunkOverlap, chunkSize } = query
try {
if (sync) {
for (const file of files)
await rh.ingestFile(stray, file, chunkSize, chunkOverlap)
for (const { file, metadata } of content)
await rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata)
}
else {
for (const file of files)
rh.ingestFile(stray, file, chunkSize, chunkOverlap).catch(log.error)
for (const { file, metadata } of content)
rh.ingestFile(stray, file, chunkSize, chunkOverlap, metadata).catch(log.error)
}
}
catch (error) {
Expand All @@ -107,7 +109,10 @@ export const rabbitHoleRoutes = new Elysia({
}
}, {
body: t.Object({
files: t.Files(),
content: t.Array(t.Object({
file: t.File(),
metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
})),
}),
query: t.Object({
sync: t.Boolean({ default: true }),
Expand All @@ -125,29 +130,36 @@ export const rabbitHoleRoutes = new Elysia({
400: 'error',
},
})
.post('/memory', async ({ rh, body, query, log, HttpError }) => {
const { file } = body, { sync } = query
.post('/web', async ({ rh, body, query, stray, log, HttpError }) => {
const { webUrl, metadata } = body, { sync, chunkOverlap, chunkSize } = query
try {
if (sync) await rh.ingestMemory(file)
else rh.ingestMemory(file).catch(log.error)
if (sync) await rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap, metadata)
else rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap, metadata).catch(log.error)
}
catch (error) {
log.error('Error while ingesting memory file:', error)
throw HttpError.InternalServer('Error while ingesting the passed memory file')
log.error('Error while ingesting web url:', error)
throw HttpError.InternalServer('Error while ingesting the passed url')
}
return {
info: sync ? 'Memory file has been ingested successfully.' : 'Memory file is being ingested asynchronously...',
info: sync ? 'Web page has been ingested successfully.' : 'Web page is being ingested asynchronously...',
}
}, {
body: t.Object({
file: t.File(),
webUrl: t.String({
format: 'uri',
default: 'https://example.com',
description: 'URL of the website or the path of the file to ingest.',
}),
metadata: t.Optional(t.Record(t.String(), t.Any(), { description: 'Metadata to attach to the ingested content.' })),
}),
query: t.Object({
sync: t.Boolean({ default: true }),
chunkSize: t.Number({ default: 256 }),
chunkOverlap: t.Number({ default: 64 }),
}),
detail: {
description: 'Upload a memory json file to the cat memory.',
summary: 'Upload memory',
description: 'Upload a website whose content will be extracted and segmented into chunks. Chunks will be then vectorized and stored into documents memory.',
summary: 'Upload URL',
},
response: {
200: t.Object({
Expand All @@ -156,35 +168,29 @@ export const rabbitHoleRoutes = new Elysia({
400: 'error',
},
})
.post('/web', async ({ rh, body, query, stray, log, HttpError }) => {
const { webUrl } = body, { sync, chunkOverlap, chunkSize } = query
.post('/memory', async ({ rh, body, query, log, HttpError }) => {
const { file } = body, { sync } = query
try {
if (sync) await rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap)
else rh.ingestPathOrURL(stray, webUrl, chunkSize, chunkOverlap).catch(log.error)
if (sync) await rh.ingestMemory(file)
else rh.ingestMemory(file).catch(log.error)
}
catch (error) {
log.error('Error while ingesting web url:', error)
throw HttpError.InternalServer('Error while ingesting the passed url')
log.error('Error while ingesting memory file:', error)
throw HttpError.InternalServer('Error while ingesting the passed memory file')
}
return {
info: sync ? 'Web page has been ingested successfully.' : 'Web page is being ingested asynchronously...',
info: sync ? 'Memory file has been ingested successfully.' : 'Memory file is being ingested asynchronously...',
}
}, {
body: t.Object({
webUrl: t.String({
format: 'uri',
default: 'https://example.com',
description: 'URL of the website or the path of the file to ingest.',
}),
file: t.File({ description: 'Memory file to ingest. It must be a JSON.' }),
}),
query: t.Object({
sync: t.Boolean({ default: true }),
chunkSize: t.Number({ default: 256 }),
chunkOverlap: t.Number({ default: 64 }),
}),
detail: {
description: 'Upload a website whose content will be extracted and segmented into chunks. Chunks will be then vectorized and stored into documents memory.',
summary: 'Upload URL',
description: 'Upload a memory json file to the cat memory.',
summary: 'Upload memory',
},
response: {
200: t.Object({
Expand Down

0 comments on commit f3d2eb4

Please sign in to comment.