diff --git a/examples/src/indexes/vector_stores/vectara.ts b/examples/src/indexes/vector_stores/vectara.ts index 62f45a876fd8..af6d2ad44f3a 100644 --- a/examples/src/indexes/vector_stores/vectara.ts +++ b/examples/src/indexes/vector_stores/vectara.ts @@ -10,7 +10,7 @@ const store = new VectaraStore({ }); // Add two documents with some metadata. -await store.addDocuments([ +const doc_ids = await store.addDocuments([ new Document({ pageContent: "Do I dare to eat a peach?", metadata: { @@ -58,3 +58,6 @@ console.log(JSON.stringify(resultsWithScore, null, 2)); // 0.38169062 // ] // ] + +// Delete the documents. +await store.deleteDocuments(doc_ids); diff --git a/langchain/src/vectorstores/tests/vectara.int.test.ts b/langchain/src/vectorstores/tests/vectara.int.test.ts index 63e7cb099851..576e78ef1cc4 100644 --- a/langchain/src/vectorstores/tests/vectara.int.test.ts +++ b/langchain/src/vectorstores/tests/vectara.int.test.ts @@ -4,16 +4,10 @@ import fs from "fs"; import { expect, beforeAll } from "@jest/globals"; import { FakeEmbeddings } from "../../embeddings/fake.js"; import { Document } from "../../document.js"; +import { insecureHash } from "../../util/js-sha1/hash.js"; import { VectaraFile, VectaraLibArgs, VectaraStore } from "../vectara.js"; const getDocs = (): Document[] => { - const hashCode = (s: string) => - s.split("").reduce((a, b) => { - // eslint-disable-next-line no-param-reassign - a = ((a << 5) - a + b.charCodeAt(0)) | 0; - return a; - }, 0); - // Some text from Lord of the Rings const englishOne = `It all depends on what you want. You can trust us to stick to you through thick and thin to the bitter end. And you can trust us to keep any secret of yours - closer than you keep it yourself. @@ -33,7 +27,7 @@ const getDocs = (): Document[] => { new Document({ pageContent: englishOne, metadata: { - document_id: hashCode(englishOne).toString(), // Generate a hashcode for document id based on the text + document_id: insecureHash(englishOne), // Generate a hashcode for document id based on the text title: "Lord of the Rings", author: "Tolkien", genre: "fiction", @@ -43,7 +37,7 @@ const getDocs = (): Document[] => { new Document({ pageContent: englishTwo, metadata: { - document_id: hashCode(englishTwo).toString(), // Generate a hashcode for document id based on the text + document_id: insecureHash(englishTwo), // Generate a hashcode for document id based on the text title: "Lord of the Rings", author: "Tolkien", genre: "fiction", @@ -53,7 +47,7 @@ const getDocs = (): Document[] => { new Document({ pageContent: frenchOne, metadata: { - document_id: hashCode(frenchOne).toString(), // Generate a hashcode for document id based on the text + document_id: insecureHash(frenchOne), // Generate a hashcode for document id based on the text title: "The hitchhiker's guide to the galaxy", author: "Douglas Adams", genre: "fiction", @@ -117,6 +111,7 @@ describe("VectaraStore", () => { describe("access operations", () => { let store: VectaraStore; + let doc_ids: string[] = []; beforeAll(async () => { store = new VectaraStore({ @@ -124,10 +119,7 @@ describe("VectaraStore", () => { corpusId, apiKey: process.env.VECTARA_API_KEY || "", }); - }); - - test.skip("addDocuments", async () => { - await store.addDocuments(getDocs()); + doc_ids = await store.addDocuments(getDocs()); }); test.skip("similaritySearchWithScore", async () => { @@ -138,12 +130,7 @@ describe("VectaraStore", () => { ); expect(resultsWithScore.length).toBeGreaterThan(0); expect(resultsWithScore[0][0].pageContent.length).toBeGreaterThan(0); - expect(resultsWithScore[0][0].metadata.length).toBeGreaterThan(0); - expect( - resultsWithScore[0][0].metadata.find( - (item: { name: string }) => item.name === "title" - ).value - ).toBe("Lord of the Rings"); + expect(resultsWithScore[0][0].metadata.title).toBe("Lord of the Rings"); expect(resultsWithScore[0][1]).toBeGreaterThan(0); }); @@ -161,7 +148,7 @@ describe("VectaraStore", () => { ); expect(results.length).toBeGreaterThan(0); expect(results[0].pageContent.length).toBeGreaterThan(0); - expect(results[0].metadata.length).toBeGreaterThan(0); + expect(results[0].metadata.title).toBe("Lord of the Rings"); }); test.skip("similaritySearch with filter", async () => { @@ -181,7 +168,7 @@ describe("VectaraStore", () => { expect(hasEnglish).toBe(false); }); - it("addFiles", async () => { + test.skip("addFiles", async () => { const docs = getDocs(); const englishOneContent = docs[0].pageContent; const frenchOneContent = docs[2].pageContent; @@ -210,18 +197,29 @@ describe("VectaraStore", () => { fileName: "bitcoin.pdf", }); - const results = await store.addFiles(vectaraFiles); + const file_doc_ids = await store.addFiles(vectaraFiles); + doc_ids = [...doc_ids, ...file_doc_ids]; for (const file of files) { fs.unlinkSync(file.filename); } - expect(results).toEqual(3); + expect(file_doc_ids.length).toEqual(3); const searchResults = await store.similaritySearch("What is bitcoin"); expect(searchResults.length).toBeGreaterThan(0); expect(searchResults[0].pageContent).toContain( "A Peer-to-Peer Electronic Cash System" ); }); + + // delete documents added in the test + afterAll(async () => { + store = new VectaraStore({ + customerId: Number(process.env.VECTARA_CUSTOMER_ID) || 0, + corpusId, + apiKey: process.env.VECTARA_API_KEY || "", + }); + await store.deleteDocuments(doc_ids); + }); }); }); diff --git a/langchain/src/vectorstores/vectara.ts b/langchain/src/vectorstores/vectara.ts index 7690521e61d2..1a35a0f7c4ba 100644 --- a/langchain/src/vectorstores/vectara.ts +++ b/langchain/src/vectorstores/vectara.ts @@ -1,3 +1,5 @@ +import * as uuid from "uuid"; + import { Document } from "../document.js"; import { Embeddings } from "../embeddings/base.js"; import { FakeEmbeddings } from "../embeddings/fake.js"; @@ -179,24 +181,73 @@ export class VectaraStore extends VectorStore { ); } + /** + * Method to delete data from the Vectara corpus. + * @param params an array of document IDs to be deleted + * @returns Promise that resolves when the deletion is complete. + */ + async deleteDocuments(ids: string[]): Promise { + if (ids && ids.length > 0) { + const headers = await this.getJsonHeader(); + for (const id of ids) { + const data = { + customer_id: this.customerId, + corpus_id: this.corpusId[0], + document_id: id, + }; + + try { + const controller = new AbortController(); + const timeout = setTimeout( + () => controller.abort(), + this.vectaraApiTimeoutSeconds * 1000 + ); + const response = await fetch( + `https://${this.apiEndpoint}/v1/delete-doc`, + { + method: "POST", + headers: headers?.headers, + body: JSON.stringify(data), + signal: controller.signal, + } + ); + clearTimeout(timeout); + if (response.status !== 200) { + throw new Error( + `Vectara API returned status code ${response.status} when deleting document ${id}` + ); + } + } catch (e) { + const error = new Error(`Error ${(e as Error).message}`); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (error as any).code = 500; + throw error; + } + } + } else { + throw new Error(`no "ids" specified for deletion`); + } + } + /** * Adds documents to the Vectara store. * @param documents An array of Document objects to add to the Vectara store. - * @returns A Promise that resolves when the documents have been added. + * @returns A Promise that resolves to an array of document IDs indexed in Vectara. */ - async addDocuments(documents: Document[]): Promise { + async addDocuments(documents: Document[]): Promise { if (this.corpusId.length > 1) throw new Error("addDocuments does not support multiple corpus ids"); const headers = await this.getJsonHeader(); + const doc_ids: string[] = []; let countAdded = 0; - for (const [index, document] of documents.entries()) { + for (const document of documents) { + const doc_id: string = document.metadata?.document_id ?? uuid.v4(); const data = { customer_id: this.customerId, corpus_id: this.corpusId[0], document: { - document_id: - document.metadata?.document_id ?? `${Date.now()}${index}`, + document_id: doc_id, title: document.metadata?.title ?? "", metadata_json: JSON.stringify(document.metadata ?? {}), section: [ @@ -235,6 +286,7 @@ export class VectaraStore extends VectorStore { throw error; } else { countAdded += 1; + doc_ids.push(doc_id); } } catch (e) { const error = new Error( @@ -248,6 +300,8 @@ export class VectaraStore extends VectorStore { if (this.verbose) { console.log(`Added ${countAdded} documents to Vectara`); } + + return doc_ids; } /** @@ -266,7 +320,7 @@ export class VectaraStore extends VectorStore { if (this.corpusId.length > 1) throw new Error("addFiles does not support multiple corpus ids"); - let numDocs = 0; + const doc_ids: string[] = []; for (const [index, file] of files.entries()) { const md = metadatas ? metadatas[index] : {}; @@ -276,7 +330,7 @@ export class VectaraStore extends VectorStore { data.append("doc-metadata", JSON.stringify(md)); const response = await fetch( - `https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}`, + `https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}&d=true`, { method: "POST", headers: { @@ -293,7 +347,9 @@ export class VectaraStore extends VectorStore { } else if (status !== 200) { throw new Error(`Vectara API returned status code ${status}`); } else { - numDocs += 1; + const result = await response.json(); + const doc_id = result.document.documentId; + doc_ids.push(doc_id); } } @@ -301,7 +357,7 @@ export class VectaraStore extends VectorStore { console.log(`Uploaded ${files.length} files to Vectara`); } - return numDocs; + return doc_ids; } /**