Skip to content

Commit

Permalink
Updates to Vectara Implementation (#3332)
Browse files Browse the repository at this point in the history
* updated documentation
added X-Source to header

* added deleteDocuments() method
updated generation of random ID from date to UUID-like
tests now fully executed and fixed to work properly

* added deleteDocuments to docs
fixes from yarn lint
keeping test.skip
  • Loading branch information
ofermend authored Nov 20, 2023
1 parent 1dc75e3 commit 84fd5f2
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 34 deletions.
5 changes: 4 additions & 1 deletion examples/src/indexes/vector_stores/vectara.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const store = new VectaraStore({
});

// Add two documents with some metadata.
await store.addDocuments([
const doc_ids = await store.addDocuments([
new Document({
pageContent: "Do I dare to eat a peach?",
metadata: {
Expand Down Expand Up @@ -58,3 +58,6 @@ console.log(JSON.stringify(resultsWithScore, null, 2));
// 0.38169062
// ]
// ]

// Delete the documents.
await store.deleteDocuments(doc_ids);
46 changes: 22 additions & 24 deletions langchain/src/vectorstores/tests/vectara.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,10 @@ import fs from "fs";
import { expect, beforeAll } from "@jest/globals";
import { FakeEmbeddings } from "../../embeddings/fake.js";
import { Document } from "../../document.js";
import { insecureHash } from "../../util/js-sha1/hash.js";
import { VectaraFile, VectaraLibArgs, VectaraStore } from "../vectara.js";

const getDocs = (): Document[] => {
const hashCode = (s: string) =>
s.split("").reduce((a, b) => {
// eslint-disable-next-line no-param-reassign
a = ((a << 5) - a + b.charCodeAt(0)) | 0;
return a;
}, 0);

// Some text from Lord of the Rings
const englishOne = `It all depends on what you want. You can trust us to stick to you through thick and thin to the
bitter end. And you can trust us to keep any secret of yours - closer than you keep it yourself.
Expand All @@ -33,7 +27,7 @@ const getDocs = (): Document[] => {
new Document({
pageContent: englishOne,
metadata: {
document_id: hashCode(englishOne).toString(), // Generate a hashcode for document id based on the text
document_id: insecureHash(englishOne), // Generate a hashcode for document id based on the text
title: "Lord of the Rings",
author: "Tolkien",
genre: "fiction",
Expand All @@ -43,7 +37,7 @@ const getDocs = (): Document[] => {
new Document({
pageContent: englishTwo,
metadata: {
document_id: hashCode(englishTwo).toString(), // Generate a hashcode for document id based on the text
document_id: insecureHash(englishTwo), // Generate a hashcode for document id based on the text
title: "Lord of the Rings",
author: "Tolkien",
genre: "fiction",
Expand All @@ -53,7 +47,7 @@ const getDocs = (): Document[] => {
new Document({
pageContent: frenchOne,
metadata: {
document_id: hashCode(frenchOne).toString(), // Generate a hashcode for document id based on the text
document_id: insecureHash(frenchOne), // Generate a hashcode for document id based on the text
title: "The hitchhiker's guide to the galaxy",
author: "Douglas Adams",
genre: "fiction",
Expand Down Expand Up @@ -117,17 +111,15 @@ describe("VectaraStore", () => {

describe("access operations", () => {
let store: VectaraStore;
let doc_ids: string[] = [];

beforeAll(async () => {
store = new VectaraStore({
customerId: Number(process.env.VECTARA_CUSTOMER_ID) || 0,
corpusId,
apiKey: process.env.VECTARA_API_KEY || "",
});
});

test.skip("addDocuments", async () => {
await store.addDocuments(getDocs());
doc_ids = await store.addDocuments(getDocs());
});

test.skip("similaritySearchWithScore", async () => {
Expand All @@ -138,12 +130,7 @@ describe("VectaraStore", () => {
);
expect(resultsWithScore.length).toBeGreaterThan(0);
expect(resultsWithScore[0][0].pageContent.length).toBeGreaterThan(0);
expect(resultsWithScore[0][0].metadata.length).toBeGreaterThan(0);
expect(
resultsWithScore[0][0].metadata.find(
(item: { name: string }) => item.name === "title"
).value
).toBe("Lord of the Rings");
expect(resultsWithScore[0][0].metadata.title).toBe("Lord of the Rings");
expect(resultsWithScore[0][1]).toBeGreaterThan(0);
});

Expand All @@ -161,7 +148,7 @@ describe("VectaraStore", () => {
);
expect(results.length).toBeGreaterThan(0);
expect(results[0].pageContent.length).toBeGreaterThan(0);
expect(results[0].metadata.length).toBeGreaterThan(0);
expect(results[0].metadata.title).toBe("Lord of the Rings");
});

test.skip("similaritySearch with filter", async () => {
Expand All @@ -181,7 +168,7 @@ describe("VectaraStore", () => {
expect(hasEnglish).toBe(false);
});

it("addFiles", async () => {
test.skip("addFiles", async () => {
const docs = getDocs();
const englishOneContent = docs[0].pageContent;
const frenchOneContent = docs[2].pageContent;
Expand Down Expand Up @@ -210,18 +197,29 @@ describe("VectaraStore", () => {
fileName: "bitcoin.pdf",
});

const results = await store.addFiles(vectaraFiles);
const file_doc_ids = await store.addFiles(vectaraFiles);
doc_ids = [...doc_ids, ...file_doc_ids];

for (const file of files) {
fs.unlinkSync(file.filename);
}

expect(results).toEqual(3);
expect(file_doc_ids.length).toEqual(3);
const searchResults = await store.similaritySearch("What is bitcoin");
expect(searchResults.length).toBeGreaterThan(0);
expect(searchResults[0].pageContent).toContain(
"A Peer-to-Peer Electronic Cash System"
);
});

// delete documents added in the test
afterAll(async () => {
store = new VectaraStore({
customerId: Number(process.env.VECTARA_CUSTOMER_ID) || 0,
corpusId,
apiKey: process.env.VECTARA_API_KEY || "",
});
await store.deleteDocuments(doc_ids);
});
});
});
74 changes: 65 additions & 9 deletions langchain/src/vectorstores/vectara.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import * as uuid from "uuid";

import { Document } from "../document.js";
import { Embeddings } from "../embeddings/base.js";
import { FakeEmbeddings } from "../embeddings/fake.js";
Expand Down Expand Up @@ -179,24 +181,73 @@ export class VectaraStore extends VectorStore {
);
}

/**
* Method to delete data from the Vectara corpus.
* @param params an array of document IDs to be deleted
* @returns Promise that resolves when the deletion is complete.
*/
async deleteDocuments(ids: string[]): Promise<void> {
if (ids && ids.length > 0) {
const headers = await this.getJsonHeader();
for (const id of ids) {
const data = {
customer_id: this.customerId,
corpus_id: this.corpusId[0],
document_id: id,
};

try {
const controller = new AbortController();
const timeout = setTimeout(
() => controller.abort(),
this.vectaraApiTimeoutSeconds * 1000
);
const response = await fetch(
`https://${this.apiEndpoint}/v1/delete-doc`,
{
method: "POST",
headers: headers?.headers,
body: JSON.stringify(data),
signal: controller.signal,
}
);
clearTimeout(timeout);
if (response.status !== 200) {
throw new Error(
`Vectara API returned status code ${response.status} when deleting document ${id}`
);
}
} catch (e) {
const error = new Error(`Error ${(e as Error).message}`);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(error as any).code = 500;
throw error;
}
}
} else {
throw new Error(`no "ids" specified for deletion`);
}
}

/**
* Adds documents to the Vectara store.
* @param documents An array of Document objects to add to the Vectara store.
* @returns A Promise that resolves when the documents have been added.
* @returns A Promise that resolves to an array of document IDs indexed in Vectara.
*/
async addDocuments(documents: Document[]): Promise<void> {
async addDocuments(documents: Document[]): Promise<string[]> {
if (this.corpusId.length > 1)
throw new Error("addDocuments does not support multiple corpus ids");

const headers = await this.getJsonHeader();
const doc_ids: string[] = [];
let countAdded = 0;
for (const [index, document] of documents.entries()) {
for (const document of documents) {
const doc_id: string = document.metadata?.document_id ?? uuid.v4();
const data = {
customer_id: this.customerId,
corpus_id: this.corpusId[0],
document: {
document_id:
document.metadata?.document_id ?? `${Date.now()}${index}`,
document_id: doc_id,
title: document.metadata?.title ?? "",
metadata_json: JSON.stringify(document.metadata ?? {}),
section: [
Expand Down Expand Up @@ -235,6 +286,7 @@ export class VectaraStore extends VectorStore {
throw error;
} else {
countAdded += 1;
doc_ids.push(doc_id);
}
} catch (e) {
const error = new Error(
Expand All @@ -248,6 +300,8 @@ export class VectaraStore extends VectorStore {
if (this.verbose) {
console.log(`Added ${countAdded} documents to Vectara`);
}

return doc_ids;
}

/**
Expand All @@ -266,7 +320,7 @@ export class VectaraStore extends VectorStore {
if (this.corpusId.length > 1)
throw new Error("addFiles does not support multiple corpus ids");

let numDocs = 0;
const doc_ids: string[] = [];

for (const [index, file] of files.entries()) {
const md = metadatas ? metadatas[index] : {};
Expand All @@ -276,7 +330,7 @@ export class VectaraStore extends VectorStore {
data.append("doc-metadata", JSON.stringify(md));

const response = await fetch(
`https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}`,
`https://api.vectara.io/v1/upload?c=${this.customerId}&o=${this.corpusId[0]}&d=true`,
{
method: "POST",
headers: {
Expand All @@ -293,15 +347,17 @@ export class VectaraStore extends VectorStore {
} else if (status !== 200) {
throw new Error(`Vectara API returned status code ${status}`);
} else {
numDocs += 1;
const result = await response.json();
const doc_id = result.document.documentId;
doc_ids.push(doc_id);
}
}

if (this.verbose) {
console.log(`Uploaded ${files.length} files to Vectara`);
}

return numDocs;
return doc_ids;
}

/**
Expand Down

0 comments on commit 84fd5f2

Please sign in to comment.