diff --git a/docs/api_refs/typedoc.json b/docs/api_refs/typedoc.json index fa0fda258ea8..d92ae9104b00 100644 --- a/docs/api_refs/typedoc.json +++ b/docs/api_refs/typedoc.json @@ -230,6 +230,7 @@ "./langchain/src/retrievers/self_query/pinecone.ts", "./langchain/src/retrievers/self_query/supabase.ts", "./langchain/src/retrievers/self_query/weaviate.ts", + "./langchain/src/retrievers/self_query/vectara.ts", "./langchain/src/retrievers/vespa.ts", "./langchain/src/cache/index.ts", "./langchain/src/cache/cloudflare_kv.ts", diff --git a/docs/core_docs/docs/modules/data_connection/retrievers/how_to/self_query/vectara-self-query.mdx b/docs/core_docs/docs/modules/data_connection/retrievers/how_to/self_query/vectara-self-query.mdx new file mode 100644 index 000000000000..4afdef301f6b --- /dev/null +++ b/docs/core_docs/docs/modules/data_connection/retrievers/how_to/self_query/vectara-self-query.mdx @@ -0,0 +1,39 @@ +# Vectara Self Query Retriever + +This example shows how to use a self query retriever with a [Vectara](https://vectara.com/) vector store. + +If you haven't already set up Vectara, please [follow the instructions here](/docs/integrations/vectorstores/vectara.mdx). + +## Usage + +This example shows how to intialize a `SelfQueryRetriever` with a vector store: + +import CodeBlock from "@theme/CodeBlock"; +import Example from "@examples/retrievers/vectara_self_query.ts"; + +{Example} + +You can also initialize the retriever with default search parameters that apply in +addition to the generated query: + +```typescript +const selfQueryRetriever = await SelfQueryRetriever.fromLLM({ + llm, + vectorStore, + documentContents, + attributeInfo, + /** + * We need to use a translator that translates the queries into a + * filter format that the vector store can understand. LangChain provides one here. + */ + structuredQueryTranslator: new VectaraTranslator()(), + searchParams: { + filter: { + filter: "( doc.genre = 'science fiction' ) and ( doc.rating > 8.5 )", + }, + mergeFiltersOperator: "and", + }, +}); +``` + +See the [official docs](https://docs.vectara.com/) for more on how to construct metadata filters. diff --git a/examples/src/retrievers/vectara_self_query.ts b/examples/src/retrievers/vectara_self_query.ts new file mode 100644 index 000000000000..53e3bd6ec760 --- /dev/null +++ b/examples/src/retrievers/vectara_self_query.ts @@ -0,0 +1,137 @@ +import { AttributeInfo } from "langchain/schema/query_constructor"; +import { Document } from "langchain/document"; +import { SelfQueryRetriever } from "langchain/retrievers/self_query"; + +import { OpenAI } from "langchain/llms/openai"; +import { VectaraStore } from "langchain/vectorstores/vectara"; +import { VectaraTranslator } from "langchain/retrievers/self_query/vectara"; +import { FakeEmbeddings } from "langchain/embeddings/fake"; +/** + * First, we create a bunch of documents. You can load your own documents here instead. + * Each document has a pageContent and a metadata field. Make sure your metadata matches the AttributeInfo below. + */ +const docs = [ + new Document({ + pageContent: + "A bunch of scientists bring back dinosaurs and mayhem breaks loose", + metadata: { year: 1993, rating: 7.7, genre: "science fiction" }, + }), + new Document({ + pageContent: + "Leo DiCaprio gets lost in a dream within a dream within a dream within a ...", + metadata: { year: 2010, director: "Christopher Nolan", rating: 8.2 }, + }), + new Document({ + pageContent: + "A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea", + metadata: { year: 2006, director: "Satoshi Kon", rating: 8.6 }, + }), + new Document({ + pageContent: + "A bunch of normal-sized women are supremely wholesome and some men pine after them", + metadata: { year: 2019, director: "Greta Gerwig", rating: 8.3 }, + }), + new Document({ + pageContent: "Toys come alive and have a blast doing so", + metadata: { year: 1995, genre: "animated" }, + }), + new Document({ + pageContent: "Three men walk into the Zone, three men walk out of the Zone", + metadata: { + year: 1979, + rating: 9.9, + director: "Andrei Tarkovsky", + genre: "science fiction", + }, + }), +]; + +/** + * Next, we define the attributes we want to be able to query on. + * in this case, we want to be able to query on the genre, year, director, rating, and length of the movie. + * We also provide a description of each attribute and the type of the attribute. + * This is used to generate the query prompts. + * + * We need to setup the filters in the vectara as well otherwise filter won't work. + * To setup the filter in vectara, go to Data -> {your_created_corpus} -> overview + * In the overview section edit the filters section and all the following attributes in + * the filters. + */ +const attributeInfo: AttributeInfo[] = [ + { + name: "genre", + description: "The genre of the movie", + type: "string or array of strings", + }, + { + name: "year", + description: "The year the movie was released", + type: "number", + }, + { + name: "director", + description: "The director of the movie", + type: "string", + }, + { + name: "rating", + description: "The rating of the movie (1-10)", + type: "number", + }, +]; + +/** + * Next, we instantiate a vector store. This is where we store the embeddings of the documents. + * We also need to provide an embeddings object. This is used to embed the documents. + */ + +const config = { + customerId: Number(process.env.VECTARA_CUSTOMER_ID), + corpusId: Number(process.env.VECTARA_CORPUS_ID), + apiKey: String(process.env.VECTARA_API_KEY), + verbose: true, +}; + +const vectorStore = await VectaraStore.fromDocuments( + docs, + new FakeEmbeddings(), + config +); + +const llm = new OpenAI(); +const documentContents = "Brief summary of a movie"; + +const selfQueryRetriever = await SelfQueryRetriever.fromLLM({ + llm, + vectorStore, + documentContents, + attributeInfo, + /** + * We need to create a basic translator that translates the queries into a + * filter format that the vector store can understand. We provide a basic translator + * here, but you can create your own translator by extending BaseTranslator + * abstract class. Note that the vector store needs to support filtering on the metadata + * attributes you want to query on. + */ + structuredQueryTranslator: new VectaraTranslator(), +}); + +/** + * Now we can query the vector store. + * We can ask questions like "Which movies are less than 90 minutes?" or "Which movies are rated higher than 8.5?". + * We can also ask questions like "Which movies are either comedy or drama and are less than 90 minutes?". + * The retriever will automatically convert these questions into queries that can be used to retrieve documents. + */ +const query1 = await selfQueryRetriever.getRelevantDocuments( + "What are some movies about dinosaurs" +); +const query2 = await selfQueryRetriever.getRelevantDocuments( + "I want to watch a movie rated higher than 8.5" +); +const query3 = await selfQueryRetriever.getRelevantDocuments( + "Which movies are directed by Greta Gerwig?" +); +const query4 = await selfQueryRetriever.getRelevantDocuments( + "Which movies are either comedy or science fiction and are rated higher than 8.5?" +); +console.log(query1, query2, query3, query4); diff --git a/langchain/.gitignore b/langchain/.gitignore index fc2be83f13e7..6b112fee5412 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -634,6 +634,9 @@ retrievers/self_query/supabase.d.ts retrievers/self_query/weaviate.cjs retrievers/self_query/weaviate.js retrievers/self_query/weaviate.d.ts +retrievers/self_query/vectara.cjs +retrievers/self_query/vectara.js +retrievers/self_query/vectara.d.ts retrievers/vespa.cjs retrievers/vespa.js retrievers/vespa.d.ts diff --git a/langchain/package.json b/langchain/package.json index f6aaedad24b1..cdecf18d9256 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -646,6 +646,9 @@ "retrievers/self_query/weaviate.cjs", "retrievers/self_query/weaviate.js", "retrievers/self_query/weaviate.d.ts", + "retrievers/self_query/vectara.cjs", + "retrievers/self_query/vectara.js", + "retrievers/self_query/vectara.d.ts", "retrievers/vespa.cjs", "retrievers/vespa.js", "retrievers/vespa.d.ts", @@ -2468,6 +2471,11 @@ "import": "./retrievers/self_query/weaviate.js", "require": "./retrievers/self_query/weaviate.cjs" }, + "./retrievers/self_query/vectara": { + "types": "./retrievers/self_query/vectara.d.ts", + "import": "./retrievers/self_query/vectara.js", + "require": "./retrievers/self_query/vectara.cjs" + }, "./retrievers/vespa": { "types": "./retrievers/vespa.d.ts", "import": "./retrievers/vespa.js", diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 98200151910d..d29b2151219f 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -249,6 +249,7 @@ const entrypoints = { "retrievers/self_query/pinecone": "retrievers/self_query/pinecone", "retrievers/self_query/supabase": "retrievers/self_query/supabase", "retrievers/self_query/weaviate": "retrievers/self_query/weaviate", + "retrievers/self_query/vectara": "retrievers/self_query/vectara", "retrievers/vespa": "retrievers/vespa", // cache cache: "cache/index", @@ -459,6 +460,7 @@ const requiresOptionalDependency = [ "retrievers/self_query/pinecone", "retrievers/self_query/supabase", "retrievers/self_query/weaviate", + "retrievers/self_query/vectara", "output_parsers/expression", "chains/query_constructor", "chains/query_constructor/ir", diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index a214f4fed76c..c98adf9c6451 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -131,6 +131,7 @@ export const optionalImportEntrypoints = [ "langchain/retrievers/self_query/pinecone", "langchain/retrievers/self_query/supabase", "langchain/retrievers/self_query/weaviate", + "langchain/retrievers/self_query/vectara", "langchain/cache/cloudflare_kv", "langchain/cache/momento", "langchain/cache/redis", diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts index 248bc5f9eafb..ff96bdc124d3 100644 --- a/langchain/src/load/import_type.d.ts +++ b/langchain/src/load/import_type.d.ts @@ -391,6 +391,9 @@ export interface OptionalImportMap { "langchain/retrievers/self_query/weaviate"?: | typeof import("../retrievers/self_query/weaviate.js") | Promise; + "langchain/retrievers/self_query/vectara"?: + | typeof import("../retrievers/self_query/vectara.js") + | Promise; "langchain/cache/cloudflare_kv"?: | typeof import("../cache/cloudflare_kv.js") | Promise; diff --git a/langchain/src/retrievers/self_query/tests/vectara_self_query.int.test.ts b/langchain/src/retrievers/self_query/tests/vectara_self_query.int.test.ts new file mode 100644 index 000000000000..22a20532237d --- /dev/null +++ b/langchain/src/retrievers/self_query/tests/vectara_self_query.int.test.ts @@ -0,0 +1,113 @@ +/* eslint-disable no-process-env */ +import { test } from "@jest/globals"; +import { Document } from "../../../document.js"; +import { AttributeInfo } from "../../../schema/query_constructor.js"; +import { SelfQueryRetriever } from "../index.js"; +import { OpenAI } from "../../../llms/openai.js"; +import { VectaraTranslator } from "../vectara.js"; +import { FakeEmbeddings } from "../../../embeddings/fake.js"; +import { VectaraStore } from "../../../vectorstores/vectara.js"; + +test.skip("Vectara Self Query Retriever Test", async () => { + const docs = [ + new Document({ + pageContent: + "A bunch of scientists bring back dinosaurs and mayhem breaks loose", + metadata: { year: 1993, rating: 7.7, genre: "science fiction" }, + }), + new Document({ + pageContent: + "Leo DiCaprio gets lost in a dream within a dream within a dream within a ...", + metadata: { year: 2010, director: "Christopher Nolan", rating: 8.2 }, + }), + new Document({ + pageContent: + "A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea", + metadata: { year: 2006, director: "Satoshi Kon", rating: 8.6 }, + }), + new Document({ + pageContent: + "A bunch of normal-sized women are supremely wholesome and some men pine after them", + metadata: { year: 2019, director: "Greta Gerwig", rating: 8.3 }, + }), + new Document({ + pageContent: "Toys come alive and have a blast doing so", + metadata: { year: 1995, genre: "animated" }, + }), + new Document({ + pageContent: + "Three men walk into the Zone, three men walk out of the Zone", + metadata: { + year: 1979, + rating: 9.9, + director: "Andrei Tarkovsky", + genre: "science fiction", + }, + }), + ]; + + const attributeInfo: AttributeInfo[] = [ + { + name: "genre", + description: "The genre of the movie", + type: "string or array of strings", + }, + { + name: "year", + description: "The year the movie was released", + type: "number", + }, + { + name: "director", + description: "The director of the movie", + type: "string", + }, + { + name: "rating", + description: "The rating of the movie (1-10)", + type: "number", + }, + ]; + const config = { + customerId: Number(process.env.VECTARA_CUSTOMER_ID), + corpusId: Number(process.env.VECTARA_CORPUS_ID), + apiKey: String(process.env.VECTARA_API_KEY), + verbose: true, + }; + + const vectorStore = await VectaraStore.fromDocuments( + docs, + new FakeEmbeddings(), + config + ); + + const llm = new OpenAI(); + const documentContents = "Brief summary of a movie"; + + const selfQueryRetriever = await SelfQueryRetriever.fromLLM({ + llm, + vectorStore, + documentContents, + attributeInfo, + + structuredQueryTranslator: new VectaraTranslator(), + }); + + const query1 = await selfQueryRetriever.getRelevantDocuments( + "I want to watch a movie rated higher than 8.5" + ); + const query2 = await selfQueryRetriever.getRelevantDocuments( + "Which movies are directed by Greta Gerwig?" + ); + const query3 = await selfQueryRetriever.getRelevantDocuments( + "Which movies are either comedy or science fiction and are rated higher than 8.5?" + ); + const query4 = await selfQueryRetriever.getRelevantDocuments( + "Wau wau wau wau hello gello hello?" + ); + console.log(query1, query2, query3, query4); + expect(query1.length).toBe(2); + expect(query2.length).toBe(1); + expect(query3.length).toBe(1); + expect(query4.length).toBe(0); +}); diff --git a/langchain/src/retrievers/self_query/vectara.ts b/langchain/src/retrievers/self_query/vectara.ts new file mode 100644 index 000000000000..8dec60c8a17a --- /dev/null +++ b/langchain/src/retrievers/self_query/vectara.ts @@ -0,0 +1,181 @@ +import { + Comparator, + Comparators, + Comparison, + NOT, + Operation, + Operator, + Operators, + StructuredQuery, + Visitor, +} from "../../chains/query_constructor/ir.js"; +import { VectaraFilter, VectaraStore } from "../../vectorstores/vectara.js"; +import { BaseTranslator } from "./base.js"; +import { isFilterEmpty } from "./utils.js"; + +type AllowedOperator = Exclude; + +export type VectaraVisitorResult = + | VectaraOperationResult + | VectaraComparisonResult + | VectaraVisitorStructuredQueryResult; +// eslint-disable-next-line @typescript-eslint/ban-types +export type VectaraOperationResult = String; +// eslint-disable-next-line @typescript-eslint/ban-types +export type VectaraComparisonResult = String; +export type VectaraVisitorStructuredQueryResult = { + filter?: { filter?: VectaraOperationResult | VectaraComparisonResult }; +}; + +type Value = number | string; +function processValue(value: Value): string { + /** Convert a value to a string and add single quotes if it is a string. */ + if (typeof value === "string") { + return `'${value}'`; + } else { + return String(value); + } +} + +export class VectaraTranslator< + T extends VectaraStore +> extends BaseTranslator { + declare VisitOperationOutput: VectaraOperationResult; + + declare VisitComparisonOutput: VectaraComparisonResult; + + allowedOperators: Operator[] = [Operators.and, Operators.or]; + + allowedComparators: Comparator[] = [ + Comparators.eq, + Comparators.ne, + Comparators.lt, + Comparators.lte, + Comparators.gt, + Comparators.gte, + ]; + + formatFunction(func: Operator | Comparator): string { + if (func in Comparators) { + if ( + this.allowedComparators.length > 0 && + this.allowedComparators.indexOf(func as Comparator) === -1 + ) { + throw new Error( + `Comparator ${func} not allowed. Allowed operators: ${this.allowedComparators.join( + ", " + )}` + ); + } + } else if (func in Operators) { + if ( + this.allowedOperators.length > 0 && + this.allowedOperators.indexOf(func as Operator) === -1 + ) { + throw new Error( + `Operator ${func} not allowed. Allowed operators: ${this.allowedOperators.join( + ", " + )}` + ); + } + } else { + throw new Error("Unknown comparator or operator"); + } + + const mapDict = { + and: " and ", + or: " or ", + eq: "=", + ne: "!=", + lt: "<", + lte: "<=", + gt: ">", + gte: ">=", + }; + return mapDict[func as Comparator | AllowedOperator]; + } + + /** + * Visits an operation and returns a VectaraOperationResult. The + * operation's arguments are visited and the operator is formatted. + * @param operation The operation to visit. + * @returns A VectaraOperationResult. + */ + visitOperation(operation: Operation): this["VisitOperationOutput"] { + const args = operation.args?.map((arg) => + arg.accept(this as Visitor) + ) as VectaraVisitorResult[]; + const operator = this.formatFunction(operation.operator); + return `( ${args.join(operator)} )`; + } + + /** + * Visits a comparison and returns a VectaraComparisonResult. The + * comparison's value is checked for type and the comparator is formatted. + * Throws an error if the value type is not supported. + * @param comparison The comparison to visit. + * @returns A VectaraComparisonResult. + */ + visitComparison(comparison: Comparison): this["VisitComparisonOutput"] { + const comparator = this.formatFunction(comparison.comparator); + return `( doc.${comparison.attribute} ${comparator} ${processValue( + comparison.value + )} )`; + } + + /** + * Visits a structured query and returns a VectaraStructuredQueryResult. + * If the query has a filter, it is visited. + * @param query The structured query to visit. + * @returns A VectaraStructuredQueryResult. + */ + visitStructuredQuery( + query: StructuredQuery + ): this["VisitStructuredQueryOutput"] { + let nextArg = {}; + if (query.filter) { + nextArg = { + filter: { filter: query.filter.accept(this as Visitor) }, + }; + } + return nextArg; + } + + mergeFilters( + defaultFilter: VectaraFilter | undefined, + generatedFilter: VectaraFilter | undefined, + mergeType = "and", + forceDefaultFilter = false + ): VectaraFilter | undefined { + if (isFilterEmpty(defaultFilter) && isFilterEmpty(generatedFilter)) { + return undefined; + } + if (isFilterEmpty(defaultFilter) || mergeType === "replace") { + if (isFilterEmpty(generatedFilter)) { + return undefined; + } + return generatedFilter; + } + if (isFilterEmpty(generatedFilter)) { + if (forceDefaultFilter) { + return defaultFilter; + } + if (mergeType === "and") { + return undefined; + } + return defaultFilter; + } + + if (mergeType === "and") { + return { + filter: `${defaultFilter} and ${generatedFilter}`, + } as VectaraFilter; + } else if (mergeType === "or") { + return { + filter: `${defaultFilter} or ${generatedFilter}`, + }; + } else { + throw new Error("Unknown merge type"); + } + } +}