From 3c2e26faad9371d19ff6ff3b2be0c5f7548321fd Mon Sep 17 00:00:00 2001 From: Raj Mehta Date: Tue, 10 Sep 2024 15:38:42 -0700 Subject: [PATCH] feat(community): BM25 Keyword Search Retriever (#6719) Co-authored-by: jacoblee93 --- .../docs/integrations/retrievers/bm25.ipynb | 101 ++++++++++++++++++ libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 1 + libs/langchain-community/package.json | 13 +++ .../src/load/import_map.ts | 1 + .../src/retrievers/bm25.ts | 58 ++++++++++ .../src/retrievers/tests/bm25.test.ts | 27 +++++ .../src/utils/@furkantoprak/bm25/BM25.ts | 100 +++++++++++++++++ .../src/utils/@furkantoprak/bm25/LICENSE.md | 21 ++++ 9 files changed, 326 insertions(+) create mode 100644 docs/core_docs/docs/integrations/retrievers/bm25.ipynb create mode 100644 libs/langchain-community/src/retrievers/bm25.ts create mode 100644 libs/langchain-community/src/retrievers/tests/bm25.test.ts create mode 100644 libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts create mode 100644 libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md diff --git a/docs/core_docs/docs/integrations/retrievers/bm25.ipynb b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb new file mode 100644 index 000000000000..b106554c6b40 --- /dev/null +++ b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BM25\n", + "\n", + "BM25, also known as [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25), is a ranking function used in information retrieval systems to estimate the relevance of documents to a given search query.\n", + "\n", + "You can use it as part of your retrieval pipeline as a to rerank documents as a postprocessing step after retrieving an initial set of documents from another source.\n", + "\n", + "## Setup\n", + "\n", + "The `BM25Retriever` is exported from `@langchain/community`. You'll need to install it like this:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community @langchain/core\n", + "\n", + "```\n", + "\n", + "This retriever uses code from [`this implementation`](https://github.com/FurkanToprak/OkapiBM25) of Okapi BM25.\n", + "\n", + "## Usage\n", + "\n", + "You can now create a new retriever with previously retrieved documents:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " { pageContent: 'mitochondria is made of lipids', metadata: {} },\n", + " {\n", + " pageContent: 'mitochondria is the powerhouse of the cell',\n", + " metadata: {}\n", + " },\n", + " { pageContent: 'Buildings are made out of brick', metadata: {} },\n", + " { pageContent: 'Buildings are made out of wood', metadata: {} }\n", + "]\n" + ] + } + ], + "source": [ + "import { BM25Retriever } from \"@langchain/community/retrievers/bm25\";\n", + "\n", + "const retriever = BM25Retriever.fromDocuments([\n", + " { pageContent: \"Buildings are made out of brick\", metadata: {} },\n", + " { pageContent: \"Buildings are made out of wood\", metadata: {} },\n", + " { pageContent: \"Buildings are made out of stone\", metadata: {} },\n", + " { pageContent: \"Cars are made out of metal\", metadata: {} },\n", + " { pageContent: \"Cars are made out of plastic\", metadata: {} },\n", + " { pageContent: \"mitochondria is the powerhouse of the cell\", metadata: {} },\n", + " { pageContent: \"mitochondria is made of lipids\", metadata: {} },\n", + "], { k: 4 });\n", + "\n", + "// Will return the 4 documents reranked by the BM25 algorithm\n", + "await retriever.invoke(\"mitochondria\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 299b955d097a..1bb10382f52e 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -626,6 +626,10 @@ retrievers/amazon_knowledge_base.cjs retrievers/amazon_knowledge_base.js retrievers/amazon_knowledge_base.d.ts retrievers/amazon_knowledge_base.d.cts +retrievers/bm25.cjs +retrievers/bm25.js +retrievers/bm25.d.ts +retrievers/bm25.d.cts retrievers/chaindesk.cjs retrievers/chaindesk.js retrievers/chaindesk.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 5c9818185cec..c4a964f747c4 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -196,6 +196,7 @@ export const config = { // retrievers "retrievers/amazon_kendra": "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base", + "retrievers/bm25": "retrievers/bm25", "retrievers/chaindesk": "retrievers/chaindesk", "retrievers/databerry": "retrievers/databerry", "retrievers/dria": "retrievers/dria", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 85444a2936a0..f8d2288a20c6 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -2118,6 +2118,15 @@ "import": "./retrievers/amazon_knowledge_base.js", "require": "./retrievers/amazon_knowledge_base.cjs" }, + "./retrievers/bm25": { + "types": { + "import": "./retrievers/bm25.d.ts", + "require": "./retrievers/bm25.d.cts", + "default": "./retrievers/bm25.d.ts" + }, + "import": "./retrievers/bm25.js", + "require": "./retrievers/bm25.cjs" + }, "./retrievers/chaindesk": { "types": { "import": "./retrievers/chaindesk.d.ts", @@ -3713,6 +3722,10 @@ "retrievers/amazon_knowledge_base.js", "retrievers/amazon_knowledge_base.d.ts", "retrievers/amazon_knowledge_base.d.cts", + "retrievers/bm25.cjs", + "retrievers/bm25.js", + "retrievers/bm25.d.ts", + "retrievers/bm25.d.cts", "retrievers/chaindesk.cjs", "retrievers/chaindesk.js", "retrievers/chaindesk.d.ts", diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 59efd56b760e..5bbd9e4d0a01 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -54,6 +54,7 @@ export * as chat_models__moonshot from "../chat_models/moonshot.js"; export * as chat_models__ollama from "../chat_models/ollama.js"; export * as chat_models__togetherai from "../chat_models/togetherai.js"; export * as chat_models__yandex from "../chat_models/yandex.js"; +export * as retrievers__bm25 from "../retrievers/bm25.js"; export * as retrievers__chaindesk from "../retrievers/chaindesk.js"; export * as retrievers__databerry from "../retrievers/databerry.js"; export * as retrievers__remote from "../retrievers/remote/index.js"; diff --git a/libs/langchain-community/src/retrievers/bm25.ts b/libs/langchain-community/src/retrievers/bm25.ts new file mode 100644 index 000000000000..dfc04709cba1 --- /dev/null +++ b/libs/langchain-community/src/retrievers/bm25.ts @@ -0,0 +1,58 @@ +import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; +import { Document } from "@langchain/core/documents"; + +import { BM25 } from "../utils/@furkantoprak/bm25/BM25.js"; + +export type BM25RetrieverOptions = { + docs: Document[]; + k: number; +} & BaseRetrieverInput; + +/** + * A retriever that uses the BM25 algorithm to rank documents based on their + * similarity to a query. It uses the "okapibm25" package for BM25 scoring. + * The k parameter determines the number of documents to return for each query. + */ +export class BM25Retriever extends BaseRetriever { + static lc_name() { + return "BM25Retriever"; + } + + lc_namespace = ["langchain", "retrievers", "bm25_retriever"]; + + static fromDocuments( + documents: Document[], + options: Omit + ) { + return new this({ ...options, docs: documents }); + } + + docs: Document[]; + + k: number; + + constructor(options: BM25RetrieverOptions) { + super(options); + this.docs = options.docs; + this.k = options.k; + } + + private preprocessFunc(text: string): string[] { + return text.toLowerCase().split(/\s+/); + } + + async _getRelevantDocuments(query: string) { + const processedQuery = this.preprocessFunc(query); + const documents = this.docs.map((doc) => doc.pageContent); + const scores = BM25(documents, processedQuery) as number[]; + + const scoredDocs = this.docs.map((doc, index) => ({ + document: doc, + score: scores[index], + })); + + scoredDocs.sort((a, b) => b.score - a.score); + + return scoredDocs.slice(0, this.k).map((item) => item.document); + } +} diff --git a/libs/langchain-community/src/retrievers/tests/bm25.test.ts b/libs/langchain-community/src/retrievers/tests/bm25.test.ts new file mode 100644 index 000000000000..bcfe46f940b8 --- /dev/null +++ b/libs/langchain-community/src/retrievers/tests/bm25.test.ts @@ -0,0 +1,27 @@ +import { expect, test } from "@jest/globals"; +import { Document } from "@langchain/core/documents"; +import { BM25Retriever } from "../bm25.js"; + +test("BM25Retriever", async () => { + const docs = [ + new Document({ + pageContent: "The quick brown fox jumps over the lazy dog", + }), + new Document({ + pageContent: "A lazy dog sleeps all day", + }), + new Document({ + pageContent: "The brown fox is quick and clever", + }), + ]; + + const retriever = BM25Retriever.fromDocuments(docs, { + k: 2, + }); + const results = await retriever.invoke("the fox and the dog"); + + expect(results).toHaveLength(2); + expect(results[0].pageContent).toBe( + "The quick brown fox jumps over the lazy dog" + ); +}); diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts new file mode 100644 index 000000000000..e3e2621168e7 --- /dev/null +++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts @@ -0,0 +1,100 @@ +/** + * Adapted from + * https://github.com/FurkanToprak/OkapiBM25 + * + * Inlined due to CJS import issues. + */ + +/** Gets word count. */ +export const getWordCount = (corpus: string) => { + return ((corpus || "").match(/\w+/g) || []).length; +}; + +/** Number of occurences of a word in a string. */ +export const getTermFrequency = (term: string, corpus: string) => { + return ((corpus || "").match(new RegExp(term, "g")) || []).length; +}; + +/** Inverse document frequency. */ +export const getIDF = (term: string, documents: string[]) => { + // Number of relevant documents. + const relevantDocuments = documents.filter((document: string) => + document.includes(term) + ).length; + return Math.log( + (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1 + ); +}; + +/** Represents a document; useful when sorting results. + */ +export interface BMDocument { + /** The document is originally scoreed. */ + document: string; + /** The score that the document recieves. */ + score: number; +} + +/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */ +export interface BMConstants { + /** Free parameter. Is 0.75 by default. */ + b?: number; + /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */ + k1?: number; +} + +/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */ +export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number; + +/** Implementation of Okapi BM25 algorithm. + * @param documents: Collection of documents. + * @param keywords: query terms. + * @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default. + * @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order. + * If this option is provided, the return type will not be an array of scores but an array of documents with their scores. + */ +export function BM25( + documents: string[], + keywords: string[], + constants?: BMConstants, + sorter?: BMSorter +): number[] | BMDocument[] { + const b = constants && constants.b ? constants.b : 0.75; + const k1 = constants && constants.k1 ? constants.k1 : 1.2; + const documentLengths = documents.map((document: string) => + getWordCount(document) + ); + const averageDocumentLength = + documentLengths.reduce((a, b) => a + b, 0) / documents.length; + const idfByKeyword = keywords.reduce((obj, keyword) => { + obj.set(keyword, getIDF(keyword, documents)); + return obj; + }, new Map()); + + const scores = documents.map((document: string, index: number) => { + const score = keywords + .map((keyword: string) => { + const inverseDocumentFrequency = idfByKeyword.get(keyword); + if (inverseDocumentFrequency === undefined) { + throw new Error("Missing keyword."); + } + const termFrequency = getTermFrequency(keyword, document); + const documentLength = documentLengths[index]; + return ( + (inverseDocumentFrequency * (termFrequency * (k1 + 1))) / + (termFrequency + + k1 * (1 - b + (b * documentLength) / averageDocumentLength)) + ); + }) + .reduce((a: number, b: number) => a + b, 0); + if (sorter) { + return { score, document } as BMDocument; + } + return score; + }); + // sort the results + if (sorter) { + return (scores as BMDocument[]).sort(sorter); + } + return scores as number[]; +} diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md new file mode 100644 index 000000000000..8dd59105dd36 --- /dev/null +++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +## Copyright (c) 2020 Furkan Toprak + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.