Merge pull request #1 from langchain-ai/erick/init-pkg

init pkg
langchain-ai · Mar 26, 2024 · 74ef417 · 74ef417
2 parents 9189ad6 + 8c88097
commit 74ef417
Show file tree

Hide file tree

Showing 33 changed files with 5,302 additions and 26 deletions.
diff --git a/.github/scripts/check_diff.py b/.github/scripts/check_diff.py
@@ -2,7 +2,7 @@
 import sys
 from typing import Dict
 
-LIB_DIRS = ["libs/{lib}"]
+LIB_DIRS = ["libs/elasticsearch"]
 
 if __name__ == "__main__":
     files = sys.argv[1:]

diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml
@@ -12,7 +12,7 @@ on:
       working-directory:
         required: true
         type: string
-        default: 'libs/'
+        default: 'libs/elasticsearch'
 
 env:
   PYTHON_VERSION: "3.11"
@@ -158,7 +158,9 @@ jobs:
 
       - name: Run integration tests
         env:
-          PARTNER_API_KEY: ${{ secrets.PARTNER_API_KEY }}
+          ES_API_KEY: ${{ secrets.ES_API_KEY }}
+          ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
+          ES_URL: ${{ secrets.ES_URL }}
         run: make integration_tests
         working-directory: ${{ inputs.working-directory }}
 

diff --git a/README.md b/README.md
@@ -1,25 +1,5 @@
-# 🦜️🔗 LangChain {partner}
+# 🦜️🔗 LangChain Elastic
 
-This repository contains {n} packages with {partner} integrations with LangChain:
+This repository contains 1 package with ElasticSearch integrations with LangChain:
 
-- [langchain-{package}](https://pypi.org/project/langchain-{package}/) integrates [{product}}]({product_link}).
-{- ... if more packages}
-
-## Initial Repo Checklist (Remove this section after completing)
-
-This setup assumes that the partner package is already split. For those instructions,
-see [these docs](https://python.langchain.com/docs/contributing/integrations#partner-packages).
-
-- [ ] Fill out the readme above (for folks that follow pypi link)
-- [ ] Copy package into /libs folder
-- [ ] Update these fields in /libs/*/pyproject.toml
-
-    - `tool.poetry.repository`
-    - `tool.poetry.urls["Source Code"]`
-
-- [ ] Add integration testing secrets in Github (ask Erick for help)
-- [ ] Add secrets as env vars in .github/workflows/_release.yml
-- [ ] Configure `LIB_DIRS` in .github/scripts/check_diff.py
-- [ ] Add partner collaborators in Github (ask Erick for help)
-- [ ] Add new repo to test-pypi and pypi trusted publishing (ask Erick for help)
-- [ ] Populate .github/workflows/_release.yml with `on.workflow_dispatch.inputs.working-directory.default`
+- [langchain-elasticsearch](https://pypi.org/project/langchain-elasticsearch/) integrates [ElasticSearch](https://www.elastic.co/elasticsearch).
diff --git a/libs/elasticsearch/.gitignore b/libs/elasticsearch/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/libs/elasticsearch/LICENSE b/libs/elasticsearch/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/libs/elasticsearch/Makefile b/libs/elasticsearch/Makefile
@@ -0,0 +1,60 @@
+.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
+
+# Default target executed when no arguments are given to make.
+all: help
+
+install:
+	poetry install
+
+# Define a variable for the test file path.
+TEST_FILE ?= tests/unit_tests/
+integration_test integration_tests: TEST_FILE=tests/integration_tests/
+
+test tests integration_test integration_tests:
+	poetry run pytest $(TEST_FILE)
+
+
+######################
+# LINTING AND FORMATTING
+######################
+
+# Define a variable for Python and notebook files.
+PYTHON_FILES=.
+MYPY_CACHE=.mypy_cache
+lint format: PYTHON_FILES=.
+lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/elasticsearch --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
+lint_package: PYTHON_FILES=langchain_elasticsearch
+lint_tests: PYTHON_FILES=tests
+lint_tests: MYPY_CACHE=.mypy_cache_test
+
+lint lint_diff lint_package lint_tests:
+	poetry run ruff .
+	poetry run ruff format $(PYTHON_FILES) --diff
+	poetry run ruff --select I $(PYTHON_FILES)
+	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
+
+format format_diff:
+	poetry run ruff format $(PYTHON_FILES)
+	poetry run ruff --select I --fix $(PYTHON_FILES)
+
+spell_check:
+	poetry run codespell --toml pyproject.toml
+
+spell_fix:
+	poetry run codespell --toml pyproject.toml -w
+
+check_imports: $(shell find langchain_elasticsearch -name '*.py')
+	poetry run python ./scripts/check_imports.py $^
+
+######################
+# HELP
+######################
+
+help:
+	@echo '----'
+	@echo 'check_imports				- check imports'
+	@echo 'format                       - run code formatters'
+	@echo 'lint                         - run linters'
+	@echo 'test                         - run unit tests'
+	@echo 'tests                        - run unit tests'
+	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
diff --git a/libs/elasticsearch/README.md b/libs/elasticsearch/README.md
@@ -0,0 +1,81 @@
+# langchain-elasticsearch
+
+This package contains the LangChain integration with Elasticsearch.
+
+## Installation
+
+```bash
+pip install -U langchain-elasticsearch
+```
+
+## Elasticsearch setup
+
+### Elastic Cloud
+
+You need a running Elasticsearch deployment. The easiest way to start one is through [Elastic Cloud](https://cloud.elastic.co/).
+You can sign up for a [free trial](https://www.elastic.co/cloud/cloud-trial-overview).
+
+1. [Create a deployment](https://www.elastic.co/guide/en/cloud/current/ec-create-deployment.html)
+2. Get your Cloud ID:
+    1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Manage" next to your deployment
+    2. Copy the Cloud ID and paste it into the `es_cloud_id` parameter below
+3. Create an API key:
+    1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Open" next to your deployment
+    2. In the left-hand side menu, go to "Stack Management", then to "API Keys"
+    3. Click "Create API key"
+    4. Enter a name for the API key and click "Create"
+    5. Copy the API key and paste it into the `es_api_key` parameter below
+
+### Elastic Cloud
+
+Alternatively, you can run Elasticsearch via Docker as described in the [docs](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch).
+
+## Usage
+
+### ElasticsearchStore
+
+The `ElasticsearchStore` class exposes Elasticsearch as a vector store.
+
+```python
+from langchain_elasticsearch import ElasticsearchStore
+
+embeddings = ... # use a LangChain Embeddings class or ElasticsearchEmbeddings
+
+vectorstore = ElasticsearchStore(
+    es_cloud_id="your-cloud-id",
+    es_api_key="your-api-key",
+    index_name="your-index-name",
+    embeddings=embeddings,
+)
+```
+
+### ElasticsearchEmbeddings
+
+The `ElasticsearchEmbeddings` class provides an interface to generate embeddings using a model
+deployed in an Elasticsearch cluster.
+
+```python
+from langchain_elasticsearch import ElasticsearchEmbeddings
+
+embeddings = ElasticsearchEmbeddings.from_credentials(
+    model_id="your-model-id",
+    input_field="your-input-field",
+    es_cloud_id="your-cloud-id",
+    es_api_key="your-api-key",
+)
+```
+
+### ElasticsearchChatMessageHistory
+
+The `ElasticsearchChatMessageHistory` class stores chat histories in Elasticsearch.
+
+```python
+from langchain_elasticsearch import ElasticsearchChatMessageHistory
+
+chat_history = ElasticsearchChatMessageHistory(
+    index="your-index-name",
+    session_id="your-session-id",
+    es_cloud_id="your-cloud-id",
+    es_api_key="your-api-key",
+)
+```
diff --git a/libs/elasticsearch/langchain_elasticsearch/__init__.py b/libs/elasticsearch/langchain_elasticsearch/__init__.py
@@ -0,0 +1,19 @@
+from langchain_elasticsearch.chat_history import ElasticsearchChatMessageHistory
+from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings
+from langchain_elasticsearch.retrievers import ElasticsearchRetriever
+from langchain_elasticsearch.vectorstores import (
+    ApproxRetrievalStrategy,
+    ElasticsearchStore,
+    ExactRetrievalStrategy,
+    SparseRetrievalStrategy,
+)
+
+__all__ = [
+    "ApproxRetrievalStrategy",
+    "ElasticsearchChatMessageHistory",
+    "ElasticsearchEmbeddings",
+    "ElasticsearchRetriever",
+    "ElasticsearchStore",
+    "ExactRetrievalStrategy",
+    "SparseRetrievalStrategy",
+]
diff --git a/libs/elasticsearch/langchain_elasticsearch/_utilities.py b/libs/elasticsearch/langchain_elasticsearch/_utilities.py
@@ -0,0 +1,108 @@
+from enum import Enum
+from typing import List, Union
+
+import numpy as np
+from elasticsearch import BadRequestError, ConflictError, Elasticsearch, NotFoundError
+from langchain_core import __version__ as langchain_version
+
+Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
+
+
+class DistanceStrategy(str, Enum):
+    """Enumerator of the Distance strategies for calculating distances
+    between vectors."""
+
+    EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
+    MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
+    DOT_PRODUCT = "DOT_PRODUCT"
+    JACCARD = "JACCARD"
+    COSINE = "COSINE"
+
+
+def with_user_agent_header(client: Elasticsearch, header_prefix: str) -> Elasticsearch:
+    headers = dict(client._headers)
+    headers.update({"user-agent": f"{header_prefix}/{langchain_version}"})
+    return client.options(headers=headers)
+
+
+def maximal_marginal_relevance(
+    query_embedding: np.ndarray,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> List[int]:
+    """Calculate maximal marginal relevance."""
+    if min(k, len(embedding_list)) <= 0:
+        return []
+    if query_embedding.ndim == 1:
+        query_embedding = np.expand_dims(query_embedding, axis=0)
+    similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
+    most_similar = int(np.argmax(similarity_to_query))
+    idxs = [most_similar]
+    selected = np.array([embedding_list[most_similar]])
+    while len(idxs) < min(k, len(embedding_list)):
+        best_score = -np.inf
+        idx_to_add = -1
+        similarity_to_selected = cosine_similarity(embedding_list, selected)
+        for i, query_score in enumerate(similarity_to_query):
+            if i in idxs:
+                continue
+            redundant_score = max(similarity_to_selected[i])
+            equation_score = (
+                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
+            )
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
+    return idxs
+
+
+def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
+    """Row-wise cosine similarity between two equal-width matrices."""
+    if len(X) == 0 or len(Y) == 0:
+        return np.array([])
+
+    X = np.array(X)
+    Y = np.array(Y)
+    if X.shape[1] != Y.shape[1]:
+        raise ValueError(
+            f"Number of columns in X and Y must be the same. X has shape {X.shape} "
+            f"and Y has shape {Y.shape}."
+        )
+    try:
+        import simsimd as simd  # type: ignore
+
+        X = np.array(X, dtype=np.float32)
+        Y = np.array(Y, dtype=np.float32)
+        Z = 1 - simd.cdist(X, Y, metric="cosine")
+        if isinstance(Z, float):
+            return np.array([Z])
+        return np.array(Z)
+    except ImportError:
+        X_norm = np.linalg.norm(X, axis=1)
+        Y_norm = np.linalg.norm(Y, axis=1)
+        # Ignore divide by zero errors run time warnings as those are handled below.
+        with np.errstate(divide="ignore", invalid="ignore"):
+            similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
+        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+        return similarity
+
+
+def check_if_model_deployed(client: Elasticsearch, model_id: str) -> None:
+    try:
+        dummy = {"x": "y"}
+        client.ml.infer_trained_model(model_id=model_id, docs=[dummy])
+    except NotFoundError as err:
+        raise err
+    except ConflictError as err:
+        raise NotFoundError(
+            f"model '{model_id}' not found, please deploy it first",
+            meta=err.meta,
+            body=err.body,
+        ) from err
+    except BadRequestError:
+        # This error is expected because we do not know the expected document
+        # shape and just use a dummy doc above.
+        pass