generated from langchain-ai/integration-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from langchain-ai/erick/init-pkg
init pkg
- Loading branch information
Showing
33 changed files
with
5,302 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,5 @@ | ||
# 🦜️🔗 LangChain {partner} | ||
# 🦜️🔗 LangChain Elastic | ||
|
||
This repository contains {n} packages with {partner} integrations with LangChain: | ||
This repository contains 1 package with ElasticSearch integrations with LangChain: | ||
|
||
- [langchain-{package}](https://pypi.org/project/langchain-{package}/) integrates [{product}}]({product_link}). | ||
{- ... if more packages} | ||
|
||
## Initial Repo Checklist (Remove this section after completing) | ||
|
||
This setup assumes that the partner package is already split. For those instructions, | ||
see [these docs](https://python.langchain.com/docs/contributing/integrations#partner-packages). | ||
|
||
- [ ] Fill out the readme above (for folks that follow pypi link) | ||
- [ ] Copy package into /libs folder | ||
- [ ] Update these fields in /libs/*/pyproject.toml | ||
|
||
- `tool.poetry.repository` | ||
- `tool.poetry.urls["Source Code"]` | ||
|
||
- [ ] Add integration testing secrets in Github (ask Erick for help) | ||
- [ ] Add secrets as env vars in .github/workflows/_release.yml | ||
- [ ] Configure `LIB_DIRS` in .github/scripts/check_diff.py | ||
- [ ] Add partner collaborators in Github (ask Erick for help) | ||
- [ ] Add new repo to test-pypi and pypi trusted publishing (ask Erick for help) | ||
- [ ] Populate .github/workflows/_release.yml with `on.workflow_dispatch.inputs.working-directory.default` | ||
- [langchain-elasticsearch](https://pypi.org/project/langchain-elasticsearch/) integrates [ElasticSearch](https://www.elastic.co/elasticsearch). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2024 LangChain, Inc. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests | ||
|
||
# Default target executed when no arguments are given to make. | ||
all: help | ||
|
||
install: | ||
poetry install | ||
|
||
# Define a variable for the test file path. | ||
TEST_FILE ?= tests/unit_tests/ | ||
integration_test integration_tests: TEST_FILE=tests/integration_tests/ | ||
|
||
test tests integration_test integration_tests: | ||
poetry run pytest $(TEST_FILE) | ||
|
||
|
||
###################### | ||
# LINTING AND FORMATTING | ||
###################### | ||
|
||
# Define a variable for Python and notebook files. | ||
PYTHON_FILES=. | ||
MYPY_CACHE=.mypy_cache | ||
lint format: PYTHON_FILES=. | ||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/elasticsearch --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$') | ||
lint_package: PYTHON_FILES=langchain_elasticsearch | ||
lint_tests: PYTHON_FILES=tests | ||
lint_tests: MYPY_CACHE=.mypy_cache_test | ||
|
||
lint lint_diff lint_package lint_tests: | ||
poetry run ruff . | ||
poetry run ruff format $(PYTHON_FILES) --diff | ||
poetry run ruff --select I $(PYTHON_FILES) | ||
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE) | ||
|
||
format format_diff: | ||
poetry run ruff format $(PYTHON_FILES) | ||
poetry run ruff --select I --fix $(PYTHON_FILES) | ||
|
||
spell_check: | ||
poetry run codespell --toml pyproject.toml | ||
|
||
spell_fix: | ||
poetry run codespell --toml pyproject.toml -w | ||
|
||
check_imports: $(shell find langchain_elasticsearch -name '*.py') | ||
poetry run python ./scripts/check_imports.py $^ | ||
|
||
###################### | ||
# HELP | ||
###################### | ||
|
||
help: | ||
@echo '----' | ||
@echo 'check_imports - check imports' | ||
@echo 'format - run code formatters' | ||
@echo 'lint - run linters' | ||
@echo 'test - run unit tests' | ||
@echo 'tests - run unit tests' | ||
@echo 'test TEST_FILE=<test_file> - run all tests in file' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# langchain-elasticsearch | ||
|
||
This package contains the LangChain integration with Elasticsearch. | ||
|
||
## Installation | ||
|
||
```bash | ||
pip install -U langchain-elasticsearch | ||
``` | ||
|
||
## Elasticsearch setup | ||
|
||
### Elastic Cloud | ||
|
||
You need a running Elasticsearch deployment. The easiest way to start one is through [Elastic Cloud](https://cloud.elastic.co/). | ||
You can sign up for a [free trial](https://www.elastic.co/cloud/cloud-trial-overview). | ||
|
||
1. [Create a deployment](https://www.elastic.co/guide/en/cloud/current/ec-create-deployment.html) | ||
2. Get your Cloud ID: | ||
1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Manage" next to your deployment | ||
2. Copy the Cloud ID and paste it into the `es_cloud_id` parameter below | ||
3. Create an API key: | ||
1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Open" next to your deployment | ||
2. In the left-hand side menu, go to "Stack Management", then to "API Keys" | ||
3. Click "Create API key" | ||
4. Enter a name for the API key and click "Create" | ||
5. Copy the API key and paste it into the `es_api_key` parameter below | ||
|
||
### Elastic Cloud | ||
|
||
Alternatively, you can run Elasticsearch via Docker as described in the [docs](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch). | ||
|
||
## Usage | ||
|
||
### ElasticsearchStore | ||
|
||
The `ElasticsearchStore` class exposes Elasticsearch as a vector store. | ||
|
||
```python | ||
from langchain_elasticsearch import ElasticsearchStore | ||
|
||
embeddings = ... # use a LangChain Embeddings class or ElasticsearchEmbeddings | ||
|
||
vectorstore = ElasticsearchStore( | ||
es_cloud_id="your-cloud-id", | ||
es_api_key="your-api-key", | ||
index_name="your-index-name", | ||
embeddings=embeddings, | ||
) | ||
``` | ||
|
||
### ElasticsearchEmbeddings | ||
|
||
The `ElasticsearchEmbeddings` class provides an interface to generate embeddings using a model | ||
deployed in an Elasticsearch cluster. | ||
|
||
```python | ||
from langchain_elasticsearch import ElasticsearchEmbeddings | ||
|
||
embeddings = ElasticsearchEmbeddings.from_credentials( | ||
model_id="your-model-id", | ||
input_field="your-input-field", | ||
es_cloud_id="your-cloud-id", | ||
es_api_key="your-api-key", | ||
) | ||
``` | ||
|
||
### ElasticsearchChatMessageHistory | ||
|
||
The `ElasticsearchChatMessageHistory` class stores chat histories in Elasticsearch. | ||
|
||
```python | ||
from langchain_elasticsearch import ElasticsearchChatMessageHistory | ||
|
||
chat_history = ElasticsearchChatMessageHistory( | ||
index="your-index-name", | ||
session_id="your-session-id", | ||
es_cloud_id="your-cloud-id", | ||
es_api_key="your-api-key", | ||
) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from langchain_elasticsearch.chat_history import ElasticsearchChatMessageHistory | ||
from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings | ||
from langchain_elasticsearch.retrievers import ElasticsearchRetriever | ||
from langchain_elasticsearch.vectorstores import ( | ||
ApproxRetrievalStrategy, | ||
ElasticsearchStore, | ||
ExactRetrievalStrategy, | ||
SparseRetrievalStrategy, | ||
) | ||
|
||
__all__ = [ | ||
"ApproxRetrievalStrategy", | ||
"ElasticsearchChatMessageHistory", | ||
"ElasticsearchEmbeddings", | ||
"ElasticsearchRetriever", | ||
"ElasticsearchStore", | ||
"ExactRetrievalStrategy", | ||
"SparseRetrievalStrategy", | ||
] |
108 changes: 108 additions & 0 deletions
108
libs/elasticsearch/langchain_elasticsearch/_utilities.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from enum import Enum | ||
from typing import List, Union | ||
|
||
import numpy as np | ||
from elasticsearch import BadRequestError, ConflictError, Elasticsearch, NotFoundError | ||
from langchain_core import __version__ as langchain_version | ||
|
||
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] | ||
|
||
|
||
class DistanceStrategy(str, Enum): | ||
"""Enumerator of the Distance strategies for calculating distances | ||
between vectors.""" | ||
|
||
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE" | ||
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT" | ||
DOT_PRODUCT = "DOT_PRODUCT" | ||
JACCARD = "JACCARD" | ||
COSINE = "COSINE" | ||
|
||
|
||
def with_user_agent_header(client: Elasticsearch, header_prefix: str) -> Elasticsearch: | ||
headers = dict(client._headers) | ||
headers.update({"user-agent": f"{header_prefix}/{langchain_version}"}) | ||
return client.options(headers=headers) | ||
|
||
|
||
def maximal_marginal_relevance( | ||
query_embedding: np.ndarray, | ||
embedding_list: list, | ||
lambda_mult: float = 0.5, | ||
k: int = 4, | ||
) -> List[int]: | ||
"""Calculate maximal marginal relevance.""" | ||
if min(k, len(embedding_list)) <= 0: | ||
return [] | ||
if query_embedding.ndim == 1: | ||
query_embedding = np.expand_dims(query_embedding, axis=0) | ||
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0] | ||
most_similar = int(np.argmax(similarity_to_query)) | ||
idxs = [most_similar] | ||
selected = np.array([embedding_list[most_similar]]) | ||
while len(idxs) < min(k, len(embedding_list)): | ||
best_score = -np.inf | ||
idx_to_add = -1 | ||
similarity_to_selected = cosine_similarity(embedding_list, selected) | ||
for i, query_score in enumerate(similarity_to_query): | ||
if i in idxs: | ||
continue | ||
redundant_score = max(similarity_to_selected[i]) | ||
equation_score = ( | ||
lambda_mult * query_score - (1 - lambda_mult) * redundant_score | ||
) | ||
if equation_score > best_score: | ||
best_score = equation_score | ||
idx_to_add = i | ||
idxs.append(idx_to_add) | ||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) | ||
return idxs | ||
|
||
|
||
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: | ||
"""Row-wise cosine similarity between two equal-width matrices.""" | ||
if len(X) == 0 or len(Y) == 0: | ||
return np.array([]) | ||
|
||
X = np.array(X) | ||
Y = np.array(Y) | ||
if X.shape[1] != Y.shape[1]: | ||
raise ValueError( | ||
f"Number of columns in X and Y must be the same. X has shape {X.shape} " | ||
f"and Y has shape {Y.shape}." | ||
) | ||
try: | ||
import simsimd as simd # type: ignore | ||
|
||
X = np.array(X, dtype=np.float32) | ||
Y = np.array(Y, dtype=np.float32) | ||
Z = 1 - simd.cdist(X, Y, metric="cosine") | ||
if isinstance(Z, float): | ||
return np.array([Z]) | ||
return np.array(Z) | ||
except ImportError: | ||
X_norm = np.linalg.norm(X, axis=1) | ||
Y_norm = np.linalg.norm(Y, axis=1) | ||
# Ignore divide by zero errors run time warnings as those are handled below. | ||
with np.errstate(divide="ignore", invalid="ignore"): | ||
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) | ||
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 | ||
return similarity | ||
|
||
|
||
def check_if_model_deployed(client: Elasticsearch, model_id: str) -> None: | ||
try: | ||
dummy = {"x": "y"} | ||
client.ml.infer_trained_model(model_id=model_id, docs=[dummy]) | ||
except NotFoundError as err: | ||
raise err | ||
except ConflictError as err: | ||
raise NotFoundError( | ||
f"model '{model_id}' not found, please deploy it first", | ||
meta=err.meta, | ||
body=err.body, | ||
) from err | ||
except BadRequestError: | ||
# This error is expected because we do not know the expected document | ||
# shape and just use a dummy doc above. | ||
pass |
Oops, something went wrong.