Skip to content

Commit

Permalink
Merge pull request #1 from langchain-ai/erick/init-pkg
Browse files Browse the repository at this point in the history
init pkg
  • Loading branch information
efriis authored Mar 26, 2024
2 parents 9189ad6 + 8c88097 commit 74ef417
Show file tree
Hide file tree
Showing 33 changed files with 5,302 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/scripts/check_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
from typing import Dict

LIB_DIRS = ["libs/{lib}"]
LIB_DIRS = ["libs/elasticsearch"]

if __name__ == "__main__":
files = sys.argv[1:]
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
working-directory:
required: true
type: string
default: 'libs/'
default: 'libs/elasticsearch'

env:
PYTHON_VERSION: "3.11"
Expand Down Expand Up @@ -158,7 +158,9 @@ jobs:

- name: Run integration tests
env:
PARTNER_API_KEY: ${{ secrets.PARTNER_API_KEY }}
ES_API_KEY: ${{ secrets.ES_API_KEY }}
ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
ES_URL: ${{ secrets.ES_URL }}
run: make integration_tests
working-directory: ${{ inputs.working-directory }}

Expand Down
26 changes: 3 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,5 @@
# 🦜️🔗 LangChain {partner}
# 🦜️🔗 LangChain Elastic

This repository contains {n} packages with {partner} integrations with LangChain:
This repository contains 1 package with ElasticSearch integrations with LangChain:

- [langchain-{package}](https://pypi.org/project/langchain-{package}/) integrates [{product}}]({product_link}).
{- ... if more packages}

## Initial Repo Checklist (Remove this section after completing)

This setup assumes that the partner package is already split. For those instructions,
see [these docs](https://python.langchain.com/docs/contributing/integrations#partner-packages).

- [ ] Fill out the readme above (for folks that follow pypi link)
- [ ] Copy package into /libs folder
- [ ] Update these fields in /libs/*/pyproject.toml

- `tool.poetry.repository`
- `tool.poetry.urls["Source Code"]`

- [ ] Add integration testing secrets in Github (ask Erick for help)
- [ ] Add secrets as env vars in .github/workflows/_release.yml
- [ ] Configure `LIB_DIRS` in .github/scripts/check_diff.py
- [ ] Add partner collaborators in Github (ask Erick for help)
- [ ] Add new repo to test-pypi and pypi trusted publishing (ask Erick for help)
- [ ] Populate .github/workflows/_release.yml with `on.workflow_dispatch.inputs.working-directory.default`
- [langchain-elasticsearch](https://pypi.org/project/langchain-elasticsearch/) integrates [ElasticSearch](https://www.elastic.co/elasticsearch).
1 change: 1 addition & 0 deletions libs/elasticsearch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__
21 changes: 21 additions & 0 deletions libs/elasticsearch/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 LangChain, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
60 changes: 60 additions & 0 deletions libs/elasticsearch/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests

# Default target executed when no arguments are given to make.
all: help

install:
poetry install

# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE=tests/integration_tests/

test tests integration_test integration_tests:
poetry run pytest $(TEST_FILE)


######################
# LINTING AND FORMATTING
######################

# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/elasticsearch --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_elasticsearch
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test

lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)

format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)

spell_check:
poetry run codespell --toml pyproject.toml

spell_fix:
poetry run codespell --toml pyproject.toml -w

check_imports: $(shell find langchain_elasticsearch -name '*.py')
poetry run python ./scripts/check_imports.py $^

######################
# HELP
######################

help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'
81 changes: 81 additions & 0 deletions libs/elasticsearch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# langchain-elasticsearch

This package contains the LangChain integration with Elasticsearch.

## Installation

```bash
pip install -U langchain-elasticsearch
```

## Elasticsearch setup

### Elastic Cloud

You need a running Elasticsearch deployment. The easiest way to start one is through [Elastic Cloud](https://cloud.elastic.co/).
You can sign up for a [free trial](https://www.elastic.co/cloud/cloud-trial-overview).

1. [Create a deployment](https://www.elastic.co/guide/en/cloud/current/ec-create-deployment.html)
2. Get your Cloud ID:
1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Manage" next to your deployment
2. Copy the Cloud ID and paste it into the `es_cloud_id` parameter below
3. Create an API key:
1. In the [Elastic Cloud console](https://cloud.elastic.co), click "Open" next to your deployment
2. In the left-hand side menu, go to "Stack Management", then to "API Keys"
3. Click "Create API key"
4. Enter a name for the API key and click "Create"
5. Copy the API key and paste it into the `es_api_key` parameter below

### Elastic Cloud

Alternatively, you can run Elasticsearch via Docker as described in the [docs](https://python.langchain.com/docs/integrations/vectorstores/elasticsearch).

## Usage

### ElasticsearchStore

The `ElasticsearchStore` class exposes Elasticsearch as a vector store.

```python
from langchain_elasticsearch import ElasticsearchStore

embeddings = ... # use a LangChain Embeddings class or ElasticsearchEmbeddings

vectorstore = ElasticsearchStore(
es_cloud_id="your-cloud-id",
es_api_key="your-api-key",
index_name="your-index-name",
embeddings=embeddings,
)
```

### ElasticsearchEmbeddings

The `ElasticsearchEmbeddings` class provides an interface to generate embeddings using a model
deployed in an Elasticsearch cluster.

```python
from langchain_elasticsearch import ElasticsearchEmbeddings

embeddings = ElasticsearchEmbeddings.from_credentials(
model_id="your-model-id",
input_field="your-input-field",
es_cloud_id="your-cloud-id",
es_api_key="your-api-key",
)
```

### ElasticsearchChatMessageHistory

The `ElasticsearchChatMessageHistory` class stores chat histories in Elasticsearch.

```python
from langchain_elasticsearch import ElasticsearchChatMessageHistory

chat_history = ElasticsearchChatMessageHistory(
index="your-index-name",
session_id="your-session-id",
es_cloud_id="your-cloud-id",
es_api_key="your-api-key",
)
```
19 changes: 19 additions & 0 deletions libs/elasticsearch/langchain_elasticsearch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from langchain_elasticsearch.chat_history import ElasticsearchChatMessageHistory
from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings
from langchain_elasticsearch.retrievers import ElasticsearchRetriever
from langchain_elasticsearch.vectorstores import (
ApproxRetrievalStrategy,
ElasticsearchStore,
ExactRetrievalStrategy,
SparseRetrievalStrategy,
)

__all__ = [
"ApproxRetrievalStrategy",
"ElasticsearchChatMessageHistory",
"ElasticsearchEmbeddings",
"ElasticsearchRetriever",
"ElasticsearchStore",
"ExactRetrievalStrategy",
"SparseRetrievalStrategy",
]
108 changes: 108 additions & 0 deletions libs/elasticsearch/langchain_elasticsearch/_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from enum import Enum
from typing import List, Union

import numpy as np
from elasticsearch import BadRequestError, ConflictError, Elasticsearch, NotFoundError
from langchain_core import __version__ as langchain_version

Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]


class DistanceStrategy(str, Enum):
"""Enumerator of the Distance strategies for calculating distances
between vectors."""

EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
DOT_PRODUCT = "DOT_PRODUCT"
JACCARD = "JACCARD"
COSINE = "COSINE"


def with_user_agent_header(client: Elasticsearch, header_prefix: str) -> Elasticsearch:
headers = dict(client._headers)
headers.update({"user-agent": f"{header_prefix}/{langchain_version}"})
return client.options(headers=headers)


def maximal_marginal_relevance(
query_embedding: np.ndarray,
embedding_list: list,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
query_embedding = np.expand_dims(query_embedding, axis=0)
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
most_similar = int(np.argmax(similarity_to_query))
idxs = [most_similar]
selected = np.array([embedding_list[most_similar]])
while len(idxs) < min(k, len(embedding_list)):
best_score = -np.inf
idx_to_add = -1
similarity_to_selected = cosine_similarity(embedding_list, selected)
for i, query_score in enumerate(similarity_to_query):
if i in idxs:
continue
redundant_score = max(similarity_to_selected[i])
equation_score = (
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
)
if equation_score > best_score:
best_score = equation_score
idx_to_add = i
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs


def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
if len(X) == 0 or len(Y) == 0:
return np.array([])

X = np.array(X)
Y = np.array(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError(
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
f"and Y has shape {Y.shape}."
)
try:
import simsimd as simd # type: ignore

X = np.array(X, dtype=np.float32)
Y = np.array(Y, dtype=np.float32)
Z = 1 - simd.cdist(X, Y, metric="cosine")
if isinstance(Z, float):
return np.array([Z])
return np.array(Z)
except ImportError:
X_norm = np.linalg.norm(X, axis=1)
Y_norm = np.linalg.norm(Y, axis=1)
# Ignore divide by zero errors run time warnings as those are handled below.
with np.errstate(divide="ignore", invalid="ignore"):
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
return similarity


def check_if_model_deployed(client: Elasticsearch, model_id: str) -> None:
try:
dummy = {"x": "y"}
client.ml.infer_trained_model(model_id=model_id, docs=[dummy])
except NotFoundError as err:
raise err
except ConflictError as err:
raise NotFoundError(
f"model '{model_id}' not found, please deploy it first",
meta=err.meta,
body=err.body,
) from err
except BadRequestError:
# This error is expected because we do not know the expected document
# shape and just use a dummy doc above.
pass
Loading

0 comments on commit 74ef417

Please sign in to comment.