Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chroma vector store #1198

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,16 @@ jobs:
pip install --upgrade pip
if [ $RAY = "ENABLED" ]; then
if [ $PY_VERSION != "3.11" ]; then
pip install ".[dev,ray,qdrant]"
pip install ".[dev,ray,qdrant,pinecone,chromadb]"
else
pip install ".[dev]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
fi
python -c "import yaml;f = open('evadb/evadb.yml', 'r+');config_obj = yaml.load(f, Loader=yaml.FullLoader);config_obj['experimental']['ray'] = True;f.seek(0);f.write(yaml.dump(config_obj));f.truncate();"
else
if [ $PY_VERSION != "3.11" ]; then
pip install ".[dev,ludwig,qdrant]"
pip install ".[dev,ludwig,qdrant,pinecone,chromadb]"
else
pip install ".[dev]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
fi
fi

Expand Down Expand Up @@ -449,7 +449,7 @@ jobs:
source test_evadb/bin/activate
pip install --upgrade pip
pip debug --verbose
pip install ".[dev,ludwig,qdrant,forecasting]"
pip install ".[dev,ludwig,qdrant,forecasting,pinecone,chromadb]"
source test_evadb/bin/activate
bash script/test/test.sh -m "<< parameters.mode >>"

Expand Down
1 change: 1 addition & 0 deletions evadb/catalog/catalog_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ class VectorStoreType(EvaDBEnum):
QDRANT # noqa: F821
PINECONE # noqa: F821
PGVECTOR # noqa: F821
CHROMADB # noqa: F821


class VideoColumnName(EvaDBEnum):
Expand Down
2 changes: 2 additions & 0 deletions evadb/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def handle_vector_store_params(
return {"index_path": index_path}
elif vector_store_type == VectorStoreType.QDRANT:
return {"index_db": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.CHROMADB:
return {"index_path": index_path}
elif vector_store_type == VectorStoreType.PINECONE:
return {}
else:
Expand Down
2 changes: 1 addition & 1 deletion evadb/interfaces/relational/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def create_vector_index(
index_name (str): Name of the index.
table_name (str): Name of the table.
expr (str): Expression used to build the vector index.
using (str): Method used for indexing, can be `FAISS` or `QDRANT` or `PINECONE`.
using (str): Method used for indexing, can be `FAISS` or `QDRANT` or `PINECONE` or `CHROMADB`.

Returns:
EvaDBCursor: The EvaDBCursor object.
Expand Down
3 changes: 2 additions & 1 deletion evadb/parser/evadb.lark
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ function_metadata_key: uid

function_metadata_value: string_literal | decimal_literal

vector_store_type: USING (FAISS | QDRANT | PINECONE | PGVECTOR )
vector_store_type: USING (FAISS | QDRANT | PINECONE | PGVECTOR | CHROMADB)

index_elem: ("(" uid_list ")"
| "(" function_call ")")
Expand Down Expand Up @@ -416,6 +416,7 @@ FAISS: "FAISS"i
QDRANT: "QDRANT"i
PINECONE: "PINECONE"i
PGVECTOR: "PGVECTOR"i
CHROMADB: "CHROMADB"i

// Computer vision tasks

Expand Down
2 changes: 2 additions & 0 deletions evadb/parser/lark_visitor/_create_statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,8 @@ def vector_store_type(self, tree):
vector_store_type = VectorStoreType.PINECONE
elif str.upper(token) == "PGVECTOR":
vector_store_type = VectorStoreType.PGVECTOR
elif str.upper(token) == "CHROMADB":
vector_store_type = VectorStoreType.CHROMADB
return vector_store_type


Expand Down
81 changes: 81 additions & 0 deletions evadb/third_party/vector_stores/chromadb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List

from evadb.third_party.vector_stores.types import (
FeaturePayload,
VectorIndexQuery,
VectorIndexQueryResult,
VectorStore,
)
from evadb.utils.generic_utils import try_to_import_chromadb_client

_chromadb_client_instance = None

required_params = ["index_path"]


def get_chromadb_client(index_path: str):
global _chromadb_client_instance
if _chromadb_client_instance is None:
try_to_import_chromadb_client()
import chromadb # noqa: F401

# creating a local client
_chromadb_client_instance = chromadb.PersistentClient(path=index_path)
return _chromadb_client_instance


class ChromaDBVectorStore(VectorStore):
def __init__(self, index_name: str, index_path: str) -> None:
self._client = get_chromadb_client(index_path)
self._collection_name = index_name

def create(self, vector_dim: int):
self._client.create_collection(
name=self._collection_name,
metadata={"hnsw:construction_ef": vector_dim, "hnsw:space": "cosine"},
)

def add(self, payload: List[FeaturePayload]):
ids = [str(row.id) for row in payload]
embeddings = [row.embedding.reshape(-1).tolist() for row in payload]
self._client.get_collection(self._collection_name).add(
ids=ids,
embeddings=embeddings,
)

def delete(self) -> None:
self._client.delete_collection(
name=self._collection_name,
)

def query(
self,
query: VectorIndexQuery,
) -> VectorIndexQueryResult:
response = self._client.get_collection(self._collection_name).query(
query_embeddings=query.embedding.reshape(-1).tolist(),
n_results=query.top_k,
)

distances, ids = [], []
if "ids" in response:
for id in response["ids"][0]:
ids.append(int(id))
for distance in response["distances"][0]:
distances.append(distance)

return VectorIndexQueryResult(distances, ids)
7 changes: 7 additions & 0 deletions evadb/third_party/vector_stores/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from evadb.catalog.catalog_type import VectorStoreType
from evadb.third_party.vector_stores.chromadb import ChromaDBVectorStore
from evadb.third_party.vector_stores.faiss import FaissVectorStore
from evadb.third_party.vector_stores.pinecone import PineconeVectorStore
from evadb.third_party.vector_stores.qdrant import QdrantVectorStore
Expand Down Expand Up @@ -42,5 +43,11 @@ def init_vector_store(
validate_kwargs(kwargs, required_params, required_params)
return PineconeVectorStore(index_name, **kwargs)

elif vector_store_type == VectorStoreType.CHROMADB:
from evadb.third_party.vector_stores.chromadb import required_params

validate_kwargs(kwargs, required_params, required_params)
return ChromaDBVectorStore(index_name, **kwargs)

else:
raise Exception(f"Vector store {vector_store_type} not supported")
18 changes: 18 additions & 0 deletions evadb/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,16 @@ def try_to_import_pinecone_client():
)


def try_to_import_chromadb_client():
try:
import chromadb # noqa: F401
except ImportError:
raise ValueError(
"""Could not import chromadb python package.
Please install it with 'pip install chromadb`."""
)


def is_qdrant_available() -> bool:
try:
try_to_import_qdrant_client()
Expand All @@ -516,6 +526,14 @@ def is_pinecone_available() -> bool:
return False


def is_chromadb_available() -> bool:
try:
try_to_import_chromadb_client()
return True
except ValueError: # noqa: E722
return False


##############################
## UTILS
##############################
Expand Down
2 changes: 2 additions & 0 deletions script/formatting/spelling.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ CatalogManagerTests
CatalogModelsTest
ChatGPT
ChatGPTTest
CHROMADB
ChromaDB
ColConstraintInfo
Colab
ColorSpace
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def read(path, encoding="utf-8"):

pinecone_libs = ["pinecone-client"]

chromadb_libs = ["chromadb"]

postgres_libs = [
"psycopg2",
]
Expand Down Expand Up @@ -154,6 +156,8 @@ def read(path, encoding="utf-8"):
"function": function_libs,
"notebook": notebook_libs,
"qdrant": qdrant_libs,
"pinecone": pinecone_libs,
"chromadb": chromadb_libs,
"postgres": postgres_libs,
"ludwig": ludwig_libs,
"forecasting": forecasting_libs,
Expand Down
20 changes: 19 additions & 1 deletion test/integration_tests/long/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import os
import time
import unittest
from test.markers import pinecone_skip_marker, qdrant_skip_marker
from test.markers import chromadb_skip_marker, pinecone_skip_marker, qdrant_skip_marker
from test.util import (
create_sample_image,
get_evadb_for_testing,
Expand Down Expand Up @@ -411,6 +411,24 @@ def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_qdrant(sel
# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testQdrantIndexImageDataset")

@chromadb_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_chromadb(
xzdandy marked this conversation as resolved.
Show resolved Hide resolved
self,
):
create_index_query = """CREATE INDEX testChromaDBIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING CHROMADB;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)

@pinecone_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_pinecone(
self,
Expand Down
8 changes: 7 additions & 1 deletion test/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pytest

from evadb.utils.generic_utils import (
is_chromadb_available,
is_forecast_available,
is_gpu_available,
is_ludwig_available,
Expand All @@ -37,7 +38,12 @@

pinecone_skip_marker = pytest.mark.skipif(
is_pinecone_available() is False,
reason="skipping since pinecone is not installed",
reason="Skipping since pinecone is not installed",
)

chromadb_skip_marker = pytest.mark.skipif(
is_chromadb_available() is False,
reason="Skipping since chromadb is not installed",
)

windows_skip_marker = pytest.mark.skipif(
Expand Down