From ba8108ef39c12c9a3f89643c13ba0739343d44c0 Mon Sep 17 00:00:00 2001 From: Brad Edwards Date: Fri, 6 Oct 2023 23:34:39 -0700 Subject: [PATCH 1/4] Fix: ChromaVectorStore can attempt to add in excess of chromadb batch size --- llama_index/vector_stores/chroma.py | 63 ++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/llama_index/vector_stores/chroma.py b/llama_index/vector_stores/chroma.py index d2373fe206ab2..3cf9f2b35d683 100644 --- a/llama_index/vector_stores/chroma.py +++ b/llama_index/vector_stores/chroma.py @@ -1,7 +1,7 @@ """Chroma vector store.""" import logging import math -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, Generator, List, Optional, cast from llama_index.bridge.pydantic import Field, PrivateAttr from llama_index.schema import BaseNode, MetadataMode, TextNode @@ -32,6 +32,22 @@ def _to_chroma_filter(standard_filters: MetadataFilters) -> dict: import_err_msg = "`chromadb` package not found, please run `pip install chromadb`" +def chunk_list( + lst: List[BaseNode], max_chunk_size: int +) -> Generator[List[BaseNode], None, None]: + """Yield successive max_chunk_size-sized chunks from lst. + + Args: + lst (List[BaseNode]): list of nodes with embeddings + max_chunk_size (int): max chunk size + + Yields: + Generator[List[BaseNode], None, None]: list of nodes with embeddings + """ + for i in range(0, len(lst), max_chunk_size): + yield lst[i : i + max_chunk_size] + + class ChromaVectorStore(BasePydanticVectorStore): """Chroma vector store. @@ -129,27 +145,34 @@ def add(self, nodes: List[BaseNode]) -> List[str]: if not self._collection: raise ValueError("Collection not initialized") - embeddings = [] - metadatas = [] - ids = [] - documents = [] - for node in nodes: - embeddings.append(node.get_embedding()) - metadatas.append( - node_to_metadata_dict( - node, remove_text=True, flat_metadata=self.flat_metadata + max_chunk_size = 41665 # One less than the max chunk size for ChromaDB + node_chunks = chunk_list(nodes, max_chunk_size) + + all_ids = [] + for node_chunk in node_chunks: + embeddings = [] + metadatas = [] + ids = [] + documents = [] + for node in node_chunk: + embeddings.append(node.get_embedding()) + metadatas.append( + node_to_metadata_dict( + node, remove_text=True, flat_metadata=self.flat_metadata + ) ) + ids.append(node.node_id) + documents.append(node.get_content(metadata_mode=MetadataMode.NONE)) + + self._collection.add( + embeddings=embeddings, + ids=ids, + metadatas=metadatas, + documents=documents, ) - ids.append(node.node_id) - documents.append(node.get_content(metadata_mode=MetadataMode.NONE)) - - self._collection.add( - embeddings=embeddings, - ids=ids, - metadatas=metadatas, - documents=documents, - ) - return ids + all_ids.extend(ids) + + return all_ids def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: """ From 92ebef5247196db12e4cd58c745a16b604f24726 Mon Sep 17 00:00:00 2001 From: Brad Edwards Date: Sat, 7 Oct 2023 21:33:29 -0700 Subject: [PATCH 2/4] Move magic number to constant --- llama_index/vector_stores/chroma.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_index/vector_stores/chroma.py b/llama_index/vector_stores/chroma.py index 3cf9f2b35d683..f4d5008d81e07 100644 --- a/llama_index/vector_stores/chroma.py +++ b/llama_index/vector_stores/chroma.py @@ -62,6 +62,8 @@ class ChromaVectorStore(BasePydanticVectorStore): """ + MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB + stores_text: bool = True flat_metadata: bool = True @@ -145,7 +147,7 @@ def add(self, nodes: List[BaseNode]) -> List[str]: if not self._collection: raise ValueError("Collection not initialized") - max_chunk_size = 41665 # One less than the max chunk size for ChromaDB + max_chunk_size = MAX_CUNK_SIZE node_chunks = chunk_list(nodes, max_chunk_size) all_ids = [] From e7717a4aae9ae3541f47926aff55981bc00dec73 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Sun, 8 Oct 2023 20:48:00 -0600 Subject: [PATCH 3/4] move constant outside of class --- llama_index/vector_stores/chroma.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_index/vector_stores/chroma.py b/llama_index/vector_stores/chroma.py index f4d5008d81e07..138d24cd7f583 100644 --- a/llama_index/vector_stores/chroma.py +++ b/llama_index/vector_stores/chroma.py @@ -31,6 +31,8 @@ def _to_chroma_filter(standard_filters: MetadataFilters) -> dict: import_err_msg = "`chromadb` package not found, please run `pip install chromadb`" +MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB + def chunk_list( lst: List[BaseNode], max_chunk_size: int @@ -62,8 +64,6 @@ class ChromaVectorStore(BasePydanticVectorStore): """ - MAX_CHUNK_SIZE = 41665 # One less than the max chunk size for ChromaDB - stores_text: bool = True flat_metadata: bool = True From 8e91c18349d34f180ce09ba51179431057e09b78 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Sun, 8 Oct 2023 20:54:04 -0600 Subject: [PATCH 4/4] typo --- llama_index/vector_stores/chroma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_index/vector_stores/chroma.py b/llama_index/vector_stores/chroma.py index 138d24cd7f583..4b7bd70b90e67 100644 --- a/llama_index/vector_stores/chroma.py +++ b/llama_index/vector_stores/chroma.py @@ -147,7 +147,7 @@ def add(self, nodes: List[BaseNode]) -> List[str]: if not self._collection: raise ValueError("Collection not initialized") - max_chunk_size = MAX_CUNK_SIZE + max_chunk_size = MAX_CHUNK_SIZE node_chunks = chunk_list(nodes, max_chunk_size) all_ids = []