Skip to content

Commit

Permalink
feat: CIP-5: Large Batch Handling Improvements Proposal
Browse files Browse the repository at this point in the history
- Updated CIP
- Implementation done
- Added a new test in test_add

Refs: chroma-core#1049
  • Loading branch information
tazarov committed Sep 15, 2023
1 parent 83df816 commit b38caae
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 9 deletions.
2 changes: 2 additions & 0 deletions chromadb/api/fastapi.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import Optional, cast, Tuple
import logging
from typing import Optional, cast, Tuple
from typing import Sequence
Expand Down Expand Up @@ -35,6 +36,7 @@
from urllib.parse import urlparse, urlunparse, quote

logger = logging.getLogger(__name__)
from chromadb.utils.batch_utils import create_batches


class FastAPI(API):
Expand Down
15 changes: 7 additions & 8 deletions chromadb/utils/batch_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import Optional, Tuple, List

from chromadb.api import API
from chromadb.api.types import (
Documents,
Embeddings,
Expand All @@ -10,7 +9,7 @@


def create_batches(
api: API,
max_batch_size: int,
ids: IDs,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
Expand All @@ -19,15 +18,15 @@ def create_batches(
_batches: List[
Tuple[IDs, Embeddings, Optional[Metadatas], Optional[Documents]]
] = []
if len(ids) > api.max_batch_size:
if len(ids) > max_batch_size:
# create split batches
for i in range(0, len(ids), api.max_batch_size):
for i in range(0, len(ids), max_batch_size):
_batches.append(
( # type: ignore
ids[i : i + api.max_batch_size],
embeddings[i : i + api.max_batch_size] if embeddings else None,
metadatas[i : i + api.max_batch_size] if metadatas else None,
documents[i : i + api.max_batch_size] if documents else None,
ids[i : i + max_batch_size],
embeddings[i : i + max_batch_size] if embeddings else None,
metadatas[i : i + max_batch_size] if metadatas else None,
documents[i : i + max_batch_size] if documents else None,
)
)
else:
Expand Down
4 changes: 3 additions & 1 deletion docs/CIP_5_Large_Batch_Handling_Improvements.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,6 @@ New tests:

## **Rejected Alternatives**

N/A
Exposing `max_batch_size` and throwing an exception - We decided against this because submitting
batches (especially large ones) comes with monetary or, at the very least, compute cost. Instead, we want the API to
gracefully handle large batches by splitting them up.

0 comments on commit b38caae

Please sign in to comment.