-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ENH] Metric batching and more metrics (#1163)
## Description of changes This PR accomplishes two things: - Adds batching to metrics to decrease load to Posthog - Adds more metric instrumentation Each `TelemetryEvent` type now has a `batch_size` member defining how many of that Event to include in a batch. `TelemetryEvent`s with `batch_size > 1` must also define `can_batch()` and `batch()` methods to do the actual batching -- our posthog client can't do this itself since different `TelemetryEvent`s use different count fields. The Posthog client combines events until they hit their `batch_size` then fires them off as one event. NB: this means we can drop up to `batch_size` events -- since we only batch `add()` calls right now this seems fine, though we may want to address it in the future. As for the additional telemetry, I pretty much copied Anton's draft #859 with some minor changes. Other considerations: Maybe we should implement `can_batch()` and `batch()` on all events, even those which don't currently use them? I'd prefer not to leave dead code hanging around but happy to go either way. I created a ticket for the type ignores: #1169 ## Test plan pytest passes modulo a couple unrelated failures With `print(event.properties)` in posthog client's `_direct_capture()`: ``` >>> import chromadb >>> client = chromadb.Client() {'batch_size': 1} >>> collection = client.create_collection("sample_collection") {'batch_size': 1, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'embedding_function': 'ONNXMiniLM_L6_V2'} >>> collection.add( ... documents=["This is document1", "This is document2"], # we embed for you, or bring your own ... metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on arbitrary metadata! ... ids=["doc1", "doc2"], # must be unique for each doc ... ) {'batch_size': 1, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 2, 'with_documents': 2, 'with_metadata': 2} >>> for i in range(50): ... collection.add(documents=[str(i)], ids=[str(i)]) ... {'batch_size': 20, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 20, 'with_documents': 20, 'with_metadata': 0} {'batch_size': 20, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 20, 'with_documents': 20, 'with_metadata': 0} >>> for i in range(50): ... collection.add(documents=[str(i) + ' ' + str(n) for n in range(20)], ids=[str(i) + ' ' + str(n) for n in range(20)]) ... {'batch_size': 20, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 210, 'with_documents': 210, 'with_metadata': 0} {'batch_size': 20, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 400, 'with_documents': 400, 'with_metadata': 0} {'batch_size': 20, 'collection_uuid': 'bb19d790-4ec7-436c-b781-46dab047625d', 'add_amount': 400, 'with_documents': 400, 'with_metadata': 0} ``` ## Documentation Changes chroma-core/docs#139 chroma-core/docs@a4fd57d
- Loading branch information
Showing
6 changed files
with
258 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,153 @@ | ||
from dataclasses import dataclass | ||
from typing import ClassVar | ||
from typing import cast, ClassVar | ||
from chromadb.telemetry import TelemetryEvent | ||
from chromadb.utils.embedding_functions import get_builtins | ||
|
||
|
||
@dataclass | ||
class ClientStartEvent(TelemetryEvent): | ||
name: ClassVar[str] = "client_start" | ||
def __init__(self) -> None: | ||
super().__init__() | ||
|
||
|
||
@dataclass | ||
class ServerStartEvent(TelemetryEvent): | ||
name: ClassVar[str] = "server_start" | ||
class ClientCreateCollectionEvent(TelemetryEvent): | ||
collection_uuid: str | ||
embedding_function: str | ||
|
||
def __init__(self, collection_uuid: str, embedding_function: str): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
|
||
embedding_function_names = get_builtins() | ||
|
||
self.embedding_function = ( | ||
embedding_function | ||
if embedding_function in embedding_function_names | ||
else "custom" | ||
) | ||
|
||
|
||
@dataclass | ||
class CollectionAddEvent(TelemetryEvent): | ||
name: ClassVar[str] = "collection_add" | ||
max_batch_size: ClassVar[int] = 20 | ||
collection_uuid: str | ||
add_amount: int | ||
with_documents: int | ||
with_metadata: int | ||
|
||
def __init__( | ||
self, | ||
collection_uuid: str, | ||
add_amount: int, | ||
with_documents: int, | ||
with_metadata: int, | ||
batch_size: int = 1, | ||
): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
self.add_amount = add_amount | ||
self.with_documents = with_documents | ||
self.with_metadata = with_metadata | ||
self.batch_size = batch_size | ||
|
||
@property | ||
def batch_key(self) -> str: | ||
return self.collection_uuid + self.name | ||
|
||
def batch(self, other: "TelemetryEvent") -> "CollectionAddEvent": | ||
if not self.batch_key == other.batch_key: | ||
raise ValueError("Cannot batch events") | ||
other = cast(CollectionAddEvent, other) | ||
total_amount = self.add_amount + other.add_amount | ||
return CollectionAddEvent( | ||
collection_uuid=self.collection_uuid, | ||
add_amount=total_amount, | ||
with_documents=self.with_documents + other.with_documents, | ||
with_metadata=self.with_metadata + other.with_metadata, | ||
batch_size=self.batch_size + other.batch_size, | ||
) | ||
|
||
|
||
class CollectionUpdateEvent(TelemetryEvent): | ||
collection_uuid: str | ||
update_amount: int | ||
with_embeddings: int | ||
with_metadata: int | ||
with_documents: int | ||
|
||
def __init__( | ||
self, | ||
collection_uuid: str, | ||
update_amount: int, | ||
with_embeddings: int, | ||
with_metadata: int, | ||
with_documents: int, | ||
): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
self.update_amount = update_amount | ||
self.with_embeddings = with_embeddings | ||
self.with_metadata = with_metadata | ||
self.with_documents = with_documents | ||
|
||
|
||
class CollectionQueryEvent(TelemetryEvent): | ||
collection_uuid: str | ||
query_amount: int | ||
with_metadata_filter: bool | ||
with_document_filter: bool | ||
n_results: int | ||
include_metadatas: bool | ||
include_documents: bool | ||
include_distances: bool | ||
|
||
def __init__( | ||
self, | ||
collection_uuid: str, | ||
query_amount: int, | ||
with_metadata_filter: bool, | ||
with_document_filter: bool, | ||
n_results: int, | ||
include_metadatas: bool, | ||
include_documents: bool, | ||
include_distances: bool, | ||
): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
self.query_amount = query_amount | ||
self.with_metadata_filter = with_metadata_filter | ||
self.with_document_filter = with_document_filter | ||
self.n_results = n_results | ||
self.include_metadatas = include_metadatas | ||
self.include_documents = include_documents | ||
self.include_distances = include_distances | ||
|
||
|
||
class CollectionGetEvent(TelemetryEvent): | ||
collection_uuid: str | ||
ids_count: int | ||
limit: int | ||
include_metadata: bool | ||
include_documents: bool | ||
|
||
def __init__( | ||
self, | ||
collection_uuid: str, | ||
ids_count: int, | ||
limit: int, | ||
include_metadata: bool, | ||
include_documents: bool, | ||
): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
self.ids_count = ids_count | ||
self.limit = limit | ||
self.include_metadata = include_metadata | ||
self.include_documents = include_documents | ||
|
||
|
||
@dataclass | ||
class CollectionDeleteEvent(TelemetryEvent): | ||
name: ClassVar[str] = "collection_delete" | ||
collection_uuid: str | ||
delete_amount: int | ||
|
||
def __init__(self, collection_uuid: str, delete_amount: int): | ||
super().__init__() | ||
self.collection_uuid = collection_uuid | ||
self.delete_amount = delete_amount |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.