Merge pull request #11 from chroma-core/main

update repo
chroma-core · Aug 29, 2023 · 490fef4 · 490fef4
2 parents 7e7e3a3 + 4332adf
commit 490fef4
Show file tree

Hide file tree

Showing 63 changed files with 5,037 additions and 615 deletions.
diff --git a/README.md b/README.md
@@ -96,6 +96,7 @@ Chroma is a rapidly developing project. We welcome PR contributors and ideas for
 - [Join the conversation on Discord](https://discord.gg/MMeYNTmh3x) - `#contributing` channel
 - [Review the 🛣️ Roadmap and contribute your ideas](https://docs.trychroma.com/roadmap)
 - [Grab an issue and open a PR](https://github.com/chroma-core/chroma/issues) - [`Good first issue tag`](https://github.com/chroma-core/chroma/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+- [Read our contributing guide](https://docs.trychroma.com/contributing)
 
 **Release Cadence**
 We currently release new tagged versions of the `pypi` and `npm` packages on Mondays. Hotfixes go out at any time during the week.

diff --git a/chromadb/__init__.py b/chromadb/__init__.py
@@ -6,12 +6,45 @@
 from chromadb.telemetry import Telemetry
 from chromadb.config import Settings, System
 from chromadb.api import API
+from chromadb.api.models.Collection import Collection
+from chromadb.api.types import (
+    CollectionMetadata,
+    Documents,
+    EmbeddingFunction,
+    Embeddings,
+    IDs,
+    Include,
+    Metadata,
+    Where,
+    QueryResult,
+    GetResult,
+    WhereDocument,
+    UpdateCollectionMetadata,
+)
+
+# Re-export types from chromadb.types
+__all__ = [
+    "Collection",
+    "Metadata",
+    "Where",
+    "WhereDocument",
+    "Documents",
+    "IDs",
+    "Embeddings",
+    "EmbeddingFunction",
+    "Include",
+    "CollectionMetadata",
+    "UpdateCollectionMetadata",
+    "QueryResult",
+    "GetResult",
+]
+
 
 logger = logging.getLogger(__name__)
 
 __settings = Settings()
 
-__version__ = "0.4.4"
+__version__ = "0.4.8"
 
 # Workaround to deal with Colab's old sqlite3 version
 try:

diff --git a/chromadb/api/fastapi.py b/chromadb/api/fastapi.py
@@ -1,6 +1,15 @@
+import json
 from typing import Optional, cast
+from typing import Sequence
+from uuid import UUID
+
+import requests
+from overrides import override
+
+import chromadb.errors as errors
+import chromadb.utils.embedding_functions as ef
 from chromadb.api import API
-from chromadb.config import Settings, System
+from chromadb.api.models.Collection import Collection
 from chromadb.api.types import (
     Documents,
     Embeddings,
@@ -14,15 +23,13 @@
     QueryResult,
     CollectionMetadata,
 )
-import chromadb.utils.embedding_functions as ef
-import requests
-import json
-from typing import Sequence
-from chromadb.api.models.Collection import Collection
-import chromadb.errors as errors
-from uuid import UUID
+from chromadb.auth import (
+    ClientAuthProvider,
+)
+from chromadb.auth.providers import RequestsClientAuthProtocolAdapter
+from chromadb.auth.registry import resolve_provider
+from chromadb.config import Settings, System
 from chromadb.telemetry import Telemetry
-from overrides import override
 
 
 class FastAPI(API):
@@ -47,7 +54,27 @@ def __init__(self, system: System):
         )
 
         self._header = system.settings.chroma_server_headers
-        self._session = requests.Session()
+        if (
+            system.settings.chroma_client_auth_provider
+            and system.settings.chroma_client_auth_protocol_adapter
+        ):
+            self._auth_provider = self.require(
+                resolve_provider(
+                    system.settings.chroma_client_auth_provider, ClientAuthProvider
+                )
+            )
+            self._adapter = cast(
+                RequestsClientAuthProtocolAdapter,
+                system.require(
+                    resolve_provider(
+                        system.settings.chroma_client_auth_protocol_adapter,
+                        RequestsClientAuthProtocolAdapter,
+                    )
+                ),
+            )
+            self._session = self._adapter.session
+        else:
+            self._session = requests.Session()
         if self._header is not None:
             self._session.headers.update(self._header)
 

diff --git a/chromadb/api/models/Collection.py b/chromadb/api/models/Collection.py
@@ -112,7 +112,7 @@ def get(
 
         Args:
             ids: The ids of the embeddings to get. Optional.
-            where: A Where type dict used to filter results by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
+            where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
             limit: The number of documents to return. Optional.
             offset: The offset to start returning results from. Useful for paging results with limit. Optional.
             where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
@@ -165,7 +165,7 @@ def query(
             query_embeddings: The embeddings to get the closes neighbors of. Optional.
             query_texts: The document texts to get the closes neighbors of. Optional.
             n_results: The number of neighbors to return for each query_embedding or query_texts. Optional.
-            where: A Where type dict used to filter results by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
+            where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
             where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
             include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`, `"distances"`. Ids are always included. Defaults to `["metadatas", "documents", "distances"]`. Optional.
 
@@ -313,17 +313,21 @@ def delete(
 
         Args:
             ids: The ids of the embeddings to delete
-            where: A Where type dict used to filter the delection by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
+            where: A Where type dict used to filter the delection by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
             where_document: A WhereDocument type dict used to filter the deletion by the document content. E.g. `{$contains: {"text": "hello"}}`. Optional.
 
         Returns:
             None
+
+        Raises:
+            ValueError: If you don't provide either ids, where, or where_document
         """
         ids = validate_ids(maybe_cast_one_to_many(ids)) if ids else None
         where = validate_where(where) if where else None
         where_document = (
             validate_where_document(where_document) if where_document else None
         )
+
         self._client._delete(self.id, ids, where, where_document)
 
     def _validate_embedding_set(

diff --git a/chromadb/api/segment.py b/chromadb/api/segment.py
@@ -242,9 +242,11 @@ def _add(
         coll = self._get_collection(collection_id)
         self._manager.hint_use_collection(collection_id, t.Operation.ADD)
 
+        records_to_submit = []
         for r in _records(t.Operation.ADD, ids, embeddings, metadatas, documents):
             self._validate_embedding_record(coll, r)
-            self._producer.submit_embedding(coll["topic"], r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
 
         self._telemetry_client.capture(CollectionAddEvent(str(collection_id), len(ids)))
         return True
@@ -261,9 +263,11 @@ def _update(
         coll = self._get_collection(collection_id)
         self._manager.hint_use_collection(collection_id, t.Operation.UPDATE)
 
+        records_to_submit = []
         for r in _records(t.Operation.UPDATE, ids, embeddings, metadatas, documents):
             self._validate_embedding_record(coll, r)
-            self._producer.submit_embedding(coll["topic"], r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
 
         return True
 
@@ -279,9 +283,11 @@ def _upsert(
         coll = self._get_collection(collection_id)
         self._manager.hint_use_collection(collection_id, t.Operation.UPSERT)
 
+        records_to_submit = []
         for r in _records(t.Operation.UPSERT, ids, embeddings, metadatas, documents):
             self._validate_embedding_record(coll, r)
-            self._producer.submit_embedding(coll["topic"], r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
 
         return True
 
@@ -362,11 +368,27 @@ def _delete(
             else None
         )
 
+        # You must have at least one of non-empty ids, where, or where_document.
+        if (
+            (ids is None or (ids is not None and len(ids) == 0))
+            and (where is None or (where is not None and len(where) == 0))
+            and (
+                where_document is None
+                or (where_document is not None and len(where_document) == 0)
+            )
+        ):
+            raise ValueError(
+                """
+                You must provide either ids, where, or where_document to delete. If
+                you want to delete all data in a collection you can delete the
+                collection itself using the delete_collection method. Or alternatively,
+                you can get() all the relevant ids and then delete them.
+                """
+            )
+
         coll = self._get_collection(collection_id)
         self._manager.hint_use_collection(collection_id, t.Operation.DELETE)
 
-        # TODO: Do we want to warn the user that unrestricted _delete() is 99% of the
-        # time a bad idea?
         if (where or where_document) or not ids:
             metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
             records = metadata_segment.get_metadata(
@@ -376,9 +398,14 @@ def _delete(
         else:
             ids_to_delete = ids
 
+        if len(ids_to_delete) == 0:
+            return []
+
+        records_to_submit = []
         for r in _records(t.Operation.DELETE, ids_to_delete):
             self._validate_embedding_record(coll, r)
-            self._producer.submit_embedding(coll["topic"], r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
 
         self._telemetry_client.capture(
             CollectionDeleteEvent(str(collection_id), len(ids_to_delete))

diff --git a/chromadb/api/types.py b/chromadb/api/types.py
@@ -111,14 +111,33 @@ def validate_ids(ids: IDs) -> IDs:
         raise ValueError(f"Expected IDs to be a list, got {ids}")
     if len(ids) == 0:
         raise ValueError(f"Expected IDs to be a non-empty list, got {ids}")
-    for id in ids:
-        if not isinstance(id, str):
-            raise ValueError(f"Expected ID to be a str, got {id}")
-    if len(ids) != len(set(ids)):
-        dups = set([x for x in ids if ids.count(x) > 1])
-        raise errors.DuplicateIDError(
-            f"Expected IDs to be unique, found duplicates for: {dups}"
-        )
+    seen = set()
+    dups = set()
+    for id_ in ids:
+        if not isinstance(id_, str):
+            raise ValueError(f"Expected ID to be a str, got {id_}")
+        if id_ in seen:
+            dups.add(id_)
+        else:
+            seen.add(id_)
+    if dups:
+        n_dups = len(dups)
+        if n_dups < 10:
+            example_string = ", ".join(dups)
+            message = (
+                f"Expected IDs to be unique, found duplicates of: {example_string}"
+            )
+        else:
+            examples = []
+            for idx, dup in enumerate(dups):
+                examples.append(dup)
+                if idx == 10:
+                    break
+            example_string = (
+                f"{', '.join(examples[:5])}, ..., {', '.join(examples[-5:])}"
+            )
+            message = f"Expected IDs to be unique, found {n_dups} duplicated IDs: {example_string}"
+        raise errors.DuplicateIDError(message)
     return ids
 
 
@@ -266,6 +285,10 @@ def validate_where_document(where_document: WhereDocument) -> WhereDocument:
             raise ValueError(
                 f"Expected where document operand value for operator $contains to be a str, got {operand}"
             )
+        elif len(operand) == 0:
+            raise ValueError(
+                "Expected where document operand value for operator $contains to be a non-empty str"
+            )
     return where_document