Skip to content

Commit

Permalink
Merge pull request #11 from chroma-core/main
Browse files Browse the repository at this point in the history
update repo
  • Loading branch information
sunilkumardash9 authored Aug 29, 2023
2 parents 7e7e3a3 + 4332adf commit 490fef4
Show file tree
Hide file tree
Showing 63 changed files with 5,037 additions and 615 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ Chroma is a rapidly developing project. We welcome PR contributors and ideas for
- [Join the conversation on Discord](https://discord.gg/MMeYNTmh3x) - `#contributing` channel
- [Review the 🛣️ Roadmap and contribute your ideas](https://docs.trychroma.com/roadmap)
- [Grab an issue and open a PR](https://github.com/chroma-core/chroma/issues) - [`Good first issue tag`](https://github.com/chroma-core/chroma/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
- [Read our contributing guide](https://docs.trychroma.com/contributing)

**Release Cadence**
We currently release new tagged versions of the `pypi` and `npm` packages on Mondays. Hotfixes go out at any time during the week.
Expand Down
35 changes: 34 additions & 1 deletion chromadb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,45 @@
from chromadb.telemetry import Telemetry
from chromadb.config import Settings, System
from chromadb.api import API
from chromadb.api.models.Collection import Collection
from chromadb.api.types import (
CollectionMetadata,
Documents,
EmbeddingFunction,
Embeddings,
IDs,
Include,
Metadata,
Where,
QueryResult,
GetResult,
WhereDocument,
UpdateCollectionMetadata,
)

# Re-export types from chromadb.types
__all__ = [
"Collection",
"Metadata",
"Where",
"WhereDocument",
"Documents",
"IDs",
"Embeddings",
"EmbeddingFunction",
"Include",
"CollectionMetadata",
"UpdateCollectionMetadata",
"QueryResult",
"GetResult",
]


logger = logging.getLogger(__name__)

__settings = Settings()

__version__ = "0.4.4"
__version__ = "0.4.8"

# Workaround to deal with Colab's old sqlite3 version
try:
Expand Down
47 changes: 37 additions & 10 deletions chromadb/api/fastapi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import json
from typing import Optional, cast
from typing import Sequence
from uuid import UUID

import requests
from overrides import override

import chromadb.errors as errors
import chromadb.utils.embedding_functions as ef
from chromadb.api import API
from chromadb.config import Settings, System
from chromadb.api.models.Collection import Collection
from chromadb.api.types import (
Documents,
Embeddings,
Expand All @@ -14,15 +23,13 @@
QueryResult,
CollectionMetadata,
)
import chromadb.utils.embedding_functions as ef
import requests
import json
from typing import Sequence
from chromadb.api.models.Collection import Collection
import chromadb.errors as errors
from uuid import UUID
from chromadb.auth import (
ClientAuthProvider,
)
from chromadb.auth.providers import RequestsClientAuthProtocolAdapter
from chromadb.auth.registry import resolve_provider
from chromadb.config import Settings, System
from chromadb.telemetry import Telemetry
from overrides import override


class FastAPI(API):
Expand All @@ -47,7 +54,27 @@ def __init__(self, system: System):
)

self._header = system.settings.chroma_server_headers
self._session = requests.Session()
if (
system.settings.chroma_client_auth_provider
and system.settings.chroma_client_auth_protocol_adapter
):
self._auth_provider = self.require(
resolve_provider(
system.settings.chroma_client_auth_provider, ClientAuthProvider
)
)
self._adapter = cast(
RequestsClientAuthProtocolAdapter,
system.require(
resolve_provider(
system.settings.chroma_client_auth_protocol_adapter,
RequestsClientAuthProtocolAdapter,
)
),
)
self._session = self._adapter.session
else:
self._session = requests.Session()
if self._header is not None:
self._session.headers.update(self._header)

Expand Down
10 changes: 7 additions & 3 deletions chromadb/api/models/Collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def get(
Args:
ids: The ids of the embeddings to get. Optional.
where: A Where type dict used to filter results by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
limit: The number of documents to return. Optional.
offset: The offset to start returning results from. Useful for paging results with limit. Optional.
where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
Expand Down Expand Up @@ -165,7 +165,7 @@ def query(
query_embeddings: The embeddings to get the closes neighbors of. Optional.
query_texts: The document texts to get the closes neighbors of. Optional.
n_results: The number of neighbors to return for each query_embedding or query_texts. Optional.
where: A Where type dict used to filter results by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`, `"distances"`. Ids are always included. Defaults to `["metadatas", "documents", "distances"]`. Optional.
Expand Down Expand Up @@ -313,17 +313,21 @@ def delete(
Args:
ids: The ids of the embeddings to delete
where: A Where type dict used to filter the delection by. E.g. `{"color" : "red", "price": 4.20}`. Optional.
where: A Where type dict used to filter the delection by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
where_document: A WhereDocument type dict used to filter the deletion by the document content. E.g. `{$contains: {"text": "hello"}}`. Optional.
Returns:
None
Raises:
ValueError: If you don't provide either ids, where, or where_document
"""
ids = validate_ids(maybe_cast_one_to_many(ids)) if ids else None
where = validate_where(where) if where else None
where_document = (
validate_where_document(where_document) if where_document else None
)

self._client._delete(self.id, ids, where, where_document)

def _validate_embedding_set(
Expand Down
39 changes: 33 additions & 6 deletions chromadb/api/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,11 @@ def _add(
coll = self._get_collection(collection_id)
self._manager.hint_use_collection(collection_id, t.Operation.ADD)

records_to_submit = []
for r in _records(t.Operation.ADD, ids, embeddings, metadatas, documents):
self._validate_embedding_record(coll, r)
self._producer.submit_embedding(coll["topic"], r)
records_to_submit.append(r)
self._producer.submit_embeddings(coll["topic"], records_to_submit)

self._telemetry_client.capture(CollectionAddEvent(str(collection_id), len(ids)))
return True
Expand All @@ -261,9 +263,11 @@ def _update(
coll = self._get_collection(collection_id)
self._manager.hint_use_collection(collection_id, t.Operation.UPDATE)

records_to_submit = []
for r in _records(t.Operation.UPDATE, ids, embeddings, metadatas, documents):
self._validate_embedding_record(coll, r)
self._producer.submit_embedding(coll["topic"], r)
records_to_submit.append(r)
self._producer.submit_embeddings(coll["topic"], records_to_submit)

return True

Expand All @@ -279,9 +283,11 @@ def _upsert(
coll = self._get_collection(collection_id)
self._manager.hint_use_collection(collection_id, t.Operation.UPSERT)

records_to_submit = []
for r in _records(t.Operation.UPSERT, ids, embeddings, metadatas, documents):
self._validate_embedding_record(coll, r)
self._producer.submit_embedding(coll["topic"], r)
records_to_submit.append(r)
self._producer.submit_embeddings(coll["topic"], records_to_submit)

return True

Expand Down Expand Up @@ -362,11 +368,27 @@ def _delete(
else None
)

# You must have at least one of non-empty ids, where, or where_document.
if (
(ids is None or (ids is not None and len(ids) == 0))
and (where is None or (where is not None and len(where) == 0))
and (
where_document is None
or (where_document is not None and len(where_document) == 0)
)
):
raise ValueError(
"""
You must provide either ids, where, or where_document to delete. If
you want to delete all data in a collection you can delete the
collection itself using the delete_collection method. Or alternatively,
you can get() all the relevant ids and then delete them.
"""
)

coll = self._get_collection(collection_id)
self._manager.hint_use_collection(collection_id, t.Operation.DELETE)

# TODO: Do we want to warn the user that unrestricted _delete() is 99% of the
# time a bad idea?
if (where or where_document) or not ids:
metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
records = metadata_segment.get_metadata(
Expand All @@ -376,9 +398,14 @@ def _delete(
else:
ids_to_delete = ids

if len(ids_to_delete) == 0:
return []

records_to_submit = []
for r in _records(t.Operation.DELETE, ids_to_delete):
self._validate_embedding_record(coll, r)
self._producer.submit_embedding(coll["topic"], r)
records_to_submit.append(r)
self._producer.submit_embeddings(coll["topic"], records_to_submit)

self._telemetry_client.capture(
CollectionDeleteEvent(str(collection_id), len(ids_to_delete))
Expand Down
39 changes: 31 additions & 8 deletions chromadb/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,33 @@ def validate_ids(ids: IDs) -> IDs:
raise ValueError(f"Expected IDs to be a list, got {ids}")
if len(ids) == 0:
raise ValueError(f"Expected IDs to be a non-empty list, got {ids}")
for id in ids:
if not isinstance(id, str):
raise ValueError(f"Expected ID to be a str, got {id}")
if len(ids) != len(set(ids)):
dups = set([x for x in ids if ids.count(x) > 1])
raise errors.DuplicateIDError(
f"Expected IDs to be unique, found duplicates for: {dups}"
)
seen = set()
dups = set()
for id_ in ids:
if not isinstance(id_, str):
raise ValueError(f"Expected ID to be a str, got {id_}")
if id_ in seen:
dups.add(id_)
else:
seen.add(id_)
if dups:
n_dups = len(dups)
if n_dups < 10:
example_string = ", ".join(dups)
message = (
f"Expected IDs to be unique, found duplicates of: {example_string}"
)
else:
examples = []
for idx, dup in enumerate(dups):
examples.append(dup)
if idx == 10:
break
example_string = (
f"{', '.join(examples[:5])}, ..., {', '.join(examples[-5:])}"
)
message = f"Expected IDs to be unique, found {n_dups} duplicated IDs: {example_string}"
raise errors.DuplicateIDError(message)
return ids


Expand Down Expand Up @@ -266,6 +285,10 @@ def validate_where_document(where_document: WhereDocument) -> WhereDocument:
raise ValueError(
f"Expected where document operand value for operator $contains to be a str, got {operand}"
)
elif len(operand) == 0:
raise ValueError(
"Expected where document operand value for operator $contains to be a non-empty str"
)
return where_document


Expand Down
Loading

0 comments on commit 490fef4

Please sign in to comment.