From ead99cd6f1d5857a83ee26671f9ab91764b2137d Mon Sep 17 00:00:00 2001 From: Max Isom Date: Fri, 12 Jul 2024 11:57:19 -0700 Subject: [PATCH 1/2] [BUG] fix persistent HNSW parameter migration --- chromadb/db/mixins/sysdb.py | 8 +++++++- chromadb/test/property/strategies.py | 10 ++++++---- chromadb/test/property/test_cross_version_persist.py | 7 ++++++- chromadb/test/property/test_persist.py | 4 +++- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/chromadb/db/mixins/sysdb.py b/chromadb/db/mixins/sysdb.py index d4b56df5bbd..42375c109ed 100644 --- a/chromadb/db/mixins/sysdb.py +++ b/chromadb/db/mixins/sysdb.py @@ -19,6 +19,7 @@ UniqueConstraintError, ) from chromadb.db.system import SysDB +from chromadb.segment.impl.vector.hnsw_params import PersistentHnswParams from chromadb.telemetry.opentelemetry import ( add_attributes_to_current_span, OpenTelemetryClient, @@ -776,7 +777,12 @@ def _insert_config_from_legacy_params( collections_t = Table("collections") # Get any existing HNSW params from the metadata - hnsw_metadata_params = HnswParams.extract(metadata or {}) + metadata = metadata or {} + if metadata.get("hnsw:batch_size") or metadata.get("hnsw:sync_threshold"): + hnsw_metadata_params = PersistentHnswParams.extract(metadata) + else: + hnsw_metadata_params = HnswParams.extract(metadata) + hnsw_configuration = HNSWConfigurationInternal.from_legacy_params( hnsw_metadata_params # type: ignore[arg-type] ) diff --git a/chromadb/test/property/strategies.py b/chromadb/test/property/strategies.py index d6ca4e4c9ed..136811e5e15 100644 --- a/chromadb/test/property/strategies.py +++ b/chromadb/test/property/strategies.py @@ -281,7 +281,7 @@ def collections( with_hnsw_params: bool = False, has_embeddings: Optional[bool] = None, has_documents: Optional[bool] = None, - with_persistent_hnsw_params: bool = False, + with_persistent_hnsw_params: st.SearchStrategy[bool] = st.just(False), ) -> Collection: """Strategy to generate a Collection object. If add_filterable_data is True, then known_metadata_keys and known_document_keywords will be populated with consistent data.""" @@ -292,16 +292,18 @@ def collections( dimension = draw(st.integers(min_value=2, max_value=2048)) dtype = draw(st.sampled_from(float_types)) - if with_persistent_hnsw_params and not with_hnsw_params: + use_persistent_hnsw_params = draw(with_persistent_hnsw_params) + + if use_persistent_hnsw_params and not with_hnsw_params: raise ValueError( - "with_hnsw_params requires with_persistent_hnsw_params to be true" + "with_persistent_hnsw_params requires with_hnsw_params to be true" ) if with_hnsw_params: if metadata is None: metadata = {} metadata.update(test_hnsw_config) - if with_persistent_hnsw_params: + if use_persistent_hnsw_params: metadata["hnsw:batch_size"] = draw(st.integers(min_value=3, max_value=2000)) metadata["hnsw:sync_threshold"] = draw( st.integers(min_value=3, max_value=2000) diff --git a/chromadb/test/property/test_cross_version_persist.py b/chromadb/test/property/test_cross_version_persist.py index 8d3087777aa..cb3cfe88e9c 100644 --- a/chromadb/test/property/test_cross_version_persist.py +++ b/chromadb/test/property/test_cross_version_persist.py @@ -272,7 +272,12 @@ def persist_generated_data_with_old_version( # Since we can't pickle the embedding function, we always generate record sets with embeddings collection_st: st.SearchStrategy[strategies.Collection] = st.shared( - strategies.collections(with_hnsw_params=True, has_embeddings=True), key="coll" + strategies.collections( + with_hnsw_params=True, + has_embeddings=True, + with_persistent_hnsw_params=st.booleans(), + ), + key="coll", ) diff --git a/chromadb/test/property/test_persist.py b/chromadb/test/property/test_persist.py index f91b0ed78fd..a1c6add1619 100644 --- a/chromadb/test/property/test_persist.py +++ b/chromadb/test/property/test_persist.py @@ -58,7 +58,9 @@ def settings(request: pytest.FixtureRequest) -> Generator[Settings, None, None]: collection_st = st.shared( - strategies.collections(with_hnsw_params=True, with_persistent_hnsw_params=True), + strategies.collections( + with_hnsw_params=True, with_persistent_hnsw_params=st.just(True) + ), key="coll", ) From dbaa7c3640ab48448616237acf77071eceff7510 Mon Sep 17 00:00:00 2001 From: Max Isom Date: Mon, 15 Jul 2024 11:27:57 -0700 Subject: [PATCH 2/2] Clean --- chromadb/db/mixins/sysdb.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/chromadb/db/mixins/sysdb.py b/chromadb/db/mixins/sysdb.py index 42375c109ed..2684441cbb0 100644 --- a/chromadb/db/mixins/sysdb.py +++ b/chromadb/db/mixins/sysdb.py @@ -19,7 +19,6 @@ UniqueConstraintError, ) from chromadb.db.system import SysDB -from chromadb.segment.impl.vector.hnsw_params import PersistentHnswParams from chromadb.telemetry.opentelemetry import ( add_attributes_to_current_span, OpenTelemetryClient, @@ -772,16 +771,12 @@ def _insert_config_from_legacy_params( # This is a legacy case where we don't have configuration stored in the database # This is non-destructive, we don't delete or overwrite any keys in the metadata - from chromadb.segment.impl.vector.hnsw_params import HnswParams + from chromadb.segment.impl.vector.hnsw_params import PersistentHnswParams collections_t = Table("collections") - # Get any existing HNSW params from the metadata - metadata = metadata or {} - if metadata.get("hnsw:batch_size") or metadata.get("hnsw:sync_threshold"): - hnsw_metadata_params = PersistentHnswParams.extract(metadata) - else: - hnsw_metadata_params = HnswParams.extract(metadata) + # Get any existing HNSW params from the metadata (works regardless whether metadata has persistent params) + hnsw_metadata_params = PersistentHnswParams.extract(metadata or {}) hnsw_configuration = HNSWConfigurationInternal.from_legacy_params( hnsw_metadata_params # type: ignore[arg-type]