Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: 1199 enhance similarity search test to make index is used #1212

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion evadb/catalog/services/index_catalog_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,11 @@ def delete_entry_by_name(self, name: str):
index_metadata = index_obj.as_dataclass()
# clean up the on disk data
if os.path.exists(index_metadata.save_file_path):
os.remove(index_metadata.save_file_path)
if os.path.isfile(index_metadata.save_file_path):
# For service-hosting-based vector database, we should not
# touch their base directory. The only case that needs to
# be taken care of is FAISS index local disk file.
os.remove(index_metadata.save_file_path)
index_obj.delete(self.session)
except Exception:
logger.exception("Delete index failed for name {}".format(name))
Expand Down
2 changes: 1 addition & 1 deletion evadb/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def handle_vector_store_params(
elif vector_store_type == VectorStoreType.QDRANT:
return {"index_db": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.CHROMADB:
return {"index_path": index_path}
return {"index_path": str(Path(index_path).parent)}
elif vector_store_type == VectorStoreType.PINECONE:
return {}
else:
Expand Down
4 changes: 2 additions & 2 deletions evadb/third_party/vector_stores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, index_name: str) -> None:

assert (
self._api_key
), "Please set your Pinecone API key in evadb.yml file (third_party, pinecone_api_key) or environment variable (PINECONE_KEY)"
), "Please set your Pinecone API key in evadb.yml file (third_party, pinecone_api_key) or environment variable (PINECONE_KEY). It can be found at Pinecone Dashboard > API Keys > Value"

# Get the environment name.
self._environment = ConfigurationManager().get_value(
Expand All @@ -57,7 +57,7 @@ def __init__(self, index_name: str) -> None:

assert (
self._environment
), "Please set the Pinecone environment key in evadb.yml file (third_party, pinecone_env) or environment variable (PINECONE_ENV)"
), "Please set the Pinecone environment key in evadb.yml file (third_party, pinecone_env) or environment variable (PINECONE_ENV). It can be found Pinecone Dashboard > API Keys > Environment."

if not _pinecone_init_done:
# Initialize pinecone.
Expand Down
167 changes: 103 additions & 64 deletions test/integration_tests/long/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,10 @@ def test_should_do_vector_index_scan(self):
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite1")
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite2")
drop_query = "DROP INDEX testFaissIndexScanRewrite1"
execute_query_fetch_all(self.evadb, drop_query)
drop_query = "DROP INDEX testFaissIndexScanRewrite2"
execute_query_fetch_all(self.evadb, drop_query)

def test_should_not_do_vector_index_scan_with_desc_order(self):
# Execution with index scan.
Expand Down Expand Up @@ -347,7 +349,8 @@ def test_should_not_do_vector_index_scan_with_desc_order(self):
self.assertTrue(np.array_equal(actual_open, base_img + 3))

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite")
drop_query = "DROP INDEX testFaissIndexScanRewrite"
execute_query_fetch_all(self.evadb, drop_query)

def test_should_not_do_vector_index_scan_with_predicate(self):
# Execution with index scan.
Expand All @@ -370,86 +373,122 @@ def test_should_not_do_vector_index_scan_with_predicate(self):
self.assertFalse("FaissIndexScan" in batch.frames[0][0])

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexScanRewrite")
drop_query = "DROP INDEX testFaissIndexScanRewrite"
execute_query_fetch_all(self.evadb, drop_query)

def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_faiss(self):
for _ in range(2):
create_index_query = """CREATE INDEX testFaissIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING FAISS;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

def test_end_to_end_index_scan_should_work_correctly_on_image_dataset(self):
create_index_query = """CREATE INDEX testFaissIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING FAISS;"""
execute_query_fetch_all(self.evadb, create_index_query)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testFaissIndexImageDataset")
# Cleanup
drop_query = "DROP INDEX testFaissIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@qdrant_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_qdrant(self):
create_index_query = """CREATE INDEX testQdrantIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING QDRANT;"""
execute_query_fetch_all(self.evadb, create_index_query)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
for _ in range(2):
create_index_query = """CREATE INDEX testQdrantIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING QDRANT;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

"""|__ ProjectPlan
|__ VectorIndexScanPlan
|__ SeqScanPlan
|__ StoragePlan"""
"""|__ ProjectPlan
|__ VectorIndexScanPlan
|__ SeqScanPlan
|__ StoragePlan"""

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

# Cleanup
self.evadb.catalog().drop_index_catalog_entry("testQdrantIndexImageDataset")
# Cleanup
drop_query = "DROP INDEX testQdrantIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@chromadb_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_chromadb(
self,
):
create_index_query = """CREATE INDEX testChromaDBIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING CHROMADB;"""
execute_query_fetch_all(self.evadb, create_index_query)
for _ in range(2):
create_index_query = """CREATE INDEX testChromaDBIndexImageDataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING CHROMADB;"""
execute_query_fetch_all(self.evadb, create_index_query)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is cool!

self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
# Cleanup
drop_query = "DROP INDEX testChromaDBIndexImageDataset"
execute_query_fetch_all(self.evadb, drop_query)

@pinecone_skip_marker
def test_end_to_end_index_scan_should_work_correctly_on_image_dataset_pinecone(
self,
):
# We need to always drop the index as Pinecone's free tier only supports a single current index.
drop_index_query = "DROP INDEX IF EXISTS testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)
create_index_query = """CREATE INDEX testpineconeindeximagedataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING PINECONE;"""
execute_query_fetch_all(self.evadb, create_index_query)
# Sleep to ensure the pinecone records get updated as Pinecone is eventually consistent.
time.sleep(20)
for _ in range(2):
create_index_query = """CREATE INDEX testpineconeindeximagedataset
ON testSimilarityImageDataset (DummyFeatureExtractor(data))
USING PINECONE;"""
execute_query_fetch_all(self.evadb, create_index_query)
# Sleep to ensure the pinecone records get updated as Pinecone is eventually consistent.
time.sleep(20)

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
explain_batch = execute_query_fetch_all(
self.evadb, f"EXPLAIN {select_query}"
)
self.assertTrue("VectorIndexScan" in explain_batch.frames[0][0])

select_query = """SELECT _row_id FROM testSimilarityImageDataset
ORDER BY Similarity(DummyFeatureExtractor(Open("{}")), DummyFeatureExtractor(data))
LIMIT 1;""".format(
self.img_path
)
res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(
res_batch.frames["testsimilarityimagedataset._row_id"][0], 5
)

res_batch = execute_query_fetch_all(self.evadb, select_query)
self.assertEqual(res_batch.frames["testsimilarityimagedataset._row_id"][0], 5)
drop_index_query = "DROP INDEX testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)
drop_index_query = "DROP INDEX testpineconeindeximagedataset;"
execute_query_fetch_all(self.evadb, drop_index_query)