Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: update restful v2 test cases #36448

Merged
merged 1 commit into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions tests/restful_client_v2/base/testbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim=
batch_size = batch_size
batch = nb // batch_size
remainder = nb % batch_size
data = []

full_data = []
insert_ids = []
for i in range(batch):
nb = batch_size
Expand All @@ -116,6 +117,7 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim=
assert rsp['code'] == 0
if return_insert_id:
insert_ids.extend(rsp['data']['insertIds'])
full_data.extend(data)
# insert remainder data
if remainder:
nb = remainder
Expand All @@ -128,10 +130,11 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim=
assert rsp['code'] == 0
if return_insert_id:
insert_ids.extend(rsp['data']['insertIds'])
full_data.extend(data)
if return_insert_id:
return schema_payload, data, insert_ids
return schema_payload, full_data, insert_ids

return schema_payload, data
return schema_payload, full_data

def wait_collection_load_completed(self, name):
t0 = time.time()
Expand Down
165 changes: 134 additions & 31 deletions tests/restful_client_v2/testcases/test_vector_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
import sys
import json
import time

import utils.utils
from utils import constant
from utils.utils import gen_collection_name
from utils.utils import gen_collection_name, get_sorted_distance
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
Expand Down Expand Up @@ -921,12 +923,10 @@ def test_upsert_vector_pk_auto_id(self, nb, dim, insert_round, id_type):
@pytest.mark.L0
class TestSearchVector(TestBase):


@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.skip(reason="behavior change;todo:@zhuwenxing")
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [16])
def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, auto_id,
Expand Down Expand Up @@ -1011,14 +1011,7 @@ def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, aut
"filter": "word_count > 100",
"groupingField": "user_id",
"outputFields": ["*"],
"searchParams": {
"metricType": "COSINE",
"params": {
"radius": "0.1",
"range_filter": "0.8"
}
},
"limit": 100,
"limit": 100
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
Expand All @@ -1032,10 +1025,10 @@ def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, aut
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.skip(reason="behavior change;todo:@zhuwenxing")
@pytest.mark.parametrize("nq", [1, 2])
@pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"])
def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, nq):
is_partition_key, enable_dynamic_schema, nq, metric_type):
"""
Insert a vector with a simple payload
"""
Expand All @@ -1056,7 +1049,7 @@ def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, a
]
},
"indexParams": [
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": "COSINE"},
{"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type},
]
}
rsp = self.collection_client.collection_create(payload)
Expand Down Expand Up @@ -1100,13 +1093,6 @@ def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, a
"filter": "word_count > 100",
"groupingField": "user_id",
"outputFields": ["*"],
"searchParams": {
"metricType": "COSINE",
"params": {
"radius": "0.1",
"range_filter": "0.8"
}
},
"limit": 100,
}
rsp = self.vector_client.vector_search(payload)
Expand Down Expand Up @@ -1227,8 +1213,8 @@ def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_r
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.skip(reason="behavior change;todo:@zhuwenxing")
def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round, auto_id,
@pytest.mark.parametrize("metric_type", ['HAMMING'])
def test_search_vector_with_binary_vector_datatype(self, metric_type, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema):
"""
Insert a vector with a simple payload
Expand All @@ -1250,7 +1236,7 @@ def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round,
]
},
"indexParams": [
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING",
{"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": metric_type,
"params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}}
]
}
Expand Down Expand Up @@ -1301,13 +1287,6 @@ def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round,
"data": [gen_vector(datatype="BinaryVector", dim=dim)],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"metricType": "HAMMING",
"params": {
"radius": "0.1",
"range_filter": "0.8"
}
},
"limit": 100,
}
rsp = self.vector_client.vector_search(payload)
Expand Down Expand Up @@ -1549,6 +1528,130 @@ def test_search_vector_with_complex_int64_varchar_and_filter(self, filter_expr):
if "like" in varchar_expr:
assert name.startswith(prefix)

@pytest.mark.parametrize("consistency_level", ["Strong", "Bounded", "Eventually", "Session"])
def test_search_vector_with_consistency_level(self, consistency_level):
"""
Search a vector with different consistency level
"""
name = gen_collection_name()
self.name = name
nb = 200
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb)
names = []
for item in data:
names.append(item.get("name"))
names.sort()
logger.info(f"names: {names}")
mid = len(names) // 2
prefix = names[mid][0:2]
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"consistencyLevel": consistency_level
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) == limit

@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
def test_search_vector_with_range_search(self, metric_type):
"""
Search a vector with range search with different metric type
"""
name = gen_collection_name()
self.name = name
nb = 3000
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
training_data = [item[vector_field] for item in data]
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct
if metric_type == "L2":
r1, r2 = r2, r1
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])
payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"searchParams": {
"params": {
"radius": r1,
"range_filter": r2,
}
}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
assert len(res) == limit
for item in res:
distance = item.get("distance")
if metric_type == "L2":
assert r1 > distance > r2
else:
assert r1 < distance < r2

@pytest.mark.parametrize("ignore_growing", [True, False])
def test_search_vector_with_ignore_growing(self, ignore_growing):
"""
Search a vector with range search with different metric type
"""
name = gen_collection_name()
self.name = name
metric_type = "COSINE"
nb = 1000
dim = 128
limit = 100
schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type)
vector_field = schema_payload.get("vectorField")
# search data
vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()
training_data = [item[vector_field] for item in data]
distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type)
r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct
if metric_type == "L2":
r1, r2 = r2, r1
output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field])

payload = {
"collectionName": name,
"data": [vector_to_search],
"outputFields": output_fields,
"limit": limit,
"offset": 0,
"searchParams": {
"ignore_growing": ignore_growing

}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
res = rsp['data']
logger.info(f"res: {len(res)}")
if ignore_growing is True:
assert len(res) == 0
else:
assert len(res) == limit



@pytest.mark.L1
class TestSearchVectorNegative(TestBase):
Expand Down
26 changes: 25 additions & 1 deletion tests/restful_client_v2/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import requests
from loguru import logger
import datetime

from sklearn.metrics import pairwise_distances
fake = Faker()
rng = np.random.default_rng()

Expand Down Expand Up @@ -240,4 +240,28 @@ def get_all_fields_by_data(data, exclude_fields=None):
return list(fields)


def ip_distance(x, y):
return np.dot(x, y)


def cosine_distance(u, v, epsilon=1e-8):
dot_product = np.dot(u, v)
norm_u = np.linalg.norm(u)
norm_v = np.linalg.norm(v)
return dot_product / (max(norm_u * norm_v, epsilon))


def l2_distance(u, v):
return np.sum((u - v) ** 2)


def get_sorted_distance(train_emb, test_emb, metric_type):
milvus_sklearn_metric_map = {
"L2": l2_distance,
"COSINE": cosine_distance,
"IP": ip_distance
}
distance = pairwise_distances(train_emb, Y=test_emb, metric=milvus_sklearn_metric_map[metric_type], n_jobs=-1)
distance = np.array(distance.T, order='C', dtype=np.float16)
distance_sorted = np.sort(distance, axis=1).tolist()
return distance_sorted