From c331cb4de6bf8e5ae0ce20ef511565febdaf06df Mon Sep 17 00:00:00 2001 From: Filip Haltmayer Date: Wed, 19 Apr 2023 13:34:22 -0700 Subject: [PATCH] Include MilvusClient Signed-off-by: Filip Haltmayer --- pymilvus/milvus_client/__init__.py | 0 pymilvus/milvus_client/defaults.py | 12 + pymilvus/milvus_client/milvus_client.py | 816 ++++++++++++++++++ pymilvus/milvus_client/milvus_client_tests.py | 148 ++++ 4 files changed, 976 insertions(+) create mode 100644 pymilvus/milvus_client/__init__.py create mode 100644 pymilvus/milvus_client/defaults.py create mode 100644 pymilvus/milvus_client/milvus_client.py create mode 100644 pymilvus/milvus_client/milvus_client_tests.py diff --git a/pymilvus/milvus_client/__init__.py b/pymilvus/milvus_client/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymilvus/milvus_client/defaults.py b/pymilvus/milvus_client/defaults.py new file mode 100644 index 000000000..446e938d6 --- /dev/null +++ b/pymilvus/milvus_client/defaults.py @@ -0,0 +1,12 @@ +"""Default MilvusClient args.""" + +DEFAULT_SEARCH_PARAMS = { + "IVF_FLAT": {"metric_type": "L2", "params": {"nprobe": 10}}, + "IVF_SQ8": {"metric_type": "L2", "params": {"nprobe": 10}}, + "IVF_PQ": {"metric_type": "L2", "params": {"nprobe": 10}}, + "HNSW": {"metric_type": "L2", "params": {"ef": 10}}, + "RHNSW_FLAT": {"metric_type": "L2", "params": {"ef": 10}}, + "RHNSW_SQ": {"metric_type": "L2", "params": {"ef": 10}}, + "RHNSW_PQ": {"metric_type": "L2", "params": {"ef": 10}}, + "AUTOINDEX": {"metric_type": "L2", "params": {}}, +} diff --git a/pymilvus/milvus_client/milvus_client.py b/pymilvus/milvus_client/milvus_client.py new file mode 100644 index 000000000..1853f5f7b --- /dev/null +++ b/pymilvus/milvus_client/milvus_client.py @@ -0,0 +1,816 @@ +"""MilvusClient for dealing with simple workflows.""" +import logging +import threading +from typing import Optional, Union, List, Dict +from uuid import uuid4 + +from tqdm import tqdm +from pymilvus.client.types import LoadState +from pymilvus.exceptions import MilvusException +from pymilvus.milvus_client.defaults import DEFAULT_SEARCH_PARAMS +from pymilvus.orm import utility +from pymilvus.orm.collection import Collection, CollectionSchema, FieldSchema +from pymilvus.orm.connections import connections +from pymilvus.orm.types import DataType, infer_dtype_bydata + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + + +class MilvusClient: + """The Milvus Client""" + def __init__( + self, + collection_name: str = "ClientCollection", + pk_field: str = None, + vector_field: str = None, + uri: str = None, + shard_num: int = None, + partitions: List[str] = None, + consistency_level: str = "Bounded", + replica_number: int = 1, + index_params: dict = None, + timeout: Optional[int] = None, + drop_old: bool = False, + ): + """A client for the common Milvus use case. + + This client attempts to hide away the complexity of using Pymilvus. In a lot ofcases what + the user wants is a simple wrapper that supports adding data, deleting data, and searching. + This wrapper can autoinfer the schema from a previous collection or newly inserted data, + can update the paritions, can query, and can delete by pk. + + Args: + pk_field (str, optional): Which entry in data is considered the primary key. If None, + an auto-id will be created. Will be overwritten if loading from a previous + collection. Defaults to None. + vector_field (str, optional): Which entry in the data is considered the vector field. + Will get overwritten if loading from previous collection. Defaults to None. + uri (str, optional): The connection address to use to connect to the + instance. Defaults to "http://localhost:19530". You can also set this address + as an env variable + shard_num (int, optional): The amount of shards to use for the collection. Unless + dealing with huge scale, recommended to keep at default. Defaults to None and allows + server to set. + partitions (List[str], optional): Which paritions to create for the collection. + Defaults to None. + consistency_level (str, optional): Which consistency level to use for the Client. + The options are "Strong", "Bounded", "Eventually", "Session". Defaults to "Bounded". + replica_number (int, optional): The amount of in memomory replicas to use. + Defaults to 1. + index_params (dict, optional): What index parameteres to use for the Collection. + If none, will use a default one. Defaults to None. + timeout (Optional[int], optional): What timeout to use for function calls. Defaults + to None. + drop_old (bool, optional): If a collection with the same name already exists, drop it. + Defaults to False. + """ + self.uri = uri + self.collection_name = collection_name + self.shard_num = shard_num + self.partitions = partitions + self.consistency_level = consistency_level + self.replica_number = replica_number + self.index_params = index_params + self.timeout = timeout + self.pk_field = pk_field + self.vector_field = vector_field + + # TODO: Figure out thread safety + # self.concurrent_counter = 0 + self.concurrent_lock = threading.RLock() + self.dim = None + self.default_search_params = None + self.collection = None + self.fields = None + + self.alias = self._create_connection() + self.is_self_hosted = bool( + utility.get_server_type(using=self.alias) == "milvus" + ) + if drop_old: + self.delete_collection() + self._init(None) + + def insert_data( + self, + data: List[Dict[str, any]], + timeout: int = None, + batch_size: int = 100, + partition: str = None, + progress_bar: bool = False, + ) -> List[Union[str, int]]: + """Insert data into the collection. + + If the Milvus Client was initiated without an existing Collection, the first dict passed + in will be used to initiate the collection. + + Args: + data (List[Dict[str, any]]): A list of dicts to pass in. If list not provided, will + cast to list. + timeout (int, optional): The timeout to use, will override init timeout. Defaults + to None. + batch_size (int, optional): The batch size to perform inputs with. Defaults to 100. + partition (str, optional): Which partition to insert into. Defaults to None. + progress_bar (bool, optional): Whether to display a progress bar for the input. + Defaults to False. + + Raises: + DataNotMatchException: If the data has misssing fields an exception will be thrown. + MilvusException: General Milvus error on insert. + + Returns: + List[Union[str, int]]: A list of primary keys that were inserted. + """ + # If no data provided, we cannot input anything + if len(data) == 0: + return [] + + if batch_size < 1: + logger.error( + "Invalid batch size provided for insert." + ) + + raise ValueError("Invalid batch size provided for insert.") + + # If the collection hasnt been initialized, initialize it + with self.concurrent_lock: + if self.collection is None: + self._init(data[0]) + + # Dont include the primary key if auto_id is true and they included it in data + ignore_pk = self.pk_field if self.collection.schema.auto_id else None + insert_dict = {} + pks = [] + + for k in data: + for key, value in k.items(): + if key in self.fields: + insert_dict.setdefault(key, []).append(value) + + # Insert the data in batches + for i in tqdm(range(0, len(data), batch_size), disable=not progress_bar): + # Convert dict to list of lists batch for insertion + try: + insert_batch = [insert_dict[key][i : i + batch_size] for key in self.fields if key != ignore_pk] + except KeyError as ex: + logger.error( + "Malformed data, one of the inserts does not contain all fields required." + ) + raise ex + # Insert into the collection. + try: + res = self.collection.insert( + insert_batch, + timeout=timeout or self.timeout, + partition_name=partition, + ) + pks.extend(res.primary_keys) + except MilvusException as ex: + logger.error( + "Failed to insert batch starting at entity: %s/%s", str(i), str(len(data)) + ) + raise ex + return pks + + def upsert_data( + self, + data: List[Dict[str, any]], + timeout: int = None, + batch_size: int = 100, + partition: str = None, + progress_bar: bool = False, + ) -> List[Union[str, int]]: + """WARNING: SLOW AND NOT ATOMIC. Will be updated for 2.3 release. + + Upsert the data into the collection. + + If the Milvus Client was initiated without an existing Collection, the first dict passed + in will be used to initiate the collection. + + Args: + data (List[Dict[str, any]]): A list of dicts to upsert. + timeout (int, optional): The timeout to use, will override init timeout. Defaults + to None. + batch_size (int, optional): The batch size to perform inputs with. Defaults to 100. + partition (str, optional): Which partition to insert into. Defaults to None. + progress_bar (bool, optional): Whether to display a progress bar for the input. + Defaults to False. + Returns: + List[Union[str, int]]: A list of primary keys that were inserted. + """ + # If the collection exists we need to first delete the values + if self.collection is not None: + pks = [x[self.pk_field] for x in data] + self.delete_by_pk(pks, timeout) + + ret = self.insert_data( + data=data, + timeout=timeout, + batch_size=batch_size, + partition=partition, + progress_bar=progress_bar + ) + + return ret + + def search_data( + self, + data: Union[List[list], list], + search_params: dict = None, + filter_expression: str = None, + top_k: int = 10, + partitions: List[str] = None, + timeout: int = None, + ) -> List[dict]: + """Search for a query vector/vectors. + + In order for the search to process, a collection needs to have been either provided + at init or data needs to have been inserted. + + Args: + data (Union[List[list], list]): The vector/vectors to search. + search_params (dict, optional): The search params to use for the search. Will default + to the default set for the client. + filter_expression (str, optional): A filter to use for the search. Defaults to None. + top_k (int, optional): How many results to return per search. Defaults to 10. + partitions (List[str], optional): Which partitions to search within. Defaults to + searching through all. + timeout (int, optional): Timeout to use, overides the client level assigned at init. + Defaults to None. + + Raises: + ValueError: The collection being searched doesnt exist. Need to insert data first. + + Returns: + List[dict]: A list of dicts containing the score and the result data. Embeddings are + not included in the result data. + """ + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter += 1 + + if self.collection is None: + logger.error("Collection does not exist: %s", self.collection_name) + raise ValueError( + "Missing collection. Make sure data inserted or intialized on existing collection." + ) + + if not isinstance(data[0], list): + data = [data] + + return_fields = list(self.fields.keys()) + return_fields.remove(self.vector_field) + + res = self.collection.search( + data, + self.vector_field, + expr=filter_expression, + param=search_params or self.default_search_params, + limit=top_k, + partition_names=partitions, + output_fields=return_fields, + timeout=timeout or self.timeout, + ) + + ret = [] + for hits in res: + for hit in hits: + ret_dict = {x: hit.entity.get(x) for x in return_fields} + ret.append({"score": hit.score, "data": ret_dict}) + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter -= 1 + return ret + + def query_data( + self, + filter_expression: str, + partitions: List[str] = None, + timeout: int = None, + ) -> List[dict]: + """Query for entries in the Collection. + + Args: + filter_expression (str): The filter to use for the query. + partitions (List[str], optional): Which partitions to perform query. Defaults to None. + timeout (int, optional): Timeout to use, overides the client level assigned at init. + Defaults to None. + + Raises: + ValueError: Missing collection. + + Returns: + List[dict]: A list of result dicts, embeddings are not included. + """ + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter += 1 + + if self.collection is None: + logger.error("Collection does not exist: %s", self.collection_name) + raise ValueError( + "Missing collection. Make sure data inserted or intialized on existing collection." + ) + + return_fields = list(self.fields.keys()) + return_fields.remove(self.vector_field) + + res = self.collection.query( + expr=filter_expression, + partition_names=partitions, + output_fields=return_fields, + timeout=timeout or self.timeout, + ) + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter -= 1 + + return res + + def get_embeddings_by_pk( + self, + pks: List[Union[str, int]], + timeout: int = None, + ) -> None: + """Grab the inserted embeddings using the primary key from the Collection. + + Due to current implementations, grabbing a large amount of vectors is slow. + + Args: + filter_expression (str): The filter to use for the query. + timeout (int, optional): Timeout to use, overides the client level assigned at + init. Defaults to None. + + Raises: + ValueError: Missing collection. + + Returns: + List[dict]: A list of result dicts with keys {pk_field, vector_field} + """ + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter += 1 + + if self.collection is None: + logger.error("Collection does not exist: %s", self.collection_name) + raise ValueError( + "Missing collection. Make sure data inserted or intialized on existing collection." + ) + + # Varchar pks need double quotes around the values + if self.fields[self.pk_field] == DataType.VARCHAR: + ids = ['"' + str(entry) + '"' for entry in pks] + expr = f""""{self.pk_field}" in [{','.join(ids)}]""" + else: + ids = [str(entry) for entry in pks] + expr = f"{self.pk_field} in [{','.join(ids)}]" + + res = self.collection.query( + expr=expr, + output_fields=[self.vector_field], + timeout=timeout or self.timeout, + ) + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter -= 1 + + return res + + def delete_by_pk( + self, + pks: list, + timeout: int = None, + ) -> None: + """Delete entries in the collection by their pk. + + Delete all the entries based on the pk. If unsure of pk you can first query the collection + to grab the corresponding data. Then you can delete using the pk_field. + + Args: + pks (list): _description_ + timeout (int, optional): Timeout to use, overides the client level assigned at init. + Defaults to None. + """ + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter += 1 + + if self.collection is None: + logger.error("Collection does not exist: %s", self.collection_name) + + if len(pks) == 0: + return + + if self.fields[self.pk_field] == DataType.VARCHAR: + ids = ['"' + str(entry) + '"' for entry in pks] + expr = f""""{self.pk_field}" in [{','.join(ids)}]""" + else: + ids = [str(entry) for entry in pks] + expr = f"{self.pk_field} in [{','.join(ids)}]" + + self.collection.delete(expr=expr, timout=timeout or self.timeout) + + # TODO: Figure out thread safety + # with self.concurrent_lock: + # self.concurrent_counter -= 1 + + def delete_collection( + self, + ) -> None: + """Delete the collection.""" + # TODO: Figure out thread safety + with self.concurrent_lock: + if self.collection is not None: + self.collection.drop() + self.collection = None + + def close( + self, + drop_collection: bool = True, + ): + if drop_collection == True: + self.delete_collection() + + connections.remove_connection(self.alias) + + def add_partitions(self, input_partitions: List[str]): + """Add partitions to the collection. + + Add a list of partition names to the collection. If the collection is loaded + it will first be unloaded, then the partitions will be added, and then reloaded. + + Args: + input_partitions (List[str]): The list of partition names to be added. + + Raises: + MilvusException: Unable to add the partition. + """ + # TODO: Figure out thread safety + with self.concurrent_lock: + if self.collection is not None and self.is_self_hosted: + # Calculate which partitions need to be added + input_partitions = set(input_partitions) + current_partitions = { + partition.name for partition in self.collection.partitions + } + new_partitions = input_partitions.difference(current_partitions) + # If partitions need to be added, add them + if len(new_partitions) != 0: + # Try to unload the collection, if exception already released + reload = False + try: + self.collection.release(timeout=self.timeout) + reload = True + except MilvusException: + pass + + try: + for part in new_partitions: + self.collection.create_partition(part) + logger.info( + "Successfully added partitions to collection: %s partitions: %s", + self.collection_name, + ",".join(part for part in list(new_partitions)), + ) + except MilvusException as ex: + logger.debug( + "Failed to add partitions to: %s", self.collection_name + ) + raise ex + # If the collection started out loaded, reload it. + if reload: + self._load() + else: + logger.debug( + "No parititons to add for collection: %s", self.collection_name + ) + else: + logger.debug( + "Collection either on Zilliz or non existant for collection: %s", + self.collection_name, + ) + + def delete_partitions(self, remove_partitions: List[str]): + """Remove partitions from the collection. + + Remove a list of partition names from the collection. If the collection is loaded + it will first be unloaded, then the partitions will be removed, and then reloaded. + + Args: + remove_partitions (List[str]): The list of partition names to be removed. + + Raises: + MilvusException: Unable to remove the partition. + """ + with self.concurrent_lock: + if self.collection is not None and self.is_self_hosted: + # Calculate which partitions need to be removed + remove_partitions = set(remove_partitions) + current_partitions = { + partition.name for partition in self.collection.partitions + } + removal_partitions = remove_partitions.intersection(current_partitions) + # If partitions need to be added, add them + if len(removal_partitions) != 0: + # Try to unload the collection, if exception raised it is most likely already + # released + reload = False + try: + self.collection.release(timeout=self.timeout) + reload = True + except MilvusException: + pass + try: + for part in removal_partitions: + self.collection.drop_partition(part) + logger.info( + "Successfully deleted partitions from collection: %s partitions: %s", + self.collection_name, + ",".join(part for part in list(removal_partitions)), + ) + except MilvusException as ex: + logger.debug( + "Failed to delete partitions from: %s", self.collection_name + ) + raise ex + # If the collection started out loaded, reload it. + if reload: + self._load() + else: + logger.debug( + "No parititons to delete for collection: %s", + self.collection_name, + ) + + def _create_connection(self) -> str: + """Create the connection to the Milvus server.""" + # TODO: Implement reuse with new uri style + alias = uuid4().hex + try: + connections.connect(alias=alias, uri = self.uri) + logger.debug("Created new connection using: %s", alias) + return alias + except MilvusException as ex: + logger.error("Failed to create new connection using: %s", alias) + raise ex + + def _init(self, input_data: Optional[dict]): + """Create/connect to the colletion""" + # If no input data and collection exists, use that + if input_data is None and utility.has_collection( + self.collection_name, using=self.alias + ): + self.collection = Collection(self.collection_name, using=self.alias) + # Grab the field information from the existing collection + self._extract_fields() + # If data is supplied we can create a new collection + elif input_data is not None: + self._create_collection(input_data) + # Nothin to init from + else: + logger.debug( + "No information to perform init from for collection %s", + self.collection_name, + ) + return + self._create_index() + # Partitions only allowed on Milvus at the moment + if self.is_self_hosted and self.partitions is not None: + self.add_partitions(self.partitions) + self._create_default_search_params() + self._load() + + def _create_collection(self, data: dict) -> None: + """Create the collection by autoinferring the schema.""" + # TODO: Assuming ordered dict for 3.7 + fields = {} + + # Figure out each datatype of the input. + for key, value in data.items(): + # Infer the corresponding datatype of the metadata + dtype = infer_dtype_bydata(value) + + # Datatype isnt compatible + if dtype in (DataType.UNKNOWN, DataType.NONE): + logger.error( + "Failed to parse schema for collection %s, unrecognized dtype for key: %s", + self.collection_name, + key, + ) + raise ValueError(f"Unrecognized datatype for {key}.") + + # Create an entry under the field name + fields[key] = {} + fields[key]["name"] = key + fields[key]["dtype"] = dtype + + # Area for attaching kwargs for certain datatypes + if dtype == DataType.VARCHAR: + fields[key]["max_length"] = 65_535 + + if self.vector_field is None: + logger.error( + "Missing vector_field, cannot infer schema for collection: %s", + self.collection_name, + ) + raise ValueError("Missing vector_field, cannot autoinfer schema.") + + try: + self.dim = len(data[self.vector_field]) + # Attach dim kwarg to vector field + fields[self.vector_field]["dim"] = self.dim + except KeyError as ex: + logger.error( + "Missing vector_field: %s in data for collection: %s", + self.vector_field, + self.collection_name, + ) + raise ex + + if self.pk_field is None: + # Generate a unique auto-id field + self.pk_field = "internal_pk_" + uuid4().hex[:4] + # Create a new field for pk + fields[self.pk_field] = {} + fields[self.pk_field]["name"] = self.pk_field + fields[self.pk_field]["dtype"] = DataType.INT64 + fields[self.pk_field]["auto_id"] = True + fields[self.pk_field]["is_primary"] = True + logger.debug( + "Missing pk_field, creating auto-id pk for collection: %s", + self.collection_name, + ) + else: + # If pk_field given, assume it was iterated + try: + fields[self.pk_field]["auto_id"] = False + fields[self.pk_field]["is_primary"] = True + except KeyError as ex: + logger.error( + "Missing pk_field: %s in data for collection: %s", + self.pk_field, + self.collection_name, + ) + raise ex + try: + # Create the fieldschemas + fieldschemas = [] + # TODO: Assuming ordered dicts for 3.7 + self.fields = {} + for field_dict in fields.values(): + fieldschemas.append(FieldSchema(**field_dict)) + self.fields[field_dict["name"]] = field_dict["dtype"] + # Create the schema for the collection + schema = CollectionSchema(fieldschemas) + # Create the collection + self.collection = Collection( + name=self.collection_name, + schema=schema, + consistency_level=self.consistency_level, + shards_num=self.shard_num, + using=self.alias, + ) + logger.error("Successfully created collection: %s", self.collection_name) + except MilvusException as ex: + logger.error("Failed to create collection: %s", self.collection_name) + raise ex + + def _extract_fields(self) -> None: + """Grab the existing fields from the Collection""" + self.fields = {} + schema = self.collection.schema + for field in schema.fields: + field_dict = field.to_dict() + if field_dict.get("is_primary", None) is not None: + logger.debug("Updating pk_field with one from collection.") + self.pk_field = field_dict["name"] + if field_dict["type"] in (DataType.FLOAT_VECTOR, DataType.BINARY_VECTOR): + logger.debug("Updating vector_field with one from collection.") + self.vector_field = field_dict["name"] + self.fields[field_dict["name"]] = field_dict["type"] + + logger.info( + "Successfully extracted fields from for collection: %s, total fields: %s, " + "pk_field: %s, vector_field: %s", + self.collection_name, + len(self.fields), + self.pk_field, + self.vector_field, + ) + + def _create_index(self) -> None: + """Create a index on the collection""" + if self._get_index() is None: + # If no index params, use a default HNSW based one + if self.index_params is None: + # TODO: Once segment normalization we can default to IP + metric_type = ( + "L2" + if self.fields[self.vector_field] == DataType.FLOAT_VECTOR + else "JACCARD" + ) + # TODO: Once AUTOINDEX type is supported by Milvus we can default to HNSW always + index_type = "HNSW" if self.is_self_hosted else "AUTOINDEX" + params = {"M": 8, "efConstruction": 64} if self.is_self_hosted else {} + self.index_params = { + "metric_type": metric_type, + "index_type": index_type, + "params": params, + } + try: + self.collection.create_index( + self.vector_field, + index_params=self.index_params, + using=self.alias, + timeout=self.timeout, + ) + logger.info( + "Successfully created an index on collection: %s", + self.collection_name, + ) + + except MilvusException as ex: + logger.error( + "Failed to create an index on collection: %s", self.collection_name + ) + raise ex + else: + logger.debug( + "Index exists already for collection: %s", self.collection_name + ) + + def _get_index(self): + """Return the index dict if index exists.""" + for index in self.collection.indexes: + if index.field_name == self.vector_field: + return index + return None + + def _create_default_search_params(self) -> None: + """Generate search params based on the current index type""" + index = self._get_index().to_dict() + if index is not None: + index_type = index["index_param"]["index_type"] + metric_type = index["index_param"]["metric_type"] + self.default_search_params = DEFAULT_SEARCH_PARAMS[index_type] + self.default_search_params["metric_type"] = metric_type + + def _load(self): + """Loads the collection.""" + if self._get_index() is not None: + # Check if the collection is loaded or in progress of loading + if ( + utility.load_state( + self.collection_name, using=self.alias, timeout=self.timeout + ) + != LoadState.NotLoad + ): + # IF the collection is loaded/loading, check the replica count + if len(self.collection.get_replicas().groups) == self.replica_number: + logger.debug("Collection already loaded.") + return + + # If the replica count is incorrect, release the collection + try: + self.collection.release(timeout=self.timeout) + logger.debug( + "Successfully released collection due to incorrect replica: %s", + self.collection_name, + ) + except MilvusException as ex: + logger.error( + "Failed to release collection with incorrect num_replicas: %s", + self.collection_name, + ) + raise ex + # Try to load in the collection with correct replica count + try: + self.collection.load(replica_number=self.replica_number) + logger.info( + "Successfully loaded collection with correct replica_count: %s", + self.collection_name, + ) + except MilvusException: + logger.error( + "Failed to load collection with num_replicas greater than one: %s, " + / "attempting num_replicas==1", + self.collection_name, + ) + # If load fails, try to load in with only 1 replica (standalone) + try: + self.collection.load(replica_number=1) + logger.info( + "Successfully loaded collection with num_replicas==1: %s", + self.collection_name, + ) + # If both loads fail, raise exception + except MilvusException as ex: + logger.error("Failed to load collection: %s", self.collection_name) + raise ex diff --git a/pymilvus/milvus_client/milvus_client_tests.py b/pymilvus/milvus_client/milvus_client_tests.py new file mode 100644 index 000000000..2a505d57b --- /dev/null +++ b/pymilvus/milvus_client/milvus_client_tests.py @@ -0,0 +1,148 @@ +"""Test the MilvusClient""" +import logging +import sys +from uuid import uuid4 +import numpy as np + +from pymilvus import FieldSchema, DataType, CollectionSchema, connections, utility, Collection +from pymilvus.milvus_client.milvus_client import MilvusClient + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.DEBUG) +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +handler.setFormatter(formatter) +logger.addHandler(handler) + + +""" +Tests to Run: + +Construct non existant collection +Construct existant collection + +Insert data existant collection +Insert data nonexistant collection + +Insert non matching data existant collection +Insert non matching data nonexistant collection + +Insert insert data into auto_id with pk field +insert data into auto_id without pk field + +""" + +MILVUS_URI = "http://localhost:19530" +COLLECTION_NAME = "test" + +def valid_data(seed: int): + datas = [] + count = 10 + for cur in range(count): + float_num = seed + (cur / 10) + int_num = (seed * 10) + cur + temp = { + "varchar": str(float_num)[:5], + "float": np.float32(float_num), + "int": int_num, + "float_vector": [float_num] * 3 + } + datas.append(temp) + + return datas + +def create_existing_collection(uri, collection_name): + alias = uuid4().hex + connections.connect(uri=uri, alias=alias) + if utility.has_collection(collection_name=collection_name, using=alias): + utility.drop_collection(collection_name=collection_name, using=alias) + fields = [ + FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim = 3), + FieldSchema(name="int", dtype=DataType.INT64, is_primary = True, auto_id = True), + FieldSchema(name="float", dtype=DataType.FLOAT), + FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length= 65_535) + ] + schema = CollectionSchema(fields) + + ret = { + "col": Collection(collection_name, schema, using=alias), + "fields": ["float_vector", "int", "float", "varchar"], + "primary_field": "int", + "vector_field": "float_vector" + } + + return ret + + +class TestMilvusClient: + @staticmethod + def test_construct_from_existing_collection(): + info = create_existing_collection(MILVUS_URI, COLLECTION_NAME) + client = MilvusClient(collection_name=COLLECTION_NAME, uri=MILVUS_URI) + assert list(client.fields.keys()) == info["fields"] + assert client.pk_field == info["primary_field"] + assert client.vector_field == info["vector_field"] + info["col"].drop() + + @staticmethod + def test_construct_from_nonexistant_collection(): + client = MilvusClient(collection_name=COLLECTION_NAME, uri=MILVUS_URI) + assert client.fields == None + assert client.pk_field == None + assert client.vector_field == None + + @staticmethod + def test_insert_in_existing_collection_valid(): + info = create_existing_collection(MILVUS_URI, COLLECTION_NAME) + client = MilvusClient(collection_name=COLLECTION_NAME, uri=MILVUS_URI) + client.insert_data(valid_data(1)) + info["col"].drop() + + + +if __name__ == "__main__": + # TestMilvusClient.test_construct_from_existing_collection() + # TestMilvusClient.test_construct_from_nonexistant_collection() + TestMilvusClient.test_insert_in_existing_collection_valid() + +# import sys + +# # Test the insert +# outs = s.insert_data(test_data, partition="lol") +# pprint(outs) +# rets = s.search_data([0, 0, 0, 0, 0, 0, 0]) +# pprint(rets) + +# # Test the searches +# rets = s.search_data([0, 0, 0, 0, 0, 0, 0]) +# pprint(rets) + +# rets = s.search_data([0, 0, 0, 0, 0, 0, 0], partitions=["lol"]) +# pprint(rets) + +# rets = s.search_data([0, 0, 0, 0, 0, 0, 0], partitions=["default"]) +# pprint(rets) + +# # Test the query +# rets = s.query_data(s.pk_field + " in [1]") +# print(rets) + +# rets = s.get_embeddings_by_pk([1]) +# print(rets) + +# # pprint(s.collection.partitions) + +# # ret = s.search_data([0, 0, 0, 0, 0, 0, 0]) +# # pprint(ret) + +# # ret = s.query_data("""char in ["bar"]""") +# # pprint(ret) + +# # s.delete_by_pk([1]) + +# # ret = s.query_data("""char in ["bar"]""") +# # pprint(ret)