diff --git a/benchmarks/perf-tool/README.md b/benchmarks/perf-tool/README.md index 9c1c18918..f98227e27 100644 --- a/benchmarks/perf-tool/README.md +++ b/benchmarks/perf-tool/README.md @@ -13,18 +13,36 @@ file. ## Install Prerequisites -### Python +### Setup -Python 3.7 or above is required. +K-NN perf requires Python 3.8 or greater to be installed. One of +the easier ways to do this is through Conda, a package and environment +management system for Python. -### Pip +First, follow the +[installation instructions](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) +to install Conda on your system. -Use pip to install the necessary requirements: +Next, create a Python 3.8 environment: +``` +conda create -n knn-perf python=3.8 +``` + +After the environment is created, activate it: +``` +source activate knn-perf +``` +Lastly, clone the k-NN repo and install all required python packages: ``` +git clone https://github.com/opensearch-project/k-NN.git +cd k-NN/benchmarks/perf-tool pip install -r requirements.txt ``` +After all of this completes, you should be ready to run your first performance benchmarks! + + ## Usage ### Quick Start @@ -72,16 +90,17 @@ The output will be the delta between the two metrics. ### Test Parameters -| Parameter Name | Description | Default | -| ----------- | ----------- | ----------- | -| endpoint | Endpoint OpenSearch cluster is running on | localhost | -| test_name | Name of test | No default | -| test_id | String ID of test | No default | -| num_runs | Number of runs to execute steps | 1 | -| show_runs | Whether to output each run in addition to the total summary | false | -| setup | List of steps to run once before metric collection starts | [] | -| steps | List of steps that make up one test run. Metrics will be collected on these steps. | No default | -| cleanup | List of steps to run after each test run | [] | +| Parameter Name | Description | Default | +|----------------|------------------------------------------------------------------------------------|------------| +| endpoint | Endpoint OpenSearch cluster is running on | localhost | +| port | Port on which OpenSearch Cluster is running on | 9200 | +| test_name | Name of test | No default | +| test_id | String ID of test | No default | +| num_runs | Number of runs to execute steps | 1 | +| show_runs | Whether to output each run in addition to the total summary | false | +| setup | List of steps to run once before metric collection starts | [] | +| steps | List of steps that make up one test run. Metrics will be collected on these steps. | No default | +| cleanup | List of steps to run after each test run | [] | ### Steps diff --git a/benchmarks/perf-tool/okpt/io/config/parsers/test.py b/benchmarks/perf-tool/okpt/io/config/parsers/test.py index 34b1752c7..d0ef4c02f 100644 --- a/benchmarks/perf-tool/okpt/io/config/parsers/test.py +++ b/benchmarks/perf-tool/okpt/io/config/parsers/test.py @@ -23,6 +23,7 @@ class TestConfig: test_name: str test_id: str endpoint: str + port: int num_runs: int show_runs: bool setup: List[Step] @@ -48,6 +49,9 @@ def parse(self, file_obj: TextIOWrapper) -> TestConfig: if 'endpoint' in config_obj: implicit_step_config['endpoint'] = config_obj['endpoint'] + if 'port' in config_obj: + implicit_step_config['port'] = config_obj['port'] + # Each step should have its own parse - take the config object and check if its valid setup = [] if 'setup' in config_obj: @@ -62,6 +66,7 @@ def parse(self, file_obj: TextIOWrapper) -> TestConfig: test_config = TestConfig( endpoint=config_obj['endpoint'], + port=config_obj['port'], test_name=config_obj['test_name'], test_id=config_obj['test_id'], num_runs=config_obj['num_runs'], diff --git a/benchmarks/perf-tool/okpt/io/config/schemas/test.yml b/benchmarks/perf-tool/okpt/io/config/schemas/test.yml index 1939a8a31..06b880cc7 100644 --- a/benchmarks/perf-tool/okpt/io/config/schemas/test.yml +++ b/benchmarks/perf-tool/okpt/io/config/schemas/test.yml @@ -9,6 +9,9 @@ endpoint: type: string default: "localhost" +port: + type: integer + default: 9200 test_name: type: string test_id: diff --git a/benchmarks/perf-tool/okpt/test/steps/steps.py b/benchmarks/perf-tool/okpt/test/steps/steps.py index 0de61078f..b04a4af4d 100644 --- a/benchmarks/perf-tool/okpt/test/steps/steps.py +++ b/benchmarks/perf-tool/okpt/test/steps/steps.py @@ -5,7 +5,7 @@ # compatible open source license. """Provides steps for OpenSearch tests. -Some of the OpenSearch operations return a `took` field in the response body, +Some OpenSearch operations return a `took` field in the response body, so the profiling decorators aren't needed for some functions. """ import json @@ -454,8 +454,7 @@ def _action(self): results['took'] = [ float(query_response['took']) for query_response in query_responses ] - port = 9200 if self.endpoint == 'localhost' else 80 - results['memory_kb'] = get_cache_size_in_kb(self.endpoint, port) + results['memory_kb'] = get_cache_size_in_kb(self.endpoint, self.port) if self.calculate_recall: ids = [[int(hit['_id']) @@ -614,7 +613,6 @@ def _action(self): num_of_search_segments = 0; for shard_key in shards.keys(): for segment in shards[shard_key]: - num_of_committed_segments += segment["num_committed_segments"] num_of_search_segments += segment["num_search_segments"] @@ -689,12 +687,13 @@ def delete_model(endpoint, port, model_id): return response.json() -def get_opensearch_client(endpoint: str, port: int): +def get_opensearch_client(endpoint: str, port: int, timeout=60): """ Get an opensearch client from an endpoint and port Args: endpoint: Endpoint OpenSearch is running on port: Port OpenSearch is running on + timeout: timeout for OpenSearch client, default value 60 Returns: OpenSearch client @@ -708,7 +707,7 @@ def get_opensearch_client(endpoint: str, port: int): use_ssl=False, verify_certs=False, connection_class=RequestsHttpConnection, - timeout=60, + timeout=timeout, ) diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json new file mode 100644 index 000000000..b8f591176 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json new file mode 100644 index 000000000..fecde0392 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json @@ -0,0 +1,42 @@ +{ + "bool": + { + "should": + [ + { + "range": + { + "age": + { + "gte": 30, + "lte": 70 + } + } + }, + { + "term": + { + "color": "green" + } + }, + { + "term": + { + "color": "blue" + } + }, + { + "term": + { + "color": "yellow" + } + }, + { + "term": + { + "color": "sweet" + } + } + ] + } +} \ No newline at end of file diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml new file mode 100644 index 000000000..61486b3b6 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml @@ -0,0 +1,34 @@ +endpoint: [ENDPOINT] +test_name: "Faiss HNSW Relaxed Filter Test" +test_id: "Faiss HNSW Relaxed Filter Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: [INDEX_SPEC_PATH]/relaxed-filter/index.json + - name: ingest_multi_field + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] + - name: refresh_index + index_name: target_index + - name: query_with_filter + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_dataset: neighbors_filter_5 + filter_spec: [INDEX_SPEC_PATH]/relaxed-filter-spec.json + filter_type: FILTER diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json new file mode 100644 index 000000000..b8f591176 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json new file mode 100644 index 000000000..9e6356f1c --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json @@ -0,0 +1,44 @@ +{ + "bool": + { + "must": + [ + { + "range": + { + "age": + { + "gte": 30, + "lte": 60 + } + } + }, + { + "term": + { + "taste": "bitter" + } + }, + { + "bool": + { + "should": + [ + { + "term": + { + "color": "blue" + } + }, + { + "term": + { + "color": "green" + } + } + ] + } + } + ] + } +} \ No newline at end of file diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml new file mode 100644 index 000000000..bf02144ac --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml @@ -0,0 +1,37 @@ +endpoint: [ENDPOINT] +test_name: "Faiss HNSW Restrictive Filter Test" +test_id: "Faiss HNSW Restrictive Filter Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: [INDEX_SPEC_PATH]/index.json + - name: ingest_multi_field + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: query_with_filter + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_dataset: neighbors_filter_4 + filter_spec: [INDEX_SPEC_PATH]/restrictive-filter-spec.json + filter_type: FILTER diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json new file mode 100644 index 000000000..b8f591176 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml new file mode 100644 index 000000000..f3e976cf3 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml @@ -0,0 +1,32 @@ +endpoint: localhost +test_name: "Faiss HNSW Test" +test_id: "Faiss HNSW Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: /home/ec2-user/[PATH]/index.json + - name: ingest + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: query + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean.hdf5 diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml index f20fba203..44ed8e66e 100644 --- a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml @@ -1,6 +1,6 @@ endpoint: [ENDPOINT] -test_name: "index-workflow" -test_id: "Index workflow" +test_name: "Lucene HNSW Relaxed Filter Test" +test_id: "Lucene HNSW Relaxed Filter Test" num_runs: 10 show_runs: false steps: diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml index b1d7b60d7..d7f451a48 100644 --- a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml @@ -1,6 +1,6 @@ endpoint: [ENDPOINT] -test_name: "index-workflow" -test_id: "Index workflow" +test_name: "Lucene HNSW Restrictive Filter Test" +test_id: "Lucene HNSW Restrictive Filter Test" num_runs: 10 show_runs: false steps: @@ -8,17 +8,20 @@ steps: index_name: target_index - name: create_index index_name: target_index - index_spec: [INDEX_SPEC_PATH]/index.json + index_spec: /home/ec2-user/k-NN/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/index.json - name: ingest_multi_field index_name: target_index field_name: target_field bulk_size: 500 dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + dataset_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-attr.hdf5 attributes_dataset_name: attributes attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] - name: refresh_index index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 - name: query_with_filter k: 100 r: 1 @@ -26,9 +29,9 @@ steps: index_name: target_index field_name: target_field dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + dataset_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-attr.hdf5 neighbors_format: hdf5 - neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-filters.hdf5 neighbors_dataset: neighbors_filter_4 - filter_spec: [INDEX_SPEC_PATH]/restrictive-filter-spec.json + filter_spec: /home/ec2-user/k-NN/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json filter_type: FILTER diff --git a/jni/src/faiss_wrapper.cpp b/jni/src/faiss_wrapper.cpp index 2e626f9c6..a1bbb9635 100644 --- a/jni/src/faiss_wrapper.cpp +++ b/jni/src/faiss_wrapper.cpp @@ -256,7 +256,6 @@ jobjectArray knn_jni::faiss_wrapper::QueryIndex_WithFilter(knn_jni::JNIUtilInter jniUtil->ReleaseIntArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT); } else { try { - std::cout << "Doing query" << std::endl; indexReader->search(1, rawQueryvector, kJ, dis.data(), ids.data()); } catch (...) { jniUtil->ReleaseFloatArrayElements(env, queryVectorJ, rawQueryvector, JNI_ABORT); @@ -286,33 +285,6 @@ jobjectArray knn_jni::faiss_wrapper::QueryIndex_WithFilter(knn_jni::JNIUtilInter return results; } -/** - * Based on the type of the index reader we need to return the SearchParameters. The way we do this by dynamically - * casting the IndexReader. - * @param indexReader - * @param idSelector - * @return SearchParameters - */ -std::unique_ptr buildSearchParams(const faiss::IndexIDMap *indexReader, faiss::IDSelector* idSelector) { - auto hnswReader = dynamic_cast(indexReader->index); - if(hnswReader) { - // we need to make this variable unique_ptr so that the scope can be shared with caller function. - std::unique_ptr hnswParams(new faiss::SearchParametersHNSW); - hnswParams->sel = idSelector; - return hnswParams; - } - - auto ivfReader = dynamic_cast(indexReader->index); - auto ivfFlatReader = dynamic_cast(indexReader->index); - if(ivfReader || ivfFlatReader) { - // we need to make this variable unique_ptr so that the scope can be shared with caller function. - std::unique_ptr ivfParams(new faiss::SearchParametersIVF); - ivfParams->sel = idSelector; - return ivfParams; - } - throw std::runtime_error("Invalid Index Type supported for Filtered Search on Faiss"); -} - void knn_jni::faiss_wrapper::Free(jlong indexPointer) { auto *indexWrapper = reinterpret_cast(indexPointer); delete indexWrapper; @@ -499,3 +471,33 @@ void buildFilterIdsBitMap(const int* filterIds, int filterIdsLength, uint8_t* bi bitsetVector[bitsetArrayIndex] = bitsetVector[bitsetArrayIndex] | (1 << (value & 7)); } } + +/** + * Based on the type of the index reader we need to return the SearchParameters. The way we do this by dynamically + * casting the IndexReader. + * @param indexReader + * @param idSelector + * @return SearchParameters + */ +std::unique_ptr buildSearchParams(const faiss::IndexIDMap *indexReader, faiss::IDSelector* idSelector) { + auto hnswReader = dynamic_cast(indexReader->index); + if(hnswReader) { + // we need to make this variable unique_ptr so that the scope can be shared with caller function. + std::unique_ptr hnswParams(new faiss::SearchParametersHNSW); + // Setting the ef_search value equal to what was provided during index creation. SearchParametersHNSW has a default + // value of ef_search = 16 which will then be used. + hnswParams->efSearch = hnswReader->hnsw.efSearch; + hnswParams->sel = idSelector; + return hnswParams; + } + + auto ivfReader = dynamic_cast(indexReader->index); + auto ivfFlatReader = dynamic_cast(indexReader->index); + if(ivfReader || ivfFlatReader) { + // we need to make this variable unique_ptr so that the scope can be shared with caller function. + std::unique_ptr ivfParams(new faiss::SearchParametersIVF); + ivfParams->sel = idSelector; + return ivfParams; + } + throw std::runtime_error("Invalid Index Type supported for Filtered Search on Faiss"); +}