From c9ffb2309b8670561af1c905a4e54f89173ec8a4 Mon Sep 17 00:00:00 2001 From: vagimeli Date: Thu, 4 May 2023 16:36:52 -0600 Subject: [PATCH] Revert "Add ML fault tolerance (#3803)" This reverts commit d2d226780dd7492a83b75690c95da663a1a725f8. --- _ml-commons-plugin/api.md | 98 +++++++++++++------------- _ml-commons-plugin/cluster-settings.md | 64 +---------------- 2 files changed, 51 insertions(+), 111 deletions(-) diff --git a/_ml-commons-plugin/api.md b/_ml-commons-plugin/api.md index 163c726f36..0b11a63fbe 100644 --- a/_ml-commons-plugin/api.md +++ b/_ml-commons-plugin/api.md @@ -24,7 +24,7 @@ In order to train tasks through the API, three inputs are required. - Model hyper parameters: Adjust these parameters to make the model train better. - Input data: The data input that trains the ML model, or applies the ML models to predictions. You can input data in two ways, query against your index or use data frame. -## Training a model +## Train model Training can occur both synchronously and asynchronously. @@ -96,7 +96,7 @@ For asynchronous responses, the API returns the task_id, which can be used to ge } ``` -## Getting model information +## Get model information You can retrieve information on your model using the model_id. @@ -115,12 +115,12 @@ The API returns information on the model, the algorithm used, and the content fo } ``` -## Registering a model +## Upload a model -Use the register operation to register a custom model to a model index. ML Commons splits the model into smaller chunks and saves those chunks in the model's index. +Use the upload operation to upload a custom model to a model index. ML Commons splits the model into smaller chunks and saves those chunks in the model's index. ```json -POST /_plugins/_ml/models/_register +POST /_plugins/_ml/models/_upload ``` ### Request fields @@ -137,10 +137,10 @@ Field | Data type | Description ### Example -The following example request registers a version `1.0.0` of an NLP sentence transformation model named `all-MiniLM-L6-v2`. +The following example request uploads version `1.0.0` of an NLP sentence transformation model named `all-MiniLM-L6-v2`. ```json -POST /_plugins/_ml/models/_register +POST /_plugins/_ml/models/_upload { "name": "all-MiniLM-L6-v2", "version": "1.0.0", @@ -166,14 +166,14 @@ OpenSearch responds with the `task_id` and task `status`. } ``` -To see the status of your model registration, enter the `task_id` in the [task API] ... +To see the status of your model upload, enter the `task_id` into the [task API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api#get-task-information). Use the `model_id` from the task response once the upload is complete. For example: ```json { "model_id" : "WWQI44MBbzI2oUKAvNUt", "task_type" : "UPLOAD_MODEL", "function_name" : "TEXT_EMBEDDING", - "state" : "REGISTERED", + "state" : "COMPLETED", "worker_node" : "KzONM8c8T4Od-NoUANQNGg", "create_time" : 1665961344003, "last_update_time" : 1665961373047, @@ -181,28 +181,28 @@ To see the status of your model registration, enter the `task_id` in the [task A } ``` -## Deploying a model +## Load model -The deploy model operation reads the model's chunks from the model index and then creates an instance of the model to cache into memory. This operation requires the `model_id`. +The load model operation reads the model's chunks from the model index, then creates an instance of the model to cache into memory. This operation requires the `model_id`. ```json -POST /_plugins/_ml/models//_deploy +POST /_plugins/_ml/models//_load ``` -### Example: Deploying to all available ML nodes +### Example: Load into all available ML nodes -In this example request, OpenSearch deploys the model to any available OpenSearch ML node: +In this example request, OpenSearch loads the model into any available OpenSearch ML node: ```json -POST /_plugins/_ml/models/WWQI44MBbzI2oUKAvNUt/_deploy +POST /_plugins/_ml/models/WWQI44MBbzI2oUKAvNUt/_load ``` -### Example: Deploying to a specific node +### Example: Load into a specific node -If you want to reserve the memory of other ML nodes within your cluster, you can deploy your model to a specific node(s) by specifying the `node_ids` in the request body: +If you want to reserve the memory of other ML nodes within your cluster, you can load your model into a specific node(s) by specifying the `node_ids` in the request body: ```json -POST /_plugins/_ml/models/WWQI44MBbzI2oUKAvNUt/_deploy +POST /_plugins/_ml/models/WWQI44MBbzI2oUKAvNUt/_load { "node_ids": ["4PLK7KJWReyX0oWKnBA8nA"] } @@ -213,40 +213,40 @@ POST /_plugins/_ml/models/WWQI44MBbzI2oUKAvNUt/_deploy ```json { "task_id" : "hA8P44MBhyWuIwnfvTKP", - "status" : "DEPLOYING" + "status" : "CREATED" } ``` -## Undeploying a model +## Unload a model -To undeploy a model from memory, use the undeploy operation: +To unload a model from memory, use the unload operation. ```json -POST /_plugins/_ml/models//_undeploy +POST /_plugins/_ml/models//_unload ``` -### Example: Undeploying model from all ML nodes +### Example: Unload model from all ML nodes ```json -POST /_plugins/_ml/models/MGqJhYMBbbh0ushjm8p_/_undeploy +POST /_plugins/_ml/models/MGqJhYMBbbh0ushjm8p_/_unload ``` -### Response: Undeploying a model from all ML nodes +### Response: Unload model from all ML nodes ```json { "s5JwjZRqTY6nOT0EvFwVdA": { "stats": { - "MGqJhYMBbbh0ushjm8p_": "UNDEPLOYED" + "MGqJhYMBbbh0ushjm8p_": "unloaded" } } } ``` -### Example: Undeploying specific models from specific nodes +### Example: Unload specific models from specific nodes ```json -POST /_plugins/_ml/models/_undeploy +POST /_plugins/_ml/models/_unload { "node_ids": ["sv7-3CbwQW-4PiIsDOfLxQ"], "model_ids": ["KDo2ZYQB-v9VEDwdjkZ4"] @@ -254,32 +254,32 @@ POST /_plugins/_ml/models/_undeploy ``` -### Response: Undeploying specific models from specific nodes +### Response: Unload specific models from specific nodes ```json { "sv7-3CbwQW-4PiIsDOfLxQ" : { "stats" : { - "KDo2ZYQB-v9VEDwdjkZ4" : "UNDEPLOYED" + "KDo2ZYQB-v9VEDwdjkZ4" : "unloaded" } } } ``` -### Response: Undeploying all models from specific nodes +### Response: Unload all models from specific nodes ```json { "sv7-3CbwQW-4PiIsDOfLxQ" : { "stats" : { - "KDo2ZYQB-v9VEDwdjkZ4" : "UNDEPLOYED", - "-8o8ZYQBvrLMaN0vtwzN" : "UNDEPLOYED" + "KDo2ZYQB-v9VEDwdjkZ4" : "unloaded", + "-8o8ZYQBvrLMaN0vtwzN" : "unloaded" } } } ``` -### Example: Undeploying specific models from all nodes +### Example: Unload specific models from all nodes ```json { @@ -287,19 +287,19 @@ POST /_plugins/_ml/models/_undeploy } ``` -### Response: Undeploying specific models from all nodes +### Response: Unload specific models from all nodes ```json { "sv7-3CbwQW-4PiIsDOfLxQ" : { "stats" : { - "KDo2ZYQB-v9VEDwdjkZ4" : "UNDEPLOYED" + "KDo2ZYQB-v9VEDwdjkZ4" : "unloaded" } } } ``` -## Searching for a model +## Search model Use this command to search models you've already created. @@ -309,7 +309,7 @@ POST /_plugins/_ml/models/_search {query} ``` -### Example: Querying all models +### Example: Query all models ```json POST /_plugins/_ml/models/_search @@ -321,7 +321,7 @@ POST /_plugins/_ml/models/_search } ``` -### Example: Querying models with algorithm "FIT_RCF" +### Example: Query models with algorithm "FIT_RCF" ```json POST /_plugins/_ml/models/_search @@ -388,9 +388,9 @@ POST /_plugins/_ml/models/_search } ``` -## Deleting a model +## Delete model -Deletes a model based on the `model_id`. +Deletes a model based on the model_id ```json DELETE /_plugins/_ml/models/ @@ -414,9 +414,9 @@ The API returns the following: } ``` -## Returning model profile information +## Profile -The profile operation returns runtime information on ML tasks and models. The profile operation can help debug issues with models at runtime. +Returns runtime information on ML tasks and models. This operation can help debug issues with models at runtime. ```json @@ -444,7 +444,7 @@ task_ids | string | Returns runtime data for a specific task. You can string tog return_all_tasks | boolean | Determines whether or not a request returns all tasks. When set to `false` task profiles are left out of the response. return_all_models | boolean | Determines whether or not a profile request returns all models. When set to `false` model profiles are left out of the response. -### Example: Returning all tasks and models on a specific node +### Example: Return all tasks and models on a specific node ```json GET /_plugins/_ml/profile @@ -455,7 +455,7 @@ GET /_plugins/_ml/profile } ``` -### Response: Returning all tasks and models on a specific node +### Response: Return all tasks and models on a specific node ```json { @@ -473,7 +473,7 @@ GET /_plugins/_ml/profile "KzONM8c8T4Od-NoUANQNGg" : { # node id "models" : { "WWQI44MBbzI2oUKAvNUt" : { # model id - "model_state" : "DEPLOYED", # model status + "model_state" : "LOADED", # model status "predictor" : "org.opensearch.ml.engine.algorithms.text_embedding.TextEmbeddingModel@592814c9", "worker_nodes" : [ # routing table "KzONM8c8T4Od-NoUANQNGg" @@ -790,7 +790,7 @@ POST /_plugins/_ml/_train_predict/kmeans } ``` -## Getting task information +## Get task information You can retrieve information about a task using the task_id. @@ -814,7 +814,7 @@ The response includes information about the task. } ``` -## Searching for a task +## Search task Search tasks based on parameters indicated in the request body. @@ -905,7 +905,7 @@ GET /_plugins/_ml/tasks/_search } ``` -## Deleting a task +## Delete task Delete a task based on the task_id. diff --git a/_ml-commons-plugin/cluster-settings.md b/_ml-commons-plugin/cluster-settings.md index 31f0abc559..660c9e5c18 100644 --- a/_ml-commons-plugin/cluster-settings.md +++ b/_ml-commons-plugin/cluster-settings.md @@ -59,7 +59,7 @@ plugins.ml_commons.max_ml_task_per_node: 10 ## Set number of ML models per node -Sets the number of ML models that can be deployed to each ML node. When set to `0`, no ML models can deploy on any node. +Sets the number of ML models that can be loaded on to each ML node. When set to `0`, no ML models can load on any node. ### Setting @@ -74,7 +74,7 @@ plugins.ml_commons.max_model_on_node: 10 ## Set sync job intervals -When returning runtime information with the [Profile API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api#profile), ML Commons will run a regular job to sync newly deployed or undeployed models on each node. When set to `0`, ML Commons immediately stops sync-up jobs. +When returning runtime information with the [profile API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api#profile), ML Commons will run a regular job to sync newly loaded or unloaded models on each node. When set to `0`, ML Commons immediately stops sync up jobs. ### Setting @@ -186,63 +186,3 @@ plugins.ml_commons.native_memory_threshold: 90 - Default value: 90 - Value range: [0, 100] - -## Allow custom deployment plans - -When enabled, this setting grants users the ability to deploy models to specific ML nodes according to that user's permissions. - -### Setting - -``` -plugins.ml_commons.allow_custom_deployment_plan: false -``` - -### Values - -- Default value: false -- Value range: [false, true] - -## Enable auto redeploy - -This setting automatically redeploys deployed or partially deployed models upon cluster failure. If all ML nodes inside a cluster crash, the model switches to the `DEPLOYED_FAILED` state, and the model must be deployed manually. - -### Setting - -``` -plugins.ml_commons.model_auto_redeploy.enable: false -``` - -### Values - -- Default value: false -- Value range: [false, true] - -## Set retires for auto redeploy - -This setting sets the limit for the number of times a deployed or partially deployed model will try and redeploy when ML nodes in a cluster fail or new ML nodes join the cluster. - -### Setting - -``` -plugins.ml_commons.model_auto_redeploy.lifetime_retry_times: 3 -``` - -### Values - -- Default value: 3 -- Value range: [0, 100] - -## Set auto redeploy success ratio - -This setting sets the ratio of success for the auto-redeployment of a model based on the available ML nodes in a cluster. For example, if ML nodes crash inside a cluster, the auto redeploy protocol adds another node or retires a crashed node. If the ratio is `0.7` and 70% of all ML nodes successfully redeploy the model on auto-redeploy activation, the redeployment is a success. If the model redeploys on fewer than 70% of available ML nodes, the auto-redeploy retries until the redeployment succeeds or OpenSearch reaches [the maximum number of retries](#set-retires-for-auto-redeploy). - -### Setting - -``` -plugins.ml_commons.model_auto_redeploy_success_ratio: 0.8 -``` - -### Values - -- Default value: 0.8 -- Value range: [0, 1]