add rate limiting for offline batch jobs, set default bulk size to 500 (

#3116) * add rate limiting for offline batch jobs, set default bulk size to 500 Signed-off-by: Xun Zhang <[email protected]> * update error code to 429 for rate limiting and update logs Signed-off-by: Xun Zhang <[email protected]> --------- Signed-off-by: Xun Zhang <[email protected]>
opensearch-project · Oct 16, 2024 · 9a4166e · 9a4166e
1 parent 09ee93f
commit 9a4166e
Show file tree

Hide file tree

Showing 12 changed files with 231 additions and 49 deletions.
diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/Ingestable.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/Ingestable.java
@@ -13,7 +13,7 @@ public interface Ingestable {
      * @param mlBatchIngestionInput batch ingestion input data
      * @return successRate (0 - 100)
      */
-    default double ingest(MLBatchIngestionInput mlBatchIngestionInput) {
+    default double ingest(MLBatchIngestionInput mlBatchIngestionInput, int bulkSize) {
         throw new IllegalStateException("Ingest is not implemented");
     }
 }
diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/OpenAIDataIngestion.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/OpenAIDataIngestion.java
@@ -39,7 +39,7 @@ public OpenAIDataIngestion(Client client) {
     }
 
     @Override
-    public double ingest(MLBatchIngestionInput mlBatchIngestionInput) {
+    public double ingest(MLBatchIngestionInput mlBatchIngestionInput, int bulkSize) {
         List<String> sources = (List<String>) mlBatchIngestionInput.getDataSources().get(SOURCE);
         if (Objects.isNull(sources) || sources.isEmpty()) {
             return 100;
@@ -48,13 +48,19 @@ public double ingest(MLBatchIngestionInput mlBatchIngestionInput) {
         boolean isSoleSource = sources.size() == 1;
         List<Double> successRates = Collections.synchronizedList(new ArrayList<>());
         for (int sourceIndex = 0; sourceIndex < sources.size(); sourceIndex++) {
-            successRates.add(ingestSingleSource(sources.get(sourceIndex), mlBatchIngestionInput, sourceIndex, isSoleSource));
+            successRates.add(ingestSingleSource(sources.get(sourceIndex), mlBatchIngestionInput, sourceIndex, isSoleSource, bulkSize));
         }
 
         return calculateSuccessRate(successRates);
     }
 
-    private double ingestSingleSource(String fileId, MLBatchIngestionInput mlBatchIngestionInput, int sourceIndex, boolean isSoleSource) {
+    private double ingestSingleSource(
+        String fileId,
+        MLBatchIngestionInput mlBatchIngestionInput,
+        int sourceIndex,
+        boolean isSoleSource,
+        int bulkSize
+    ) {
         double successRate = 0;
         try {
             String apiKey = mlBatchIngestionInput.getCredential().get(API_KEY);
@@ -82,8 +88,8 @@ private double ingestSingleSource(String fileId, MLBatchIngestionInput mlBatchIn
                     linesBuffer.add(line);
                     lineCount++;
 
-                    // Process every 100 lines
-                    if (lineCount % 100 == 0) {
+                    // Process every bulkSize lines
+                    if (lineCount % bulkSize == 0) {
                         // Create a CompletableFuture that will be completed by the bulkResponseListener
                         CompletableFuture<Void> future = new CompletableFuture<>();
                         batchIngest(

diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/S3DataIngestion.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/ingest/S3DataIngestion.java
@@ -53,7 +53,7 @@ public S3DataIngestion(Client client) {
     }
 
     @Override
-    public double ingest(MLBatchIngestionInput mlBatchIngestionInput) {
+    public double ingest(MLBatchIngestionInput mlBatchIngestionInput, int bulkSize) {
         S3Client s3 = initS3Client(mlBatchIngestionInput);
 
         List<String> s3Uris = (List<String>) mlBatchIngestionInput.getDataSources().get(SOURCE);
@@ -63,7 +63,7 @@ public double ingest(MLBatchIngestionInput mlBatchIngestionInput) {
         boolean isSoleSource = s3Uris.size() == 1;
         List<Double> successRates = Collections.synchronizedList(new ArrayList<>());
         for (int sourceIndex = 0; sourceIndex < s3Uris.size(); sourceIndex++) {
-            successRates.add(ingestSingleSource(s3, s3Uris.get(sourceIndex), mlBatchIngestionInput, sourceIndex, isSoleSource));
+            successRates.add(ingestSingleSource(s3, s3Uris.get(sourceIndex), mlBatchIngestionInput, sourceIndex, isSoleSource, bulkSize));
         }
 
         return calculateSuccessRate(successRates);
@@ -74,7 +74,8 @@ public double ingestSingleSource(
         String s3Uri,
         MLBatchIngestionInput mlBatchIngestionInput,
         int sourceIndex,
-        boolean isSoleSource
+        boolean isSoleSource,
+        int bulkSize
     ) {
         String bucketName = getS3BucketName(s3Uri);
         String keyName = getS3KeyName(s3Uri);
@@ -99,8 +100,8 @@ public double ingestSingleSource(
                 linesBuffer.add(line);
                 lineCount++;
 
-                // Process every 100 lines
-                if (lineCount % 100 == 0) {
+                // Process every bulkSize lines
+                if (lineCount % bulkSize == 0) {
                     // Create a CompletableFuture that will be completed by the bulkResponseListener
                     CompletableFuture<Void> future = new CompletableFuture<>();
                     batchIngest(

diff --git a/plugin/src/main/java/org/opensearch/ml/action/batch/TransportBatchIngestionAction.java b/plugin/src/main/java/org/opensearch/ml/action/batch/TransportBatchIngestionAction.java
@@ -10,6 +10,7 @@
 import static org.opensearch.ml.common.MLTaskState.COMPLETED;
 import static org.opensearch.ml.common.MLTaskState.FAILED;
 import static org.opensearch.ml.plugin.MachineLearningPlugin.INGEST_THREAD_POOL;
+import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_BATCH_INGESTION_BULK_SIZE;
 import static org.opensearch.ml.task.MLTaskManager.TASK_SEMAPHORE_TIMEOUT;
 import static org.opensearch.ml.utils.MLExceptionUtils.OFFLINE_BATCH_INGESTION_DISABLED_ERR_MSG;
 
@@ -24,7 +25,9 @@
 import org.opensearch.action.support.ActionFilters;
 import org.opensearch.action.support.HandledTransportAction;
 import org.opensearch.client.Client;
+import org.opensearch.cluster.service.ClusterService;
 import org.opensearch.common.inject.Inject;
+import org.opensearch.common.settings.Settings;
 import org.opensearch.core.action.ActionListener;
 import org.opensearch.core.rest.RestStatus;
 import org.opensearch.ml.common.MLTask;
@@ -60,16 +63,19 @@ public class TransportBatchIngestionAction extends HandledTransportAction<Action
     private final Client client;
     private ThreadPool threadPool;
     private MLFeatureEnabledSetting mlFeatureEnabledSetting;
+    private volatile Integer batchIngestionBulkSize;
 
     @Inject
     public TransportBatchIngestionAction(
+        ClusterService clusterService,
         TransportService transportService,
         ActionFilters actionFilters,
         Client client,
         MLTaskManager mlTaskManager,
         ThreadPool threadPool,
         MLModelManager mlModelManager,
-        MLFeatureEnabledSetting mlFeatureEnabledSetting
+        MLFeatureEnabledSetting mlFeatureEnabledSetting,
+        Settings settings
     ) {
         super(MLBatchIngestionAction.NAME, transportService, actionFilters, MLBatchIngestionRequest::new);
         this.transportService = transportService;
@@ -78,6 +84,12 @@ public TransportBatchIngestionAction(
         this.threadPool = threadPool;
         this.mlModelManager = mlModelManager;
         this.mlFeatureEnabledSetting = mlFeatureEnabledSetting;
+
+        batchIngestionBulkSize = ML_COMMONS_BATCH_INGESTION_BULK_SIZE.get(settings);
+        clusterService
+            .getClusterSettings()
+            .addSettingsUpdateConsumer(ML_COMMONS_BATCH_INGESTION_BULK_SIZE, it -> batchIngestionBulkSize = it);
+
     }
 
     @Override
@@ -131,33 +143,45 @@ protected void createMLTaskandExecute(MLBatchIngestionInput mlBatchIngestionInpu
             .state(MLTaskState.CREATED)
             .build();
 
-        mlTaskManager.createMLTask(mlTask, ActionListener.wrap(response -> {
-            String taskId = response.getId();
-            try {
-                mlTask.setTaskId(taskId);
-                mlTaskManager.add(mlTask);
-                listener.onResponse(new MLBatchIngestionResponse(taskId, MLTaskType.BATCH_INGEST, MLTaskState.CREATED.name()));
-                String ingestType = (String) mlBatchIngestionInput.getDataSources().get(TYPE);
-                Ingestable ingestable = MLEngineClassLoader.initInstance(ingestType.toLowerCase(), client, Client.class);
-                threadPool.executor(INGEST_THREAD_POOL).execute(() -> {
-                    executeWithErrorHandling(() -> {
-                        double successRate = ingestable.ingest(mlBatchIngestionInput);
-                        handleSuccessRate(successRate, taskId);
-                    }, taskId);
-                });
-            } catch (Exception ex) {
-                log.error("Failed in batch ingestion", ex);
-                mlTaskManager
-                    .updateMLTask(
-                        taskId,
-                        Map.of(STATE_FIELD, FAILED, ERROR_FIELD, MLExceptionUtils.getRootCauseMessage(ex)),
-                        TASK_SEMAPHORE_TIMEOUT,
-                        true
-                    );
-                listener.onFailure(ex);
+        mlModelManager.checkMaxBatchJobTask(mlTask, ActionListener.wrap(exceedLimits -> {
+            if (exceedLimits) {
+                String error =
+                    "Exceeded maximum limit for BATCH_INGEST tasks. To increase the limit, update the plugins.ml_commons.max_batch_ingestion_tasks setting.";
+                log.warn(error + " in task " + mlTask.getTaskId());
+                listener.onFailure(new OpenSearchStatusException(error, RestStatus.TOO_MANY_REQUESTS));
+            } else {
+                mlTaskManager.createMLTask(mlTask, ActionListener.wrap(response -> {
+                    String taskId = response.getId();
+                    try {
+                        mlTask.setTaskId(taskId);
+                        mlTaskManager.add(mlTask);
+                        listener.onResponse(new MLBatchIngestionResponse(taskId, MLTaskType.BATCH_INGEST, MLTaskState.CREATED.name()));
+                        String ingestType = (String) mlBatchIngestionInput.getDataSources().get(TYPE);
+                        Ingestable ingestable = MLEngineClassLoader.initInstance(ingestType.toLowerCase(), client, Client.class);
+                        threadPool.executor(INGEST_THREAD_POOL).execute(() -> {
+                            executeWithErrorHandling(() -> {
+                                double successRate = ingestable.ingest(mlBatchIngestionInput, batchIngestionBulkSize);
+                                handleSuccessRate(successRate, taskId);
+                            }, taskId);
+                        });
+                    } catch (Exception ex) {
+                        log.error("Failed in batch ingestion", ex);
+                        mlTaskManager
+                            .updateMLTask(
+                                taskId,
+                                Map.of(STATE_FIELD, FAILED, ERROR_FIELD, MLExceptionUtils.getRootCauseMessage(ex)),
+                                TASK_SEMAPHORE_TIMEOUT,
+                                true
+                            );
+                        listener.onFailure(ex);
+                    }
+                }, exception -> {
+                    log.error("Failed to create batch ingestion task", exception);
+                    listener.onFailure(exception);
+                }));
             }
         }, exception -> {
-            log.error("Failed to create batch ingestion task", exception);
+            log.error("Failed to check the maximum BATCH_INGEST Task limits", exception);
             listener.onFailure(exception);
         }));
     }

diff --git a/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java b/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java
@@ -40,6 +40,8 @@
 import static org.opensearch.ml.engine.utils.FileUtils.deleteFileQuietly;
 import static org.opensearch.ml.plugin.MachineLearningPlugin.DEPLOY_THREAD_POOL;
 import static org.opensearch.ml.plugin.MachineLearningPlugin.REGISTER_THREAD_POOL;
+import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MAX_BATCH_INFERENCE_TASKS;
+import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MAX_BATCH_INGESTION_TASKS;
 import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MAX_DEPLOY_MODEL_TASKS_PER_NODE;
 import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MAX_MODELS_PER_NODE;
 import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MAX_REGISTER_MODEL_TASKS_PER_NODE;
@@ -107,6 +109,7 @@
 import org.opensearch.ml.common.MLModelGroup;
 import org.opensearch.ml.common.MLTask;
 import org.opensearch.ml.common.MLTaskState;
+import org.opensearch.ml.common.MLTaskType;
 import org.opensearch.ml.common.connector.Connector;
 import org.opensearch.ml.common.controller.MLController;
 import org.opensearch.ml.common.controller.MLRateLimiter;
@@ -177,6 +180,8 @@ public class MLModelManager {
     private volatile Integer maxModelPerNode;
     private volatile Integer maxRegisterTasksPerNode;
     private volatile Integer maxDeployTasksPerNode;
+    private volatile Integer maxBatchInferenceTasks;
+    private volatile Integer maxBatchIngestionTasks;
 
     public static final ImmutableSet MODEL_DONE_STATES = ImmutableSet
         .of(
@@ -232,6 +237,16 @@ public MLModelManager(
         clusterService
             .getClusterSettings()
             .addSettingsUpdateConsumer(ML_COMMONS_MAX_DEPLOY_MODEL_TASKS_PER_NODE, it -> maxDeployTasksPerNode = it);
+
+        maxBatchInferenceTasks = ML_COMMONS_MAX_BATCH_INFERENCE_TASKS.get(settings);
+        clusterService
+            .getClusterSettings()
+            .addSettingsUpdateConsumer(ML_COMMONS_MAX_BATCH_INFERENCE_TASKS, it -> maxBatchInferenceTasks = it);
+
+        maxBatchIngestionTasks = ML_COMMONS_MAX_BATCH_INGESTION_TASKS.get(settings);
+        clusterService
+            .getClusterSettings()
+            .addSettingsUpdateConsumer(ML_COMMONS_MAX_BATCH_INGESTION_TASKS, it -> maxBatchIngestionTasks = it);
     }
 
     public void registerModelMeta(MLRegisterModelMetaInput mlRegisterModelMetaInput, ActionListener<String> listener) {
@@ -867,6 +882,18 @@ public void checkAndAddRunningTask(MLTask mlTask, Integer runningTaskLimit) {
         mlTaskManager.checkLimitAndAddRunningTask(mlTask, runningTaskLimit);
     }
 
+    /**
+     * Check if exceed batch job task limit
+     *
+     * @param mlTask ML task
+     * @param listener ActionListener if the limit is exceeded
+     */
+    public void checkMaxBatchJobTask(MLTask mlTask, ActionListener<Boolean> listener) {
+        MLTaskType taskType = mlTask.getTaskType();
+        int maxLimit = taskType.equals(MLTaskType.BATCH_PREDICTION) ? maxBatchInferenceTasks : maxBatchIngestionTasks;
+        mlTaskManager.checkMaxBatchJobTask(taskType, maxLimit, listener);
+    }
+
     private void updateModelRegisterStateAsDone(
         MLRegisterModelInput registerModelInput,
         String taskId,

diff --git a/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java b/plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java
@@ -972,7 +972,10 @@ public List<Setting<?>> getSettings() {
                 MLCommonsSettings.ML_COMMONS_REMOTE_JOB_STATUS_EXPIRED_REGEX,
                 MLCommonsSettings.ML_COMMONS_CONTROLLER_ENABLED,
                 MLCommonsSettings.ML_COMMONS_OFFLINE_BATCH_INGESTION_ENABLED,
-                MLCommonsSettings.ML_COMMONS_OFFLINE_BATCH_INFERENCE_ENABLED
+                MLCommonsSettings.ML_COMMONS_OFFLINE_BATCH_INFERENCE_ENABLED,
+                MLCommonsSettings.ML_COMMONS_MAX_BATCH_INFERENCE_TASKS,
+                MLCommonsSettings.ML_COMMONS_MAX_BATCH_INGESTION_TASKS,
+                MLCommonsSettings.ML_COMMONS_BATCH_INGESTION_BULK_SIZE
             );
         return settings;
     }

diff --git a/plugin/src/main/java/org/opensearch/ml/settings/MLCommonsSettings.java b/plugin/src/main/java/org/opensearch/ml/settings/MLCommonsSettings.java
@@ -34,6 +34,15 @@ private MLCommonsSettings() {}
             Setting.Property.NodeScope,
             Setting.Property.Dynamic
         );
+
+    public static final Setting<Integer> ML_COMMONS_MAX_BATCH_INFERENCE_TASKS = Setting
+        .intSetting("plugins.ml_commons.max_batch_inference_tasks", 10, 0, 500, Setting.Property.NodeScope, Setting.Property.Dynamic);
+
+    public static final Setting<Integer> ML_COMMONS_MAX_BATCH_INGESTION_TASKS = Setting
+        .intSetting("plugins.ml_commons.max_batch_ingestion_tasks", 10, 0, 500, Setting.Property.NodeScope, Setting.Property.Dynamic);
+
+    public static final Setting<Integer> ML_COMMONS_BATCH_INGESTION_BULK_SIZE = Setting
+        .intSetting("plugins.ml_commons.batch_ingestion_bulk_size", 500, 100, 100000, Setting.Property.NodeScope, Setting.Property.Dynamic);
     public static final Setting<Integer> ML_COMMONS_MAX_DEPLOY_MODEL_TASKS_PER_NODE = Setting
         .intSetting("plugins.ml_commons.max_deploy_model_tasks_per_node", 10, 0, 10, Setting.Property.NodeScope, Setting.Property.Dynamic);
     public static final Setting<Integer> ML_COMMONS_MAX_ML_TASK_PER_NODE = Setting

diff --git a/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java b/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java
@@ -253,6 +253,33 @@ protected void executeTask(MLPredictionTaskRequest request, ActionListener<MLTas
             .lastUpdateTime(now)
             .async(false)
             .build();
+        if (actionType.equals(ActionType.BATCH_PREDICT)) {
+            mlModelManager.checkMaxBatchJobTask(mlTask, ActionListener.wrap(exceedLimits -> {
+                if (exceedLimits) {
+                    String error =
+                        "Exceeded maximum limit for BATCH_PREDICTION tasks. To increase the limit, update the plugins.ml_commons.max_batch_inference_tasks setting.";
+                    log.warn(error + " in task " + mlTask.getTaskId());
+                    listener.onFailure(new OpenSearchStatusException(error, RestStatus.TOO_MANY_REQUESTS));
+                } else {
+                    executePredictionByInputDataType(inputDataType, modelId, mlInput, mlTask, functionName, listener);
+                }
+            }, exception -> {
+                log.error("Failed to check the maximum BATCH_PREDICTION Task limits", exception);
+                listener.onFailure(exception);
+            }));
+            return;
+        }
+        executePredictionByInputDataType(inputDataType, modelId, mlInput, mlTask, functionName, listener);
+    }
+
+    private void executePredictionByInputDataType(
+        MLInputDataType inputDataType,
+        String modelId,
+        MLInput mlInput,
+        MLTask mlTask,
+        FunctionName functionName,
+        ActionListener<MLTaskResponse> listener
+    ) {
         switch (inputDataType) {
             case SEARCH_QUERY:
                 ActionListener<MLInputDataset> dataFrameActionListener = ActionListener.wrap(dataSet -> {