elastic · benwtrent · Aug 9, 2021 · Aug 4, 2021 · davidkyle · Aug 9, 2021
diff --git a/...rc/main/java/org/elasticsearch/xpack/core/ml/inference/InferenceToXContentCompressor.java b/...rc/main/java/org/elasticsearch/xpack/core/ml/inference/InferenceToXContentCompressor.java
@@ -37,9 +37,9 @@
  */
 public final class InferenceToXContentCompressor {
     private static final int BUFFER_SIZE = 4096;
-    // Either 10% of the configured JVM heap, or 1 GB, which ever is smaller
+    // Either 25% of the configured JVM heap, or 1 GB, which ever is smaller
     private static final long MAX_INFLATED_BYTES = Math.min(
-        (long)((0.10) * JvmInfo.jvmInfo().getMem().getHeapMax().getBytes()),
+        (long)((0.25) * JvmInfo.jvmInfo().getMem().getHeapMax().getBytes()),
         ByteSizeValue.ofGb(1).getBytes());
 
     private InferenceToXContentCompressor() {}
@@ -49,6 +49,12 @@ public static <T extends ToXContentObject> BytesReference deflate(T objectToComp
         return deflate(reference);
     }
 
+    public static <T> T inflateUnsafe(BytesReference compressedBytes,
+                                      CheckedFunction<XContentParser, T, IOException> parserFunction,
+                                      NamedXContentRegistry xContentRegistry) throws IOException {
+        return inflate(compressedBytes, parserFunction, xContentRegistry, Long.MAX_VALUE);
+    }
+
     public static <T> T inflate(BytesReference compressedBytes,
                                 CheckedFunction<XContentParser, T, IOException> parserFunction,
                                 NamedXContentRegistry xContentRegistry) throws IOException {

diff --git a/...ugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/TrainedModelConfig.java b/...ugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/TrainedModelConfig.java
@@ -286,6 +286,14 @@ public TrainedModelConfig ensureParsedDefinition(NamedXContentRegistry xContentR
         return this;
     }
 
+    public TrainedModelConfig ensureParsedDefinitionUnsafe(NamedXContentRegistry xContentRegistry) throws IOException {
+        if (definition == null) {
+            return null;
+        }
+        definition.ensureParsedDefinitionUnsafe(xContentRegistry);
+        return this;
+    }
+
     @Nullable
     public TrainedModelDefinition getModelDefinition() {
         if (definition == null) {
@@ -872,6 +880,14 @@ private void ensureParsedDefinition(NamedXContentRegistry xContentRegistry) thro
             }
         }
 
+        private void ensureParsedDefinitionUnsafe(NamedXContentRegistry xContentRegistry) throws IOException {
+            if (parsedDefinition == null) {
+                parsedDefinition = InferenceToXContentCompressor.inflateUnsafe(compressedRepresentation,
+                    parser -> TrainedModelDefinition.fromXContent(parser, true).build(),
+                    xContentRegistry);
+            }
+        }
+
         @Override
         public void writeTo(StreamOutput out) throws IOException {
             if (out.getVersion().onOrAfter(Version.V_8_0_0)) { // TODO adjust on backport

diff --git a/...ternalClusterTest/java/org/elasticsearch/xpack/ml/integration/TrainedModelProviderIT.java b/...ternalClusterTest/java/org/elasticsearch/xpack/ml/integration/TrainedModelProviderIT.java
@@ -339,7 +339,7 @@ public void testGetTrainedModelForInference() throws InterruptedException, IOExc
 
         AtomicReference<InferenceDefinition> definitionHolder = new AtomicReference<>();
         blockingCall(
-            listener -> trainedModelProvider.getTrainedModelForInference(modelId, listener),
+            listener -> trainedModelProvider.getTrainedModelForInference(modelId, false, listener),
             definitionHolder,
             exceptionHolder);
         assertThat(exceptionHolder.get(), is(nullValue()));

diff --git a/...ugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/inference/InferenceRunner.java b/...ugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/inference/InferenceRunner.java
@@ -91,7 +91,7 @@ public void run(String modelId) {
         LOGGER.info("[{}] Started inference on test data against model [{}]", config.getId(), modelId);
         try {
             PlainActionFuture<LocalModel> localModelPlainActionFuture = new PlainActionFuture<>();
-            modelLoadingService.getModelForPipeline(modelId, localModelPlainActionFuture);
+            modelLoadingService.getModelForInternalInference(modelId, localModelPlainActionFuture);
             InferenceState inferenceState = restoreInferenceState();
             dataCountsTracker.setTestDocsCount(inferenceState.processedTestDocsCount);
             TestDocsIterator testDocsIterator = new TestDocsIterator(new OriginSettingClient(client, ClientHelper.ML_ORIGIN), config,

diff --git a/...rc/main/java/org/elasticsearch/xpack/ml/inference/loadingservice/ModelLoadingService.java b/...rc/main/java/org/elasticsearch/xpack/ml/inference/loadingservice/ModelLoadingService.java
@@ -99,7 +99,7 @@ public class ModelLoadingService implements ClusterStateListener {
 
     // The feature requesting the model
     public enum Consumer {
-        PIPELINE, SEARCH
+        PIPELINE, SEARCH, INTERNAL
     }
 
     private static class ModelAndConsumer {
@@ -175,6 +175,16 @@ public void getModelForPipeline(String modelId, ActionListener<LocalModel> model
         getModel(modelId, Consumer.PIPELINE, modelActionListener);
     }
 
+    /**
+     * Load the model for internal use. Note, this decompresses the model if the stored estimate doesn't trip circuit breakers.
+     * Consequently, it assumes the model was created by an ML process
+     * @param modelId  the model to get
+     * @param modelActionListener the listener to alert when the model has been retrieved
+     */
+    public void getModelForInternalInference(String modelId, ActionListener<LocalModel> modelActionListener) {
+        getModel(modelId, Consumer.INTERNAL, modelActionListener);
+    }
+
     /**
      * Load the model for use by at search. Models requested by search are always cached.
      *
@@ -272,15 +282,15 @@ private boolean loadModelIfNecessary(String modelIdOrAlias, Consumer consumer, A
                 return true;
             }
 
-            if (Consumer.PIPELINE == consumer && referencedModels.contains(modelId) == false) {
+            if (Consumer.SEARCH != consumer && referencedModels.contains(modelId) == false) {
                 // The model is requested by a pipeline but not referenced by any ingest pipelines.
                 // This means it is a simulate call and the model should not be cached
                 logger.trace(() -> new ParameterizedMessage(
                     "[{}] (model_alias [{}]) not actively loading, eager loading without cache",
                     modelId,
                     modelIdOrAlias
                 ));
-                loadWithoutCaching(modelId, modelActionListener);
+                loadWithoutCaching(modelId, consumer, modelActionListener);
             } else {
                 logger.trace(() -> new ParameterizedMessage(
                     "[{}] (model_alias [{}]) attempting to load and cache",
@@ -298,7 +308,7 @@ private void loadModel(String modelId, Consumer consumer) {
         provider.getTrainedModel(modelId, GetTrainedModelsAction.Includes.empty(), ActionListener.wrap(
             trainedModelConfig -> {
                 trainedModelCircuitBreaker.addEstimateBytesAndMaybeBreak(trainedModelConfig.getEstimatedHeapMemory(), modelId);
-                provider.getTrainedModelForInference(modelId, ActionListener.wrap(
+                provider.getTrainedModelForInference(modelId, consumer == Consumer.INTERNAL, ActionListener.wrap(
                     inferenceDefinition -> {
                         try {
                             // Since we have used the previously stored estimate to help guard against OOM we need
@@ -327,14 +337,14 @@ private void loadModel(String modelId, Consumer consumer) {
         ));
     }
 
-    private void loadWithoutCaching(String modelId, ActionListener<LocalModel> modelActionListener) {
+    private void loadWithoutCaching(String modelId, Consumer consumer, ActionListener<LocalModel> modelActionListener) {
         // If we the model is not loaded and we did not kick off a new loading attempt, this means that we may be getting called
         // by a simulated pipeline
         provider.getTrainedModel(modelId, GetTrainedModelsAction.Includes.empty(), ActionListener.wrap(
             trainedModelConfig -> {
                 // Verify we can pull the model into memory without causing OOM
                 trainedModelCircuitBreaker.addEstimateBytesAndMaybeBreak(trainedModelConfig.getEstimatedHeapMemory(), modelId);
-                provider.getTrainedModelForInference(modelId, ActionListener.wrap(
+                provider.getTrainedModelForInference(modelId, consumer == Consumer.INTERNAL, ActionListener.wrap(
                     inferenceDefinition -> {
                         InferenceConfig inferenceConfig = trainedModelConfig.getInferenceConfig() == null ?
                             inferenceConfigFromTargetType(inferenceDefinition.getTargetType()) :

diff --git a/.../src/main/java/org/elasticsearch/xpack/ml/inference/persistence/TrainedModelProvider.java b/.../src/main/java/org/elasticsearch/xpack/ml/inference/persistence/TrainedModelProvider.java
@@ -392,13 +392,17 @@ private void storeTrainedModelAndDefinition(TrainedModelConfig trainedModelConfi
      * do not.
      *
      * @param modelId The model tp get
+     * @param unsafe when true, the compressed bytes size is not checked and the circuit breaker is solely responsible for
+     *               preventing OOMs
      * @param listener The listener
      */
-    public void getTrainedModelForInference(final String modelId, final ActionListener<InferenceDefinition> listener) {
+    public void getTrainedModelForInference(final String modelId, boolean unsafe, final ActionListener<InferenceDefinition> listener) {
         // TODO Change this when we get more than just langIdent stored
         if (MODELS_STORED_AS_RESOURCE.contains(modelId)) {
             try {
-                TrainedModelConfig config = loadModelFromResource(modelId, false).build().ensureParsedDefinition(xContentRegistry);
+                TrainedModelConfig config = loadModelFromResource(modelId, false)
+                    .build()
+                    .ensureParsedDefinitionUnsafe(xContentRegistry);
                 assert config.getModelDefinition().getTrainedModel() instanceof LangIdentNeuralNetwork;
                 assert config.getModelType() == TrainedModelType.LANG_IDENT;
                 listener.onResponse(
@@ -425,10 +429,9 @@ public void getTrainedModelForInference(final String modelId, final ActionListen
             success -> {
                 try {
                     BytesReference compressedData = getDefinitionFromDocs(docs, modelId);
-                    InferenceDefinition inferenceDefinition = InferenceToXContentCompressor.inflate(
-                        compressedData,
-                        InferenceDefinition::fromXContent,
-                        xContentRegistry);
+                    InferenceDefinition inferenceDefinition = unsafe ?
+                    InferenceToXContentCompressor.inflateUnsafe(compressedData, InferenceDefinition::fromXContent, xContentRegistry) :
+                    InferenceToXContentCompressor.inflate(compressedData, InferenceDefinition::fromXContent, xContentRegistry);
 
                     listener.onResponse(inferenceDefinition);
                 } catch (Exception e) {