[ML] File data viz limiting uploaded doc chunk size (#44768) (#44905)

* [ML] File data viz limitiing upload chunk size * adding comments * refactor * fixing incorrect overwrite of array
elastic · Sep 5, 2019 · ae5f05d · ae5f05d
1 parent 6ca564a
commit ae5f05d
Showing 1 changed file with 43 additions and 7 deletions.
diff --git a/...y/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/...y/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js
@@ -10,7 +10,8 @@ import { chunk } from 'lodash';
 import moment from 'moment';
 import { i18n } from '@kbn/i18n';
 
-const CHUNK_SIZE = 10000;
+const CHUNK_SIZE = 5000;
+const MAX_CHUNK_CHAR_COUNT = 1000000;
 const IMPORT_RETRIES = 5;
 
 export class Importer {
@@ -21,6 +22,7 @@ export class Importer {
 
     this.data = [];
     this.docArray = [];
+    this.docSizeArray = [];
   }
 
   async initializeImport(index) {
@@ -58,7 +60,7 @@ export class Importer {
       };
     }
 
-    const chunks = chunk(this.docArray, CHUNK_SIZE);
+    const chunks = createDocumentChunks(this.docArray);
 
     const ingestPipeline = {
       id: pipelineId,
@@ -86,13 +88,18 @@ export class Importer {
       };
 
       while (resp.success === false && retries > 0) {
-        resp = await ml.fileDatavisualizer.import(aggs);
+        try {
+          resp = await ml.fileDatavisualizer.import(aggs);
 
-        if (retries < IMPORT_RETRIES) {
-          console.log(`Retrying import ${IMPORT_RETRIES - retries}`);
-        }
+          if (retries < IMPORT_RETRIES) {
+            console.log(`Retrying import ${IMPORT_RETRIES - retries}`);
+          }
 
-        retries--;
+          retries--;
+        } catch (err) {
+          resp = { success: false, error: err };
+          retries = 0;
+        }
       }
 
       if (resp.success) {
@@ -152,3 +159,32 @@ function updatePipelineTimezone(ingestPipeline) {
     }
   }
 }
+
+function createDocumentChunks(docArray) {
+  const chunks = [];
+  // chop docArray into 5000 doc chunks
+  const tempChunks = chunk(docArray, CHUNK_SIZE);
+
+  // loop over tempChunks and check that the total character length
+  // for each chunk is within the MAX_CHUNK_CHAR_COUNT.
+  // if the length is too long, split the chunk into smaller chunks
+  // based on how much larger it is than MAX_CHUNK_CHAR_COUNT
+  // note, each document is a different size, so dividing by charCountOfDocs
+  // only produces an average chunk size that should be smaller than the max length
+  for (let i = 0; i < tempChunks.length; i++) {
+    const docs = tempChunks[i];
+    const numberOfDocs = docs.length;
+
+    const charCountOfDocs = JSON.stringify(docs).length;
+    if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) {
+      // calculate new chunk size which should produce a chunk
+      // who's length is on average around MAX_CHUNK_CHAR_COUNT
+      const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs);
+      const smallerChunks = chunk(docs, adjustedChunkSize);
+      chunks.push(...smallerChunks);
+    } else {
+      chunks.push(docs);
+    }
+  }
+  return chunks;
+}