diff --git a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js index d055a2ee09d6f..a17c563b78dc3 100644 --- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js +++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js @@ -10,7 +10,8 @@ import { chunk } from 'lodash'; import moment from 'moment'; import { i18n } from '@kbn/i18n'; -const CHUNK_SIZE = 10000; +const CHUNK_SIZE = 5000; +const MAX_CHUNK_CHAR_COUNT = 1000000; const IMPORT_RETRIES = 5; export class Importer { @@ -21,6 +22,7 @@ export class Importer { this.data = []; this.docArray = []; + this.docSizeArray = []; } async initializeImport(index) { @@ -58,7 +60,7 @@ export class Importer { }; } - const chunks = chunk(this.docArray, CHUNK_SIZE); + const chunks = createDocumentChunks(this.docArray); const ingestPipeline = { id: pipelineId, @@ -86,13 +88,18 @@ export class Importer { }; while (resp.success === false && retries > 0) { - resp = await ml.fileDatavisualizer.import(aggs); + try { + resp = await ml.fileDatavisualizer.import(aggs); - if (retries < IMPORT_RETRIES) { - console.log(`Retrying import ${IMPORT_RETRIES - retries}`); - } + if (retries < IMPORT_RETRIES) { + console.log(`Retrying import ${IMPORT_RETRIES - retries}`); + } - retries--; + retries--; + } catch (err) { + resp = { success: false, error: err }; + retries = 0; + } } if (resp.success) { @@ -152,3 +159,32 @@ function updatePipelineTimezone(ingestPipeline) { } } } + +function createDocumentChunks(docArray) { + const chunks = []; + // chop docArray into 5000 doc chunks + const tempChunks = chunk(docArray, CHUNK_SIZE); + + // loop over tempChunks and check that the total character length + // for each chunk is within the MAX_CHUNK_CHAR_COUNT. + // if the length is too long, split the chunk into smaller chunks + // based on how much larger it is than MAX_CHUNK_CHAR_COUNT + // note, each document is a different size, so dividing by charCountOfDocs + // only produces an average chunk size that should be smaller than the max length + for (let i = 0; i < tempChunks.length; i++) { + const docs = tempChunks[i]; + const numberOfDocs = docs.length; + + const charCountOfDocs = JSON.stringify(docs).length; + if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { + // calculate new chunk size which should produce a chunk + // who's length is on average around MAX_CHUNK_CHAR_COUNT + const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs); + const smallerChunks = chunk(docs, adjustedChunkSize); + chunks.push(...smallerChunks); + } else { + chunks.push(docs); + } + } + return chunks; +}