From 0693d40f3073a756264c2f042e88e5c5d50d8a4e Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Wed, 4 Sep 2019 13:39:41 +0100 Subject: [PATCH 1/4] [ML] File data viz limitiing upload chunk size --- .../import_view/importer/importer.js | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js index d055a2ee09d6f..2b3006154b32e 100644 --- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js +++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js @@ -11,6 +11,7 @@ import moment from 'moment'; import { i18n } from '@kbn/i18n'; const CHUNK_SIZE = 10000; +const MAX_CHUNK_CHAR_COUNT = 1000000; const IMPORT_RETRIES = 5; export class Importer { @@ -21,6 +22,7 @@ export class Importer { this.data = []; this.docArray = []; + this.docSizeArray = []; } async initializeImport(index) { @@ -58,7 +60,22 @@ export class Importer { }; } - const chunks = chunk(this.docArray, CHUNK_SIZE); + const tempChunks = chunk(this.docArray, CHUNK_SIZE); + let chunks = []; + + for (let i = 0; i < tempChunks.length; i++) { + const docs = tempChunks[i]; + const numberOfDocs = docs.length; + + const charCountOfDocs = JSON.stringify(docs).length; + if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { + const adjustedChunkSize = Math.floor(MAX_CHUNK_CHAR_COUNT / charCountOfDocs * numberOfDocs); + const smallerChunks = chunk(docs, adjustedChunkSize); + chunks.push(...smallerChunks); + } else { + chunks = tempChunks; + } + } const ingestPipeline = { id: pipelineId, @@ -86,13 +103,18 @@ export class Importer { }; while (resp.success === false && retries > 0) { - resp = await ml.fileDatavisualizer.import(aggs); + try { + resp = await ml.fileDatavisualizer.import(aggs); - if (retries < IMPORT_RETRIES) { - console.log(`Retrying import ${IMPORT_RETRIES - retries}`); - } + if (retries < IMPORT_RETRIES) { + console.log(`Retrying import ${IMPORT_RETRIES - retries}`); + } - retries--; + retries--; + } catch (err) { + resp = { success: false, error: err }; + retries = 0; + } } if (resp.success) { From bb2a2a5164d2e831b65a832446237b0ef6a8a564 Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Wed, 4 Sep 2019 17:48:58 +0100 Subject: [PATCH 2/4] adding comments --- .../import_view/importer/importer.js | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js index 2b3006154b32e..782a76aafe22a 100644 --- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js +++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js @@ -10,7 +10,7 @@ import { chunk } from 'lodash'; import moment from 'moment'; import { i18n } from '@kbn/i18n'; -const CHUNK_SIZE = 10000; +const CHUNK_SIZE = 5000; const MAX_CHUNK_CHAR_COUNT = 1000000; const IMPORT_RETRIES = 5; @@ -60,22 +60,7 @@ export class Importer { }; } - const tempChunks = chunk(this.docArray, CHUNK_SIZE); - let chunks = []; - - for (let i = 0; i < tempChunks.length; i++) { - const docs = tempChunks[i]; - const numberOfDocs = docs.length; - - const charCountOfDocs = JSON.stringify(docs).length; - if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { - const adjustedChunkSize = Math.floor(MAX_CHUNK_CHAR_COUNT / charCountOfDocs * numberOfDocs); - const smallerChunks = chunk(docs, adjustedChunkSize); - chunks.push(...smallerChunks); - } else { - chunks = tempChunks; - } - } + const chunks = this.createDocumentChunks(); const ingestPipeline = { id: pipelineId, @@ -144,6 +129,35 @@ export class Importer { return result; } + + createDocumentChunks() { + let chunks = []; + // chop docArray into 5000 doc chunks + const tempChunks = chunk(this.docArray, CHUNK_SIZE); + + // loop over tempChunks and check that the total character length + // for each chunk is within the MAX_CHUNK_CHAR_COUNT. + // if the length is too long, split the chunk into smaller chunks + // based on how much larger it is than MAX_CHUNK_CHAR_COUNT + // note, each document is a different size, so dividing by charCountOfDocs + // only produces an average chunk size that should be smaller than the max length + for (let i = 0; i < tempChunks.length; i++) { + const docs = tempChunks[i]; + const numberOfDocs = docs.length; + + const charCountOfDocs = JSON.stringify(docs).length; + if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { + // calculate new chunk size which should produce a chunk + // who's length is on average around MAX_CHUNK_CHAR_COUNT + const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs); + const smallerChunks = chunk(docs, adjustedChunkSize); + chunks.push(...smallerChunks); + } else { + chunks = tempChunks; + } + } + return chunks; + } } function populateFailures(error, failures, chunkCount) { From c1bc0c66852948984073cfe352da9e09f7dc67e0 Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Wed, 4 Sep 2019 17:51:51 +0100 Subject: [PATCH 3/4] refactor --- .../import_view/importer/importer.js | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js index 782a76aafe22a..1e0552e4cc735 100644 --- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js +++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js @@ -60,7 +60,7 @@ export class Importer { }; } - const chunks = this.createDocumentChunks(); + const chunks = createDocumentChunks(this.docArray); const ingestPipeline = { id: pipelineId, @@ -129,35 +129,6 @@ export class Importer { return result; } - - createDocumentChunks() { - let chunks = []; - // chop docArray into 5000 doc chunks - const tempChunks = chunk(this.docArray, CHUNK_SIZE); - - // loop over tempChunks and check that the total character length - // for each chunk is within the MAX_CHUNK_CHAR_COUNT. - // if the length is too long, split the chunk into smaller chunks - // based on how much larger it is than MAX_CHUNK_CHAR_COUNT - // note, each document is a different size, so dividing by charCountOfDocs - // only produces an average chunk size that should be smaller than the max length - for (let i = 0; i < tempChunks.length; i++) { - const docs = tempChunks[i]; - const numberOfDocs = docs.length; - - const charCountOfDocs = JSON.stringify(docs).length; - if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { - // calculate new chunk size which should produce a chunk - // who's length is on average around MAX_CHUNK_CHAR_COUNT - const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs); - const smallerChunks = chunk(docs, adjustedChunkSize); - chunks.push(...smallerChunks); - } else { - chunks = tempChunks; - } - } - return chunks; - } } function populateFailures(error, failures, chunkCount) { @@ -188,3 +159,32 @@ function updatePipelineTimezone(ingestPipeline) { } } } + +function createDocumentChunks(docArray) { + let chunks = []; + // chop docArray into 5000 doc chunks + const tempChunks = chunk(docArray, CHUNK_SIZE); + + // loop over tempChunks and check that the total character length + // for each chunk is within the MAX_CHUNK_CHAR_COUNT. + // if the length is too long, split the chunk into smaller chunks + // based on how much larger it is than MAX_CHUNK_CHAR_COUNT + // note, each document is a different size, so dividing by charCountOfDocs + // only produces an average chunk size that should be smaller than the max length + for (let i = 0; i < tempChunks.length; i++) { + const docs = tempChunks[i]; + const numberOfDocs = docs.length; + + const charCountOfDocs = JSON.stringify(docs).length; + if (charCountOfDocs > MAX_CHUNK_CHAR_COUNT) { + // calculate new chunk size which should produce a chunk + // who's length is on average around MAX_CHUNK_CHAR_COUNT + const adjustedChunkSize = Math.floor((MAX_CHUNK_CHAR_COUNT / charCountOfDocs) * numberOfDocs); + const smallerChunks = chunk(docs, adjustedChunkSize); + chunks.push(...smallerChunks); + } else { + chunks = tempChunks; + } + } + return chunks; +} From e87416b9649aae2ed0a02bedfde65943e0efd32d Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Thu, 5 Sep 2019 15:25:45 +0100 Subject: [PATCH 4/4] fixing incorrect overwrite of array --- .../file_based/components/import_view/importer/importer.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js index 1e0552e4cc735..a17c563b78dc3 100644 --- a/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js +++ b/x-pack/legacy/plugins/ml/public/datavisualizer/file_based/components/import_view/importer/importer.js @@ -161,7 +161,7 @@ function updatePipelineTimezone(ingestPipeline) { } function createDocumentChunks(docArray) { - let chunks = []; + const chunks = []; // chop docArray into 5000 doc chunks const tempChunks = chunk(docArray, CHUNK_SIZE); @@ -183,7 +183,7 @@ function createDocumentChunks(docArray) { const smallerChunks = chunk(docs, adjustedChunkSize); chunks.push(...smallerChunks); } else { - chunks = tempChunks; + chunks.push(docs); } } return chunks;