[ML] Categorization example endpoint tests

elastic · Feb 12, 2020 · bf9b453 · bf9b453
1 parent 842b317
commit bf9b453
Show file tree

Hide file tree

Showing 2 changed files with 291 additions and 0 deletions.
diff --git a/x-pack/test/api_integration/apis/ml/categorization_field_examples.ts b/x-pack/test/api_integration/apis/ml/categorization_field_examples.ts
@@ -0,0 +1,290 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+import expect from '@kbn/expect';
+
+import { FtrProviderContext } from '../../ftr_provider_context';
+
+const COMMON_HEADERS = {
+  'kbn-xsrf': 'some-xsrf-token',
+};
+
+const start = 1554463535770;
+const end = 1574316073914;
+const analyzer = {
+  tokenizer: 'ml_classic',
+  filter: [
+    {
+      type: 'stop',
+      stopwords: [
+        'Monday',
+        'Tuesday',
+        'Wednesday',
+        'Thursday',
+        'Friday',
+        'Saturday',
+        'Sunday',
+        'Mon',
+        'Tue',
+        'Wed',
+        'Thu',
+        'Fri',
+        'Sat',
+        'Sun',
+        'January',
+        'February',
+        'March',
+        'April',
+        'May',
+        'June',
+        'July',
+        'August',
+        'September',
+        'October',
+        'November',
+        'December',
+        'Jan',
+        'Feb',
+        'Mar',
+        'Apr',
+        'May',
+        'Jun',
+        'Jul',
+        'Aug',
+        'Sep',
+        'Oct',
+        'Nov',
+        'Dec',
+        'GMT',
+        'UTC',
+      ],
+    },
+  ],
+};
+const defaultRequestBody = {
+  indexPatternTitle: 'categorization_functional_test',
+  query: { bool: { must: [{ match_all: {} }] } },
+  size: 5,
+  timeField: '@timestamp',
+  start,
+  end,
+  analyzer,
+};
+
+const testDataList = [
+  {
+    title: 'valid with good number of tokens',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field1',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'valid',
+      sampleSize: 1000,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'valid',
+          message: '1000 field values analyzed, 95% contain 3 or more tokens.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'invalid, too many tokens.',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field2',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'invalid',
+      sampleSize: 500,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 1,
+          valid: 'partially_valid',
+          message: 'The median length for the field values analyzed is over 400 characters.',
+        },
+        {
+          id: 4,
+          valid: 'invalid',
+          message:
+            'Tokenization of field value examples has failed due to more than 10000 tokens being found in a sample of 50 values.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'partially valid, more than 75% are null',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field3',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'partially_valid',
+      sampleSize: 250,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'valid',
+          message: '250 field values analyzed, 95% contain 3 or more tokens.',
+        },
+        {
+          id: 2,
+          valid: 'partially_valid',
+          message: 'More than 75% of field values are null.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'partially valid, median length is over 400 characters',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field4',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'partially_valid',
+      sampleSize: 500,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'valid',
+          message: '500 field values analyzed, 100% contain 3 or more tokens.',
+        },
+        {
+          id: 1,
+          valid: 'partially_valid',
+          message: 'The median length for the field values analyzed is over 400 characters.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'invalid, no values in any doc',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field5',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'invalid',
+      sampleSize: 0,
+      exampleLength: 0,
+      validationChecks: [
+        {
+          id: 3,
+          valid: 'invalid',
+          message:
+            'No examples for this field could be found. Please ensure the selected date range contains data.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'invalid, mostly made up of stop words, so no matched tokens',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field6',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'invalid',
+      sampleSize: 1000,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'invalid',
+          message: '1000 field values analyzed, 0% contain 3 or more tokens.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'valid, mostly made up of stop words, but analyser has no stop words. so it is ok.',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field6',
+      analyzer: {
+        tokenizer: 'ml_classic',
+      },
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'valid',
+      sampleSize: 1000,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'valid',
+          message: '1000 field values analyzed, 100% contain 3 or more tokens.',
+        },
+      ],
+    },
+  },
+  {
+    title: 'partially valid, half the docs are stop words.',
+    requestBody: {
+      ...defaultRequestBody,
+      field: 'field7',
+    },
+    expected: {
+      responseCode: 200,
+      overallValidStatus: 'partially_valid',
+      sampleSize: 1000,
+      exampleLength: 5,
+      validationChecks: [
+        {
+          id: 0,
+          valid: 'partially_valid',
+          message: '1000 field values analyzed, 50% contain 3 or more tokens.',
+        },
+      ],
+    },
+  },
+];
+
+// eslint-disable-next-line import/no-default-export
+export default ({ getService }: FtrProviderContext) => {
+  const esArchiver = getService('esArchiver');
+  const supertest = getService('supertest');
+
+  describe('Categorization example endpoint - ', function() {
+    this.tags(['james']);
+    before(async () => {
+      await esArchiver.load('ml/categorization');
+    });
+
+    after(async () => {
+      await esArchiver.unload('ml/categorization');
+    });
+
+    for (const testData of testDataList) {
+      it(testData.title, async () => {
+        const { body } = await supertest
+          .post('/api/ml/jobs/categorization_field_examples')
+          .set(COMMON_HEADERS)
+          .send(testData.requestBody)
+          .expect(testData.expected.responseCode);
+
+        expect(body.overallValidStatus).to.eql(testData.expected.overallValidStatus);
+        expect(body.examples.length).to.eql(testData.expected.exampleLength);
+        expect(body.sampleSize).to.eql(testData.expected.sampleSize);
+        expect(body.validationChecks).to.eql(testData.expected.validationChecks);
+      });
+    }
+  });
+};
diff --git a/x-pack/test/api_integration/apis/ml/index.ts b/x-pack/test/api_integration/apis/ml/index.ts
@@ -12,5 +12,6 @@ export default function({ loadTestFile }: FtrProviderContext) {
 
     loadTestFile(require.resolve('./bucket_span_estimator'));
     loadTestFile(require.resolve('./calculate_model_memory_limit'));
+    loadTestFile(require.resolve('./categorization_field_examples'));
   });
 }