keplergl · heshan0131 · Mar 21, 2021 · Jan 12, 2021 · Feb 9, 2021
diff --git a/src/utils/dataset-utils.js b/src/utils/dataset-utils.js
@@ -91,29 +91,165 @@ export function createNewDataEntry({info, data, metadata}, datasets = {}) {
   };
 }
 
+/**
+ * Field name prefixes and suffixes which should not be considered
+ * as metrics. Fields will still be included if a 'metric word'
+ * is found on the field name, however.
+ */
+const EXCLUDED_DEFAULT_FIELDS = [
+  // Serial numbers and identification numbers
+  '_id',
+  'id',
+  'index',
+  'uuid',
+  'guid',
+  'uid',
+  'gid',
+  'serial',
+  // Geographic IDs are unlikely to be interesting to color
+  'zip',
+  'code',
+  'post',
+  'region',
+  'fips',
+  'cbgs',
+  'h3',
+  's2',
+  // Geographic coords (but not z/elevation/altitude
+  // since that might be a metric)
+  'lat',
+  'lon',
+  'lng',
+  'latitude',
+  'longitude',
+  '_x',
+  '_y'
+];
+
+/**
+ * Prefixes and suffixes that indicate a field is a metric.
+ *
+ * Note that these are in order of preference, first being
+ * most preferred.
+ */
+const METRIC_DEFAULT_FIELDS = [
+  'metric',
+  'value',
+  'sum',
+  'count',
+  'unique',
+  'mean',
+  'mode',
+  'median',
+  'max',
+  'min',
+  'deviation',
+  'variance',
+  'p99',
+  'p95',
+  'p75',
+  'p50',
+  'p25',
+  'p05',
+  // Abbreviations are less preferred
+  'cnt',
+  'val'
+];
+
 /**
  * Choose a field to use as the default color field of a layer.
  *
- * Right now this implements a very simple heuristic looking
- * for a real-type field that is not lat/lon.
+ * The heuristic is:
+ *
+ * First, exclude fields that are on the exclusion list and don't
+ * have names that suggest they contain metrics. Also exclude
+ * field names that are blank.
+ *
+ * Next, look for a field that is of real type and contains one
+ * of the preferred names (in order of the preferred names).
+ *
+ * Next, look for a field that is of integer type and contains
+ * one of the preferred names (in order of the preferred names).
+ *
+ * Next, look for the first field that is of real type (in order
+ * of field index).
+ *
+ * Next, look for the first field that is of integer type (in
+ * order of field index).
  *
- * In the future we could consider other things:
- * Consider integer fields
- * look for highest dynamic range (using a sample of the data)
- * Look for particular names to select ("value", "color", etc)
- * Look for particular names to avoid ("" - the Pandas index column)
+ * It's possible no field will be chosen (i.e. because all fields
+ * are strings.)
  *
  * @param dataset
  */
 export function findDefaultColorField({fields, fieldPairs = []}) {
-  const defaultField = fields.find(
-    f =>
-      f.type === ALL_FIELD_TYPES.real &&
+  const fieldsWithoutExcluded = fields.filter(field => {
+    if (field.type !== ALL_FIELD_TYPES.real && field.type !== ALL_FIELD_TYPES.integer) {
+      // Only select numeric fields.
+      return false;
+    }
+    if (
+      fieldPairs.find(
+        pair => pair.pair.lat.value === field.name || pair.pair.lng.value === field.name
+      )
+    ) {
       // Do not permit lat, lon fields
-      !fieldPairs.find(pair => pair.pair.lat.value === f.name || pair.pair.lng.value === f.name)
-  );
-  if (!defaultField) {
-    return null;
+      return false;
+    }
+
+    const normalizedFieldName = field.name.toLowerCase();
+    if (normalizedFieldName === '') {
+      // Special case excluded name when the name is blank.
+      return false;
+    }
+    const hasExcluded = EXCLUDED_DEFAULT_FIELDS.find(
+      f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
+    );
+    const hasInclusion = METRIC_DEFAULT_FIELDS.find(
+      f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
+    );
+    return !hasExcluded || hasInclusion;
+  });
+
+  const sortedFields = fieldsWithoutExcluded.sort((left, right) => {
+    const normalizedLeft = left.name.toLowerCase();
+    const normalizedRight = right.name.toLowerCase();
+    const leftHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
+      f => normalizedLeft.startsWith(f) || normalizedLeft.endsWith(f)
+    );
+    const rightHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
+      f => normalizedRight.startsWith(f) || normalizedRight.endsWith(f)
+    );
+    if (leftHasInclusion !== rightHasInclusion) {
+      if (leftHasInclusion === -1) {
+        // Elements that do not have the inclusion list should go after those that do.
+        return 1;
+      } else if (rightHasInclusion === -1) {
+        // Elements that do have the inclusion list should go before those that don't.
+        return -1;
+      }
+      // Compare based on order in the inclusion list
+      return leftHasInclusion - rightHasInclusion;
+    }
+
+    // Compare based on type
+    if (left.type !== right.type) {
+      if (left.type === ALL_FIELD_TYPES.real) {
+        return -1;
+      }
+      // left is an integer and right is not
+      // and reals come before integers
+      return 1;
+    }
+
+    // Finally, order based on the order in the datasets columns
+    return left.index - right.index;
+  });
+
+  if (sortedFields.length) {
+    // There was a best match
+    return sortedFields[0];
   }
-  return defaultField;
+  // No matches
+  return null;
 }
diff --git a/test/node/utils/dataset-utils-test.js b/test/node/utils/dataset-utils-test.js
@@ -25,27 +25,57 @@ import {processCsvData} from 'processors/data-processor';
 
 import csvData from 'test/fixtures/test-layer-data';
 
-test('datasetUtils.findDefaultColorField', t => {
-  const dataset = createNewDataEntry({
-    info: {id: 'taro'},
-    data: processCsvData(csvData)
-  }).taro;
-
-  const defaultField = findDefaultColorField(dataset);
-  // Unfortunately lat_1 is not detected as part of a field pair :(
-  t.equals(defaultField.name, 'lat_1', 'default field name is OK');
+const DEFAULT_FIELD_TEST_CASES = [
+  {
+    name: 'excluded lat',
+    csv: csvData,
+    expected: 'trip_distance'
+  },
+  {
+    name: 'empty',
+    csv: 'a\na',
+    expected: null
+  },
+  {
+    name: 'integer only',
+    csv: 'a,b\na,0\na,1',
+    expected: 'b'
+  },
+  {
+    name: 'integer and real',
+    csv: 'a,b,c\na,0,0.5\na,1,0.5',
+    expected: 'c'
+  },
+  {
+    name: 'excluded real',
+    csv: 'zipcode,b,c\n0.5,0,0.5\n0.5,1,0.5',
+    expected: 'c'
+  },
+  {
+    name: 'included real',
+    csv: 'zipcode mean,b,c\n0.5,0,0.5\n0.5,1,0.5',
+    expected: 'zipcode mean'
+  },
+  {
+    name: 'included real, with inclusion ordering',
+    csv: 'zipcode mean,a metric,b,c\n0.5,0.1,0,0.5\n0.5,0.1,1,0.5',
+    expected: 'a metric'
+  }
+];
 
-  t.end();
-});
-
-test('datasetUtils.findDefaultColorField empty', t => {
-  const dataset = createNewDataEntry({
-    info: {id: 'taro'},
-    data: processCsvData('a\na')
-  }).taro;
-
-  const defaultField = findDefaultColorField(dataset);
-  t.notOk(defaultField, 'default field is null');
+test('datasetUtils.findDefaultColorField', t => {
+  for (const tc of DEFAULT_FIELD_TEST_CASES) {
+    const dataset = createNewDataEntry({
+      info: {id: 'taro'},
+      data: processCsvData(tc.csv)
+    }).taro;
 
+    const defaultField = findDefaultColorField(dataset);
+    if (!tc.expected) {
+      t.notOk(defaultField, `${tc.name}: default field is null`);
+    } else {
+      t.equals(defaultField.name, tc.expected, `${tc.name}: default field name is OK`);
+    }
+  }
   t.end();
 });