diff --git a/src/utils/dataset-utils.js b/src/utils/dataset-utils.js index 9fddc7c37c..9ab1100819 100644 --- a/src/utils/dataset-utils.js +++ b/src/utils/dataset-utils.js @@ -91,29 +91,165 @@ export function createNewDataEntry({info, data, metadata}, datasets = {}) { }; } +/** + * Field name prefixes and suffixes which should not be considered + * as metrics. Fields will still be included if a 'metric word' + * is found on the field name, however. + */ +const EXCLUDED_DEFAULT_FIELDS = [ + // Serial numbers and identification numbers + '_id', + 'id', + 'index', + 'uuid', + 'guid', + 'uid', + 'gid', + 'serial', + // Geographic IDs are unlikely to be interesting to color + 'zip', + 'code', + 'post', + 'region', + 'fips', + 'cbgs', + 'h3', + 's2', + // Geographic coords (but not z/elevation/altitude + // since that might be a metric) + 'lat', + 'lon', + 'lng', + 'latitude', + 'longitude', + '_x', + '_y' +]; + +/** + * Prefixes and suffixes that indicate a field is a metric. + * + * Note that these are in order of preference, first being + * most preferred. + */ +const METRIC_DEFAULT_FIELDS = [ + 'metric', + 'value', + 'sum', + 'count', + 'unique', + 'mean', + 'mode', + 'median', + 'max', + 'min', + 'deviation', + 'variance', + 'p99', + 'p95', + 'p75', + 'p50', + 'p25', + 'p05', + // Abbreviations are less preferred + 'cnt', + 'val' +]; + /** * Choose a field to use as the default color field of a layer. * - * Right now this implements a very simple heuristic looking - * for a real-type field that is not lat/lon. + * The heuristic is: + * + * First, exclude fields that are on the exclusion list and don't + * have names that suggest they contain metrics. Also exclude + * field names that are blank. + * + * Next, look for a field that is of real type and contains one + * of the preferred names (in order of the preferred names). + * + * Next, look for a field that is of integer type and contains + * one of the preferred names (in order of the preferred names). + * + * Next, look for the first field that is of real type (in order + * of field index). + * + * Next, look for the first field that is of integer type (in + * order of field index). * - * In the future we could consider other things: - * Consider integer fields - * look for highest dynamic range (using a sample of the data) - * Look for particular names to select ("value", "color", etc) - * Look for particular names to avoid ("" - the Pandas index column) + * It's possible no field will be chosen (i.e. because all fields + * are strings.) * * @param dataset */ export function findDefaultColorField({fields, fieldPairs = []}) { - const defaultField = fields.find( - f => - f.type === ALL_FIELD_TYPES.real && + const fieldsWithoutExcluded = fields.filter(field => { + if (field.type !== ALL_FIELD_TYPES.real && field.type !== ALL_FIELD_TYPES.integer) { + // Only select numeric fields. + return false; + } + if ( + fieldPairs.find( + pair => pair.pair.lat.value === field.name || pair.pair.lng.value === field.name + ) + ) { // Do not permit lat, lon fields - !fieldPairs.find(pair => pair.pair.lat.value === f.name || pair.pair.lng.value === f.name) - ); - if (!defaultField) { - return null; + return false; + } + + const normalizedFieldName = field.name.toLowerCase(); + if (normalizedFieldName === '') { + // Special case excluded name when the name is blank. + return false; + } + const hasExcluded = EXCLUDED_DEFAULT_FIELDS.find( + f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f) + ); + const hasInclusion = METRIC_DEFAULT_FIELDS.find( + f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f) + ); + return !hasExcluded || hasInclusion; + }); + + const sortedFields = fieldsWithoutExcluded.sort((left, right) => { + const normalizedLeft = left.name.toLowerCase(); + const normalizedRight = right.name.toLowerCase(); + const leftHasInclusion = METRIC_DEFAULT_FIELDS.findIndex( + f => normalizedLeft.startsWith(f) || normalizedLeft.endsWith(f) + ); + const rightHasInclusion = METRIC_DEFAULT_FIELDS.findIndex( + f => normalizedRight.startsWith(f) || normalizedRight.endsWith(f) + ); + if (leftHasInclusion !== rightHasInclusion) { + if (leftHasInclusion === -1) { + // Elements that do not have the inclusion list should go after those that do. + return 1; + } else if (rightHasInclusion === -1) { + // Elements that do have the inclusion list should go before those that don't. + return -1; + } + // Compare based on order in the inclusion list + return leftHasInclusion - rightHasInclusion; + } + + // Compare based on type + if (left.type !== right.type) { + if (left.type === ALL_FIELD_TYPES.real) { + return -1; + } + // left is an integer and right is not + // and reals come before integers + return 1; + } + + // Finally, order based on the order in the datasets columns + return left.index - right.index; + }); + + if (sortedFields.length) { + // There was a best match + return sortedFields[0]; } - return defaultField; + // No matches + return null; } diff --git a/test/node/utils/dataset-utils-test.js b/test/node/utils/dataset-utils-test.js index 6c33d6e1c0..fa6625695a 100644 --- a/test/node/utils/dataset-utils-test.js +++ b/test/node/utils/dataset-utils-test.js @@ -25,27 +25,57 @@ import {processCsvData} from 'processors/data-processor'; import csvData from 'test/fixtures/test-layer-data'; -test('datasetUtils.findDefaultColorField', t => { - const dataset = createNewDataEntry({ - info: {id: 'taro'}, - data: processCsvData(csvData) - }).taro; - - const defaultField = findDefaultColorField(dataset); - // Unfortunately lat_1 is not detected as part of a field pair :( - t.equals(defaultField.name, 'lat_1', 'default field name is OK'); +const DEFAULT_FIELD_TEST_CASES = [ + { + name: 'excluded lat', + csv: csvData, + expected: 'trip_distance' + }, + { + name: 'empty', + csv: 'a\na', + expected: null + }, + { + name: 'integer only', + csv: 'a,b\na,0\na,1', + expected: 'b' + }, + { + name: 'integer and real', + csv: 'a,b,c\na,0,0.5\na,1,0.5', + expected: 'c' + }, + { + name: 'excluded real', + csv: 'zipcode,b,c\n0.5,0,0.5\n0.5,1,0.5', + expected: 'c' + }, + { + name: 'included real', + csv: 'zipcode mean,b,c\n0.5,0,0.5\n0.5,1,0.5', + expected: 'zipcode mean' + }, + { + name: 'included real, with inclusion ordering', + csv: 'zipcode mean,a metric,b,c\n0.5,0.1,0,0.5\n0.5,0.1,1,0.5', + expected: 'a metric' + } +]; - t.end(); -}); - -test('datasetUtils.findDefaultColorField empty', t => { - const dataset = createNewDataEntry({ - info: {id: 'taro'}, - data: processCsvData('a\na') - }).taro; - - const defaultField = findDefaultColorField(dataset); - t.notOk(defaultField, 'default field is null'); +test('datasetUtils.findDefaultColorField', t => { + for (const tc of DEFAULT_FIELD_TEST_CASES) { + const dataset = createNewDataEntry({ + info: {id: 'taro'}, + data: processCsvData(tc.csv) + }).taro; + const defaultField = findDefaultColorField(dataset); + if (!tc.expected) { + t.notOk(defaultField, `${tc.name}: default field is null`); + } else { + t.equals(defaultField.name, tc.expected, `${tc.name}: default field name is OK`); + } + } t.end(); });