Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Choose the default field to be integer if no reals are present #1409

Merged
merged 2 commits into from
Mar 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 151 additions & 15 deletions src/utils/dataset-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,29 +91,165 @@ export function createNewDataEntry({info, data, metadata}, datasets = {}) {
};
}

/**
* Field name prefixes and suffixes which should not be considered
* as metrics. Fields will still be included if a 'metric word'
* is found on the field name, however.
*/
const EXCLUDED_DEFAULT_FIELDS = [
// Serial numbers and identification numbers
'_id',
'id',
'index',
'uuid',
'guid',
'uid',
'gid',
'serial',
// Geographic IDs are unlikely to be interesting to color
'zip',
'code',
'post',
'region',
'fips',
'cbgs',
'h3',
's2',
// Geographic coords (but not z/elevation/altitude
// since that might be a metric)
'lat',
'lon',
'lng',
'latitude',
'longitude',
'_x',
'_y'
];

/**
* Prefixes and suffixes that indicate a field is a metric.
*
* Note that these are in order of preference, first being
* most preferred.
*/
const METRIC_DEFAULT_FIELDS = [
'metric',
'value',
'sum',
'count',
'unique',
'mean',
'mode',
'median',
'max',
'min',
'deviation',
'variance',
'p99',
'p95',
'p75',
'p50',
'p25',
'p05',
// Abbreviations are less preferred
'cnt',
'val'
];

/**
* Choose a field to use as the default color field of a layer.
*
* Right now this implements a very simple heuristic looking
* for a real-type field that is not lat/lon.
* The heuristic is:
*
* First, exclude fields that are on the exclusion list and don't
* have names that suggest they contain metrics. Also exclude
* field names that are blank.
*
* Next, look for a field that is of real type and contains one
* of the preferred names (in order of the preferred names).
*
* Next, look for a field that is of integer type and contains
* one of the preferred names (in order of the preferred names).
*
* Next, look for the first field that is of real type (in order
* of field index).
*
* Next, look for the first field that is of integer type (in
* order of field index).
*
* In the future we could consider other things:
* Consider integer fields
* look for highest dynamic range (using a sample of the data)
* Look for particular names to select ("value", "color", etc)
* Look for particular names to avoid ("" - the Pandas index column)
* It's possible no field will be chosen (i.e. because all fields
* are strings.)
*
* @param dataset
*/
export function findDefaultColorField({fields, fieldPairs = []}) {
const defaultField = fields.find(
f =>
f.type === ALL_FIELD_TYPES.real &&
const fieldsWithoutExcluded = fields.filter(field => {
if (field.type !== ALL_FIELD_TYPES.real && field.type !== ALL_FIELD_TYPES.integer) {
// Only select numeric fields.
return false;
}
if (
fieldPairs.find(
pair => pair.pair.lat.value === field.name || pair.pair.lng.value === field.name
)
) {
// Do not permit lat, lon fields
!fieldPairs.find(pair => pair.pair.lat.value === f.name || pair.pair.lng.value === f.name)
);
if (!defaultField) {
return null;
return false;
}

const normalizedFieldName = field.name.toLowerCase();
if (normalizedFieldName === '') {
// Special case excluded name when the name is blank.
return false;
}
const hasExcluded = EXCLUDED_DEFAULT_FIELDS.find(
f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
);
const hasInclusion = METRIC_DEFAULT_FIELDS.find(
f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
);
return !hasExcluded || hasInclusion;
});

const sortedFields = fieldsWithoutExcluded.sort((left, right) => {
const normalizedLeft = left.name.toLowerCase();
const normalizedRight = right.name.toLowerCase();
const leftHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
f => normalizedLeft.startsWith(f) || normalizedLeft.endsWith(f)
);
const rightHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
f => normalizedRight.startsWith(f) || normalizedRight.endsWith(f)
);
if (leftHasInclusion !== rightHasInclusion) {
if (leftHasInclusion === -1) {
// Elements that do not have the inclusion list should go after those that do.
return 1;
} else if (rightHasInclusion === -1) {
// Elements that do have the inclusion list should go before those that don't.
return -1;
}
// Compare based on order in the inclusion list
return leftHasInclusion - rightHasInclusion;
}

// Compare based on type
if (left.type !== right.type) {
if (left.type === ALL_FIELD_TYPES.real) {
return -1;
}
// left is an integer and right is not
// and reals come before integers
return 1;
}

// Finally, order based on the order in the datasets columns
return left.index - right.index;
});

if (sortedFields.length) {
// There was a best match
return sortedFields[0];
}
return defaultField;
// No matches
return null;
}
70 changes: 50 additions & 20 deletions test/node/utils/dataset-utils-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,57 @@ import {processCsvData} from 'processors/data-processor';

import csvData from 'test/fixtures/test-layer-data';

test('datasetUtils.findDefaultColorField', t => {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData(csvData)
}).taro;

const defaultField = findDefaultColorField(dataset);
// Unfortunately lat_1 is not detected as part of a field pair :(
t.equals(defaultField.name, 'lat_1', 'default field name is OK');
const DEFAULT_FIELD_TEST_CASES = [
{
name: 'excluded lat',
csv: csvData,
expected: 'trip_distance'
},
{
name: 'empty',
csv: 'a\na',
expected: null
},
{
name: 'integer only',
csv: 'a,b\na,0\na,1',
expected: 'b'
},
{
name: 'integer and real',
csv: 'a,b,c\na,0,0.5\na,1,0.5',
expected: 'c'
},
{
name: 'excluded real',
csv: 'zipcode,b,c\n0.5,0,0.5\n0.5,1,0.5',
expected: 'c'
},
{
name: 'included real',
csv: 'zipcode mean,b,c\n0.5,0,0.5\n0.5,1,0.5',
expected: 'zipcode mean'
},
{
name: 'included real, with inclusion ordering',
csv: 'zipcode mean,a metric,b,c\n0.5,0.1,0,0.5\n0.5,0.1,1,0.5',
expected: 'a metric'
}
];

t.end();
});

test('datasetUtils.findDefaultColorField empty', t => {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData('a\na')
}).taro;

const defaultField = findDefaultColorField(dataset);
t.notOk(defaultField, 'default field is null');
test('datasetUtils.findDefaultColorField', t => {
for (const tc of DEFAULT_FIELD_TEST_CASES) {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData(tc.csv)
}).taro;

const defaultField = findDefaultColorField(dataset);
if (!tc.expected) {
t.notOk(defaultField, `${tc.name}: default field is null`);
} else {
t.equals(defaultField.name, tc.expected, `${tc.name}: default field name is OK`);
}
}
t.end();
});