diff --git a/packages/analytics/src/distance.js b/packages/analytics/src/distance.js new file mode 100644 index 00000000..3f5f0f19 --- /dev/null +++ b/packages/analytics/src/distance.js @@ -0,0 +1,128 @@ +import get from 'lodash.get'; +import { levenshteinDistance } from './algorithms'; + +/** + * To compare 2 fields with 2 id and compute a distance + * - for arrays, the distance is calculated according to the number of element in common + * + * ```json + * [{ + * { + * id_of_a: 1, + * id_of_b: 2, + * a: ['x', 'y'], + * b: ['x', 'z'], + * }, + * { + * id_of_a: 1, + * id_of_b: 3, + * a: ['x', 'y'], + * b: ['y', 'z'], + * }, + * { + * id_of_a: 1, + * id_of_b: 4, + * a: ['x', 'y'], + * b: ['z'], + * }, + * { + * id_of_a: 1, + * id_of_b: 5, + * a: ['x', 'y'], + * b: ['x', 'y', 'z'], + * }, + * { + * id_of_a: 1, + * id_of_b: 6, + * a: ['x', 'y'], + * b: ['x', 'y'], + * }, + * }] + * ``` + * + * Script: + * + * ```ini + * [use] + * plugin = analytics + * + * [distance] + * id = id_of_a + * id = id_of_b + * value = a + * value = b + * + * ``` + * + * Output: + * + * ```json + * [ + * { id: [ 1, 2 ], value: 0.5 }, + * { id: [ 1, 3 ], value: 0.5 }, + * { id: [ 1, 4 ], value: 0 }, + * { id: [ 1, 5 ], value: 0.8 }, + * { id: [ 1, 6 ], value: 1 } + * ] + * + * ] + * ``` + * + * @name distance + * @param {String} [path=value] path + * @returns {Object} + */ +export default function distance(data, feed) { + if (this.isLast()) { + feed.close(); + return; + } + const fractionalDigits = Number(this.getParam('digits', 4)); + const idPath = this.getParam('id', 'id'); + let id1; + let id2; + if (Array.isArray(idPath)) { + [id1, id2] = idPath.map((f) => get(data, f)); + } else { + [id1, id2] = get(data, idPath); + } + const valuePath = this.getParam('value', 'value'); + let value1; + let value2; + if (Array.isArray(valuePath)) { + [value1, value2] = valuePath.map((f) => get(data, f)); + } else { + [value1, value2] = get(data, valuePath); + } + if (Array.isArray(value1) && Array.isArray(value2)) { + const measurement = value1.map((val) => (value2.indexOf(val) >= 0 ? 1 : 0)).reduce((a, b) => a + b, 0); + const value = ((measurement * 200) / (value1.length + value2.length)) / 100; + const result = { + id: [id1, id2], + value, + }; + feed.send(result); + return; + } + if (typeof value1 === 'string' && typeof value2 === 'string') { + const measurement = levenshteinDistance(value1, value2); + const value = ((measurement * 200) / (value1.length + value2.length)) / 100; + const result = { + id: [id1, id2], + value: (1 - value).toFixed(fractionalDigits), + }; + feed.send(result); + return; + } + if (typeof value1 === 'number' && typeof value2 === 'number') { + const measurement = Math.min(value1, value2); + const value = (((1 + measurement) * 200) / (value1 + value2 + 2)) / 100; + const result = { + id: [id1, id2], + value: value.toFixed(fractionalDigits), + }; + feed.send(result); + return; + } + feed.end(); +} diff --git a/packages/analytics/src/index.js b/packages/analytics/src/index.js index e31ae7da..54a168ef 100644 --- a/packages/analytics/src/index.js +++ b/packages/analytics/src/index.js @@ -27,6 +27,7 @@ import less from './less'; import drop from './drop'; import filter from './filter'; import multiply from './mulitply'; +import distance from './distance'; export default { count, @@ -58,4 +59,5 @@ export default { drop, filter, multiply, + distance, }; diff --git a/packages/analytics/test/distance.js b/packages/analytics/test/distance.js new file mode 100644 index 00000000..cb213d67 --- /dev/null +++ b/packages/analytics/test/distance.js @@ -0,0 +1,100 @@ +const assert = require('assert'); +const from = require('from'); +const ezs = require('../../core/src'); + +ezs.use(require('../src')); + +describe('distance', () => { + it('of 2 array', (done) => { + const res = []; + from([ + { + id_of_a: 1, id_of_b: 2, a: ['x', 'y'], b: ['x', 'z'], + }, + { + id_of_a: 1, id_of_b: 3, a: ['x', 'y'], b: ['y', 'z'], + }, + { + id_of_a: 1, id_of_b: 4, a: ['x', 'y'], b: ['z'], + }, + { + id_of_a: 1, id_of_b: 5, a: ['x', 'y'], b: ['x', 'y', 'z'], + }, + { + id_of_a: 1, id_of_b: 6, a: ['x', 'y'], b: ['x', 'y'], + }, + ]) + .pipe(ezs('distance', { id: ['id_of_a', 'id_of_b'], value: ['a', 'b'] })) + .on('data', (chunk) => { + res.push(chunk); + }) + .on('end', () => { + assert.equal(5, res.length); + assert.equal(0.5, res[0].value); + assert.equal(0, res[2].value); + assert.equal(1, res[4].value); + done(); + }); + }); + it('of 2 string', (done) => { + const res = []; + from([ + { + id: [1, 2], value: ['karolin', 'kathrin'], + }, + { + id: [1, 3], value: ['karolin', 'kerstin'], + }, + { + id: [1, 4], value: ['karolin', 'caroline'], + }, + { + id: [1, 5], value: ['karolin', 'kaporal'], + }, + { + id: [1, 6], value: ['karolin', 'karolin'], + }, + ]) + .pipe(ezs('distance')) + .on('data', (chunk) => { + res.push(chunk); + }) + .on('end', () => { + assert.equal(5, res.length); + assert.equal(0.5714, res[0].value); + assert.equal(1, res[4].value); + done(); + }); + }); + it('of 2 number', (done) => { + const res = []; + from([ + { + id: [1, 2], value: [1234, 13444], + }, + { + id: [1, 3], value: [0.3445, 0.456612], + }, + { + id: [1, 4], value: [563.3434, 423.534], + }, + { + id: [1, 5], value: [1, 1], + }, + { + id: [1, 6], value: [0, 0], + }, + ]) + .pipe(ezs('distance')) + .on('data', (chunk) => { + res.push(chunk); + }) + .on('end', () => { + assert.equal(5, res.length); + assert.equal(0.1683, res[0].value); + assert.equal(1.000, res[3].value); + assert.equal(1.000, res[4].value); + done(); + }); + }); +});