Skip to content

Commit

Permalink
feat: implement bivariate aggregate functions (#1593)
Browse files Browse the repository at this point in the history
  • Loading branch information
kswenson authored Nov 7, 2024
1 parent 41c132b commit 413895c
Show file tree
Hide file tree
Showing 9 changed files with 623 additions and 1 deletion.
33 changes: 33 additions & 0 deletions v3/src/models/formula/functions/aggregate-functions.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { MathNode } from "mathjs"
import { checkNumber } from "../../../utilities/math-utils"
import { XYValues } from "../../../utilities/stats-utils"
import { IValueType } from "../../data/attribute-types"
import { CurrentScope, FValue, FValueOrArray } from "../formula-types"
import { UNDEF_RESULT, evaluateNode, getRootScope, isNumber, isValueNonEmpty, isValueTruthy } from "./function-utils"
Expand Down Expand Up @@ -44,6 +45,38 @@ export const aggregateNumericFnWithFilterFactory = (fn: (values: number[]) => FV
}
}

// Calls the client function with a filtered array of strictly numeric value pairs.
// Note that bivariate functions like correlation, rSquared, etc., all have the same signature.
// The only difference is the final math operation applied to the values.
export const aggregateBivariateNumericFnWithFilterFactory = (fn: (xyValues: XYValues) => FValue) => {
return (args: MathNode[], mathjs: any, currentScope: CurrentScope) => {
const scope = getRootScope(currentScope)
const [xArg, yArg, filterArg] = args
let xValues = evaluateNode(xArg, scope)
if (!Array.isArray(xValues)) {
xValues = [xValues]
}
let yValues = evaluateNode(yArg, scope)
if (!Array.isArray(yValues)) {
yValues = [yValues]
}
let filterValues = !!filterArg && evaluateNode(filterArg, scope)
if (!!filterArg && !Array.isArray(filterValues)) {
filterValues = [filterValues]
}
const xyValues: XYValues = []
const count = Math.min(xValues.length, yValues.length)
for (let i = 0; i < count; ++i) {
const [isXValid, x] = checkNumber(xValues[i])
const [isYValid, y] = checkNumber(yValues[i])
if (isXValid && isYValid && (filterValues ? isValueTruthy(filterValues[i]) : true)) {
xyValues.push({ x, y })
}
}
return xyValues.length > 0 ? fn(xyValues) : UNDEF_RESULT
}
}

// Calls the client function with a filtered array of non-empty values.
type ClientFn = (values: number[], args: MathNode[], scope: FormulaMathJsScope) => FValue
export const aggregateFnWithFilterFactory = (fn: ClientFn) => {
Expand Down
47 changes: 47 additions & 0 deletions v3/src/models/formula/functions/bivariate-stats-functions.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { evaluate } from "../test-utils/formula-test-utils"
import { UNDEF_RESULT } from "./function-utils"

// Note that aggregate functions require formula-test-utils since they use the custom MathJS scope API to support
// caching. Therefore, they cannot be simply tested using basic MathJS evaluation, similar to arithmetic functions.

// Most of the tests use attributes from the Mammals dataset, comparing v3 results with v2.

describe("correlation", () => {
it("returns correct value", () => {
expect(evaluate("correlation(LifeSpan, Order)")).toBe(UNDEF_RESULT)
expect(evaluate("correlation(LifeSpan, Speed)")).toBeCloseTo(-0.059392, 6)
expect(evaluate("correlation(Height, Mass)")).toBeCloseTo(0.684623, 6)
})
})

describe("linRegrIntercept", () => {
it("returns correct value", () => {
expect(evaluate("linRegrIntercept(LifeSpan, Order)")).toBe(UNDEF_RESULT)
expect(evaluate("linRegrIntercept(LifeSpan, Speed)")).toBeCloseTo(50.722887, 6)
expect(evaluate("linRegrIntercept(Height, Mass)")).toBeCloseTo(-516.767727, 6)
})
})

describe("linRegrSESlope", () => {
it("returns correct value", () => {
expect(evaluate("linRegrSESlope(LifeSpan, Order)")).toBe(UNDEF_RESULT)
expect(evaluate("linRegrSESlope(LifeSpan, Speed)")).toBeCloseTo(0.252991, 6)
expect(evaluate("linRegrSESlope(Height, Mass)")).toBeCloseTo(155.171375, 6)
})
})

describe("linRegrSlope", () => {
it("returns correct value", () => {
expect(evaluate("linRegrSlope(LifeSpan, Order)")).toBe(UNDEF_RESULT)
expect(evaluate("linRegrSlope(LifeSpan, Speed)")).toBeCloseTo(-0.070601, 6)
expect(evaluate("linRegrSlope(Height, Mass)")).toBeCloseTo(728.730807, 6)
})
})

describe("rSquared", () => {
it("returns correct value", () => {
expect(evaluate("rSquared(LifeSpan, Order)")).toBe(UNDEF_RESULT)
expect(evaluate("rSquared(LifeSpan, Speed)")).toBeCloseTo(0.003527, 6)
expect(evaluate("rSquared(Height, Mass)")).toBeCloseTo(0.468709, 6)
})
})
62 changes: 62 additions & 0 deletions v3/src/models/formula/functions/bivariate-stats-functions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import {
correlation, linRegrIntercept, linRegrStdErrSlopeAndIntercept, linRegrSlope, rSquared
} from "../../../utilities/stats-utils"
import { IFormulaMathjsFunction } from "../formula-types"
import { aggregateBivariateNumericFnWithFilterFactory, cachedAggregateFnFactory } from "./aggregate-functions"

export const bivariateStatsFunctions: Record<string, IFormulaMathjsFunction> = {

correlation: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return correlation(xyValues)
})
},

linRegrIntercept: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return linRegrIntercept(xyValues)
})
},

linRegrSEIntercept: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return linRegrStdErrSlopeAndIntercept(xyValues).stdErrIntercept
})
},

linRegrSESlope: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return linRegrStdErrSlopeAndIntercept(xyValues).stdErrSlope
})
},

linRegrSlope: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return linRegrSlope(xyValues)
})
},

rSquared: {
numOfRequiredArguments: 2,
isAggregate: true,
cachedEvaluateFactory: cachedAggregateFnFactory,
evaluateRaw: aggregateBivariateNumericFnWithFilterFactory(xyValues => {
return rSquared(xyValues)
})
}
}
3 changes: 3 additions & 0 deletions v3/src/models/formula/functions/math.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
} from "../formula-types"
import { aggregateFunctions } from "./aggregate-functions"
import { arithmeticFunctions } from "./arithmetic-functions"
import { bivariateStatsFunctions } from "./bivariate-stats-functions"
import { dateFunctions } from "./date-functions"
import { evaluateNode, getRootScope } from "./function-utils"
import { logicFunctions } from "./logic-functions"
Expand Down Expand Up @@ -85,6 +86,8 @@ export const fnRegistry = {

...univariateStatsFunctions,

...bivariateStatsFunctions,

...semiAggregateFunctions
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ export const univariateStatsFunctions: Record<string, IFormulaMathjsFunction> =
},

// mad(expression, filterExpression)
// median absolute deviation
mad: {
numOfRequiredArguments: 1,
isAggregate: true,
Expand Down
81 changes: 81 additions & 0 deletions v3/src/utilities/nist-noint1.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
export const data = [
{ x: 60, y: 130 },
{ x: 61, y: 131 },
{ x: 62, y: 132 },
{ x: 63, y: 133 },
{ x: 64, y: 134 },
{ x: 65, y: 135 },
{ x: 66, y: 136 },
{ x: 67, y: 137 },
{ x: 68, y: 138 },
{ x: 69, y: 139 },
{ x: 70, y: 140 },
]

export const certifiedResults = {
slope: 2.07438016528926,
sdSlope: 0.165289256198347E-01,
sdResiduals: 3.56753034006338,
rSquared: 0.999365492298663
}

// http://www.itl.nist.gov/div898/strd/lls/data/LINKS/DATA/NoInt1.dat

/*
NIST/ITL StRD
Dataset Name: NoInt1 (NoInt1.dat)
File Format: ASCII
Certified Values (lines 31 to 44)
Data (lines 61 to 71)
Procedure: Linear Least Squares Regression
Reference: Eberhardt, K., NIST.
Data: 1 Response Variable (y)
1 Predictor Variable (x)
11 Observations
Average Level of Difficulty
Generated Data
Model: Linear Class
1 Parameter (B1)
y = B1*x + e
Certified Regression Statistics
Standard Deviation
Parameter Estimate of Estimate
B1 2.07438016528926 0.165289256198347E-01
Residual
Standard Deviation 3.56753034006338
R-Squared 0.999365492298663
Certified Analysis of Variance Table
Source of Degrees of Sums of Mean
Variation Freedom Squares Squares F Statistic
Regression 1 200457.727272727 200457.727272727 15750.2500000000
Residual 10 127.272727272727 12.7272727272727
Data: y x
130 60
131 61
132 62
133 63
134 64
135 65
136 66
137 67
138 68
139 69
140 70
*/
135 changes: 135 additions & 0 deletions v3/src/utilities/nist-norris.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
export const data = [
{ y: 0.1, x: 0.2 },
{ y: 338.8, x: 337.4 },
{ y: 118.1, x: 118.2 },
{ y: 888.0, x: 884.6 },
{ y: 9.2, x: 10.1 },
{ y: 228.1, x: 226.5 },
{ y: 668.5, x: 666.3 },
{ y: 998.5, x: 996.3 },
{ y: 449.1, x: 448.6 },
{ y: 778.9, x: 777.0 },
{ y: 559.2, x: 558.2 },
{ y: 0.3, x: 0.4 },
{ y: 0.1, x: 0.6 },
{ y: 778.1, x: 775.5 },
{ y: 668.8, x: 666.9 },
{ y: 339.3, x: 338.0 },
{ y: 448.9, x: 447.5 },
{ y: 10.8, x: 11.6 },
{ y: 557.7, x: 556.0 },
{ y: 228.3, x: 228.1 },
{ y: 998.0, x: 995.8 },
{ y: 888.8, x: 887.6 },
{ y: 119.6, x: 120.2 },
{ y: 0.3, x: 0.3 },
{ y: 0.6, x: 0.3 },
{ y: 557.6, x: 556.8 },
{ y: 339.3, x: 339.1 },
{ y: 888.0, x: 887.2 },
{ y: 998.5, x: 999.0 },
{ y: 778.9, x: 779.0 },
{ y: 10.2, x: 11.1 },
{ y: 117.6, x: 118.3 },
{ y: 228.9, x: 229.2 },
{ y: 668.4, x: 669.1 },
{ y: 449.2, x: 448.9 },
{ y: 0.2, x: 0.5 }
]

export const certifiedResults = {
count: 36,
intercept: -0.262323073774029,
slope: 1.00211681802045,
sdIntercept: 0.232818234301152,
sdSlope: 0.429796848199937E-03,
sdResiduals: 0.884796396144373,
rSquared: 0.999993745883712
}

// http://www.itl.nist.gov/div898/strd/lls/data/LINKS/DATA/Norris.dat

/*
NIST/ITL StRD
Dataset Name: Norris (Norris.dat)
File Format: ASCII
Certified Values (lines 31 to 46)
Data (lines 61 to 96)
Procedure: Linear Least Squares Regression
Reference: Norris, J., NIST.
Calibration of Ozone Monitors.
Data: 1 Response Variable (y)
1 Predictor Variable (x)
36 Observations
Lower Level of Difficulty
Observed Data
Model: Linear Class
2 Parameters (B0,B1)
y = B0 + B1*x + e
Certified Regression Statistics
Standard Deviation
Parameter Estimate of Estimate
B0 -0.262323073774029 0.232818234301152
B1 1.00211681802045 0.429796848199937E-03
Residual
Standard Deviation 0.884796396144373
R-Squared 0.999993745883712
Certified Analysis of Variance Table
Source of Degrees of Sums of Mean
Variation Freedom Squares Squares F Statistic
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
Residual 34 26.6173985294224 0.782864662630069
Data: y x
0.1 0.2
338.8 337.4
118.1 118.2
888.0 884.6
9.2 10.1
228.1 226.5
668.5 666.3
998.5 996.3
449.1 448.6
778.9 777.0
559.2 558.2
0.3 0.4
0.1 0.6
778.1 775.5
668.8 666.9
339.3 338.0
448.9 447.5
10.8 11.6
557.7 556.0
228.3 228.1
998.0 995.8
888.8 887.6
119.6 120.2
0.3 0.3
0.6 0.3
557.6 556.8
339.3 339.1
888.0 887.2
998.5 999.0
778.9 779.0
10.2 11.1
117.6 118.3
228.9 229.2
668.4 669.1
449.2 448.9
0.2 0.5
*/
Loading

0 comments on commit 413895c

Please sign in to comment.