From 16ab1991948213f88f66027d1a7f92e93e183b11 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 11 May 2024 15:55:27 -0400 Subject: [PATCH] Add `mad` accumulator for `stats1` DSL function (#1561) * Add `mad` accumulator for `stats1` DSL function * regression files * make dev output --- docs/src/data-diving-examples.md | 46 +++++++++---------- docs/src/manpage.md | 12 ++--- docs/src/manpage.txt | 12 ++--- docs/src/reference-verbs.md | 40 ++++++++-------- docs/src/two-pass-algorithms.md | 4 +- man/manpage.txt | 12 ++--- man/mlr.1 | 6 ++- pkg/transformers/utils/stats1_accumulators.go | 46 +++++++++++++++++++ test/cases/cli-help/0001/expout | 2 + test/cases/verb-stats1/0001/cmd | 2 +- test/cases/verb-stats1/0001/expout | 3 ++ 11 files changed, 117 insertions(+), 68 deletions(-) diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md index 39738f193d..100716ec26 100644 --- a/docs/src/data-diving-examples.md +++ b/docs/src/data-diving-examples.md @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr  0.9730497632351692
-tiv_2011_tiv_2012_ols_m 0.9835583980337723
-tiv_2011_tiv_2012_ols_b 433854.6428968317
+tiv_2011_tiv_2012_corr  0.9730497632351701
+tiv_2011_tiv_2012_ols_m 0.9835583980337732
+tiv_2011_tiv_2012_ols_b 433854.6428968301
 tiv_2011_tiv_2012_ols_n 36634
-tiv_2011_tiv_2012_r2    0.9468258417320189
+tiv_2011_tiv_2012_r2    0.9468258417320204
 
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 
           u_v_corr              w_x_corr
-0.1334180491027861 -0.011319841199866178
+0.1334180491027861 -0.011319841199852926
 
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 
  color    shape              u_v_corr               w_x_corr
-   red   circle    0.9807984401887236   -0.01856553658708754
-orange   square   0.17685855992752927   -0.07104431573806054
- green   circle   0.05764419437577255    0.01179572988801509
-   red   square   0.05574477124893523 -0.0006801456507510942
-yellow triangle   0.04457273771962798   0.024604310103081825
-yellow   square   0.04379172927296089   -0.04462197201631237
-purple   circle   0.03587354936895086     0.1341133954140899
-  blue   square   0.03241153095761164  -0.053507648119643196
-  blue triangle  0.015356427073158766 -0.0006089997461435399
-orange   circle  0.010518953877704048   -0.16279397329279383
-   red triangle   0.00809782571528034   0.012486621357942596
-purple triangle  0.005155190909099334  -0.045057909256220656
-purple   square -0.025680276963377404    0.05769429647930396
- green   square   -0.0257760734502851  -0.003265173252087127
-orange triangle -0.030456661186085785    -0.1318699981926352
-yellow   circle  -0.06477331572781474    0.07369449819706045
-  blue   circle  -0.10234761901929677  -0.030528539069837757
- green triangle  -0.10901825107358765   -0.04848782060162929
+   red   circle    0.9807984401887242  -0.018565536587084836
+orange   square   0.17685855992752933   -0.07104431573805543
+ green   circle   0.05764419437577257   0.011795729888018455
+   red   square    0.0557447712489348 -0.0006801456507506415
+yellow triangle    0.0445727377196281   0.024604310103079844
+yellow   square    0.0437917292729612  -0.044621972016306265
+purple   circle   0.03587354936895115    0.13411339541407613
+  blue   square   0.03241153095761152   -0.05350764811965621
+  blue triangle  0.015356427073158612 -0.0006089997461408209
+orange   circle  0.010518953877704181    -0.1627939732927932
+   red triangle   0.00809782571528054    0.01248662135795501
+purple triangle  0.005155190909099739   -0.04505790925621933
+purple   square  -0.02568027696337717   0.057694296479293694
+ green   square -0.025776073450284875 -0.0032651732520739014
+orange triangle -0.030456661186085584   -0.13186999819263814
+yellow   circle  -0.06477331572781515     0.0736944981970553
+  blue   circle   -0.1023476190192966  -0.030528539069839333
+ green triangle  -0.10901825107358747   -0.04848782060162855
 
diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 3321f17987..6f4ab1bbd6 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -19,9 +19,7 @@ Quick links: This is simply a copy of what you should see on running `man mlr` at a command prompt, once Miller is installed on your system.
-MILLER(1)                                                            MILLER(1)
-
-
+4mMILLER24m(1)                                                            4mMILLER24m(1)
 
 1mNAME0m
        Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@@ -815,7 +813,7 @@ MILLER(1)                                                            MILLER(1)
                markdown " "    N/A    "\n"
                nidx     " "    N/A    "\n"
                pprint   " "    N/A    "\n"
-               tsv      "  "    N/A    "\n"
+               tsv      "     "    N/A    "\n"
                xtab     "\n"   " "    "\n\n"
 
        --fs {string}            Specify FS for input and output.
@@ -1430,6 +1428,7 @@ MILLER(1)                                                            MILLER(1)
          antimode Find least-frequently-occurring values for fields; first-found wins tie
          sum      Compute sums of specified fields
          mean     Compute averages (sample means) of specified fields
+         mad      Compute mean absolute deviation
          var      Compute sample variance of specified fields
          stddev   Compute sample standard deviation of specified fields
          meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1928,6 +1927,7 @@ MILLER(1)                                                            MILLER(1)
          antimode Find least-frequently-occurring values for fields; first-found wins tie
          sum      Compute sums of specified fields
          mean     Compute averages (sample means) of specified fields
+         mad      Compute mean absolute deviation
          var      Compute sample variance of specified fields
          stddev   Compute sample standard deviation of specified fields
          meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -3730,7 +3730,5 @@ MILLER(1)                                                            MILLER(1)
        MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
        https://miller.readthedocs.io
 
-
-
-                                  2024-05-09                         MILLER(1)
+                                  2024-05-11                         4mMILLER24m(1)
 
diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 5ce10b9603..199946d457 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -1,6 +1,4 @@ -MILLER(1) MILLER(1) - - +4mMILLER24m(1) 4mMILLER24m(1) 1mNAME0m Miller -- like awk, sed, cut, join, and sort for name-indexed data such @@ -794,7 +792,7 @@ MILLER(1) MILLER(1) markdown " " N/A "\n" nidx " " N/A "\n" pprint " " N/A "\n" - tsv " " N/A "\n" + tsv " " N/A "\n" xtab "\n" " " "\n\n" --fs {string} Specify FS for input and output. @@ -1409,6 +1407,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1907,6 +1906,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3709,6 +3709,4 @@ MILLER(1) MILLER(1) MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - - - 2024-05-09 MILLER(1) + 2024-05-11 4mMILLER24m(1) diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index d6589c459a..a90f0890ff 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -2093,6 +2093,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3266,6 +3267,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3433,14 +3435,14 @@ fields, optionally categorized by one or more fields. data/medium
-x_y_cov    0.000042574820827444476
-x_y_corr   0.0005042001844467462
-y_y_cov    0.08461122467974003
+x_y_cov    0.00004257482082749404
+x_y_corr   0.0005042001844473328
+y_y_cov    0.08461122467974005
 y_y_corr   1
-x2_xy_cov  0.04188382281779374
-x2_xy_corr 0.630174342037994
-x2_y2_cov  -0.00030953725962542085
-x2_y2_corr -0.0034249088761121966
+x2_xy_cov  0.041883822817793716
+x2_xy_corr 0.6301743420379936
+x2_y2_cov  -0.0003095372596253918
+x2_y2_corr -0.003424908876111875
 
@@ -3449,12 +3451,12 @@ x2_y2_corr -0.0034249088761121966
   data/medium
 
-a   x_y_ols_m             x_y_ols_b           x_y_ols_n x_y_r2                  y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m        xy_y2_ols_b         xy_y2_ols_n xy_y2_r2
-pan 0.01702551273681908   0.5004028922897639  2081      0.00028691820445814767  1         0         2081      1      0.8781320866715662 0.11908230147563566 2081        0.41749827377311266
-eks 0.0407804923685586    0.48140207967651016 1965      0.0016461239223448587   1         0         1965      1      0.8978728611690183 0.10734054433612333 1965        0.45563223864254526
-wye -0.03915349075204814  0.5255096523974456  1966      0.0015051268704373607   1         0         1966      1      0.8538317334220835 0.1267454301662969  1966        0.38991721818599295
-zee 0.0027812364960399147 0.5043070448033061  2047      0.000007751652858786137 1         0         2047      1      0.8524439912011013 0.12401684308018937 2047        0.39356598090006495
-hat -0.018620577041095078 0.5179005397264935  1941      0.0003520036646055585   1         0         1941      1      0.8412305086345014 0.13557328318623216 1941        0.3687944261732265
+a   x_y_ols_m             x_y_ols_b          x_y_ols_n x_y_r2                  y_y_ols_m y_y_ols_b                           y_y_ols_n y_y_r2 xy_y2_ols_m        xy_y2_ols_b         xy_y2_ols_n xy_y2_r2
+pan 0.017025512736819345  0.500402892289764  2081      0.00028691820445815624  1         -0.00000000000000002890430283104539 2081      1      0.8781320866715664 0.11908230147563569 2081        0.4174982737731127
+eks 0.04078049236855813   0.4814020796765104 1965      0.0016461239223448218   1         0.00000000000000017862676354313703  1965      1      0.897872861169018  0.1073405443361234  1965        0.4556322386425451
+wye -0.03915349075204785  0.5255096523974457 1966      0.0015051268704373377   1         0.00000000000000004464425401127647  1966      1      0.8538317334220837 0.1267454301662969  1966        0.3899172181859931
+zee 0.0027812364960401333 0.5043070448033061 2047      0.000007751652858787357 1         0.00000000000000004819404567023685  2047      1      0.8524439912011011 0.12401684308018947 2047        0.39356598090006495
+hat -0.018620577041095272 0.5179005397264937 1941      0.00035200366460556604  1         -0.00000000000000003400445761787692 1941      1      0.8412305086345017 0.13557328318623207 1941        0.3687944261732266
 
Here's an example simple line-fit. The `x` and `y` @@ -3540,11 +3542,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943 ## step @@ -3821,9 +3823,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e1..e475aebf3b 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074
diff --git a/man/manpage.txt b/man/manpage.txt
index 5ce10b9603..199946d457 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -1,6 +1,4 @@
-MILLER(1)                                                            MILLER(1)
-
-
+4mMILLER24m(1)                                                            4mMILLER24m(1)
 
 1mNAME0m
        Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@@ -794,7 +792,7 @@ MILLER(1)                                                            MILLER(1)
                markdown " "    N/A    "\n"
                nidx     " "    N/A    "\n"
                pprint   " "    N/A    "\n"
-               tsv      "  "    N/A    "\n"
+               tsv      "     "    N/A    "\n"
                xtab     "\n"   " "    "\n\n"
 
        --fs {string}            Specify FS for input and output.
@@ -1409,6 +1407,7 @@ MILLER(1)                                                            MILLER(1)
          antimode Find least-frequently-occurring values for fields; first-found wins tie
          sum      Compute sums of specified fields
          mean     Compute averages (sample means) of specified fields
+         mad      Compute mean absolute deviation
          var      Compute sample variance of specified fields
          stddev   Compute sample standard deviation of specified fields
          meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1907,6 +1906,7 @@ MILLER(1)                                                            MILLER(1)
          antimode Find least-frequently-occurring values for fields; first-found wins tie
          sum      Compute sums of specified fields
          mean     Compute averages (sample means) of specified fields
+         mad      Compute mean absolute deviation
          var      Compute sample variance of specified fields
          stddev   Compute sample standard deviation of specified fields
          meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -3709,6 +3709,4 @@ MILLER(1)                                                            MILLER(1)
        MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
        https://miller.readthedocs.io
 
-
-
-                                  2024-05-09                         MILLER(1)
+                                  2024-05-11                         4mMILLER24m(1)
diff --git a/man/mlr.1 b/man/mlr.1
index d660b99a89..f5b66e015f 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2024-05-09
+.\"      Date: 2024-05-11
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2024-05-09" "\ \&" "\ \&"
+.TH "MILLER" "1" "2024-05-11" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1778,6 +1778,7 @@ Options:
   antimode Find least-frequently-occurring values for fields; first-found wins tie
   sum      Compute sums of specified fields
   mean     Compute averages (sample means) of specified fields
+  mad      Compute mean absolute deviation
   var      Compute sample variance of specified fields
   stddev   Compute sample standard deviation of specified fields
   meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -2408,6 +2409,7 @@ Options:
   antimode Find least-frequently-occurring values for fields; first-found wins tie
   sum      Compute sums of specified fields
   mean     Compute averages (sample means) of specified fields
+  mad      Compute mean absolute deviation
   var      Compute sample variance of specified fields
   stddev   Compute sample standard deviation of specified fields
   meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
diff --git a/pkg/transformers/utils/stats1_accumulators.go b/pkg/transformers/utils/stats1_accumulators.go
index 02756a9a34..3e8aadd942 100644
--- a/pkg/transformers/utils/stats1_accumulators.go
+++ b/pkg/transformers/utils/stats1_accumulators.go
@@ -72,6 +72,11 @@ var stats1AccumulatorInfos []stats1AccumulatorInfo = []stats1AccumulatorInfo{
 		"Compute averages (sample means) of specified fields",
 		NewStats1MeanAccumulator,
 	},
+	{
+		"mad",
+		"Compute mean absolute deviation",
+		NewStats1MeanAbsDevAccumulator,
+	},
 
 	{
 		"var",
@@ -504,6 +509,47 @@ func (acc *Stats1MeanAccumulator) Reset() {
 	acc.count = 0
 }
 
+// ----------------------------------------------------------------
+type Stats1MeanAbsDevAccumulator struct {
+	samples []*mlrval.Mlrval
+}
+
+func NewStats1MeanAbsDevAccumulator() IStats1Accumulator {
+	return &Stats1MeanAbsDevAccumulator{
+		samples: make([]*mlrval.Mlrval, 0, 1000),
+	}
+}
+func (acc *Stats1MeanAbsDevAccumulator) Ingest(value *mlrval.Mlrval) {
+	if value.IsNumeric() {
+		acc.samples = append(acc.samples, value)
+	}
+}
+func (acc *Stats1MeanAbsDevAccumulator) Emit() *mlrval.Mlrval {
+	n := len(acc.samples)
+	if n == 0 {
+		return mlrval.VOID
+	}
+	mn := mlrval.FromInt(int64(n))
+
+	mean := mlrval.FromInt(0)
+	for i := 0; i < n; i++ {
+		mean = bifs.BIF_plus_binary(mean, acc.samples[i])
+	}
+	mean = bifs.BIF_divide(mean, mn)
+
+	meanAbsDev := mlrval.FromInt(0)
+	for i := 0; i < n; i++ {
+		diff := bifs.BIF_minus_binary(mean, acc.samples[i])
+		meanAbsDev = bifs.BIF_plus_binary(meanAbsDev, bifs.BIF_abs(diff))
+	}
+	meanAbsDev = bifs.BIF_divide(meanAbsDev, mn)
+
+	return meanAbsDev
+}
+func (acc *Stats1MeanAbsDevAccumulator) Reset() {
+	acc.samples = make([]*mlrval.Mlrval, 0, 1000)
+}
+
 // ----------------------------------------------------------------
 type Stats1MinAccumulator struct {
 	min *mlrval.Mlrval
diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout
index 33eed96d5d..6d6cdea85c 100644
--- a/test/cases/cli-help/0001/expout
+++ b/test/cases/cli-help/0001/expout
@@ -555,6 +555,7 @@ Options:
   antimode Find least-frequently-occurring values for fields; first-found wins tie
   sum      Compute sums of specified fields
   mean     Compute averages (sample means) of specified fields
+  mad      Compute mean absolute deviation
   var      Compute sample variance of specified fields
   stddev   Compute sample standard deviation of specified fields
   meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
@@ -1075,6 +1076,7 @@ Options:
   antimode Find least-frequently-occurring values for fields; first-found wins tie
   sum      Compute sums of specified fields
   mean     Compute averages (sample means) of specified fields
+  mad      Compute mean absolute deviation
   var      Compute sample variance of specified fields
   stddev   Compute sample standard deviation of specified fields
   meaneb   Estimate error bars for averages (assuming no sample autocorrelation)
diff --git a/test/cases/verb-stats1/0001/cmd b/test/cases/verb-stats1/0001/cmd
index 1e5931d0fc..93753529ce 100644
--- a/test/cases/verb-stats1/0001/cmd
+++ b/test/cases/verb-stats1/0001/cmd
@@ -1 +1 @@
-mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode -f i,x,y test/input/abixy
+mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode,mad -f i,x,y test/input/abixy
diff --git a/test/cases/verb-stats1/0001/expout b/test/cases/verb-stats1/0001/expout
index e99cdf2b07..8c52b51663 100644
--- a/test/cases/verb-stats1/0001/expout
+++ b/test/cases/verb-stats1/0001/expout
@@ -5,6 +5,7 @@ i_min      1
 i_max      10
 i_antimode 1
 i_mode     1
+i_mad      2.50000000
 x_mean     0.45362938
 x_sum      4.53629384
 x_count    10
@@ -12,6 +13,7 @@ x_min      0.03144188
 x_max      0.75867996
 x_antimode 0.34679014
 x_mode     0.34679014
+x_mad      0.17005656
 y_mean     0.59445424
 y_sum      5.94454242
 y_count    10
@@ -19,3 +21,4 @@ y_min      0.13418874
 y_max      0.97618139
 y_antimode 0.72680286
 y_mode     0.72680286
+y_mad      0.25930133