diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md index 39738f193d..100716ec26 100644 --- a/docs/src/data-diving-examples.md +++ b/docs/src/data-diving-examples.md @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr 0.9730497632351692 -tiv_2011_tiv_2012_ols_m 0.9835583980337723 -tiv_2011_tiv_2012_ols_b 433854.6428968317 +tiv_2011_tiv_2012_corr 0.9730497632351701 +tiv_2011_tiv_2012_ols_m 0.9835583980337732 +tiv_2011_tiv_2012_ols_b 433854.6428968301 tiv_2011_tiv_2012_ols_n 36634 -tiv_2011_tiv_2012_r2 0.9468258417320189 +tiv_2011_tiv_2012_r2 0.9468258417320204
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
u_v_corr w_x_corr -0.1334180491027861 -0.011319841199866178 +0.1334180491027861 -0.011319841199852926
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
color shape u_v_corr w_x_corr - red circle 0.9807984401887236 -0.01856553658708754 -orange square 0.17685855992752927 -0.07104431573806054 - green circle 0.05764419437577255 0.01179572988801509 - red square 0.05574477124893523 -0.0006801456507510942 -yellow triangle 0.04457273771962798 0.024604310103081825 -yellow square 0.04379172927296089 -0.04462197201631237 -purple circle 0.03587354936895086 0.1341133954140899 - blue square 0.03241153095761164 -0.053507648119643196 - blue triangle 0.015356427073158766 -0.0006089997461435399 -orange circle 0.010518953877704048 -0.16279397329279383 - red triangle 0.00809782571528034 0.012486621357942596 -purple triangle 0.005155190909099334 -0.045057909256220656 -purple square -0.025680276963377404 0.05769429647930396 - green square -0.0257760734502851 -0.003265173252087127 -orange triangle -0.030456661186085785 -0.1318699981926352 -yellow circle -0.06477331572781474 0.07369449819706045 - blue circle -0.10234761901929677 -0.030528539069837757 - green triangle -0.10901825107358765 -0.04848782060162929 + red circle 0.9807984401887242 -0.018565536587084836 +orange square 0.17685855992752933 -0.07104431573805543 + green circle 0.05764419437577257 0.011795729888018455 + red square 0.0557447712489348 -0.0006801456507506415 +yellow triangle 0.0445727377196281 0.024604310103079844 +yellow square 0.0437917292729612 -0.044621972016306265 +purple circle 0.03587354936895115 0.13411339541407613 + blue square 0.03241153095761152 -0.05350764811965621 + blue triangle 0.015356427073158612 -0.0006089997461408209 +orange circle 0.010518953877704181 -0.1627939732927932 + red triangle 0.00809782571528054 0.01248662135795501 +purple triangle 0.005155190909099739 -0.04505790925621933 +purple square -0.02568027696337717 0.057694296479293694 + green square -0.025776073450284875 -0.0032651732520739014 +orange triangle -0.030456661186085584 -0.13186999819263814 +yellow circle -0.06477331572781515 0.0736944981970553 + blue circle -0.1023476190192966 -0.030528539069839333 + green triangle -0.10901825107358747 -0.04848782060162855diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 3321f17987..6f4ab1bbd6 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -19,9 +19,7 @@ Quick links: This is simply a copy of what you should see on running `man mlr` at a command prompt, once Miller is installed on your system.
-MILLER(1) MILLER(1) - - +4mMILLER24m(1) 4mMILLER24m(1) 1mNAME0m Miller -- like awk, sed, cut, join, and sort for name-indexed data such @@ -815,7 +813,7 @@ MILLER(1) MILLER(1) markdown " " N/A "\n" nidx " " N/A "\n" pprint " " N/A "\n" - tsv " " N/A "\n" + tsv " " N/A "\n" xtab "\n" " " "\n\n" --fs {string} Specify FS for input and output. @@ -1430,6 +1428,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1928,6 +1927,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3730,7 +3730,5 @@ MILLER(1) MILLER(1) MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - - - 2024-05-09 MILLER(1) + 2024-05-11 4mMILLER24m(1)diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 5ce10b9603..199946d457 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -1,6 +1,4 @@ -MILLER(1) MILLER(1) - - +4mMILLER24m(1) 4mMILLER24m(1) 1mNAME0m Miller -- like awk, sed, cut, join, and sort for name-indexed data such @@ -794,7 +792,7 @@ MILLER(1) MILLER(1) markdown " " N/A "\n" nidx " " N/A "\n" pprint " " N/A "\n" - tsv " " N/A "\n" + tsv " " N/A "\n" xtab "\n" " " "\n\n" --fs {string} Specify FS for input and output. @@ -1409,6 +1407,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1907,6 +1906,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3709,6 +3709,4 @@ MILLER(1) MILLER(1) MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - - - 2024-05-09 MILLER(1) + 2024-05-11 4mMILLER24m(1) diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index d6589c459a..a90f0890ff 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -2093,6 +2093,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3266,6 +3267,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3433,14 +3435,14 @@ fields, optionally categorized by one or more fields. data/medium
-x_y_cov 0.000042574820827444476 -x_y_corr 0.0005042001844467462 -y_y_cov 0.08461122467974003 +x_y_cov 0.00004257482082749404 +x_y_corr 0.0005042001844473328 +y_y_cov 0.08461122467974005 y_y_corr 1 -x2_xy_cov 0.04188382281779374 -x2_xy_corr 0.630174342037994 -x2_y2_cov -0.00030953725962542085 -x2_y2_corr -0.0034249088761121966 +x2_xy_cov 0.041883822817793716 +x2_xy_corr 0.6301743420379936 +x2_y2_cov -0.0003095372596253918 +x2_y2_corr -0.003424908876111875
@@ -3449,12 +3451,12 @@ x2_y2_corr -0.0034249088761121966 data/medium
-a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 -pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266 -eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526 -wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295 -zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495 -hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265 +a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 +pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127 +eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451 +wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931 +zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495 +hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266Here's an example simple line-fit. The `x` and `y` @@ -3540,11 +3542,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943 ## step @@ -3821,9 +3823,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e1..e475aebf3b 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074
diff --git a/man/manpage.txt b/man/manpage.txt index 5ce10b9603..199946d457 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -1,6 +1,4 @@ -MILLER(1) MILLER(1) - - +4mMILLER24m(1) 4mMILLER24m(1) 1mNAME0m Miller -- like awk, sed, cut, join, and sort for name-indexed data such @@ -794,7 +792,7 @@ MILLER(1) MILLER(1) markdown " " N/A "\n" nidx " " N/A "\n" pprint " " N/A "\n" - tsv " " N/A "\n" + tsv " " N/A "\n" xtab "\n" " " "\n\n" --fs {string} Specify FS for input and output. @@ -1409,6 +1407,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1907,6 +1906,7 @@ MILLER(1) MILLER(1) antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -3709,6 +3709,4 @@ MILLER(1) MILLER(1) MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite https://miller.readthedocs.io - - - 2024-05-09 MILLER(1) + 2024-05-11 4mMILLER24m(1) diff --git a/man/mlr.1 b/man/mlr.1 index d660b99a89..f5b66e015f 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2024-05-09 +.\" Date: 2024-05-11 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2024-05-09" "\ \&" "\ \&" +.TH "MILLER" "1" "2024-05-11" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1778,6 +1778,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -2408,6 +2409,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) diff --git a/pkg/transformers/utils/stats1_accumulators.go b/pkg/transformers/utils/stats1_accumulators.go index 02756a9a34..3e8aadd942 100644 --- a/pkg/transformers/utils/stats1_accumulators.go +++ b/pkg/transformers/utils/stats1_accumulators.go @@ -72,6 +72,11 @@ var stats1AccumulatorInfos []stats1AccumulatorInfo = []stats1AccumulatorInfo{ "Compute averages (sample means) of specified fields", NewStats1MeanAccumulator, }, + { + "mad", + "Compute mean absolute deviation", + NewStats1MeanAbsDevAccumulator, + }, { "var", @@ -504,6 +509,47 @@ func (acc *Stats1MeanAccumulator) Reset() { acc.count = 0 } +// ---------------------------------------------------------------- +type Stats1MeanAbsDevAccumulator struct { + samples []*mlrval.Mlrval +} + +func NewStats1MeanAbsDevAccumulator() IStats1Accumulator { + return &Stats1MeanAbsDevAccumulator{ + samples: make([]*mlrval.Mlrval, 0, 1000), + } +} +func (acc *Stats1MeanAbsDevAccumulator) Ingest(value *mlrval.Mlrval) { + if value.IsNumeric() { + acc.samples = append(acc.samples, value) + } +} +func (acc *Stats1MeanAbsDevAccumulator) Emit() *mlrval.Mlrval { + n := len(acc.samples) + if n == 0 { + return mlrval.VOID + } + mn := mlrval.FromInt(int64(n)) + + mean := mlrval.FromInt(0) + for i := 0; i < n; i++ { + mean = bifs.BIF_plus_binary(mean, acc.samples[i]) + } + mean = bifs.BIF_divide(mean, mn) + + meanAbsDev := mlrval.FromInt(0) + for i := 0; i < n; i++ { + diff := bifs.BIF_minus_binary(mean, acc.samples[i]) + meanAbsDev = bifs.BIF_plus_binary(meanAbsDev, bifs.BIF_abs(diff)) + } + meanAbsDev = bifs.BIF_divide(meanAbsDev, mn) + + return meanAbsDev +} +func (acc *Stats1MeanAbsDevAccumulator) Reset() { + acc.samples = make([]*mlrval.Mlrval, 0, 1000) +} + // ---------------------------------------------------------------- type Stats1MinAccumulator struct { min *mlrval.Mlrval diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index 33eed96d5d..6d6cdea85c 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -555,6 +555,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) @@ -1075,6 +1076,7 @@ Options: antimode Find least-frequently-occurring values for fields; first-found wins tie sum Compute sums of specified fields mean Compute averages (sample means) of specified fields + mad Compute mean absolute deviation var Compute sample variance of specified fields stddev Compute sample standard deviation of specified fields meaneb Estimate error bars for averages (assuming no sample autocorrelation) diff --git a/test/cases/verb-stats1/0001/cmd b/test/cases/verb-stats1/0001/cmd index 1e5931d0fc..93753529ce 100644 --- a/test/cases/verb-stats1/0001/cmd +++ b/test/cases/verb-stats1/0001/cmd @@ -1 +1 @@ -mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode -f i,x,y test/input/abixy +mlr --oxtab stats1 -a mean,sum,count,min,max,antimode,mode,mad -f i,x,y test/input/abixy diff --git a/test/cases/verb-stats1/0001/expout b/test/cases/verb-stats1/0001/expout index e99cdf2b07..8c52b51663 100644 --- a/test/cases/verb-stats1/0001/expout +++ b/test/cases/verb-stats1/0001/expout @@ -5,6 +5,7 @@ i_min 1 i_max 10 i_antimode 1 i_mode 1 +i_mad 2.50000000 x_mean 0.45362938 x_sum 4.53629384 x_count 10 @@ -12,6 +13,7 @@ x_min 0.03144188 x_max 0.75867996 x_antimode 0.34679014 x_mode 0.34679014 +x_mad 0.17005656 y_mean 0.59445424 y_sum 5.94454242 y_count 10 @@ -19,3 +21,4 @@ y_min 0.13418874 y_max 0.97618139 y_antimode 0.72680286 y_mode 0.72680286 +y_mad 0.25930133