From 9d478894daaeb924eaf140b21ddd81f213610dd8 Mon Sep 17 00:00:00 2001 From: "Documenter.jl" Date: Tue, 3 Sep 2024 17:49:46 +0000 Subject: [PATCH] build based on ad1e8b5 --- .../generated/UserGuide/autovec/index.html | 10 +- .../generated/UserGuide/benchmark/index.html | 80 ++++---- .../generated/UserGuide/sep_unite/index.html | 4 +- .../generated/UserGuide/slice/index.html | 2 +- latest/reference/index.html | 193 ++++++++++-------- latest/search/search_index.json | 2 +- 6 files changed, 161 insertions(+), 130 deletions(-) diff --git a/latest/examples/generated/UserGuide/autovec/index.html b/latest/examples/generated/UserGuide/autovec/index.html index 59d77ab..7636be0 100644 --- a/latest/examples/generated/UserGuide/autovec/index.html +++ b/latest/examples/generated/UserGuide/autovec/index.html @@ -805,7 +805,7 @@

Auto-vectorization

string(TidierData.not_vectorized[]) -
"[:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr]"
+
"[:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode]"
 

This "auto-vectorization" makes working with TidierData.jl more R-like and convenient. However, if you ever define your own function and try to use it, TidierData.jl may unintentionally vectorize it for you. To prevent auto-vectorization, you can prefix your function with a ~.

df = DataFrame(a = repeat('a':'e', inner = 2), b = [1,1,1,2,2,2,3,3,3,4], c = 11:20)
@@ -837,7 +837,7 @@ 

Auto-vectorization

Or you can modify the do-not-vectorize list like this:

push!(TidierData.not_vectorized[], :new_mean)
 
-
49-element Vector{Symbol}:
+
52-element Vector{Symbol}:
  :getindex
  :rand
  :esc
@@ -849,14 +849,14 @@ 

Auto-vectorization

:∘ :lag ⋮ - :cat_collapse - :cat_lump_min - :cat_lump_prop :categorical :as_categorical :is_categorical :unique :iqr + :cat_other + :cat_replace_missing + :cat_recode :new_mean

Now new_mean() should behave just like mean() in that it is treated as non-vectorized.

diff --git a/latest/examples/generated/UserGuide/benchmark/index.html b/latest/examples/generated/UserGuide/benchmark/index.html index 9df79ff..bfde6ac 100644 --- a/latest/examples/generated/UserGuide/benchmark/index.html +++ b/latest/examples/generated/UserGuide/benchmark/index.html @@ -887,14 +887,14 @@

filtering@benchmark filter(row -> row.Year > 1939 && row.Votes > 40, movies)

-
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  414.934 μs …  1.865 ms  ┊ GC (min … max): 0.00% … 31.44%
- Time  (median):     422.558 μs              ┊ GC (median):    0.00%
- Time  (mean ± σ):   432.911 μs ± 67.083 μs  ┊ GC (mean ± σ):  1.25% ±  5.15%
+ Range (min … max):  419.333 μs …  3.638 ms  ┊ GC (min … max): 0.00% … 16.92%
+ Time  (median):     426.235 μs              ┊ GC (median):    0.00%
+ Time  (mean ± σ):   438.110 μs ± 74.097 μs  ┊ GC (mean ± σ):  1.22% ±  5.06%
 
-  ▃██▇▆▅▄▄▄▃▂▂▁▁▁                                              ▂
-  ████████████████▇▇▇▆▆▆▆▆▅▅▄▄▄▃▄▁▁▁▁▃▃▁▁▄▇▇▃▅▃▃▃▅▄▆▆▇▆▇▆▅▄▃▄▄ █
-  415 μs        Histogram: log(frequency) by time       573 μs <
+  ▆█▇▆▄▄▄▃▂▂▁▁▁                                                ▂
+  ███████████████▇▇▆▆▆▇▅▅▄▅▄▃▆▆▆▄▄▅▃▅▃▄▃▃▁▄▄▆▇▆▇▆▁▃▄▃▄▅▃▁▃▁▇▆▅ █
+  419 μs        Histogram: log(frequency) by time       619 μs <
 
  Memory estimate: 474.87 KiB, allocs estimate: 270.
 
@@ -936,14 +936,14 @@

one mutate@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :new_col)

-
BenchmarkTools.Trial: 6789 samples with 1 evaluation.
- Range (min … max):  541.700 μs …   9.582 ms  ┊ GC (min … max): 0.00% …  5.69%
- Time  (median):     661.425 μs               ┊ GC (median):    0.00%
- Time  (mean ± σ):   731.742 μs ± 250.650 μs  ┊ GC (mean ± σ):  7.82% ± 12.75%
+
BenchmarkTools.Trial: 6516 samples with 1 evaluation.
+ Range (min … max):  557.359 μs …   7.220 ms  ┊ GC (min … max): 0.00% …  9.33%
+ Time  (median):     686.274 μs               ┊ GC (median):    0.00%
+ Time  (mean ± σ):   763.019 μs ± 243.408 μs  ┊ GC (mean ± σ):  9.17% ± 14.37%
 
-         ▅██▇▇▆▅▄▃▂▁                               ▁▁▂▂▂▂▃▂▂▁▁  ▂
-  ▃▁▃▁▅▅████████████▇▆▅▅▁▅▃▁▁▁▁▁▃▃▁▁▁▁▃▆▆▆▅▄▅▄▃▅▅▇█████████████ █
-  542 μs        Histogram: log(frequency) by time       1.25 ms <
+    ▂▅▆▇████▇▆▅▄▄▂▂                    ▁▂▁▁▁▁▁ ▁▁ ▁     ▁ ▁▁    ▂
+  ▃▇████████████████▇█▇▇▆▅▄▅▃▃▅▅▃▅▅▅▅██████████████▇███████████ █
+  557 μs        Histogram: log(frequency) by time       1.45 ms <
 
  Memory estimate: 8.42 MiB, allocs estimate: 223.
 
@@ -966,14 +966,14 @@

mutate 6 new columns@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :Votes_R1_Product, [:Rating, :Year] => ((r, y) -> r ./ y) => :Rating_Year_Ratio, [:R1, :R2, :R3, :R4, :R5] => ((a, b, c, d, e) -> a + b + c + d + e) => :R1_to_R5_Sum, :Budget => (b -> ifelse.(ismissing.(b), missing, b .> 50000)) => :High_Budget_Flag, [:R6, :R7, :R8] => ((f, g, h) -> (f + g + h) / 3) => :R6_to_R8_Avg, [:Year, :Length] => ((y, l) -> y - l) => :Year_Minus_Length )

- -
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  153.436 μs …   7.502 ms  ┊ GC (min … max): 0.00% … 6.35%
- Time  (median):     220.581 μs               ┊ GC (median):    0.00%
- Time  (mean ± σ):   232.208 μs ± 100.421 μs  ┊ GC (mean ± σ):  4.46% ± 9.85%
+ Range (min … max):  173.423 μs …  4.715 ms  ┊ GC (min … max): 0.00% … 8.59%
+ Time  (median):     221.673 μs              ┊ GC (median):    0.00%
+ Time  (mean ± σ):   237.430 μs ± 95.591 μs  ┊ GC (mean ± σ):  4.87% ± 9.96%
 
-      ▁▄▆▇██▆▄▃▁                                                ▂
-  ▄▃▄▅██████████▇▆▄▄▁▁▁▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▆▆▇▇▇█▇▇▇▇▇ █
-  153 μs        Histogram: log(frequency) by time        622 μs <
+      ▅█▄                                                       
+  ▂▂▄████▆▅▄▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▂▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▃
+  173 μs          Histogram: frequency by time          688 μs <
 
  Memory estimate: 2.25 MiB, allocs estimate: 200.
 
diff --git a/latest/examples/generated/UserGuide/sep_unite/index.html b/latest/examples/generated/UserGuide/sep_unite/index.html index d64aaed..2904c4f 100644 --- a/latest/examples/generated/UserGuide/sep_unite/index.html +++ b/latest/examples/generated/UserGuide/sep_unite/index.html @@ -931,8 +931,8 @@

@unite @unite(new_col, (b, c, d), "/") end

-
3×4 DataFrame
-
Rowbcdnew_col
StringStringString?String
111missing1/1
222missing2/2
33333/3/3
+
3×1 DataFrame
+
Rownew_col
String
11/1
22/2
33/3/3

diff --git a/latest/examples/generated/UserGuide/slice/index.html b/latest/examples/generated/UserGuide/slice/index.html index 458ef54..52677d7 100644 --- a/latest/examples/generated/UserGuide/slice/index.html +++ b/latest/examples/generated/UserGuide/slice/index.html @@ -1110,7 +1110,7 @@

Sample 5 random rows in the data end
5×3 DataFrame
-
Rowrow_numab
Int64StringInt64
16c2
21a1
35c2
48d3
57d3
+
Rowrow_numab
Int64StringInt64
16c2
29e3
37d3
43b1
58d3

diff --git a/latest/reference/index.html b/latest/reference/index.html index a35f381..61880a8 100644 --- a/latest/reference/index.html +++ b/latest/reference/index.html @@ -944,7 +944,7 @@

Index
  • TidierData.@right_join
  • TidierData.@select
  • TidierData.@semi_join
  • -
  • TidierData.@separate
  • +
  • TidierData.@separate
  • TidierData.@separate_rows
  • TidierData.@slice
  • TidierData.@slice_head
  • @@ -958,7 +958,7 @@

    Index
  • TidierData.@tally
  • TidierData.@transmute
  • TidierData.@ungroup
  • -
  • TidierData.@unite
  • +
  • TidierData.@unite
  • TidierData.@unnest_longer
  • TidierData.@unnest_wider
  • @@ -979,7 +979,7 @@

    Reference - Exported functions
  • value: true or false
  • -

    source

    +

    source

    # TidierData.acrossMethod.

    across(variable[s], function[s])
    @@ -1047,7 +1047,7 @@ 

    Reference - Exported functions 4 │ d 4 14 1 11 5 15 5 │ e 5 15 1 11 5 15

    -

    source

    +

    source

    # TidierData.as_floatMethod.

    as_float(value)
    @@ -1068,7 +1068,7 @@ 

    Reference - Exported functionsjulia> as_float(missing) missing

    -

    source

    +

    source

    # TidierData.as_integerMethod.

    as_integer(value)
    @@ -1095,7 +1095,7 @@ 

    Reference - Exported functionsjulia> as_integer(missing) missing

    -

    source

    +

    source

    # TidierData.as_stringMethod.

    as_string(value)
    @@ -1116,7 +1116,7 @@ 

    Reference - Exported functionsjulia> as_string(missing) missing

    -

    source

    +

    source

    # TidierData.case_whenMethod.

    case_when(condition => return_value)
    @@ -1192,7 +1192,7 @@ 

    Reference - Exported functions 4 │ 4 3 5 │ 5 3

    -

    source

    +

    source

    # TidierData.descMethod.

    desc(col)
    @@ -1223,7 +1223,7 @@ 

    Reference - Exported functions 9 │ e 10 20 10 │ e 9 19

    -

    source

    +

    source

    # TidierData.ends_withMethod.

    ends_with(suffix)
    @@ -1249,7 +1249,7 @@ 

    Reference - Exported functions 4 4 24 5 5 25

    -

    source

    +

    source

    # TidierData.everythingMethod.

    everything()
    @@ -1275,7 +1275,7 @@ 

    Reference - Exported functions 4 24 4 14 5 25 5 15

    -

    source

    +

    source

    # TidierData.if_elseMethod.

    if_else(condition, yes, no, [miss])
    @@ -1343,7 +1343,7 @@ 

    Reference - Exported functions 4 │ 4 3 5 │ 5 3

    -

    source

    +

    source

    # TidierData.is_floatMethod.

    is_float(column::AbstractVector)
    @@ -1368,7 +1368,7 @@ 

    Reference - Exported functionsjulia> is_float(df.b) false

    -

    source

    +

    source

    # TidierData.is_integerMethod.

    is_integer(column::AbstractVector)
    @@ -1393,7 +1393,7 @@ 

    Reference - Exported functionsjulia> is_integer(df.d) false

    -

    source

    +

    source

    # TidierData.is_numberMethod.

    is_number(column::AbstractVector)
    @@ -1421,7 +1421,7 @@ 

    Reference - Exported functionsjulia> is_number(df.d) false

    -

    source

    +

    source

    # TidierData.is_stringMethod.

    is_string(column::AbstractVector)
    @@ -1446,7 +1446,7 @@ 

    Reference - Exported functionsjulia> is_string(df.c) false

    -

    source

    +

    source

    # TidierData.matchesMethod.

    matches(pattern, [flags])
    @@ -1500,7 +1500,7 @@ 

    Reference - Exported functions 4 4 14 5 5 15

    -

    source

    +

    source

    # TidierData.missing_ifMethod.

    missing_if(x, value)
    @@ -1530,7 +1530,7 @@ 

    Reference - Exported functions 3 │ 3 banana 4 │ missing cherry

    -

    source

    +

    source

    # TidierData.nMethod.

    n()
    @@ -1566,7 +1566,7 @@ 

    Reference - Exported functions 4 │ d 2 5 │ e 2

    -

    source

    +

    source

    # TidierData.ntileMethod.

    ntile(x, n::Integer)
    @@ -1636,7 +1636,7 @@ 

    Reference - Exported functions 7 │ 7 3 8 │ 8 3

    -

    source

    +

    source

    # TidierData.replace_missingMethod.

    replace_missing(x, replacement)
    @@ -1666,7 +1666,7 @@ 

    Reference - Exported functions 3 │ 3 35 4 │ 4 8

    -

    source

    +

    source

    # TidierData.row_numberMethod.

    row_number()
    @@ -1728,7 +1728,7 @@ 

    Reference - Exported functions 4 │ b 5 │ c

    -

    source

    +

    source

    # TidierData.starts_withMethod.

    starts_with(prefix)
    @@ -1754,7 +1754,7 @@ 

    Reference - Exported functions 4 4 14 5 5 15

    -

    source

    +

    source

    # TidierData.whereMethod.

    where(function)
    @@ -1822,7 +1822,7 @@ 

    Reference - Exported functions 4 │ d 11.0 26.0 41.0 5 │ e 14.0 29.0 44.0

    -

    source

    +

    source

    # TidierData.@anti_joinMacro.

    @anti_join(df1, df2, [by])
    @@ -1874,7 +1874,7 @@ 

    Reference - Exported functions─────┼─────────────── 1 │ b 2

    -

    source

    +

    source

    # TidierData.@arrangeMacro.

    @arrange(df, exprs...)
    @@ -1924,7 +1924,7 @@ 

    Reference - Exported functions 9 │ e 10 20 10 │ e 9 19

    -

    source

    +

    source

    # TidierData.@bind_colsMacro.

    @bind_cols(dfs...)
    @@ -1952,7 +1952,7 @@ 

    Reference - Exported functions 2 │ 2 2 5 5 8 8 3 │ 3 3 6 6 9 9

    -

    source

    +

    source

    # TidierData.@bind_rowsMacro.

    @bind_rows(dfs..., id)
    @@ -2020,7 +2020,7 @@ 

    Reference - Exported functions 8 │ 8 missing 8 3 9 │ 9 missing 9 3

    -

    source

    +

    source

    # TidierData.@countMacro.

    @count(df, exprs..., [wt], [sort])
    @@ -2096,7 +2096,7 @@ 

    Reference - Exported functions 3 │ c 7 4 │ a 6

    -

    source

    +

    source

    # TidierData.@distinctMacro.

    distinct(df, exprs...)
    @@ -2169,7 +2169,7 @@ 

    Reference - Exported functions 9 │ e 4 19 10 │ e 5 20

    -

    source

    +

    source

    # TidierData.@drop_missingMacro.

    @drop_missing(df, [cols...])
    @@ -2229,7 +2229,7 @@ 

    Reference - Exported functions 2 │ 2 missing 3 │ 4 4

    -

    source

    +

    source

    # TidierData.@fill_missingMacro.

    @fill_missing(df, [columns...], direction)

    @@ -2301,7 +2301,7 @@

    Reference - Exported functions 2 │ 6.0 3.0 missing 3.0 b 3 │ 6.0 missing 6.0 missing b -

    source

    +

    source

    # TidierData.@filterMacro.

    @filter(df, exprs...)
    @@ -2346,7 +2346,7 @@ 

    Reference - Exported functions 1 │ a 1 11 2 │ c 3 13

    -

    source

    +

    source

    # TidierData.@full_joinMacro.

    @full_join(df1, df2, [by])
    @@ -2408,7 +2408,7 @@ 

    Reference - Exported functions 2 │ b 2 missing 3 │ c missing 4

    -

    source

    +

    source

    # TidierData.@glimpseMacro.

    -

    source

    +

    source

    # TidierData.@group_byMacro.

    @group_by(df, exprs...)
    @@ -2515,7 +2515,7 @@ 

    Reference - Exported functions 4 │ d 4.0 5 │ e 5.0

    -

    source

    +

    source

    # TidierData.@headMacro.

       @head(df, value)
    @@ -2583,7 +2583,7 @@ 

    Reference - Exported functionssource

    +

    source

    # TidierData.@inner_joinMacro.

    @inner_join(df1, df2, [by])
    @@ -2635,7 +2635,7 @@ 

    Reference - Exported functions─────┼────────────────────── 1 │ a 1 3

    -

    source

    +

    source

    # TidierData.@left_joinMacro.

    @left_join(df1, df2, [by])
    @@ -2692,7 +2692,7 @@ 

    Reference - Exported functions 1 │ a 1 3 2 │ b 2 missing

    -

    source

    +

    source

    # TidierData.@mutateMacro.

    @mutate(df, exprs...)
    @@ -2793,7 +2793,7 @@ 

    Reference - Exported functions 4 │ d 4 14 1 11 5 │ e 5 15 1 11

    -

    source

    +

    source

    # TidierData.@nestMacro.

    @nest(df, new_column = nesting_columns)
    @@ -2921,7 +2921,7 @@ 

    Reference - Exported functions 14 │ e 14 44 29 15 │ e 15 45 30

    -

    source

    +

    source

    # TidierData.@pivot_longerMacro.

    @pivotlonger(df, cols, [namesto], [values_to])

    @@ -2986,7 +2986,7 @@

    Reference - Exported functions 3 │ 1 B 2 4 │ 2 B 4

    -

    source

    +

    source

    # TidierData.@pivot_widerMacro.

    @pivotwider(df, namesfrom, valuesfrom[, valuesfill])

    @@ -3031,7 +3031,7 @@

    Reference - Exported functions 1 │ 1 1 2 2 │ 2 0 4 -

    source

    +

    source

    # TidierData.@pullMacro.

    @pull(df, column)
    @@ -3061,7 +3061,7 @@ 

    Reference - Exported functions 4 5

    -

    source

    +

    source

    # TidierData.@relocateMacro.

    @relocate(df, columns, before = nothing, after = nothing)
    @@ -3123,7 +3123,7 @@ 

    Reference - Exported functions 4 │ 9 D 4 B 4 D 5 │ 10 E 5 C 5 E

    -

    source

    +

    source

    # TidierData.@renameMacro.

    @rename(df, exprs...)
    @@ -3150,7 +3150,7 @@ 

    Reference - Exported functions 4 │ d 4 14 5 │ e 5 15

    -

    source

    +

    source

    # TidierData.@rename_withMacro.

     @rename_with(df, fn, exprs...)
    @@ -3190,7 +3190,7 @@ 

    Reference - Exported functions 2 │ banana doc2 2 3 │ cherry doc3 3

    -

    source

    +

    source

    # TidierData.@right_joinMacro.

    @right_join(df1, df2, [by])
    @@ -3247,7 +3247,7 @@ 

    Reference - Exported functions 1 │ a 1 3 2 │ c missing 4

    -

    source

    +

    source

    # TidierData.@selectMacro.

    @select(df, exprs...)
    @@ -3423,7 +3423,7 @@ 

    Reference - Exported functions 4 │ 4 14 5 │ 5 15

    -

    source

    +

    source

    # TidierData.@semi_joinMacro.

    @semi_join(df1, df2, [by])
    @@ -3475,17 +3475,18 @@ 

    Reference - Exported functions─────┼─────────────── 1 │ a 1

    -

    source

    -

    # +

    source

    +

    # TidierData.@separateMacro.

    -

    @separate(df, From, Into, Separator)

    +

    @separate(df, from, into, sep, extra = "merge")

    Separate a string column into mulitiple new columns based on a specified delimter

    Arguments

    Examples

    -

    source

    +

    source

    # TidierData.@separate_rowsMacro.

    -
    separate_rows(df, columns..., delimiter)
    +
    separate_rows(df, columns..., sep)
     

    Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.

    Arguments

    • df: A DataFrame
    • columns: A column or multiple columns to be split. Can be a mix of integers and column names.
    • -
    • delimiter: The string or character or regular expression used to split the column values.
    • +
    • sep: The string or character or regular expression used to split the column values.

    Examples

    -

    source

    +

    source

    # TidierData.@sliceMacro.

    @slice(df, exprs...)
    @@ -3652,7 +3673,7 @@ 

    Reference - Exported functions 2 │ b 4 14 3 │ c 7 17

    -

    source

    +

    source

    # TidierData.@slice_headMacro.

    @slice_head(df; n, prop)
    @@ -3691,7 +3712,7 @@ 

    Reference - Exported functions 1 │ missing 0.3 0.2 2 │ 0.2 2.0 0.2

    -

    source

    +

    source

    # TidierData.@slice_maxMacro.

    @slice_max(df, column; with_ties = true, n, prop, missing_rm = true)
    @@ -3753,7 +3774,7 @@ 

    Reference - Exported functions 2 │ 6.0 7.0 6.0 3 │ 1.0 6.0 1.0

    -

    source

    +

    source

    # TidierData.@slice_minMacro.

    @slice_min(df, column; with_ties = true, n, prop, missing_rm = true)
    @@ -3815,7 +3836,7 @@ 

    Reference - Exported functions 2 │ missing 0.3 missing 3 │ 0.2 2.0 0.2

    -

    source

    +

    source

    # TidierData.@slice_sampleMacro.

    @slice_sample(df, [n = 1, prop, replace = false])
    @@ -3889,7 +3910,7 @@ 

    Reference - Exported functions 4 9 19 5 8 18

    -

    source

    +

    source

    # TidierData.@slice_tailMacro.

    @slice_tail(df; n, prop)
    @@ -3928,7 +3949,7 @@ 

    Reference - Exported functions 1 │ 5.0 7.0 5.0 2 │ 6.0 7.0 6.0

    -

    source

    +

    source

    # TidierData.@summariseMacro.

    @summarize(df, exprs...)
    @@ -3992,7 +4013,7 @@ 

    Reference - Exported functions─────┼────────────────────── 1 │ 1 11

    -

    source

    +

    source

    # TidierData.@summarizeMacro.

    @summarize(df, exprs...)
    @@ -4056,7 +4077,7 @@ 

    Reference - Exported functions─────┼────────────────────── 1 │ 1 11

    -

    source

    +

    source

    # TidierData.@summaryMacro.

       @summary(df, cols...)
    @@ -4081,7 +4102,7 @@ 

    Reference - Exported functions @summary(b:d) end;

    -

    source

    +

    source

    # TidierData.@tallyMacro.

    @tally(df, [wt], [sort])
    @@ -4159,7 +4180,7 @@ 

    Reference - Exported functions 3 │ c 7 4 │ a 6

    -

    source

    +

    source

    # TidierData.@transmuteMacro.

    @transmute(df, exprs...)
    @@ -4186,7 +4207,7 @@ 

    Reference - Exported functions 4 │ 18 5 │ 20

    -

    source

    +

    source

    # TidierData.@ungroupMacro.

    @ungroup(df)
    @@ -4230,10 +4251,10 @@ 

    Reference - Exported functions 4 │ d 4 14 5 │ e 5 15

    -

    source

    -

    # +

    source

    +

    # TidierData.@uniteMacro.

    - -

    source

    +

    source

    # TidierData.@unnest_longerMacro.

    -

    source

    +

    source

    # TidierData.@unnest_widerMacro.

    -
    @unnest_wider(df, columns, names_sep=)
    +
    @unnest_wider(df, columns, names_sep)
     

    Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.

    Arguments

    @@ -4361,7 +4392,7 @@

    Reference - Exported functions 1 │ 1 1 2 5 6 2 │ 2 3 4 7 8

    -

    source

    +

    source

    Reference - Internal functions¤

    diff --git a/latest/search/search_index.json b/latest/search/search_index.json index 9775ca5..36d9602 100644 --- a/latest/search/search_index.json +++ b/latest/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":""},{"location":"#what-is-tidierdatajl","title":"What is TidierData.jl?","text":"

    TidierData.jl is a 100% Julia implementation of the dplyr and tidyr R packages. Powered by the DataFrames.jl package and Julia\u2019s extensive meta-programming capabilities, TidierData.jl is an R user\u2019s love letter to data analysis in Julia.

    TidierData.jl has two goals, which differentiate it from other data analysis meta-packages in Julia:

    Stick as closely to tidyverse syntax as possible.

    Whereas other meta-packages introduce Julia-centric idioms for working with DataFrames, this package\u2019s goal is to reimplement parts of tidyverse in Julia. This means that TidierData.jl uses tidy expressions as opposed to idiomatic Julia expressions. An example of a tidy expression is a = mean(b). In Julia, a and b are variables and are thus \"eagerly\" evaluated. This means that if b is merely referring to a column in a data frame and not an object in the global namespace, then an error will be generated because b was not found. In idiomatic Julia, b would need to be expressed as a symbol, or :b. Even then, a = mean(:b) would generate an error because it's not possible to calculate the mean value of a symbol. To handle this using idiomatic Julia, DataFrames.jl introduces a mini-language that relies heavily on the creation of anonymous functions, with explicit directional pairs syntax using a source => function => destination syntax. While this is quite elegant, it can be verbose. TidierData.jl aims to reduce this complexity by exposing an R-like syntax, which is then converted into valid DataFrames.jl code. The reason that tidy expressions are considered valid by Julia in TidierData.jl is because they are implemented using macros. Macros \"capture\" the expressions they are given, and then they can modify those expressions before evaluating them. For consistency, all top-level dplyr functions are implemented as macros (whether or not a macro is truly needed), and all \"helper\" functions (used inside of those top-level functions) are implemented as functions or pseudo-functions (functions which only exist through modification of the abstract syntax tree).

    Make broadcasting mostly invisible.

    Broadcasting trips up many R users switching to Julia because R users are used to most functions being vectorized. TidierData.jl currently uses a lookup table to decide which functions not to vectorize; all other functions are automatically vectorized. Read the documentation page on \"Autovectorization\" to read about how this works, and how to override the defaults. An example of where this issue commonly causes errors is when centering a variable. To create a new column a that centers the column b, TidierData.jl lets you simply write a = b - mean(b) exactly as you would in R. This works because TidierData.jl knows to not vectorize mean() while also recognizing that - should be vectorized such that this expression is rewritten in DataFrames.jl as :b => (b -> b .- mean(b)) => :a. For any user-defined function that you want to \"mark\" as being non-vectorized, you can prefix it with a ~. For example, a function new_mean(), if it had the same functionality as mean() would normally get vectorized by TidierData.jl unless you write it as ~new_mean().

    "},{"location":"#installation","title":"Installation","text":"

    For the stable version:

    ] add TidierData\n

    The ] character starts the Julia package manager. Press the backspace key to return to the Julia prompt.

    or

    using Pkg\nPkg.add(\"TidierData\")\n

    For the newest version:

    ] add TidierData#main\n

    or

    using Pkg\nPkg.add(url=\"https://github.com/TidierOrg/TidierData.jl\")\n

    "},{"location":"#what-macros-and-functions-does-tidierdatajl-support","title":"What macros and functions does TidierData.jl support?","text":"

    To support R-style programming, TidierData.jl is implemented using macros. This is because macros are able to \"capture\" the code before executing it, which allows the package to support R-like \"tidy expressions\" that would otherwise not be considered valid Julia code.

    TidierData.jl currently supports the following top-level macros:

    Top-level macros:

    • @glimpse() and @head()
    • @select() and @distinct()
    • @rename() and @rename_with()
    • @mutate() and @transmute()
    • @summarize() and @summarise()
    • @filter()
    • @slice(), @slice_sample(), @slice_min(), @slice_max(), @slice_head(), and @slice_tail()
    • @group_by() and @ungroup()
    • @arrange()
    • @relocate()
    • @pull()
    • @count() and @tally()
    • @left_join(), @right_join(), @inner_join(), @full_join(), @anti_join(), and @semi_join()
    • @bind_rows() and @bind_cols()
    • @pivot_wider() and @pivot_longer()
    • @separate(), @separate_rows(), and @unite()
    • @drop_missing() and @fill_missing()
    • @unnest_longer(), @unnest_wider(), and @nest()
    • @clean_names() (as in R's janitor::clean_names() function)
    • @summary() (as in R's summary() function)

    TidierData.jl also supports the following helper functions:

    Helper functions:

    • across()
    • where()
    • desc()
    • if_else() and case_when()
    • n() and row_number()
    • ntile()
    • lag() and lead()
    • everything(), starts_with(), ends_with(), matches(), and contains()
    • as_float(), as_integer(), and as_string()
    • is_number(), is_float(), is_integer(), and is_string()
    • missing_if() and replace_missing()

    See the Reference page for a detailed guide to each of the macros and functions.

    "},{"location":"#example","title":"Example","text":"

    Let's select the first five movies in our dataset whose budget exceeds the mean budget. Unlike in R, where we pass an na.rm = TRUE argument to remove missing values, in Julia we wrap the variable with a skipmissing() to remove the missing values before the mean() is calculated.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n\n@chain movies begin\n    @mutate(Budget = Budget / 1_000_000)\n    @filter(Budget >= mean(skipmissing(Budget)))\n    @select(Title, Budget)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame\n Row \u2502 Title                       Budget   \n     \u2502 String                      Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 'Til There Was You              23.0\n   2 \u2502 10 Things I Hate About You      16.0\n   3 \u2502 102 Dalmatians                  85.0\n   4 \u2502 13 Going On 30                  37.0\n   5 \u2502 13th Warrior, The               85.0\n

    "},{"location":"#whats-new","title":"What\u2019s new","text":"

    See NEWS.md for the latest updates.

    "},{"location":"#whats-missing","title":"What's missing","text":"

    Is there a tidyverse feature missing that you would like to see in TidierData.jl? Please file a GitHub issue. Because TidierData.jl primarily wraps DataFrames.jl, our decision to integrate a new feature will be guided by how well-supported it is within DataFrames.jl and how likely other users are to benefit from it.

    "},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"
    • TidierData.TidierData_set
    • TidierData.across
    • TidierData.as_float
    • TidierData.as_integer
    • TidierData.as_string
    • TidierData.case_when
    • TidierData.desc
    • TidierData.ends_with
    • TidierData.everything
    • TidierData.if_else
    • TidierData.is_float
    • TidierData.is_integer
    • TidierData.is_number
    • TidierData.is_string
    • TidierData.matches
    • TidierData.missing_if
    • TidierData.n
    • TidierData.ntile
    • TidierData.replace_missing
    • TidierData.row_number
    • TidierData.starts_with
    • TidierData.where
    • TidierData.@anti_join
    • TidierData.@arrange
    • TidierData.@bind_cols
    • TidierData.@bind_rows
    • TidierData.@count
    • TidierData.@distinct
    • TidierData.@drop_missing
    • TidierData.@fill_missing
    • TidierData.@filter
    • TidierData.@full_join
    • TidierData.@glimpse
    • TidierData.@group_by
    • TidierData.@head
    • TidierData.@inner_join
    • TidierData.@left_join
    • TidierData.@mutate
    • TidierData.@nest
    • TidierData.@pivot_longer
    • TidierData.@pivot_wider
    • TidierData.@pull
    • TidierData.@relocate
    • TidierData.@rename
    • TidierData.@rename_with
    • TidierData.@right_join
    • TidierData.@select
    • TidierData.@semi_join
    • TidierData.@separate
    • TidierData.@separate_rows
    • TidierData.@slice
    • TidierData.@slice_head
    • TidierData.@slice_max
    • TidierData.@slice_min
    • TidierData.@slice_sample
    • TidierData.@slice_tail
    • TidierData.@summarise
    • TidierData.@summarize
    • TidierData.@summary
    • TidierData.@tally
    • TidierData.@transmute
    • TidierData.@ungroup
    • TidierData.@unite
    • TidierData.@unnest_longer
    • TidierData.@unnest_wider
    "},{"location":"reference/#reference-exported-functions","title":"Reference - Exported functions","text":"

    # TidierData.TidierData_set \u2014 Method.

    TidierData_set(option::AbstractString, value::Bool)\n

    Set package options.

    Here are the supported options and what they do:

    • \"code\": Defaults to false. If set to true, this option displays the DataFrames.jl code generated by the TidierData.jl package. It is useful for debugging whether errors are introduced by TidierData.jl's generated code.

    Arguments

    • option: \"code\"
    • value: true or false

    source

    # TidierData.across \u2014 Method.

    across(variable[s], function[s])\n

    Apply functions to multiple variables. If specifying multiple variables or functions, surround them with parentheses so that they are recognized as a tuple.

    This function should only be called inside of TidierData.jl macros.

    Arguments

    • variable[s]: An unquoted variable, or if multiple, an unquoted tuple of variables.
    • function[s]: A function, or if multiple, a tuple of functions.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(across(b, minimum))\n       end\n1\u00d71 DataFrame\n Row \u2502 b_minimum \n     \u2502 Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @mutate(across((b,c), (minimum, maximum)))\n       end\n5\u00d77 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Char  Int64  Int64  Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11          5         15\n   2 \u2502 b         2     12          1         11          5         15\n   3 \u2502 c         3     13          1         11          5         15\n   4 \u2502 d         4     14          1         11          5         15\n   5 \u2502 e         5     15          1         11          5         15\n\njulia> @chain df begin\n         @mutate(across((b, starts_with(\"c\")), (minimum, maximum)))\n       end\n5\u00d77 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Char  Int64  Int64  Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11          5         15\n   2 \u2502 b         2     12          1         11          5         15\n   3 \u2502 c         3     13          1         11          5         15\n   4 \u2502 d         4     14          1         11          5         15\n   5 \u2502 e         5     15          1         11          5         15\n

    source

    # TidierData.as_float \u2014 Method.

    as_float(value)\n

    Convert a number or string to a Float64 data type.

    This is a useful helper for type conversions. Missing values are propagated.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_float(1)\n1.0\n\njulia> as_float(\"1.5\")\n1.5\n\njulia> as_float(missing)\nmissing\n

    source

    # TidierData.as_integer \u2014 Method.

    as_integer(value)\n

    Convert a number or string to an Int64 data type.

    This is a useful helper for type conversions. Missing values are propagated. Any values after the decimal point are removed.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_integer(1)\n1\n\njulia> as_integer(1.5)\n1\n\njulia> as_integer(\"2\")\n2\n\njulia> as_integer(\"2.5\")\n2\n\njulia> as_integer(missing)\nmissing\n

    source

    # TidierData.as_string \u2014 Method.

    as_string(value)\n

    Convert a number or string to a String data type.

    This is a useful helper for type conversions. Missing values are propagated.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_string(1)\n\"1\"\n\njulia> as_string(1.5)\n\"1.5\"\n\njulia> as_string(missing)\nmissing\n

    source

    # TidierData.case_when \u2014 Method.

    case_when(condition => return_value)\ncase_when(condition_1 => return_value_1, condition_2 => return_value_2, ...)\n

    Return the corresponding return_value for the first condition that evaluates to true.

    The most specific condition should be listed first and most general condition should be listed last. If none of the conditions evaluate to true, then a missing value is returned.

    Arguments

    • condition: A condition that evaluates to true, false, or missing.
    • return_value: The value to return if the condition is true.

    Examples

    julia> df = DataFrame(a = [1, 2, missing, 4, 5]);\n\njulia> @chain df begin\n         @mutate(b = case_when(a > 4  =>  \"hi\",\n                               a > 2  =>  \"medium\",\n                               a > 0  =>  \"low\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  low\n   2 \u2502       2  low\n   3 \u2502 missing  missing \n   4 \u2502       4  medium\n   5 \u2502       5  hi\n\njulia> @chain df begin\n         @mutate(b = case_when(a > 4  =>  \"hi\",\n                               a > 2  =>  \"medium\",\n                               a > 0  =>  \"low\",\n                               true   =>  \"unknown\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  low\n   2 \u2502       2  low\n   3 \u2502 missing  unknown\n   4 \u2502       4  medium\n   5 \u2502       5  hi\n\njulia> @chain df begin\n         @mutate(b = case_when(a >= 3  =>  3,\n                               true    =>  a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2        2\n   3 \u2502 missing  missing \n   4 \u2502       4        3\n   5 \u2502       5        3\n\njulia> @chain df begin\n         @mutate(b = case_when(a >= 3        =>  3,\n                               ismissing(a)  =>  0,\n                               true          =>  a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1      1\n   2 \u2502       2      2\n   3 \u2502 missing      0\n   4 \u2502       4      3\n   5 \u2502       5      3\n

    source

    # TidierData.desc \u2014 Method.

    desc(col)\n

    Orders the rows of a DataFrame column in descending order when used inside of @arrange(). This function should only be called inside of `@arrange()``.

    Arguments

    • col: An unquoted column name.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @arrange(a, desc(b))\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2     12\n   2 \u2502 a         1     11\n   3 \u2502 b         4     14\n   4 \u2502 b         3     13\n   5 \u2502 c         6     16\n   6 \u2502 c         5     15\n   7 \u2502 d         8     18\n   8 \u2502 d         7     17\n   9 \u2502 e        10     20\n  10 \u2502 e         9     19\n

    source

    # TidierData.ends_with \u2014 Method.

    ends_with(suffix)\n

    Select all columns ending with the suffix.

    Arguments

    • suffix: A string.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(ends_with(\"1\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    b_1   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     21\n   2 \u2502     2     22\n   3 \u2502     3     23\n   4 \u2502     4     24\n   5 \u2502     5     25\n

    source

    # TidierData.everything \u2014 Method.

    everything()\n

    Select all (remaining) columns.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(b_1, everything())\n       end\n5\u00d73 DataFrame\n Row \u2502 b_1    a_1    a_2   \n     \u2502 Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    21      1     11\n   2 \u2502    22      2     12\n   3 \u2502    23      3     13\n   4 \u2502    24      4     14\n   5 \u2502    25      5     15\n

    source

    # TidierData.if_else \u2014 Method.

    if_else(condition, yes, no, [miss])\n

    Return the yes value if the condition is true and the no value if the condition is false. If miss is specified, then the provided miss value is returned when the condition contains a missing value. If miss is not specified, then the returned value is an explicit missing value.

    Arguments

    • condition: A condition that evaluates to true, false, or missing.
    • yes: Value to return if the condition is true.
    • no: Value to return if the condition is false.
    • miss: Optional. Value to return if the condition is missing.

    Examples

    julia> df = DataFrame(a = [1, 2, missing, 4, 5]);\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, \"yes\", \"no\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  no\n   2 \u2502       2  no\n   3 \u2502 missing  missing \n   4 \u2502       4  yes\n   5 \u2502       5  yes\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, \"yes\", \"no\", \"unknown\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  no\n   2 \u2502       2  no\n   3 \u2502 missing  unknown\n   4 \u2502       4  yes\n   5 \u2502       5  yes\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, 3, a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2        2\n   3 \u2502 missing  missing \n   4 \u2502       4        3\n   5 \u2502       5        3\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, 3, a, 0))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1      1\n   2 \u2502       2      2\n   3 \u2502 missing      0\n   4 \u2502       4      3\n   5 \u2502       5      3\n

    source

    # TidierData.is_float \u2014 Method.

    is_float(column::AbstractVector)\n

    Determine if the given column contains floating-point numbers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains floating-point numbers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_float(df.c)\ntrue\n\njulia> is_float(df.b)\nfalse\n

    source

    # TidierData.is_integer \u2014 Method.

    is_integer(column::AbstractVector)\n

    Determine if the given column contains integers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains integers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_integer(df.b)\ntrue\n\njulia> is_integer(df.d)\nfalse\n

    source

    # TidierData.is_number \u2014 Method.

    is_number(column::AbstractVector)\n

    Determine if the given column contains numbers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains numbers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_number(df.b)\ntrue\n\njulia> is_number(df.c)\ntrue\n\njulia> is_number(df.d)\nfalse\n

    source

    # TidierData.is_string \u2014 Method.

    is_string(column::AbstractVector)\n

    Determine if the given column contains strings.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains strings, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_string(df.d)\ntrue\n\njulia> is_string(df.c)\nfalse\n

    source

    # TidierData.matches \u2014 Method.

    matches(pattern, [flags])\n

    Select all columns matching the pattern.

    Arguments

    • pattern: A string.
    • flags: Optional string containing flags. \"i\" = Do case-insensitive pattern matching. \"m\" = Treat string as multiple lines. \"s\" = Treat string as a single line. \"x\" = Tells the regular expression parser to ignore most whitespace that is neither backslashed nor within a character class. You

    can use this to break up your regular expression into (slightly) more readable parts.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(matches(\"^a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin \n         @select(matches(\"1$\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    b_1   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     21\n   2 \u2502     2     22\n   3 \u2502     3     23\n   4 \u2502     4     24\n   5 \u2502     5     25\n\njulia> @chain df begin \n         @select(matches(\"A\", \"i\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.missing_if \u2014 Method.

    missing_if(x, value)\n

    Replace a specific value with missing in x.

    Arguments

    • x: The input value which can be of any type. If x is already missing or equals value, the function will return missing. Otherwise, it returns x unaltered.
    • value: The specific value to be checked against.

    Examples

    julia> df = DataFrame(\n              a = [1, missing, 3, 4],\n              b = [\"apple\", \"apple\", \"banana\", \"cherry\"]\n            );\n\njulia> @chain df begin\n         @mutate(a = missing_if(a, 4), \n                 b = missing_if(b, \"apple\"))\n       end\n4\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  missing \n   2 \u2502 missing  missing \n   3 \u2502       3  banana\n   4 \u2502 missing  cherry\n

    source

    # TidierData.n \u2014 Method.

    n()\n

    Return the number of rows in the DataFrame or in the group if used in the context of a GroupedDataFrame.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @summarize(n = n())\n       end\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    10\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(n = n())\n       end\n5\u00d72 DataFrame\n Row \u2502 a     n     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2\n   2 \u2502 b         2\n   3 \u2502 c         2\n   4 \u2502 d         2\n   5 \u2502 e         2\n

    source

    # TidierData.ntile \u2014 Method.

    ntile(x, n::Integer)\n

    Break the input vector into n equal-sized buckets.

    ntile() is a rough rank that breaks the input vector into n buckets. If length(x) is not an integer multiple of n, the size of the buckets will differ by up to one, with larger buckets coming first.

    Unlike other ranking functions, ntile() ignores ties: it will create evenly sized buckets even if the same value of x ends up in different buckets.

    Arguments

    • x: A vector to rank. By default, the smallest values will get the smallest ranks. Missing values will be given rank missing.
    • n: Number of groups to bucket into.

    Examples

    julia> x = [5,1,3,2,2, missing]\n6-element Vector{Union{Missing, Int64}}:\n 5\n 1\n 3\n 2\n 2\n  missing\n\njulia> ntile(x, 2)\n6-element Vector{Union{Missing, Int64}}:\n 2\n 1\n 2\n 1\n 1\n  missing\n\njulia> ntile(x, 4)\n6-element Vector{Union{Missing, Int64}}:\n 4\n 1\n 3\n 1\n 2\n  missing\n\njulia> ntile(1:8, 3)\n8-element Vector{Int64}:\n 1\n 1\n 1\n 2\n 2\n 2\n 3\n 3\n\njulia> df = DataFrame(a = 1:8);\n\njulia> @chain df begin\n       @mutate(buckets = ntile(a, 3))\n       end\n8\u00d72 DataFrame\n Row \u2502 a      buckets \n     \u2502 Int64  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2        1\n   3 \u2502     3        1\n   4 \u2502     4        2\n   5 \u2502     5        2\n   6 \u2502     6        2\n   7 \u2502     7        3\n   8 \u2502     8        3\n

    source

    # TidierData.replace_missing \u2014 Method.

    replace_missing(x, replacement)\n

    Replace missing values in x with a specified replacement value.

    Arguments

    • x: The input value which can be of any type. If x is missing, the function will return replacement. Otherwise, it returns x unaltered.
    • replacement: The value to replace missing with in x.

    Examples

    julia> df = DataFrame(\n              a = [1, missing, 3, 4],\n              b = [4, 5, missing, 8]\n            );\n\njulia> @chain df begin\n         @mutate(a = replace_missing(a, 100),\n                 b = replace_missing(b, 35))\n       end\n4\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      4\n   2 \u2502   100      5\n   3 \u2502     3     35\n   4 \u2502     4      8\n

    source

    # TidierData.row_number \u2014 Method.

    row_number()\n

    Return each row's number in a DataFrame or in the group if used in the context of a GroupedDataFrame.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2));\n\njulia> @chain df begin\n         @mutate(row_num = row_number())\n       end\n10\u00d72 DataFrame\n Row \u2502 a     row_num \n     \u2502 Char  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 b           3\n   4 \u2502 b           4\n   5 \u2502 c           5\n   6 \u2502 c           6\n   7 \u2502 d           7\n   8 \u2502 d           8\n   9 \u2502 e           9\n  10 \u2502 e          10\n\njulia> @chain df begin\n         @mutate(row_num = row_number() + 1)\n       end\n10\u00d72 DataFrame\n Row \u2502 a     row_num \n     \u2502 Char  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           2\n   2 \u2502 a           3\n   3 \u2502 b           4\n   4 \u2502 b           5\n   5 \u2502 c           6\n   6 \u2502 c           7\n   7 \u2502 d           8\n   8 \u2502 d           9\n   9 \u2502 e          10\n  10 \u2502 e          11\n\njulia> @chain df begin\n         @filter(row_number() <= 5)\n       end\n5\u00d71 DataFrame\n Row \u2502 a    \n     \u2502 Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a\n   2 \u2502 a\n   3 \u2502 b\n   4 \u2502 b\n   5 \u2502 c\n

    source

    # TidierData.starts_with \u2014 Method.

    starts_with(prefix)\n

    Select all columns starting with the prefix.

    Arguments

    • prefix: A string.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(starts_with(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.where \u2014 Method.

    where(function)\n

    Selects columns on which a function returns true for all values of the column.

    This function should only be called inside of TidierData.jl macros.

    Arguments

    • function: A predicate function (one that returns true or false).

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @select(where(is_number))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n\njulia> @chain df begin\n         @mutate(across(where(is_number), minimum))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum \n     \u2502 Char  Int64  Int64  Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11\n   2 \u2502 b         2     12          1         11\n   3 \u2502 c         3     13          1         11\n   4 \u2502 d         4     14          1         11\n   5 \u2502 e         5     15          1         11\n\njulia> df = DataFrame(a = repeat('a':'e', inner = 3),\n                      b = 1:15,\n                      c = 16:30,\n                      d = 31:45);\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(across(where(is_number), mean))\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b_mean   c_mean   d_mean  \n     \u2502 Char  Float64  Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2.0     17.0     32.0\n   2 \u2502 b         5.0     20.0     35.0\n   3 \u2502 c         8.0     23.0     38.0\n   4 \u2502 d        11.0     26.0     41.0\n   5 \u2502 e        14.0     29.0     44.0\n

    source

    # TidierData.@anti_join \u2014 Macro.

    @anti_join(df1, df2, [by])\n

    Perform an anti-join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @anti_join(df1, df2)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, a = a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, \"a\" = \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n

    source

    # TidierData.@arrange \u2014 Macro.

    @arrange(df, exprs...)\n

    Order the rows of a DataFrame by the values of specified columns.

    Arguments

    • df: A DataFrame.
    • exprs...: Variables from the input DataFrame. Use desc() to sort in descending order. Multiple variables can be specified, separated by commas.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @arrange(a)\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         6     16\n   7 \u2502 d         7     17\n   8 \u2502 d         8     18\n   9 \u2502 e         9     19\n  10 \u2502 e        10     20\n\njulia> @chain df begin\n         @arrange(a, desc(b))\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2     12\n   2 \u2502 a         1     11\n   3 \u2502 b         4     14\n   4 \u2502 b         3     13\n   5 \u2502 c         6     16\n   6 \u2502 c         5     15\n   7 \u2502 d         8     18\n   8 \u2502 d         7     17\n   9 \u2502 e        10     20\n  10 \u2502 e         9     19\n

    source

    # TidierData.@bind_cols \u2014 Macro.

    @bind_cols(dfs...)\n

    Bind many DataFrames into one by column.

    Arguments

    • dfs...: DataFrames to combine.

    Examples

    julia> df1 = DataFrame(a=1:3, b=1:3);\n\njulia> df2 = DataFrame(a=4:6, b=4:6);\n\njulia> df3 = DataFrame(a=7:9, c=7:9);\n\njulia> @chain df1 begin\n         @bind_cols(df2, df3)\n       end\n3\u00d76 DataFrame\n Row \u2502 a      b      a_1    b_1    a_2    c     \n     \u2502 Int64  Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      4      4      7      7\n   2 \u2502     2      2      5      5      8      8\n   3 \u2502     3      3      6      6      9      9\n

    source

    # TidierData.@bind_rows \u2014 Macro.

    @bind_rows(dfs..., id)\n

    Bind many DataFrames into one by row.

    Columns present in at least one of the provided DataFrames are kept. Columns not present in some DataFrames are filled with missing values where necessary.

    Arguments

    • dfs...: DataFrames to combine.
    • id: string DataFrame identifier. When id is supplied, a new column of numeric identifiers is created to link each row to its original DataFrame.

    Examples

    julia> df1 = DataFrame(a=1:3, b=1:3);\n\njulia> df2 = DataFrame(a=4:6, b=4:6);\n\njulia> df3 = DataFrame(a=7:9, c=7:9);\n\njulia> @chain df1 begin\n         @bind_rows(df2)\n       end\n6\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     2      2\n   3 \u2502     3      3\n   4 \u2502     4      4\n   5 \u2502     5      5\n   6 \u2502     6      6\n

    When columns are not present in some DataFrames, they are filled with missing values.

    julia> @chain df1 begin\n         @bind_rows(df2, df3)\n       end\n9\u00d73 DataFrame\n Row \u2502 a      b        c       \n     \u2502 Int64  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1  missing \n   2 \u2502     2        2  missing \n   3 \u2502     3        3  missing \n   4 \u2502     4        4  missing \n   5 \u2502     5        5  missing \n   6 \u2502     6        6  missing \n   7 \u2502     7  missing        7\n   8 \u2502     8  missing        8\n   9 \u2502     9  missing        9\n\njulia> @chain df1 begin\n         @bind_rows(df2, df3, id = \"id\")\n       end\n9\u00d74 DataFrame\n Row \u2502 a      b        c        id    \n     \u2502 Int64  Int64?   Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1  missing      1\n   2 \u2502     2        2  missing      1\n   3 \u2502     3        3  missing      1\n   4 \u2502     4        4  missing      2\n   5 \u2502     5        5  missing      2\n   6 \u2502     6        6  missing      2\n   7 \u2502     7  missing        7      3\n   8 \u2502     8  missing        8      3\n   9 \u2502     9  missing        9      3\n

    source

    # TidierData.@count \u2014 Macro.

    @count(df, exprs..., [wt], [sort])\n

    Count the unique values of one or more variables, with an optional weighting.

    @chain df @count(a, b) is roughly equivalent to @chain df @group_by(a, b) @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt). Note that if grouping columns are provided, the result will be an ungrouped data frame, which is slightly different behavior than R's tidyverse.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • exprs...: Column names, separated by commas.
    • wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
    • sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 3),\n                           repeat([\"b\"], inner = 3),\n                           repeat([\"c\"], inner = 1),\n                           missing),\n                      b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n   4 \u2502 b            4\n   5 \u2502 b            5\n   6 \u2502 b            6\n   7 \u2502 c            7\n   8 \u2502 missing      8\n\njulia> @chain df @count()\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     8\n\njulia> @chain df begin\n         @count(a)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            3\n   2 \u2502 b            3\n   3 \u2502 c            1\n   4 \u2502 missing      1\n\njulia> @chain df begin\n         @count(a, wt = b)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            6\n   2 \u2502 b           15\n   3 \u2502 c            7\n   4 \u2502 missing      8\n\njulia> @chain df begin\n         @count(a, wt = b, sort = true)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           15\n   2 \u2502 missing      8\n   3 \u2502 c            7\n   4 \u2502 a            6       \n

    source

    # TidierData.@distinct \u2014 Macro.

    distinct(df, exprs...)\n

    Return distinct rows of a DataFrame.

    If no columns or expressions are provided, then unique rows across all columns are returned. Otherwise, unique rows are determined based on the columns or expressions provided, and then all columns are returned.

    Arguments

    • df: A DataFrame.
    • exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = repeat(1:5, 2), c = 11:20);\n\njulia> @chain df @distinct()\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         1     16\n   7 \u2502 d         2     17\n   8 \u2502 d         3     18\n   9 \u2502 e         4     19\n  10 \u2502 e         5     20\n\njulia> @chain df @distinct(a)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         3     13\n   3 \u2502 c         5     15\n   4 \u2502 d         2     17\n   5 \u2502 e         4     19\n\njulia> @chain df begin\n         @distinct(starts_with(\"a\"))\n       end\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         3     13\n   3 \u2502 c         5     15\n   4 \u2502 d         2     17\n   5 \u2502 e         4     19\n\njulia> @chain df begin\n         @distinct(a, b)\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         1     16\n   7 \u2502 d         2     17\n   8 \u2502 d         3     18\n   9 \u2502 e         4     19\n  10 \u2502 e         5     20\n

    source

    # TidierData.@drop_missing \u2014 Macro.

    @drop_missing(df, [cols...])\n

    Drop all rows with missing values.

    When called without arguments, @drop_missing() drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • cols...: An optional column, or multiple columns separated by commas or specified using selection helpers.

    Examples

    julia> df = DataFrame(\n              a = [1, 2, missing, 4],\n              b = [1, missing, 3, 4]\n            )\n4\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2  missing \n   3 \u2502 missing        3\n   4 \u2502       4        4\n\njulia> @chain df @drop_missing()\n2\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     4      4\n\njulia> @chain df @drop_missing(a)\n3\u00d72 DataFrame\n Row \u2502 a      b       \n     \u2502 Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2  missing \n   3 \u2502     4        4\n\njulia> @chain df @drop_missing(a, b)\n2\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     4      4\n\njulia> @chain df @drop_missing(starts_with(\"a\"))\n3\u00d72 DataFrame\n Row \u2502 a      b       \n     \u2502 Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2  missing \n   3 \u2502     4        4\n

    source

    # TidierData.@fill_missing \u2014 Macro.

    @fill_missing(df, [columns...], direction)

    Fill missing values in a DataFrame df using the specified method.

    Arguments

    • df: The DataFrame or GroupedDataFrame in which you want to fill missing values.
    • columns: (Optional) The columns for which missing values need to be filled, separated by commas. If not provided, the operation is applied to all columns.
    • direction: A string containing the method to use for filling missing values. Options include: \"down\" (last observation carried forward) or \"up\" (next observation carried backward).

    Examples

    julia> df = DataFrame(\n          dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n          dt2 = [0.3, 2, missing, 3, missing, 5, 6,missing],\n          dt3 = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n          dt4 = [0.3, missing, missing, 3, missing, 5, 6, missing],\n          dt5 = ['a', 'b', 'a', 'b', 'a', 'a', 'a', 'b']);\n\njulia> @fill_missing(df, dt2, dt4, \"down\")\n8\u00d75 DataFrame\n Row \u2502 dt1        dt2       dt3        dt4       dt5  \n     \u2502 Float64?   Float64?  Float64?   Float64?  Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3  missing         0.3  a\n   2 \u2502       0.2       2.0        0.2       0.3  b\n   3 \u2502 missing         2.0  missing         0.3  a\n   4 \u2502 missing         3.0  missing         3.0  b\n   5 \u2502       1.0       3.0        1.0       3.0  a\n   6 \u2502 missing         5.0  missing         5.0  a\n   7 \u2502       5.0       6.0        5.0       6.0  a\n   8 \u2502       6.0       6.0        6.0       6.0  b\n\njulia> @chain df begin\n         @fill_missing(\"up\")\n       end\n8\u00d75 DataFrame\n Row \u2502 dt1       dt2        dt3       dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?  Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      0.2        0.3       0.2        0.3  a\n   2 \u2502      0.2        2.0       0.2        3.0  b\n   3 \u2502      1.0        3.0       1.0        3.0  a\n   4 \u2502      1.0        3.0       1.0        3.0  b\n   5 \u2502      1.0        5.0       1.0        5.0  a\n   6 \u2502      5.0        5.0       5.0        5.0  a\n   7 \u2502      5.0        6.0       5.0        6.0  a\n   8 \u2502      6.0  missing         6.0  missing    b \n\njulia> @chain df begin\n         @group_by(dt5)\n         @fill_missing(dt1, \"up\")\n       end\nGroupedDataFrame with 2 groups based on key: dt5\nFirst Group (5 rows): dt5 = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n Row \u2502 dt1       dt2        dt3        dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?   Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      1.0        0.3  missing          0.3  a\n   2 \u2502      1.0  missing    missing    missing    a\n   3 \u2502      1.0  missing          1.0  missing    a\n   4 \u2502      5.0        5.0  missing          5.0  a\n   5 \u2502      5.0        6.0        5.0        6.0  a\n\u22ee\nLast Group (3 rows): dt5 = 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)\n Row \u2502 dt1       dt2        dt3        dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?   Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      0.2        2.0        0.2  missing    b\n   2 \u2502      6.0        3.0  missing          3.0  b\n   3 \u2502      6.0  missing          6.0  missing    b\n

    source

    # TidierData.@filter \u2014 Macro.

    @filter(df, exprs...)\n

    Subset a DataFrame and return a copy of DataFrame where specified conditions are satisfied.

    Arguments

    • df: A DataFrame.
    • exprs...: transformation(s) that produce vectors containing true or false.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @filter(b >= mean(b))\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 c         3     13\n   2 \u2502 d         4     14\n   3 \u2502 e         5     15\n\njulia> @chain df begin\n         @filter(b >= 3 && c >= 14)\n       end\n2\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 d         4     14\n   2 \u2502 e         5     15\n\njulia> @chain df begin\n         @filter(b in (1, 3))\n       end\n2\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 c         3     13\n

    source

    # TidierData.@full_join \u2014 Macro.

    @full_join(df1, df2, [by])\n

    Perform a full join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @full_join(df1, df2)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, a)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, a = a)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, \"a\")\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, \"a\" = \"a\")\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n

    source

    # TidierData.@glimpse \u2014 Macro.

    @glimpse(df, width = 80)\n

    Preview a DataFrame (or GroupedDataFrame).

    The @glimpse macro is used to preview a DataFrame or GroupedDataFrame. Each column is printed on a separate row, along with its data type and first few elements, with the output truncated based on the width.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • width: The width of the output, measured in the number of characters. Defaults to 80.

    Examples

    julia> df = DataFrame(\n               a = 1:100, \n               b = 1:100, \n               c = repeat([\"a\"], 100)\n               );\n\njulia> @chain df @glimpse\nRows: 100\nColumns: 3\n.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,\n\njulia> @chain df begin\n       @group_by(a)\n       @glimpse()\n       end\nRows: 100\nColumns: 3\nGroups: a [100]\n.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,\n

    source

    # TidierData.@group_by \u2014 Macro.

    @group_by(df, exprs...)\n

    Return a GroupedDataFrame where operations are performed by groups specified by unique sets of cols.

    Arguments

    • df: A DataFrame.
    • exprs...: DataFrame columns to group by or tidy expressions. Can be a single tidy expression or multiple expressions separated by commas.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0  \n\njulia> @chain df begin\n         @group_by(d = uppercase(a))\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 d     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 A         1.0\n   2 \u2502 B         2.0\n   3 \u2502 C         3.0\n   4 \u2502 D         4.0\n   5 \u2502 E         5.0\n\njulia> @chain df begin\n         @group_by(-(b, c)) # same as `a`\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0\n\njulia> @chain df begin\n         @group_by(!(b, c)) # same as `a`\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0\n

    source

    # TidierData.@head \u2014 Macro.

       @head(df, value)\n

    Shows the first n rows of the the data frame or of each group in a grouped data frame.

    Arguments

    • df: The data frame.
    • value: number of rows to be returned. Defaults to 6 if left blank.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 4),\n                                  repeat([\"b\"], inner = 4)),\n                             b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 a           3\n   4 \u2502 a           4\n   5 \u2502 b           5\n   6 \u2502 b           6\n   7 \u2502 b           7\n   8 \u2502 b           8\n\njulia> @head(df, 3)\n3\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n\njulia> @head(df)\n6\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 a           3\n   4 \u2502 a           4\n   5 \u2502 b           5\n   6 \u2502 b           6\n\njulia> @chain df begin\n         @group_by a\n         @head 2\n       end\nGroupedDataFrame with 2 groups based on key: a\nFirst Group (2 rows): a = \"a\"\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n\u22ee\nLast Group (2 rows): a = \"b\"\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           5\n   2 \u2502 b           6\n

    source

    # TidierData.@inner_join \u2014 Macro.

    @inner_join(df1, df2, [by])\n

    Perform a inner join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @inner_join(df1, df2)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, a)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, a = a)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, \"a\")\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, \"a\" = \"a\")\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n

    source

    # TidierData.@left_join \u2014 Macro.

    @left_join(df1, df2, [by])\n

    Perform a left join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @left_join(df1, df2)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing \n\njulia> @left_join(df1, df2, a)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, a = a)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, \"a\" = \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n

    source

    # TidierData.@mutate \u2014 Macro.

    @mutate(df, exprs...)\n

    Create new columns as functions of existing columns. The results have the same number of rows as df.

    Arguments

    • df: A DataFrame.
    • exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @mutate(d = b + c,\n                 b_minus_mean_b = b - mean(b))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      d      b_minus_mean_b \n     \u2502 Char  Int64  Int64  Int64  Float64        \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11     12            -2.0\n   2 \u2502 b         2     12     14            -1.0\n   3 \u2502 c         3     13     16             0.0\n   4 \u2502 d         4     14     18             1.0\n   5 \u2502 e         5     15     20             2.0\n\njulia> @chain df begin\n         @mutate begin\n           d = b + c\n           b_minus_mean_b = b - mean(b)\n         end\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      d      b_minus_mean_b \n     \u2502 Char  Int64  Int64  Int64  Float64        \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11     12            -2.0\n   2 \u2502 b         2     12     14            -1.0\n   3 \u2502 c         3     13     16             0.0\n   4 \u2502 d         4     14     18             1.0\n   5 \u2502 e         5     15     20             2.0\n\njulia> @chain df begin\n         @mutate(d = b in (1,3))\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b      c      d     \n     \u2502 Char  Int64  Int64  Bool  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11   true\n   2 \u2502 b         2     12  false\n   3 \u2502 c         3     13   true\n   4 \u2502 d         4     14  false\n   5 \u2502 e         5     15  false\n\njulia> @chain df begin\n         @mutate(across((b, c), mean))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_mean   c_mean  \n     \u2502 Char  Int64  Int64  Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11      3.0     13.0\n   2 \u2502 b         2     12      3.0     13.0\n   3 \u2502 c         3     13      3.0     13.0\n   4 \u2502 d         4     14      3.0     13.0\n   5 \u2502 e         5     15      3.0     13.0\n\njulia> @chain df begin\n         @summarize(across(contains(\"b\"), mean))\n       end\n1\u00d71 DataFrame\n Row \u2502 b_mean  \n     \u2502 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0\n\njulia> @chain df begin\n         @summarize(across(-contains(\"a\"), mean))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_mean   c_mean  \n     \u2502 Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0     13.0\n\njulia> @chain df begin\n         @mutate(across(where(is_number), minimum))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum \n     \u2502 Char  Int64  Int64  Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11\n   2 \u2502 b         2     12          1         11\n   3 \u2502 c         3     13          1         11\n   4 \u2502 d         4     14          1         11\n   5 \u2502 e         5     15          1         11\n

    source

    # TidierData.@nest \u2014 Macro.

    @nest(df, new_column = nesting_columns)\n

    Multiple columns are nested into one or more new columns in a DataFrame.

    Arguments

    • df: A DataFrame
    • new_column: New column name
    • nesting_columns: Columns to be nested into the new_column

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 3),\n                      b = 1:15,\n                      c_1 = 16:30,\n                      c_2 = 31:45);\n\njulia> @nest(df, data = b:c_2)\n5\u00d72 DataFrame\n Row \u2502 a     data          \n     \u2502 Char  DataFrame     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     3\u00d73 DataFrame \n   2 \u2502 b     3\u00d73 DataFrame \n   3 \u2502 c     3\u00d73 DataFrame \n   4 \u2502 d     3\u00d73 DataFrame \n   5 \u2502 e     3\u00d73 DataFrame \n\njulia> @nest(df, data_1 = b, data_2 = starts_with(\"c\"))\n5\u00d73 DataFrame\n Row \u2502 a     data_1         data_2        \n     \u2502 Char  DataFrame      DataFrame     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     3\u00d71 DataFrame  3\u00d72 DataFrame \n   2 \u2502 b     3\u00d71 DataFrame  3\u00d72 DataFrame \n   3 \u2502 c     3\u00d71 DataFrame  3\u00d72 DataFrame \n   4 \u2502 d     3\u00d71 DataFrame  3\u00d72 DataFrame \n   5 \u2502 e     3\u00d71 DataFrame  3\u00d72 DataFrame \n\njulia> @chain df begin\n         @nest(data = b:c_2)\n         @unnest_longer(data)\n       end\n15\u00d72 DataFrame\n Row \u2502 a     data                         \n     \u2502 Char  NamedTup\u2026                    \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     (b = 1, c_1 = 16, c_2 = 31)\n   2 \u2502 a     (b = 2, c_1 = 17, c_2 = 32)\n   3 \u2502 a     (b = 3, c_1 = 18, c_2 = 33)\n   4 \u2502 b     (b = 4, c_1 = 19, c_2 = 34)\n   5 \u2502 b     (b = 5, c_1 = 20, c_2 = 35)\n   6 \u2502 b     (b = 6, c_1 = 21, c_2 = 36)\n   7 \u2502 c     (b = 7, c_1 = 22, c_2 = 37)\n   8 \u2502 c     (b = 8, c_1 = 23, c_2 = 38)\n   9 \u2502 c     (b = 9, c_1 = 24, c_2 = 39)\n  10 \u2502 d     (b = 10, c_1 = 25, c_2 = 40)\n  11 \u2502 d     (b = 11, c_1 = 26, c_2 = 41)\n  12 \u2502 d     (b = 12, c_1 = 27, c_2 = 42)\n  13 \u2502 e     (b = 13, c_1 = 28, c_2 = 43)\n  14 \u2502 e     (b = 14, c_1 = 29, c_2 = 44)\n  15 \u2502 e     (b = 15, c_1 = 30, c_2 = 45)\n\njulia> @chain df begin\n         @nest(data = b:c_2)\n         @unnest_wider(data)\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b             c_1           c_2          \n     \u2502 Char  Any           Any           Any          \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     [1, 2, 3]     [16, 17, 18]  [31, 32, 33]\n   2 \u2502 b     [4, 5, 6]     [19, 20, 21]  [34, 35, 36]\n   3 \u2502 c     [7, 8, 9]     [22, 23, 24]  [37, 38, 39]\n   4 \u2502 d     [10, 11, 12]  [25, 26, 27]  [40, 41, 42]\n   5 \u2502 e     [13, 14, 15]  [28, 29, 30]  [43, 44, 45]\n\njulia> @chain df begin\n         @nest(data = -a)\n         @unnest_wider(data) # wider first\n         @unnest_longer(-a)  # then longer\n       end\n15\u00d74 DataFrame\n Row \u2502 a     b      c_1    c_2   \n     \u2502 Char  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     16     31\n   2 \u2502 a         2     17     32\n   3 \u2502 a         3     18     33\n   4 \u2502 b         4     19     34\n   5 \u2502 b         5     20     35\n   6 \u2502 b         6     21     36\n   7 \u2502 c         7     22     37\n   8 \u2502 c         8     23     38\n   9 \u2502 c         9     24     39\n  10 \u2502 d        10     25     40\n  11 \u2502 d        11     26     41\n  12 \u2502 d        12     27     42\n  13 \u2502 e        13     28     43\n  14 \u2502 e        14     29     44\n  15 \u2502 e        15     30     45\n\njulia> @chain df begin\n         @nest(data = -a)\n         @unnest_longer(data) # longer first\n         @unnest_wider(-a)    # then wider\n       end\n15\u00d74 DataFrame\n Row \u2502 a     b      c_2    c_1   \n     \u2502 Char  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     31     16\n   2 \u2502 a         2     32     17\n   3 \u2502 a         3     33     18\n   4 \u2502 b         4     34     19\n   5 \u2502 b         5     35     20\n   6 \u2502 b         6     36     21\n   7 \u2502 c         7     37     22\n   8 \u2502 c         8     38     23\n   9 \u2502 c         9     39     24\n  10 \u2502 d        10     40     25\n  11 \u2502 d        11     41     26\n  12 \u2502 d        12     42     27\n  13 \u2502 e        13     43     28\n  14 \u2502 e        14     44     29\n  15 \u2502 e        15     45     30\n

    source

    # TidierData.@pivot_longer \u2014 Macro.

    @pivotlonger(df, cols, [namesto], [values_to])

    Reshapes the DataFrame to make it longer, increasing the number of rows and reducing the number of columns.

    Arguments

    • df: A DataFrame.
    • cols: Columns to pivot into longer format. Multiple columns can be selected but providing tuples of columns is not yet supported.
    • names_to: Optional, defaults to variable. The name of the newly created column whose values will contain the input DataFrame's column names.
    • values_to: Optional, defaults to value. The name of the newly created column containing the input DataFrame's cell values.

    Examples

    julia> df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4]);\n\njulia> @pivot_longer(df_wide, A:B)\n4\u00d73 DataFrame\n Row \u2502 id     variable  value \n     \u2502 Int64  String    Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A             1\n   2 \u2502     2  A             3\n   3 \u2502     1  B             2\n   4 \u2502     2  B             4\n\njulia> @pivot_longer(df_wide, -id)\n4\u00d73 DataFrame\n Row \u2502 id     variable  value \n     \u2502 Int64  String    Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A             1\n   2 \u2502     2  A             3\n   3 \u2502     1  B             2\n   4 \u2502     2  B             4\n\njulia> @pivot_longer(df_wide, A:B, names_to = \"letter\", values_to = \"number\")\n4\u00d73 DataFrame\n Row \u2502 id     letter  number \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A            1\n   2 \u2502     2  A            3\n   3 \u2502     1  B            2\n   4 \u2502     2  B            4\n\njulia> @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)\n4\u00d73 DataFrame\n Row \u2502 id     letter  number \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A            1\n   2 \u2502     2  A            3\n   3 \u2502     1  B            2\n   4 \u2502     2  B            4\n\njulia> @pivot_longer(df_wide, A:B, names_to = \"letter\")\n4\u00d73 DataFrame\n Row \u2502 id     letter  value \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A           1\n   2 \u2502     2  A           3\n   3 \u2502     1  B           2\n   4 \u2502     2  B           4\n

    source

    # TidierData.@pivot_wider \u2014 Macro.

    @pivotwider(df, namesfrom, valuesfrom[, valuesfill])

    Reshapes the DataFrame to make it wider, increasing the number of columns and reducing the number of rows.

    Arguments

    • df: A DataFrame.
    • names_from: The name of the column to get the name of the output columns from.
    • values_from: The name of the column to get the cell values from.
    • values_fill: The value to replace a missing name/value combination (default is missing)

    Examples

    julia> df_long = DataFrame(id = [1, 1, 2, 2],\n                           variable = [\"A\", \"B\", \"A\", \"B\"],\n                           value = [1, 2, 3, 4]);\n\njulia> df_long_missing = DataFrame(id = [1, 1, 2],\n                           variable = [\"A\", \"B\", \"B\"],\n                           value = [1, 2, 4]);\n\njulia> @pivot_wider(df_long, names_from = variable, values_from = value)\n2\u00d73 DataFrame\n Row \u2502 id     A       B      \n     \u2502 Int64  Int64?  Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1       1       2\n   2 \u2502     2       3       4\n\njulia> @pivot_wider(df_long, names_from = \"variable\", values_from = \"value\")\n2\u00d73 DataFrame\n Row \u2502 id     A       B      \n     \u2502 Int64  Int64?  Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1       1       2\n   2 \u2502     2       3       4\n\njulia> @pivot_wider(df_long_missing, names_from = variable, values_from = value, values_fill = 0)\n2\u00d73 DataFrame\n Row \u2502 id     A      B     \n     \u2502 Int64  Int64  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      2\n   2 \u2502     2      0      4\n

    source

    # TidierData.@pull \u2014 Macro.

    @pull(df, column)\n

    Pull (or extract) a column as a vector.

    Arguments

    • df: A DataFrame.
    • column: A single column, referred to either by its name or number.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df @pull(a)\n5-element Vector{Char}:\n 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)\n 'c': ASCII/Unicode U+0063 (category Ll: Letter, lowercase)\n 'd': ASCII/Unicode U+0064 (category Ll: Letter, lowercase)\n 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)\n\njulia> @chain df @pull(2)\n5-element Vector{Int64}:\n 1\n 2\n 3\n 4\n 5\n

    source

    # TidierData.@relocate \u2014 Macro.

    @relocate(df, columns, before = nothing, after = nothing)\n

    Rearranges the columns of a data frame. This function allows for moving specified columns to a new position within the data frame, either before or after a given target column. The columns, before, and after arguments all accept tidy selection functions. Only one of before or after should be specified. If neither are specified, the selected columns will be moved to the beginning of the data frame.

    Arguments

    • df: The data frame.
    • columns: Column or columns to to be moved.
    • before: (Optional) Column or columns before which the specified columns will be moved. If not provided or nothing, this argument is ignored.
    • after: (Optional) Column or columns after which the specified columns will be moved. If not provided or nothing, this argument is ignored.

    Examples

    julia> df = DataFrame(A = 1:5, B = 6:10, C = [\"A\", \"b\", \"C\", \"D\", \"E\"], D = ['A', 'B','A', 'B','C'],\n                      E = 1:5, F = [\"A\", \"b\", \"C\", \"D\", \"E\"]);\n\njulia> @relocate(df, where(is_string), before = where(is_integer))\n5\u00d76 DataFrame\n Row \u2502 C       F       A      B      E      D    \n     \u2502 String  String  Int64  Int64  Int64  Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 A       A           1      6      1  A\n   2 \u2502 b       b           2      7      2  B\n   3 \u2502 C       C           3      8      3  A\n   4 \u2502 D       D           4      9      4  B\n   5 \u2502 E       E           5     10      5  C\n\n\njulia> @relocate(df, B, C, D, after = E)\n5\u00d76 DataFrame\n Row \u2502 A      E      B      C       D     F      \n     \u2502 Int64  Int64  Int64  String  Char  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      6  A       A     A\n   2 \u2502     2      2      7  b       B     b\n   3 \u2502     3      3      8  C       A     C\n   4 \u2502     4      4      9  D       B     D\n   5 \u2502     5      5     10  E       C     E\n\njulia> @relocate(df, B, C, D, after = starts_with(\"E\"))\n5\u00d76 DataFrame\n Row \u2502 A      E      B      C       D     F      \n     \u2502 Int64  Int64  Int64  String  Char  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      6  A       A     A\n   2 \u2502     2      2      7  b       B     b\n   3 \u2502     3      3      8  C       A     C\n   4 \u2502     4      4      9  D       B     D\n   5 \u2502     5      5     10  E       C     E\n\njulia> @relocate(df, B:C) # bring columns to the front\n5\u00d76 DataFrame\n Row \u2502 B      C       A      D     E      F      \n     \u2502 Int64  String  Int64  Char  Int64  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6  A           1  A         1  A\n   2 \u2502     7  b           2  B         2  b\n   3 \u2502     8  C           3  A         3  C\n   4 \u2502     9  D           4  B         4  D\n   5 \u2502    10  E           5  C         5  E\n

    source

    # TidierData.@rename \u2014 Macro.

    @rename(df, exprs...)\n

    Change the names of individual column names in a DataFrame. Users can also use @select() to rename and select columns.

    Arguments

    • df: A DataFrame.
    • exprs...: Use new_name = old_name syntax to rename selected columns.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @rename(d = b, e = c)\n       end\n5\u00d73 DataFrame\n Row \u2502 a     d      e     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n

    source

    # TidierData.@rename_with \u2014 Macro.

     @rename_with(df, fn, exprs...)\n

    Renames the chosen column names using a function

    Arguments

    • df: a DataFrame
    • fn: desired function to (such as strremoveall from TidierStrings)
    • exprs: One or more unquoted variable names separated by commas. Variable names

    can also be used as their positions in the data, like x:y, to select a range of variables. Variables names can also be chosen with starts with. Defaults to all columns if empty.

    Examples

    julia> function str_remove_all(column, pattern::String)\n         if ismissing(column)\n             return column\n         end\n         patterns = split(pattern, '|')\n         for p in patterns\n             column = replace(column, strip(p) => \"\")\n         end\n         return column\n       end;\n\njulia> df = DataFrame(\n              term_a = [\"apple\", \"banana\", \"cherry\"],\n              document_a = [\"doc_1\", \"doc2\", \"doc3\"],\n              _n_ = [1, 2, 3]\n            ); \n\njulia> @rename_with(df, str -> str_remove_all(str, \"_a\"), !term_a)\n3\u00d73 DataFrame\n Row \u2502 term_a  document  _n_   \n     \u2502 String  String    Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 apple   doc_1         1\n   2 \u2502 banana  doc2          2\n   3 \u2502 cherry  doc3          3\n

    source

    # TidierData.@right_join \u2014 Macro.

    @right_join(df1, df2, [by])\n

    Perform a right join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @right_join(df1, df2)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, a)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, a = a)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, \"a\" = \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n

    source

    # TidierData.@select \u2014 Macro.

    @select(df, exprs...)\n

    Select variables in a DataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df @select(a, b, c)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n\njulia> @chain df @select(a:b)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df @select(1:2)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df @select(-(a:b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(a:b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(-(a, b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(a, b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df begin\n         @select(contains(\"b\"), starts_with(\"c\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df @select(-(1:2))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(1:2))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(-c)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df begin\n         @select(-contains(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @select(!contains(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @select(where(is_number))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.@semi_join \u2014 Macro.

    @semi_join(df1, df2, [by])\n

    Perform an semi-join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @semi_join(df1, df2)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, a = a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, \"a\" = \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n

    source

    # TidierData.@separate \u2014 Macro.

    @separate(df, From, Into, Separator)

    Separate a string column into mulitiple new columns based on a specified delimter

    Arguments

    • df: A DataFrame
    • From: Column that will be split
    • Into: New column names, supports [] or ()
    • Separator: the string or chacater on which to split

    Examples

    julia> df = DataFrame(a = [\"1-1\", \"2-2\", \"3-3-3\"]);\n\njulia> @separate(df, a, [b, c, d], \"-\")\n3\u00d73 DataFrame\n Row \u2502 b          c          d          \n     \u2502 SubStrin\u2026  SubStrin\u2026  SubStrin\u2026? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1          missing    \n   2 \u2502 2          2          missing    \n   3 \u2502 3          3          3\n\njulia> @chain df begin\n         @separate(a, (b, c, d), \"-\")\n       end\n3\u00d73 DataFrame\n Row \u2502 b          c          d          \n     \u2502 SubStrin\u2026  SubStrin\u2026  SubStrin\u2026? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1          missing    \n   2 \u2502 2          2          missing    \n   3 \u2502 3          3          3\n

    source

    # TidierData.@separate_rows \u2014 Macro.

    separate_rows(df, columns..., delimiter)\n

    Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.

    Arguments

    • df: A DataFrame
    • columns: A column or multiple columns to be split. Can be a mix of integers and column names.
    • delimiter: The string or character or regular expression used to split the column values.

    Examples

    julia> df = DataFrame(a = 1:3,\n                      b = [\"a\", \"aa;bb;cc\", \"dd;ee\"],\n                      c = [\"1\", \"2;3;4\", \"5;6\"],\n                      d = [\"7\", \"8;9;10\", \"11;12\"])\n3\u00d74 DataFrame\n Row \u2502 a      b         c       d      \n     \u2502 Int64  String    String  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a         1       7\n   2 \u2502     2  aa;bb;cc  2;3;4   8;9;10\n   3 \u2502     3  dd;ee     5;6     11;12\n\njulia> @separate_rows(df, 2, 4, \";\" )\n6\u00d74 DataFrame\n Row \u2502 a      b          c       d         \n     \u2502 Int64  SubStrin\u2026  String  SubStrin\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a          1       7\n   2 \u2502     2  aa         2;3;4   8\n   3 \u2502     2  bb         2;3;4   9\n   4 \u2502     2  cc         2;3;4   10\n   5 \u2502     3  dd         5;6     11\n   6 \u2502     3  ee         5;6     12\n\njulia> @separate_rows(df, b:d, \";\" )\n6\u00d74 DataFrame\n Row \u2502 a      b          c          d         \n     \u2502 Int64  SubStrin\u2026  SubStrin\u2026  SubStrin\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a          1          7\n   2 \u2502     2  aa         2          8\n   3 \u2502     2  bb         3          9\n   4 \u2502     2  cc         4          10\n   5 \u2502     3  dd         5          11\n   6 \u2502     3  ee         6          12\n

    source

    # TidierData.@slice \u2014 Macro.

    @slice(df, exprs...)\n

    Select, remove or duplicate rows by indexing their integer positions.

    Arguments

    • df: A DataFrame.
    • exprs...: integer row values. Use positive values to keep the rows, or negative values to drop. Values provided must be either all positive or all negative, and they must be within the range of DataFrames' row numbers.

    Examples

    julia> df = DataFrame(a = repeat('a':'c', inner = 3), b = 1:9, c = 11:19);\n\njulia> @chain df @slice(1:5)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 a         3     13\n   4 \u2502 b         4     14\n   5 \u2502 b         5     15\n\njulia> @chain df @slice(-(1:2))\n7\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         3     13\n   2 \u2502 b         4     14\n   3 \u2502 b         5     15\n   4 \u2502 b         6     16\n   5 \u2502 c         7     17\n   6 \u2502 c         8     18\n   7 \u2502 c         9     19\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(1)\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         4     14\n   3 \u2502 c         7     17\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(n())\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         3     13\n   2 \u2502 b         6     16\n   3 \u2502 c         9     19\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(-n())\n         @ungroup\n       end\n6\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         4     14\n   4 \u2502 b         5     15\n   5 \u2502 c         7     17\n   6 \u2502 c         8     18\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(-(2:n()))\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         4     14\n   3 \u2502 c         7     17\n

    source

    # TidierData.@slice_head \u2014 Macro.

    @slice_head(df; n, prop)\n

    Retrieve rows from the beginning of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_head(n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a          b          c        \n     \u2502 Float64?   Float64?   Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing          0.3       0.2\n   2 \u2502       0.2        2.0       0.2\n   3 \u2502 missing    missing         0.2\n\njulia> @chain df begin\n         @slice_head(prop = 0.25)\n       end \n2\u00d73 DataFrame\n Row \u2502 a          b         c        \n     \u2502 Float64?   Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3       0.2\n   2 \u2502       0.2       2.0       0.2\n

    source

    # TidierData.@slice_max \u2014 Macro.

    @slice_max(df, column; with_ties = true, n, prop, missing_rm = true)\n

    Retrieve rows with the maximum value(s) from the specified column of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • column: The column for which to slice the maximum values.
    • with_ties: Whether or not all ties will be shown, defaults to true. When false it will only show the first row.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
    • missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_max(b)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n\njulia> @chain df begin\n         @slice_max(b, with_ties = false)\n       end \n1\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n\njulia> @chain df begin\n         @slice_max(b, n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n   3 \u2502      1.0       6.0       1.0\n\njulia> @chain df begin\n         @slice_max(b, prop = 0.5, missing_rm = true)\n       end\n3\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n   3 \u2502      1.0       6.0       1.0\n

    source

    # TidierData.@slice_min \u2014 Macro.

    @slice_min(df, column; with_ties = true, n, prop, missing_rm = true)\n

    Retrieve rows with the minimum value(s) from the specified column of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • column: The column for which to slice the minimum values.
    • with_ties: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
    • missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_min(b)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c         \n     \u2502 Float64?  Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502  missing       0.3        0.2\n   2 \u2502  missing       0.3  missing\n\njulia> @chain df begin\n         @slice_min(b, with_ties = false)\n       end \n1\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502  missing       0.3       0.2\n\njulia> @chain df begin\n         @slice_min(b, n = 3)\n       end\n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3        0.2\n   2 \u2502 missing         0.3  missing   \n   3 \u2502       0.2       2.0        0.2  \n\njulia> @chain df begin\n         @slice_min(b, prop = 0.5, missing_rm = true)\n       end\n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3        0.2\n   2 \u2502 missing         0.3  missing   \n   3 \u2502       0.2       2.0        0.2\n

    source

    # TidierData.@slice_sample \u2014 Macro.

    @slice_sample(df, [n = 1, prop, replace = false])\n

    Randomly sample rows from a DataFrame df or from each group in a GroupedDataFrame. The default is to return 1 row. Either the number of rows (n) or the proportion of rows (prop) should be provided as a keyword argument.

    Arguments

    • df: The source data frame or grouped data frame from which to sample rows.
    • n: The number of rows to sample. Defaults to 1.
    • prop: The proportion of rows to sample.
    • replace: Whether to sample with replacement. Defaults to false.

    Examples

    julia> df = DataFrame(a = 1:10, b = 11:20);\n\njulia> using StableRNGs, Random\n\njulia> rng = StableRNG(1);\n\njulia> Random.seed!(rng, 1);\n\njulia> @chain df begin \n         @slice_sample(n = 5)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6     16\n   2 \u2502     1     11\n   3 \u2502     5     15\n   4 \u2502     4     14\n   5 \u2502     8     18\n\njulia> @chain df begin \n         @slice_sample(n = 5, replace = true)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     7     17\n   2 \u2502     2     12\n   3 \u2502     1     11\n   4 \u2502     4     14\n   5 \u2502     2     12\n\njulia> @chain df begin \n         @slice_sample(prop = 0.5)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6     16\n   2 \u2502     7     17\n   3 \u2502     5     15\n   4 \u2502     9     19\n   5 \u2502     2     12\n\njulia> @chain df begin \n         @slice_sample(prop = 0.5, replace = true)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    10     20\n   2 \u2502     4     14\n   3 \u2502     9     19\n   4 \u2502     9     19\n   5 \u2502     8     18\n

    source

    # TidierData.@slice_tail \u2014 Macro.

    @slice_tail(df; n, prop)\n

    Retrieve rows from the end of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_tail(n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         5.0  missing   \n   2 \u2502       5.0       7.0        5.0\n   3 \u2502       6.0       7.0        6.0\n\njulia> @chain df begin\n         @slice_tail(prop = 0.25)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n

    source

    # TidierData.@summarise \u2014 Macro.

    @summarize(df, exprs...)\n@summarise(df, exprs...)\n

    Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(mean_b = mean(b),\n                    median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize begin\n           mean_b = mean(b)\n           median_b = median(b)\n         end\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0 \n\njulia> @chain df begin\n         @summarise(mean_b = mean(b), median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n

    source

    # TidierData.@summarize \u2014 Macro.

    @summarize(df, exprs...)\n@summarise(df, exprs...)\n

    Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(mean_b = mean(b),\n                    median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize begin\n           mean_b = mean(b)\n           median_b = median(b)\n         end\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0 \n\njulia> @chain df begin\n         @summarise(mean_b = mean(b), median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n

    source

    # TidierData.@summary \u2014 Macro.

       @summary(df, cols...)\n

    For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, median, number of missing values

    Arguments

    • 'df': A DataFrame
    • cols: columns on which summary will be performed. This is an optional arguement, without which summary will be performed on all numerical columns

    Examples

    julia> df = DataFrame(a = [1, 2, 3, 4, 5],\n                      b = [missing, 7, 8, 9, 10],\n                      c = [11, missing, 13, 14, missing],\n                      d = [16, 17, 18, 19, 20]);\n\njulia> @summary(df);\n\njulia> @summary(df, (b:d));\n\njulia> @chain df begin\n         @summary(b:d)\n       end;\n

    source

    # TidierData.@tally \u2014 Macro.

    @tally(df, [wt], [sort])\n

    Tally the unique values of one or more variables, with an optional weighting.

    @tally() is a low-level helper macro for @count() that assumes that any grouping has already been performed. @chain @tally() is roughly equivalent to @chain df @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt).

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
    • sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 3),\n                           repeat([\"b\"], inner = 3),\n                           repeat([\"c\"], inner = 1),\n                           missing),\n                      b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n   4 \u2502 b            4\n   5 \u2502 b            5\n   6 \u2502 b            6\n   7 \u2502 c            7\n   8 \u2502 missing      8\n\njulia> @chain df @tally()\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     8\n\njulia> @chain df begin\n         @group_by(a)\n         @tally()\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            3\n   2 \u2502 b            3\n   3 \u2502 c            1\n   4 \u2502 missing      1\n\njulia> @chain df begin\n         @group_by(a)\n         @tally(wt = b)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            6\n   2 \u2502 b           15\n   3 \u2502 c            7\n   4 \u2502 missing      8\n\njulia> @chain df begin\n         @group_by(a)\n         @tally(wt = b, sort = true)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           15\n   2 \u2502 missing      8\n   3 \u2502 c            7\n   4 \u2502 a            6       \n

    source

    # TidierData.@transmute \u2014 Macro.

    @transmute(df, exprs...)\n

    Create a new DataFrame with only computed columns.

    Arguments

    • df: A DataFrame.
    • exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @transmute(d = b + c)\n       end\n5\u00d71 DataFrame\n Row \u2502 d     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    12\n   2 \u2502    14\n   3 \u2502    16\n   4 \u2502    18\n   5 \u2502    20\n

    source

    # TidierData.@ungroup \u2014 Macro.

    @ungroup(df)\n

    Return a DataFrame with all groups removed.

    If this is applied to a GroupedDataFrame, then it removes the grouping. If this is applied to a DataFrame (without any groups), then it returns the DataFrame unchanged.

    Arguments

    • df: A GroupedDataFrame or `DataFrame``.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @group_by(a)\n       end\nGroupedDataFrame with 5 groups based on key: a\nFirst Group (1 row): a = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n\u22ee\nLast Group (1 row): a = 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 e         5     15\n\njulia> @chain df begin\n         @group_by(a)\n         @ungroup\n       end\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n

    source

    # TidierData.@unite \u2014 Macro.

      @unite(df, new_cols, from_cols, sep)\n

    Separate a multiple columns into one new columns using a specific delimter

    Arguments

    • df: A DataFrame
    • new_col: New column that will recieve the combination
    • from_cols: Column names that it will combine, supports [] or ()
    • sep: the string or character that will seprate the values in the new column

    Examples

    julia> df = DataFrame( b = [\"1\", \"2\", \"3\"], c = [\"1\", \"2\", \"3\"], d = [missing, missing, \"3\"]);\n\njulia> @unite(df, new_col, (b, c, d), \"-\")\n3\u00d74 DataFrame\n Row \u2502 b       c       d        new_col \n     \u2502 String  String  String?  String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1       1       missing  1-1\n   2 \u2502 2       2       missing  2-2\n   3 \u2502 3       3       3        3-3-3\n

    source

    # TidierData.@unnest_longer \u2014 Macro.

    @unnest_longer(df, columns, indices_include=false)\n

    Unnest arrays in columns from a DataFrame to create a longer DataFrame with one row for each entry of the array.

    Arguments

    • df: A DataFrame.
    • columns: Columns to unnest. Can be a column symbols or a range of columns if they align for number of values.
    • indices_include: Optional. When set to true, adds an index column for each unnested column, which logs the position of each array entry.
    • keep_empty: Optional. When set to true, rows with empty arrays are kept, not skipped, and unnested as missing.

    Examples

    julia> df = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])\n2\u00d73 DataFrame\n Row \u2502 a      b       c      \n     \u2502 Int64  Array\u2026  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  [1, 2]  [5, 6]\n   2 \u2502     2  [3, 4]  [7, 8]\n\njulia> @unnest_longer(df, 2)\n4\u00d73 DataFrame\n Row \u2502 a      b      c      \n     \u2502 Int64  Int64  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1  [5, 6]\n   2 \u2502     1      2  [5, 6]\n   3 \u2502     2      3  [7, 8]\n   4 \u2502     2      4  [7, 8]\n\njulia> @unnest_longer(df, b:c, indices_include=true)\n4\u00d75 DataFrame\n Row \u2502 a      b      c      b_id   c_id  \n     \u2502 Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      5      1      1\n   2 \u2502     1      2      6      2      2\n   3 \u2502     2      3      7      1      1\n   4 \u2502     2      4      8      2      2\n\njulia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]])\n4\u00d72 DataFrame\n Row \u2502 x      y            \n     \u2502 Int64  Array\u2026       \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  Any[]\n   2 \u2502     2  Any[1, 2, 3]\n   3 \u2502     3  Any[4, 5]\n   4 \u2502     4  Any[]\n\njulia> @unnest_longer(df2, y, keep_empty = true)\n7\u00d72 DataFrame\n Row \u2502 x      y       \n     \u2502 Int64  Any     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  missing \n   2 \u2502     2  1\n   3 \u2502     2  2\n   4 \u2502     2  3\n   5 \u2502     3  4\n   6 \u2502     3  5\n   7 \u2502     4  missing \n

    source

    # TidierData.@unnest_wider \u2014 Macro.

    @unnest_wider(df, columns, names_sep=)\n

    Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.

    Arguments

    • df: A DataFrame.
    • columns: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names.
    • names_sep: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator.

    Examples

    julia> df = DataFrame(name = [\"Zaki\", \"Farida\"], attributes = [\n               Dict(\"age\" => 25, \"city\" => \"New York\"),\n               Dict(\"age\" => 30, \"city\" => \"Los Angeles\")]);\n\njulia> @unnest_wider(df, attributes)\n2\u00d73 DataFrame\n Row \u2502 name    city         age   \n     \u2502 String  String       Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 Zaki    New York        25\n   2 \u2502 Farida  Los Angeles     30\n\njulia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])\n2\u00d73 DataFrame\n Row \u2502 a      b       c      \n     \u2502 Int64  Array\u2026  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  [1, 2]  [5, 6]\n   2 \u2502     2  [3, 4]  [7, 8]\n\njulia> @unnest_wider(df2, b:c, names_sep = \"_\")\n2\u00d75 DataFrame\n Row \u2502 a      b_1    b_2    c_1    c_2   \n     \u2502 Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      2      5      6\n   2 \u2502     2      3      4      7      8\n

    source

    "},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":""},{"location":"examples/generated/Contributors/Howto/","title":"Contribute","text":""},{"location":"examples/generated/Contributors/Howto/#contribute-to-documentation","title":"Contribute to Documentation","text":"

    Contributing with examples can be done by first creating a new file example here

    Info

    • your_new_file.jl at docs/examples/UserGuide/

    Once this is done you need to add a new entry here at the bottom and the appropriate level.

    Info

    Your new entry should look like:

    • \"Your title example\" : \"examples/generated/UserGuide/your_new_file.md\"

    "},{"location":"examples/generated/Contributors/Howto/#build-docs-locally","title":"Build docs locally","text":"

    If you want to take a look at the docs locally before doing a PR follow the next steps:

    build docs locally

    Install the following dependencies in your system via pip, i.e.

    • pip install mkdocs pygments python-markdown-math
    • pip install mkdocs-material pymdown-extensions mkdocstrings
    • pip install mknotebooks pytkdocs_tweaks mkdocs_include_exclude_files jinja2 mkdocs-video

    Then simply go to your docs env and activate it, i.e.

    docs> julia

    julia> ]

    (docs) pkg> activate .

    Next, run the scripts:

    Info

    Generate files and build docs by running:

    • genfiles.jl
    • make.jl

    Now go to your terminal in the same path docs> and run:

    mkdocs serve

    This should output http://127.0.0.1:8000, copy/paste this into your browser and you are all set.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/across/","title":"across","text":"

    across() is a helper function that is typically used inside @mutate() or @summarize to operate on multiple columns and/or multiple functions. Notice that across() accepts two arguments, a set of variables and a set of functions. If providing multiple variables or functions, these should be provided as a tuple \u2013 in other words, wrapped in parentheses and separated by commas. If you want to skip missing values, you can \"fuse\" the summary function (such as mean()) with the skipmissing() function by using the fuction fusion operator, which you can type out in Julia by typing \\circ and then pressing [Tab] such that it reads mean\u2218skipmissing.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/across/#one-variable-one-function","title":"One variable, one function","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across(Budget, mean\u2218skipmissing))\nend\n
    1\u00d71 DataFrame RowBudget_mean_skipmissingFloat64113.4125"},{"location":"examples/generated/UserGuide/across/#one-variable-one-anonymous-function","title":"One variable, one anonymous function","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across(Budget, (x -> mean(skipmissing(x)))))\nend\n
    1\u00d71 DataFrame RowBudget_functionFloat64113.4125

    Note: compound functions are not correctly supported inside of anonymous functions. As of right now, the above function works, but (x -> mean\u2218skipmissing(x)) does not work. This is a known bug and will be fixed in a future update.

    "},{"location":"examples/generated/UserGuide/across/#multiple-variables-multiple-functions","title":"Multiple variables, multiple functions","text":"
    @chain movies begin\n    @mutate(Budget = Budget / 1_000_000)\n    @summarize(across((Rating, Budget), (mean\u2218skipmissing, median\u2218skipmissing)))\nend\n
    1\u00d74 DataFrame RowRating_mean_skipmissingBudget_mean_skipmissingRating_median_skipmissingBudget_median_skipmissingFloat64Float64Float64Float6415.9328513.41256.13.0"},{"location":"examples/generated/UserGuide/across/#multiple-selection-helpers-multiple-functions","title":"Multiple selection helpers, multiple functions","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across((starts_with(\"Bud\"), ends_with(\"ting\")), (mean\u2218skipmissing, median\u2218skipmissing)))\nend\n
    1\u00d74 DataFrame RowBudget_mean_skipmissingRating_mean_skipmissingBudget_median_skipmissingRating_median_skipmissingFloat64Float64Float64Float64113.41255.932853.06.1

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/arrange/","title":"@arrange","text":"

    Arranging is the way to sort a data frame. @arrange() can take multiple arguments. Arguments refer to columns that are sorted in ascending order by default. If you want to sort in descending order, make sure to wrap the column name in desc() as shown below.

    DataFrames.jl does not currently support the sort() function on grouped data frames. In order to make this work in TidierData.jl, if you apply @arrange() to a GroupedDataFrame, @arrange() will temporarily ungroup the data, perform the sort(), and then re-group by the original grouping variables.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/arrange/#sort-both-variables-in-ascending-order","title":"Sort both variables in ascending order","text":"
    @chain movies begin\n  @arrange(Year, Rating)\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641Blacksmith Scene18931missing7.02Hadj Cheriff18941missing4.13Glenroy Bros., No. 218941missing4.24Leonard-Cushing Fight18941missing4.45Sioux Ghost Dance18941missing4.4"},{"location":"examples/generated/UserGuide/arrange/#sort-in-a-mix-of-ascending-and-descending-order","title":"Sort in a mix of ascending and descending order","text":"

    To sort in descending order, make sure to wrap the variable inside of desc().

    @chain movies begin\n  @arrange(Year, desc(Rating))\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641Blacksmith Scene18931missing7.02Luis Martinetti, Contortionist18941missing6.13Caicedo (with Pole)18941missing5.84Glenroy Brothers (Comic Boxing)18941missing5.45Buffalo Dance18941missing5.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/autovec/","title":"Auto-vectorization","text":"

    TidierData.jl uses a lookup table to decide which functions not to vectorize. For example, mean() is listed as a function that should never be vectorized. Also, any function used inside of across() is also not automatically vectorized. Any function that is not included in this list and is used in a context other than across() is automatically vectorized.

    Which functions are not vectorized? The set of non-vectorized functions is contained in the array TidierData.not_vectorized[]. Let's take a look at this array. We will wrap it in a string() to make the output easier to read.

    using TidierData\n\nstring(TidierData.not_vectorized[])\n
    \"[:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :\u2218, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr]\"\n

    This \"auto-vectorization\" makes working with TidierData.jl more R-like and convenient. However, if you ever define your own function and try to use it, TidierData.jl may unintentionally vectorize it for you. To prevent auto-vectorization, you can prefix your function with a ~.

    df = DataFrame(a = repeat('a':'e', inner = 2), b = [1,1,1,2,2,2,3,3,3,4], c = 11:20)\n
    10\u00d73 DataFrame RowabcCharInt64Int641a1112a1123b1134b2145c2156c2167d3178d3189e31910e420

    For example, let's define a function new_mean() that calculates a mean.

    new_mean(exprs...) = mean(exprs...)\n
    new_mean (generic function with 1 method)\n

    If we try to use new_mean() inside of @mutate(), it will give us the wrong result. This is because new_mean() is vectorized, which results in the mean being calculated element-wise, which is almost never what we actually want.

    @chain df begin\n    @mutate(d = c - new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a1110.02a1120.03b1130.04b2140.05c2150.06c2160.07d3170.08d3180.09e3190.010e4200.0

    To prevent new_mean() from being vectorized, we need to prefix it with a ~ like this:

    @chain df begin\n    @mutate(d = c - ~new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    Or you can modify the do-not-vectorize list like this:

    push!(TidierData.not_vectorized[], :new_mean)\n
    49-element Vector{Symbol}:\n :getindex\n :rand\n :esc\n :Ref\n :Set\n :Cols\n :collect\n :(:)\n :\u2218\n :lag\n \u22ee\n :cat_collapse\n :cat_lump_min\n :cat_lump_prop\n :categorical\n :as_categorical\n :is_categorical\n :unique\n :iqr\n :new_mean\n

    Now new_mean() should behave just like mean() in that it is treated as non-vectorized.

    @chain df begin\n    @mutate(d = c - new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    This gives us the correct answer. Notice that adding a ~ is not needed with mean() because mean() is already included on our look-up table of functions not requiring vectorization.

    @chain df begin\n    @mutate(d = c - mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    If you're not sure if a function is vectorized and want to prevent it from being vectorized, you can always prefix it with a ~ to prevent vectorization. Even though mean() is not vectorized anyway, prefixing it with a ~ will not cause any harm.

    @chain df begin\n    @mutate(d = c - ~mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    If for some crazy reason, you did want to vectorize mean(), you are always allowed to vectorize it, and TidierData.jl won't un-vectorize it.

    @chain df begin\n    @mutate(d = c - mean.(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a1110.02a1120.03b1130.04b2140.05c2150.06c2160.07d3170.08d3180.09e3190.010e4200.0

    Note: ~ also works with operators, so if you want to not vectorize an operator, you can prefix it with ~, for example, a ~* b will perform a matrix multiplication rather than element-wise multiplication.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/benchmark/","title":"Benchmark","text":"

    The goal of this benchmarking is to guage how Tidier.jl performs in comparison to DataFrames.jl. Ultimately, from this benchmarking, we can check that Tidier.jl is comparable in speed to DataFrames.jl.

    "},{"location":"examples/generated/UserGuide/benchmark/#why-function-wrap","title":"Why function wrap?","text":"

    Wrapping code in a function allows it to compile just once, which more closely reflects the reality of production workflows. For a more robust explanation, please see @kdpsingh comment here: https://github.com/TidierOrg/TidierData.jl/issues/24#issuecomment-1682718061

    using TidierData\nusing RDatasets\nusing BenchmarkTools\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/benchmark/#filtering","title":"filtering","text":"
    function filtering_tidier()\n@chain movies begin\n    @filter(Year > 1939 && Votes > 40)\nend\nend\n\n@benchmark filtering_tidier()\n\n@benchmark filter(row -> row.Year > 1939 && row.Votes > 40, movies)\n
    BenchmarkTools.Trial: 532 samples with 1 evaluation.\n Range (min \u2026 max):  9.001 ms \u2026  18.990 ms  \u250a GC (min \u2026 max): 0.00% \u2026 5.50%\n Time  (median):     9.281 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   9.411 ms \u00b1 554.583 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  1.14% \u00b1 2.52%\n\n         \u2583\u2584\u2588\u2584\u2585\u2586\u2585\u2582                                              \n  \u2582\u2581\u2582\u2582\u2584\u2583\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2586\u2585\u2583\u2582\u2581\u2581\u2581\u2581\u2582\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2583\u2581\u2583\u2582\u2583\u2583\u2583\u2584\u2584\u2583\u2583\u2583\u2583\u2582\u2583\u2582\u2583\u2583\u2582\u2582\u2582\u2581\u2581\u2581\u2581\u2582 \u2583\n  9 ms            Histogram: frequency by time        10.4 ms <\n\n Memory estimate: 7.76 MiB, allocs estimate: 287668.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#group_by-summarize","title":"group_by summarize","text":"
    function groupbysummarize_tidier()\n@chain movies begin\n    @group_by(MPAA)\n    @summarise(n=n())\nend\nend\n\n@benchmark groupbysummarize_tidier()\n\n@benchmark combine(groupby(movies, :MPAA), nrow => :n)\n
    BenchmarkTools.Trial: 10000 samples with 1 evaluation.\n Range (min \u2026 max):  414.934 \u03bcs \u2026  1.865 ms  \u250a GC (min \u2026 max): 0.00% \u2026 31.44%\n Time  (median):     422.558 \u03bcs              \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   432.911 \u03bcs \u00b1 67.083 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  1.25% \u00b1  5.15%\n\n  \u2583\u2588\u2588\u2587\u2586\u2585\u2584\u2584\u2584\u2583\u2582\u2582\u2581\u2581\u2581                                              \u2582\n  \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2587\u2587\u2586\u2586\u2586\u2586\u2586\u2585\u2585\u2584\u2584\u2584\u2583\u2584\u2581\u2581\u2581\u2581\u2583\u2583\u2581\u2581\u2584\u2587\u2587\u2583\u2585\u2583\u2583\u2583\u2585\u2584\u2586\u2586\u2587\u2586\u2587\u2586\u2585\u2584\u2583\u2584\u2584 \u2588\n  415 \u03bcs        Histogram: log(frequency) by time       573 \u03bcs <\n\n Memory estimate: 474.87 KiB, allocs estimate: 270.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#one-mutate","title":"one mutate","text":"
    function mutate_1_tidier()\n@chain movies begin\n    @mutate(new_col = Votes * R1)\nend\nend\n\n@benchmark mutate_1_tidier()\n\n@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :new_col)\n
    BenchmarkTools.Trial: 6789 samples with 1 evaluation.\n Range (min \u2026 max):  541.700 \u03bcs \u2026   9.582 ms  \u250a GC (min \u2026 max): 0.00% \u2026  5.69%\n Time  (median):     661.425 \u03bcs               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   731.742 \u03bcs \u00b1 250.650 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  7.82% \u00b1 12.75%\n\n         \u2585\u2588\u2588\u2587\u2587\u2586\u2585\u2584\u2583\u2582\u2581                               \u2581\u2581\u2582\u2582\u2582\u2582\u2583\u2582\u2582\u2581\u2581  \u2582\n  \u2583\u2581\u2583\u2581\u2585\u2585\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2586\u2585\u2585\u2581\u2585\u2583\u2581\u2581\u2581\u2581\u2581\u2583\u2583\u2581\u2581\u2581\u2581\u2583\u2586\u2586\u2586\u2585\u2584\u2585\u2584\u2583\u2585\u2585\u2587\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\n  542 \u03bcs        Histogram: log(frequency) by time       1.25 ms <\n\n Memory estimate: 8.42 MiB, allocs estimate: 223.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#mutate-6-new-columns","title":"mutate 6 new columns","text":"
    function mutate6_tidier()\n    @chain movies begin\n        @mutate(\n        Votes_R1_Product = Votes .* R1,\n        Rating_Year_Ratio = Rating ./ Year,\n        R1_to_R5_Sum = R1 + R2 + R3 + R4 + R5,\n        High_Budget_Flag = if_else(ismissing(Budget), \"NA\", Budget .> 50000),\n        R6_to_R8_Avg = (R6 + R7 + R8) / 3,\n        year_Minus_Length = Year - Length)\n    end\nend\n\n@benchmark mutate6_tidier()\n\n@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :Votes_R1_Product, [:Rating, :Year] => ((r, y) -> r ./ y) => :Rating_Year_Ratio, [:R1, :R2, :R3, :R4, :R5] => ((a, b, c, d, e) -> a + b + c + d + e) => :R1_to_R5_Sum, :Budget => (b -> ifelse.(ismissing.(b), missing, b .> 50000)) => :High_Budget_Flag, [:R6, :R7, :R8] => ((f, g, h) -> (f + g + h) / 3) => :R6_to_R8_Avg, [:Year, :Length] => ((y, l) -> y - l) => :Year_Minus_Length )\n
    BenchmarkTools.Trial: 3937 samples with 1 evaluation.\n Range (min \u2026 max):  1.062 ms \u2026   9.694 ms  \u250a GC (min \u2026 max): 0.00% \u2026  6.74%\n Time  (median):     1.174 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   1.264 ms \u00b1 326.052 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  6.29% \u00b1 11.05%\n\n      \u2581\u2585\u2587\u2588\u2584\u2582                                                   \n  \u2582\u2582\u2583\u2585\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2585\u2584\u2583\u2583\u2582\u2582\u2582\u2582\u2582\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2582\u2581\u2581\u2581\u2581\u2581\u2582\u2582\u2582\u2581\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2583\u2583\u2583\u2583\u2583\u2583\u2583\u2583\u2583\u2582\u2583\u2582\u2582\u2582 \u2583\n  1.06 ms         Histogram: frequency by time        1.91 ms <\n\n Memory estimate: 10.56 MiB, allocs estimate: 581.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#groupby-then-2-mutates","title":"groupby then 2 mutates","text":"
    function groupby1_2mutate_tidier()\n@chain movies begin\n    @group_by(MPAA)\n    @mutate(ace = R1 -> R1/2 * 4)\n    @mutate(Bace = Votes^R1)\nend\nend\n\n@benchmark groupby1_2mutate_tidier()\n\n@benchmark transform( transform( groupby(movies, :MPAA), :R1 => (x -> x/2 * 4) => :ace, ungroup = false), [:Votes, :R1] => ((a, b) -> b .^ a) => :Bace, ungroup = false)\n
    BenchmarkTools.Trial: 671 samples with 1 evaluation.\n Range (min \u2026 max):  6.845 ms \u2026  13.608 ms  \u250a GC (min \u2026 max): 0.00% \u2026 7.58%\n Time  (median):     7.277 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   7.442 ms \u00b1 603.643 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  3.11% \u00b1 4.29%\n\n    \u2581 \u2586\u2587\u2582\u2588\u2585 \u2581\u2581                                                 \n  \u2582\u2586\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2586\u2587\u2586\u2588\u2585\u2584\u2585\u2586\u2584\u2585\u2584\u2585\u2583\u2582\u2584\u2583\u2586\u2583\u2586\u2586\u2586\u2585\u2585\u2585\u2584\u2586\u2584\u2584\u2583\u2582\u2583\u2583\u2581\u2582\u2584\u2582\u2582\u2584\u2583\u2582\u2582\u2583\u2581\u2582\u2581 \u2583\n  6.85 ms         Histogram: frequency by time         8.5 ms <\n\n Memory estimate: 26.17 MiB, allocs estimate: 2449.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#select-5-columns","title":"select 5 columns","text":"
    function select5_tidier()\n    @chain movies begin\n        @select(R1:R5)\n    end\nend\n\n@benchmark select5_tidier()\n\n@benchmark select(movies, :R1, :R2, :R3, :R4, :R5)\n
    BenchmarkTools.Trial: 10000 samples with 1 evaluation.\n Range (min \u2026 max):  153.436 \u03bcs \u2026   7.502 ms  \u250a GC (min \u2026 max): 0.00% \u2026 6.35%\n Time  (median):     220.581 \u03bcs               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   232.208 \u03bcs \u00b1 100.421 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  4.46% \u00b1 9.85%\n\n      \u2581\u2584\u2586\u2587\u2588\u2588\u2586\u2584\u2583\u2581                                                \u2582\n  \u2584\u2583\u2584\u2585\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2586\u2584\u2584\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2583\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2583\u2585\u2586\u2586\u2587\u2587\u2587\u2588\u2587\u2587\u2587\u2587\u2587 \u2588\n  153 \u03bcs        Histogram: log(frequency) by time        622 \u03bcs <\n\n Memory estimate: 2.25 MiB, allocs estimate: 200.\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/binding/","title":"Binding","text":"

    Whereas joins are useful for combining data frames based on matching keys, another way to combine data frames is to bind them together, which can be done either by rows or by columns. TidierData.jl implements these actions using @bind_rows() and @bind_cols(), respectively.

    Let's generate three data frames to combine.

    using TidierData\n\ndf1 = DataFrame(a=1:3, b=1:3);\n\ndf2 = DataFrame(a=4:6, b=4:6);\n\ndf3 = DataFrame(a=7:9, c=7:9);\n

    "},{"location":"examples/generated/UserGuide/binding/#bind_rows","title":"@bind_rows()","text":"
    @bind_rows(df1, df2)\n
    6\u00d72 DataFrame RowabInt64Int64111222333444555666

    @bind_rows() keeps columns that are present in at least one of the provided data frames. Any missing columns will be filled with missing values.

    @bind_rows(df1, df3)\n
    6\u00d73 DataFrame RowabcInt64Int64?Int64?111missing222missing333missing47missing758missing869missing9

    There is an optional id argument to add an identifier for combined data frames. Note that both @bind_rows and @bind_cols accept multiple (i.e., more than 2) data frames, as in the example below.

    @bind_rows(df1, df2, df3, id = \"id\")\n
    9\u00d74 DataFrame RowabcidInt64Int64?Int64?Int64111missing1222missing1333missing1444missing2555missing2666missing277missing7388missing8399missing93

    "},{"location":"examples/generated/UserGuide/binding/#bind_cols","title":"@bind_cols()","text":"

    @bind_cols works similarly to R's tidyverse although the .name_repair argument is not supported.

    @bind_cols(df1, df2)\n
    3\u00d74 DataFrame Rowaba_1b_1Int64Int64Int64Int64111442225533366

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/column_names/","title":"Column names","text":"

    When referring to column names, TidierData.jl is a bit unusual for a Julia package in that it does not use symbols. This is because TidierData.jl uses tidy expressions, which in R lingo equates to a style of programming referred to as \"non-standard evaluation.\" If you are creating a new column a containing a value that is the mean of column b, you would simply write a = mean(b).

    However, there may be times when you wish to create or refer to a column containing a space in it. Let's start by creating some column names containing a space in their name.

    using TidierData\n\ndf = DataFrame(var\"my name\" = [\"Ada\", \"Twist\"],\n               var\"my age\" = [40, 50])\n
    2\u00d72 DataFrame Rowmy namemy ageStringInt641Ada402Twist50

    To create a column name containing a space, we used the var\"column name\" notation. Because DataFrame() is a regular Julia function, this is the standard way to refer to a variable containing a space, which is why we need to use this here.

    This notation also works inside of TidierData.jl.

    "},{"location":"examples/generated/UserGuide/column_names/#varcolumn-name-notation","title":"var\"column name\" notation","text":"

    If we want to figure out the age for the people in our dataset a decade from today, we could use this same var\"column name\" notation inside of @mutate.

    @chain df begin\n  @mutate(var\"age in 10 years\" = var\"my age\" + 10)\nend\n
    2\u00d73 DataFrame Rowmy namemy ageage in 10 yearsStringInt64Int641Ada40502Twist5060

    However, typing out the var\"column name\" can become cumbersome. TidierData.jl also supports another shorthand notation to refer to column names containing spaces or other special characters: backticks.

    "},{"location":"examples/generated/UserGuide/column_names/#backtick-notation","title":"Backtick notation","text":"

    This same code could be written more concisely like this:

    @chain df begin\n  @mutate(`age in 10 years` = `my age` + 10)\nend\n
    2\u00d73 DataFrame Rowmy namemy ageage in 10 yearsStringInt64Int641Ada40502Twist5060

    Backticks are an R convention. While they are not specific to tidyverse, they are a convenient way to refer to column names that otherwise would not parse correctly as a single entity. Backticks are supported in all TidierData.jl functions where column names may be referenced.

    "},{"location":"examples/generated/UserGuide/column_names/#cleaning-up-column-names","title":"Cleaning up column names","text":"

    Another option is to clean up the column names so that you do not have spaces to begin with. In R, this is usually accomplished using the janitor package. In Julia, the Cleaner.jl package provides this functionality, which we have wrapped inside of TidierData.jl.

    @chain df begin\n  @clean_names\nend\n
    2\u00d72 DataFrame Rowmy_namemy_ageStringInt641Ada402Twist50

    Although the default value for the case argument is \"snake_case\", you can also set this to \"camelCase\".

    @chain df begin\n  @clean_names(case = \"camelCase\")\nend\n
    2\u00d72 DataFrame RowmyNamemyAgeStringInt641Ada402Twist50

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/conditionals/","title":"Conditionals","text":"

    Conditional functions are a useful tool to update or create new columns conditional on the values of a column of data. When continuous variables are converted to categories, this is sometimes referred to as \"recoding\" a column.

    TidierData.jl provides two functions to recode data: if_else() and case_when().

    "},{"location":"examples/generated/UserGuide/conditionals/#if_else","title":"if_else()","text":"

    Why do we need another if_else() function if base Julia already comes with an ifelse() function. Similar to R, the base Julia implementation of if_else() does not include a way to designate what value to return if the enclosed vector contains a missing value. Additionally, the base Julia implementation of ifelse() produces an error if presented with a missing value in the condition. The TidierData.jl if_else() can handle missing values and includes an optional 4th argument that is used to designate what to return in the event of a `missing`` value for the condition. Let's take a look at some examples.

    using TidierData\n\ndf = DataFrame(a = [1, 2, missing, 4, 5])\n
    5\u00d71 DataFrame RowaInt64?11223missing4455

    Here, we have created a DataFrame containing a single column a with 5 values, for which the 3rd value is missing.

    Now, let's create a new column b that contains a \"yes\" if a is greater than or equal to 3, and a \"no\" otherwise. Notice that when we do this, the missing values remains as missing.

    @chain df begin\n  @mutate(b = if_else(a >= 3, \"yes\", \"no\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String?11no22no3missingmissing44yes55yes

    What if we wanted to fill in the missing value with \"unknown\"? All we need to do is provide an optional 4th argument containing the value to return in the event of a missing condition. When we run this version, missing values in a are converted to \"unknown\" in b.

    @chain df begin\n  @mutate(b = if_else(a >= 3, \"yes\", \"no\", \"unknown\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String11no22no3missingunknown44yes55yes

    Although both of these examples showed how to return a single value (like \"yes\" and \"no\"), you can also return a vector of values, which is useful for updating only a subset of the values of a column. For example, if we wanted to create a column b that contains a 3 when a is greater than or equal to 3 but otherwise remains unchanged, we could provide a 3 for the yes condition and a vector (column) a in the no condition. If we do not provide the optional 4th argument, missing values remain missing.

    @chain df begin\n  @mutate(b = if_else(a >= 3, 3, a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int64?1112223missingmissing443553

    "},{"location":"examples/generated/UserGuide/conditionals/#case_when","title":"case_when()","text":"

    Although if_else() is convenient when evaluating a single condition, it can be cumbersome when evaluating multiple conditions because subsequent conditions need to be nested within the no condition for the preceding argument. For situations where multiple conditions need to be evaluated, case_when() is more convenient.

    Let's first consider a similar example from above and recreate it using case_when(). The following code creates a column b that assigns a value of 3 if a >= 3 and otherwise leaves the value unchanged.

    @chain df begin\n  @mutate(b = case_when(a >= 3  =>  3,\n                        true    =>  a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int64?1112223missingmissing443553

    What is going on here? case_when() uses a condition => return_value syntax, which are encoded as pairs in Julia. You can provide a single pair, or multiple pairs separated by commas. Because the pairs operator (=>) might be confused with a greater than or equal to sign (>=), we have padded two spaces on either side of the => to make sure that the pair remains visually distinct. We do not use a ~ operator in case_when() (as is used in R) because the ~ operator is used to denote de-vectorized functions in TidierData.jl.

    There are 2 other things to note above. First, the true condition evaluates to true for all remaining values of a. The only reason that the b contains a missing value here is that the true condition was met, leading to the value of a (in this case, missing) to be assigned to b. Second, we were able to return a single value (3) in the first condition, and a vector (column) of data (a) in the second condition.

    What if we wanted to fill in the missing values with something else? In this case, we would need to create an explicit condition that checks for missing values and assigns a return value to that condition.

    @chain df begin\n  @mutate(b = case_when(a >= 3        =>  3,\n                        ismissing(a)  =>  0,\n                        true          =>  a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int641112223missing0443553

    Do our conditions have to be mutually exclusive? No. The return value for the first matching condition is assigned to b because the conditions are evaluated sequentially from first to last.

    @chain df begin\n  @mutate(b = case_when(a > 4  =>  \"hi\",\n                        a > 2  =>  \"medium\",\n                        a > 0  =>  \"low\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String?11low22low3missingmissing44medium55hi

    Again, if we want to fill in remaining values (which in this case are the missing ones), we can map the final condition true to the value of \"unknown\". Because the ordering of the conditions matters, the true condition should always be listed last if it is included.

    @chain df begin\n  @mutate(b = case_when(a > 4  =>  \"hi\",\n                        a > 2  =>  \"medium\",\n                        a > 0  =>  \"low\",\n                        true   =>  \"unknown\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String11low22low3missingunknown44medium55hi

    "},{"location":"examples/generated/UserGuide/conditionals/#do-these-functions-work-outside-of-tidierdatajl","title":"Do these functions work outside of TidierData.jl?","text":"

    Yes, both if_else() and case_when() work outside of TidierData.jl. However, you'll need to remember that if working with vectors, both the functions and conditions will need to be vectorized, and in the case of case_when(), the => will need to be written as .=>. The reason this is not needed when using these functions inside of TidierData.jl is because they are auto-vectorized.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/dataset_movies/","title":"Movies dataset","text":"

    To get started, we will load the movies dataset from the RDatasets.jl package.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    To work with this dataset, we will use the @chain macro. This macro initiates a pipe, and every function or macro provided to it between the begin and end blocks modifies the dataframe mentioned at the beginning of the pipe. You don't have to necessarily spread a chain over multiple lines of code, but when working with data frames it's often easiest to do so. Before going further, take a look at the Chain.jl GitHub page to see all the cool things that are possible with this, including mid-chain side effects using @aside and mid-chain assignment of variables.

    Let's take a look at the first 5 rows of the movies dataset using @slice().

    @chain movies begin\n    @slice(1:5)\nend\n
    5\u00d724 DataFrame RowTitleYearLengthBudgetRatingVotesR1R2R3R4R5R6R7R8R9R10MPAAActionAnimationComedyDramaDocumentaryRomanceShortStringInt32Int32Int32?Float64Int32Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Cat\u2026Int32Int32Int32Int32Int32Int32Int321$1971121missing6.43484.54.54.54.514.524.524.514.54.54.500110002$1000 a Touchdown193971missing6.0200.014.54.524.514.514.514.54.54.514.500100003$21 a Day Once a Month19417missing8.250.00.00.00.00.024.50.044.524.524.501000014$40,000199670missing8.2614.50.00.00.00.00.00.00.034.545.500100005$50,000 Climax Show, The197571missing3.41724.54.50.014.514.54.50.00.00.024.50000000

    Let's use @glimpse() to preview the dataset.

    @glimpse(movies)\n
    Rows: 58788\nColumns: 24\n.Title         String         $, $1000 a Touchdown, $21 a Day Once a Month, $40,\n.Year          Int32          1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19\n.Length        Int32          121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10\n.Budget        Union{Missing, Int32}missing, missing, missing, missing, missing,\n.Rating        Float64        6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0,\n.Votes         Int32          348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44\n.R1            Float64        4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5\n.R2            Float64        4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,\n.R3            Float64        4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5,\n.R4            Float64        4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4.\n.R5            Float64        14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0,\n.R6            Float64        24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,\n.R7            Float64        24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5,\n.R8            Float64        14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4\n.R9            Float64        4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4.\n.R10           Float64        4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24.\n.MPAA          CategoricalArrays.CategoricalValue{String, UInt8}, , , , , , R, ,\n.Action        Int32          0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,\n.Animation     Int32          0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Comedy        Int32          1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,\n.Drama         Int32          1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,\n.Documentary   Int32          0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Romance       Int32          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Short         Int32          0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/distinct/","title":"@distinct","text":"

    The @distinct() macro in TidierData.jl is useful to select distinct rows. Like it's R counterpart, it can be used with or without arguments. When arguments are provided, it behaves slightly differently than the R version. Whereas the R function only returns the provided columns, the TidierData.jl version returns all columns, where the first match is returned for the non-selected columns.

    using TidierData\n\ndf = DataFrame(a = 1:10, b = repeat('a':'e', inner = 2))\n
    10\u00d72 DataFrame RowabInt64Char11a22a33b44b55c66c77d88d99e1010e

    "},{"location":"examples/generated/UserGuide/distinct/#select-distinct-values-overall","title":"Select distinct values overall","text":"

    Since there are no duplicate rows, this will return all rows.

    @chain df begin\n    @distinct()\nend\n
    10\u00d72 DataFrame RowabInt64Char11a22a33b44b55c66c77d88d99e1010e

    "},{"location":"examples/generated/UserGuide/distinct/#select-distinct-values-based-on-column-b","title":"Select distinct values based on column b","text":"

    Notice that the first matching row for column a is returned for every distinct value of column b. This is slightly different behavior than R's tidyverse, which would have returned only column b.

    @chain df begin\n  @distinct(b)\nend\n
    5\u00d72 DataFrame RowabInt64Char11a23b35c47d59e

    In TidierData.jl, @distinct() works with grouped data frames. If grouped, @distinct() will ignore the grouping when determining distinct values but will return the data frame in grouped form based on the original groupings.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/fill_missing/","title":"Fill missing","text":"

    The @fill_missing macro is a reimplementation of fill(). To mirror the syntax in R, the methods availble are \"up\" (fill from bottom up) and \"down\" fill from top down.

    using TidierData\n\ndf = DataFrame(\n    a = [missing, 2, 3, missing, 5],\n    b = [missing, 1, missing, 4, 5],\n    c = ['a', 'b', missing, 'd', 'e'],\n    group = ['A', 'A', 'B', 'B', 'A']\n);\n

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-all-columns","title":"Fill all columns","text":"

    Fill missing values for the whole DataFrame using the \"down\" method (top to bottom)

    @chain df begin\n    @fill_missing(\"down\")\nend\n\n@fill_missing(df, \"down\")\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA331bB434dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-specifc-columns","title":"Fill specifc columns","text":"

    This fills missing values in columns a and c going from bottom to top.

    @chain df begin\n    @fill_missing(a, c, \"up\")\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char12missingaA221bA33missingdB454dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-with-grouped-dataframes","title":"Fill with Grouped DataFrames","text":"

    When grouping by the group column, this fills missing values in columns a within each group going from top to bottom within that group

    @chain df begin\n    @group_by(group)\n    @fill_missing(a, \"down\")\nend\n

    GroupedDataFrame with 2 groups based on key: group

    First Group (3 rows): group = 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase) RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA355eA

    &vellip;

    Last Group (2 rows): group = 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase) RowabcgroupInt64?Int64?Char?Char13missingmissingB234dB

    "},{"location":"examples/generated/UserGuide/fill_missing/#replace_missing","title":"replace_missing()","text":"

    The replace_missing function facilitates the replacement of missing values with a specified replacement.

    @chain df begin\n    @mutate(b = replace_missing(b, 2))\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64Char?Char1missing2aA221bA332missingB4missing4dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#missing_if","title":"missing_if()","text":"

    The missing_if function is used to introduce missing values under specific conditions.

    @chain df begin\n    @mutate(b = missing_if(b, 5))\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA33missingmissingB4missing4dB55missingeA

    Both missing_if and replace_missing are not type specifc.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/filter/","title":"@filter","text":"

    Filtering is a mechanism to indicate which rows you want to keep in a dataset based on criteria. This is also referred to as subsetting. Filtering rows is normally a bit tricky in DataFrames.jl because comparison operators like >= actually need to be vectorized as .>=, which can catch new Julia users by surprise. @filter() mimics R's tidyverse behavior by auto-vectorizing the code and then only selecting those rows that evaluate to true. Similar to dplyr, rows that evaluate to missing are skipped.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/filter/#lets-take-a-look-at-the-movies-whose-budget-was-more-than-average-we-will-select-only-the-first-5-rows-for-the-sake-of-brevity","title":"Let\u2019s take a look at the movies whose budget was more than average. We will select only the first 5 rows for the sake of brevity.","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @filter(Budget >= mean(skipmissing(Budget)))\n  @select(Title, Budget)\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat64?1'Til There Was You23.0210 Things I Hate About You16.03102 Dalmatians85.0413 Going On 3037.0513th Warrior, The85.0"},{"location":"examples/generated/UserGuide/filter/#lets-search-for-movies-that-have-at-least-200-votes-and-a-rating-of-greater-than-or-equal-to-8-there-are-3-ways-you-can-specify-an-and-condition-inside-of-tidierdatajl","title":"Let's search for movies that have at least 200 votes and a rating of greater than or equal to 8. There are 3 ways you can specify an \"and\" condition inside of TidierData.jl.","text":""},{"location":"examples/generated/UserGuide/filter/#the-first-option-is-to-use-the-short-circuiting-operator-as-shown-below-this-is-the-preferred-approach-because-the-second-expression-is-only-evaluated-per-element-if-the-first-one-is-true","title":"The first option is to use the short-circuiting && operator as shown below. This is the preferred approach because the second expression is only evaluated (per element) if the first one is true.","text":"
    @chain movies begin\n  @filter(Votes >= 200 && Rating >= 8)\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#the-second-option-is-to-use-the-bitwise-operator-note-that-there-is-a-key-difference-in-syntax-between-and-because-the-operator-takes-a-higher-operator-precedence-than-you-have-to-wrap-the-comparison-expressions-inside-of-parentheses-to-ensure-that-the-overall-expression-is-evaluated-correctly","title":"The second option is to use the bitwise & operator. Note that there is a key difference in syntax between & and &&. Because the & operator takes a higher operator precedence than >=, you have to wrap the comparison expressions inside of parentheses to ensure that the overall expression is evaluated correctly.","text":"
    @chain movies begin\n  @filter((Votes >= 200) & (Rating >= 8))\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#the-third-option-for-and-conditions-only-is-to-separate-the-expressions-with-commas-this-is-similar-to-the-behavior-of-filter-in-tidyverse","title":"The third option for \"and\" conditions only is to separate the expressions with commas. This is similar to the behavior of filter() in tidyverse.","text":"
    @chain movies begin\n  @filter(Votes >= 200, Rating >= 8)\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#now-lets-see-how-to-use-filter-with-in-heres-an-example-with-a-tuple","title":"Now let's see how to use @filter() with in. Here's an example with a tuple.","text":"
    @chain movies begin\n  @filter(Title in (\"101 Dalmatians\",\n                    \"102 Dalmatians\"))\n  @select(1:5)\nend\n
    2\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641101 Dalmatians1996103missing5.52102 Dalmatians2000100850000004.7"},{"location":"examples/generated/UserGuide/filter/#we-can-also-use-filter-with-in-using-a-vector-denoted-by-a","title":"We can also use @filter() with in using a vector, denoted by a [].","text":"
    @chain movies begin\n  @filter(Title in [\"101 Dalmatians\",\n                    \"102 Dalmatians\"])\n  @select(1:5)\nend\n
    2\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641101 Dalmatians1996103missing5.52102 Dalmatians2000100850000004.7"},{"location":"examples/generated/UserGuide/filter/#finally-we-can-combine-filter-with-row_number-to-retrieve-the-first-5-rows-which-can-be-used-to-mimic-the-functionality-provided-by-slice","title":"Finally, we can combine @filter with row_number() to retrieve the first 5 rows, which can be used to mimic the functionality provided by @slice.","text":"
    @chain movies begin\n  @filter(row_number() <= 5)\n  @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/group_by/","title":"@group_by","text":"

    Grouping and ungrouping behavior is one of the nicest parts of using R's tidyverse. Once a data frame is grouped, all verbs applied to that data frame respect the grouping, including but not limited to @mutate(), @summarize(), @slice() and @filter, which allows for really powerful abstractions. For example, with @group_by() followed by @filter(), you can limit the rows of a dataset to the maximum or minimum values for each group.

    Exactly as in R's tidyverse, once a data frame is grouped, it remains grouped until either @summarize() is called (which \"peels off\" one layer of grouping) or @ungroup() is called, which removes all layers of grouping. Also as in R's tidyverse, @group_by() sorts the groups in ascending order. Unlike in R, there is never any question about whether a data frame is currently grouped because GroupedDataFrames print out in a very different form than DataFrames, making them easy to tell apart.

    When using @chain, note that you can write either @ungroup or @ungroup(). Both are considered valid.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-mutate","title":"Combining @group_by() with @mutate()","text":"
    @chain movies begin\n    @group_by(Year)\n    @mutate(Mean_Yearly_Rating = mean(skipmissing(Rating)))\n    @select(Year, Rating, Mean_Yearly_Rating)\n    @ungroup\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowYearRatingMean_Yearly_RatingInt32Float64Float64119716.45.66517219396.06.35041319418.26.34107419968.25.74712519753.45.62908"},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-summarize","title":"Combining @group_by() with @summarize()","text":"
    @chain movies begin\n    @group_by(Year)\n    @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)),\n        Median_Yearly_Rating = median(skipmissing(Rating)))\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowYearMean_Yearly_RatingMedian_Yearly_RatingInt32Float64Float64119715.665175.8219396.350416.4319416.341076.4419965.747125.9519755.629085.7"},{"location":"examples/generated/UserGuide/group_by/#grouping-by-multiple-columns","title":"Grouping by multiple columns","text":"
    @chain movies begin\n  @group_by(Year, Comedy)\n  @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)),\n      Median_Yearly_Rating = median(skipmissing(Rating)))\n  @ungroup # Need to ungroup to peel off grouping by Year\n  @arrange(desc(Year), Comedy)\n  @slice(1:5)\nend\n
    5\u00d74 DataFrame RowYearComedyMean_Yearly_RatingMedian_Yearly_RatingInt32Int32Float64Float641200506.627886.752200516.300816.13200406.765216.94200416.428986.65200306.404096.6"},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-filter","title":"Combining @group_by() with @filter()","text":"
    @chain movies begin\n    @group_by(Year)\n    @filter(Rating == minimum(Rating))\n    @ungroup\n    @select(Year, Rating)\n    @arrange(desc(Year))\n    @slice(1:10)\nend\n
    10\u00d72 DataFrame RowYearRatingInt32Float64120051.8220041.0320041.0420041.0520041.0620041.0720041.0820041.0920031.01020031.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/interpolation/","title":"Interpolation","text":"

    The !! (\"bang bang\") operator can be used to interpolate values of variables from the parent environment into your code. This operator is borrowed from the R rlang package. At some point, we may switch to using native Julia interpolation, but for a variety of reasons that introduce some complexity with native interpolation, we plan to continue to support !! interpolation.

    To interpolate multiple variables, the rlang R package uses the !!! \"triple bang\" operator. However, in TidierData.jl, the !! \"bang bang\" operator can be used to interpolate either single or multiple values as shown in the examples below.

    Note: You can only interpolate values from variables in the parent environment. If you would like to interpolate column names, you have two options: you can either use across() or you can use @aside with @pull() to create variables in the parent environment containing the values of those columns which can then be accessed using interpolatino.

    myvar = :bandmyvar = Cols(:a, :b)both refer to *columns* with those names. On the other hand,myvar = \"b\",myvar = (\"a\", \"b\")andmyvar = [\"a\", \"b\"]will interpolate the *values*. If you intend to interpolate column names, the preferred way is to useCols()` as in the examples below.

    using TidierData\n\ndf = DataFrame(a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4],\n               c = 11:20)\n
    10\u00d73 DataFrame RowabcStringInt64Int641a1112a1123b1134b2145c2156c2167d3178d3189e31910e420

    "},{"location":"examples/generated/UserGuide/interpolation/#select-the-column-because-myvar-contains-a-symbol","title":"Select the column (because myvar contains a symbol)","text":"
    myvar = :b\n\n@chain df begin\n  @select(!!myvar)\nend\n
    10\u00d71 DataFrame RowbInt64112131425262738393104"},{"location":"examples/generated/UserGuide/interpolation/#select-multiple-variables","title":"Select multiple variables","text":"

    You can also use a vector as in [:a, :b], but Cols() is preferred because it lets you mix and match numbers.

    myvars = Cols(:a, :b)\n\n@chain df begin\n  @select(!!myvars)\nend\n
    10\u00d72 DataFrame RowabStringInt641a12a13b14b25c26c27d38d39e310e4

    This is the same as this...

    myvars = Cols(:a, 2)\n\n@chain df begin\n  @select(!!myvars)\nend\n
    10\u00d72 DataFrame RowabStringInt641a12a13b14b25c26c27d38d39e310e4

    "},{"location":"examples/generated/UserGuide/interpolation/#filter-rows-containing-the-value-of-myvar_string","title":"Filter rows containing the value of myvar_string","text":"
    myvar_string = \"b\"\n\n@chain df begin\n  @filter(a == !!myvar_string)\nend\n
    2\u00d73 DataFrame RowabcStringInt64Int641b1132b214"},{"location":"examples/generated/UserGuide/interpolation/#filtering-rows-works-similarly-using-in","title":"Filtering rows works similarly using in.","text":"

    Note that for in to work here, we have to wrap it in [] because otherwise, the string will be converted into a collection of characters, which are a different data type.

    myvar_string = \"b\"\n\n@chain df begin\n  @filter(a in [!!myvar_string])\nend\n
    2\u00d73 DataFrame RowabcStringInt64Int641b1132b214

    "},{"location":"examples/generated/UserGuide/interpolation/#you-can-also-use-this-for-a-vector-or-tuple-of-strings","title":"You can also use this for a vector (or tuple) of strings.","text":"
    myvars_string = [\"a\", \"b\"]\n\n@chain df begin\n  @filter(a in !!myvars_string)\nend\n
    4\u00d73 DataFrame RowabcStringInt64Int641a1112a1123b1134b214"},{"location":"examples/generated/UserGuide/interpolation/#mutate-one-variable","title":"Mutate one variable","text":"

    Remember: You cannot interpolate column names into @mutate() expressions. However, you can create a temporary variable containing the values of the column in question or you can use @mutate() with across().

    "},{"location":"examples/generated/UserGuide/interpolation/#option-1-create-a-temporary-variable-containing-the-values-of-the-column","title":"Option 1: Create a temporary variable containing the values of the column.","text":"
    myvar = :b\n\n@chain df begin\n  @aside(myvar_values = @pull(_, !!myvar))\n  @mutate(d = !!myvar_values + 1)\nend\n
    10\u00d74 DataFrame RowabcdStringInt64Int64Int641a11122a11223b11324b21435c21536c21637d31748d31849e319410e4205"},{"location":"examples/generated/UserGuide/interpolation/#option-2-use-mutate-with-across","title":"Option 2: Use @mutate() with across()","text":"

    Note: when using across(), anonymous functions are not vectorized. This is intentional to allow users to specify their function exactly as desired.

    @chain df begin\n  @mutate(across(!!myvar, x -> x .+ 1))\n  @rename(d = b_function)\nend\n
    10\u00d74 DataFrame RowabcdStringInt64Int64Int641a11122a11223b11324b21435c21536c21637d31748d31849e319410e4205

    "},{"location":"examples/generated/UserGuide/interpolation/#summarize-across-one-variable","title":"Summarize across one variable","text":"
    myvar = :b\n\n@chain df begin\n  @summarize(across(!!myvar, mean))\nend\n
    1\u00d71 DataFrame Rowb_meanFloat6412.2"},{"location":"examples/generated/UserGuide/interpolation/#summarize-across-multiple-variables","title":"Summarize across multiple variables","text":"
    myvars = Cols(:b, :c)\n\n@chain df begin\n  @summarize(across(!!myvars, (mean, minimum, maximum)))\nend\n
    1\u00d76 DataFrame Rowb_meanc_meanb_minimumc_minimumb_maximumc_maximumFloat64Float64Int64Int64Int64Int6412.215.5111420"},{"location":"examples/generated/UserGuide/interpolation/#group-by-one-interpolated-variable","title":"Group by one interpolated variable","text":"
    myvar = :a\n\n@chain df begin\n  @group_by(!!myvar)\n  @summarize(c = mean(c))\nend\n
    5\u00d72 DataFrame RowacStringFloat641a11.52b13.53c15.54d17.55e19.5"},{"location":"examples/generated/UserGuide/interpolation/#group-by-multiple-interpolated-variables","title":"Group by multiple interpolated variables","text":"

    Once again, you can mix and match column selectors within Cols()

    myvars = Cols(:a, 2)\n\n@chain df begin\n  @group_by(!!myvars)\n  @summarize(c = mean(c))\nend\n

    GroupedDataFrame with 5 groups based on key: a

    First Group (1 row): a = \"a\" RowabcStringInt64Float641a111.5

    &vellip;

    Last Group (2 rows): a = \"e\" RowabcStringInt64Float641e319.02e420.0

    Notice that df remains grouped by a because the @summarize() peeled off one layer of grouping.

    "},{"location":"examples/generated/UserGuide/interpolation/#global-constants","title":"Global constants","text":"

    You can also use !! interpolation to access global variables like pi.

    df = DataFrame(radius = 1:5)\n\n@chain df begin\n  @mutate(area = !!pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    As of v0.14.0, global constants defined within the Base or Core modules (like missing, pi, and Real can be directly referenced without any !!)

    @chain df begin\n  @mutate(area = pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    "},{"location":"examples/generated/UserGuide/interpolation/#alternative-interpolation-syntax","title":"Alternative interpolation syntax","text":"

    Since we know that pi is defined in the Main module, we can also access it using Main.pi.

    @chain df begin\n  @mutate(area = Main.pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use !!variable or [Module_name_here].variable syntax to refer to this variable.

    Note: You can use !! interpolation anywhere, including inside of functions and loops.

    df = DataFrame(a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4],\n               c = 11:20)\n\nfor col in [:b, :c]\n  @chain df begin\n    @summarize(across(!!col, mean))\n    println\n  end\nend\n
    1\u00d71 DataFrame\n Row \u2502 b_mean\n     \u2502 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     2.2\n1\u00d71 DataFrame\n Row \u2502 c_mean\n     \u2502 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    15.5\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/joins/","title":"Joins","text":"

    One really nice thing about the R tidyverse implementation of joins is that they support natural joins. If you don't specify which columns to join on, these column names are inferred from the overlapping columns. While you can override this behavior by specifying which columns to join on, it's convenient that this is not strictly required. We have adopted a similar approach to joins in TidierData.jl.

    Here, we will only show examples of natural joins. For additional ways to join, take a look at the examples in the Reference.

    using TidierData\n

    Let's generate two data frames to join on. Here's the first one.

    df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n

    And here's the second one.

    df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n

    All the joins work similarly to R's tidyverse although the new join_by syntax for non-equijoins is not (yet) supported.

    "},{"location":"examples/generated/UserGuide/joins/#left-join","title":"Left join","text":"
    @left_join(df1, df2)\n
    2\u00d73 DataFrame RowabcStringInt64Int64?1a132b2missing"},{"location":"examples/generated/UserGuide/joins/#right-join","title":"Right join","text":"
    @right_join(df1, df2)\n
    2\u00d73 DataFrame RowabcStringInt64?Int641a132cmissing4"},{"location":"examples/generated/UserGuide/joins/#inner-join","title":"Inner join","text":"
    @inner_join(df1, df2)\n
    1\u00d73 DataFrame RowabcStringInt64Int641a13"},{"location":"examples/generated/UserGuide/joins/#full-join","title":"Full join","text":"
    @full_join(df1, df2)\n
    3\u00d73 DataFrame RowabcStringInt64?Int64?1a132b2missing3cmissing4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/mutate_transmute/","title":"@mutate","text":"

    The primary purpose of @mutate() is to either create a new column or to update an existing column without changing the number of rows in the dataset. If you only plan to select the mutated columns, then you can use @transmute() instead of @mutate(). However, in TidierData.jl, @select() can also be used to create and select new columns (unlike R's tidyverse), which means that @transmute() is a redundant function in that it has the same functionality as @select(). @transmute is included in TidierData.jl for convenience but is not strictly required.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-to-add-a-new-column","title":"Using @mutate() to add a new column","text":"

    Let's create a new column that contains the budget for each movie expressed in millions of dollars, and the select a handful of columns and rows for the sake of brevity. Notice that the underscores in in 1_000_000 are strictly optional and included only for the sake of readability. Underscores within numbers are ignored by Julia, such that 1_000_000 is read by Julia exactly the same as 1000000.

    @chain movies begin\n  @filter(!ismissing(Budget))\n  @mutate(Budget_Millions = Budget/1_000_000)\n  @select(Title, Budget, Budget_Millions)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleBudgetBudget_MillionsStringInt32?Float641'G' Men4500000.452'Manos' the Hands of Fate190000.0193'Til There Was You2300000023.04.com for Murder50000005.0510 Things I Hate About You1600000016.0

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-to-update-an-existing-column","title":"Using @mutate() to update an existing column","text":"

    Here we will repeat the same exercise, except that we will overwrite the existing Budget column.

    @chain movies begin\n    @filter(!ismissing(Budget))\n    @mutate(Budget = Budget/1_000_000)\n    @select(Title, Budget)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat641'G' Men0.452'Manos' the Hands of Fate0.0193'Til There Was You23.04.com for Murder5.0510 Things I Hate About You16.0

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-with-in","title":"Using @mutate() with in","text":"

    Here's an example of using @mutate with in.

    @chain movies begin\n  @filter(!ismissing(Budget))\n  @mutate(Nineties = Year in 1990:1999)\n  @select(Title, Year, Nineties)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleYearNinetiesStringInt32Bool1'G' Men1935false2'Manos' the Hands of Fate1966false3'Til There Was You1997true4.com for Murder2002false510 Things I Hate About You1999true

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-with-n-and-row_number","title":"Using @mutate with n() and row_number()","text":"

    Here's an example of using @mutate with both n() and row_number(). Within the context of mutate(), n() and row_number() are created into temporarily columns, which means that they can be used inside of expressions.

    @chain movies begin\n  @mutate(Row_Num = row_number(),\n          Total_Rows = n())\n  @filter(!ismissing(Budget))\n  @select(Title, Year, Row_Num, Total_Rows)\n  @slice(1:5)\nend\n
    5\u00d74 DataFrame RowTitleYearRow_NumTotal_RowsStringInt32Int64Int641'G' Men193522587882'Manos' the Hands of Fate196635587883'Til There Was You199748587884.com for Murder20029158788510 Things I Hate About You199911258788

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-transmute-to-update-and-select-columns","title":"Using @transmute to update and select columns.","text":"

    If we knew we wanted to select only the Title and Budget columns, we could have also used@transmute(), which (again) is just an alias for @select().

    @chain movies begin\n    @filter(!ismissing(Budget))\n    @transmute(Title = Title, Budget = Budget/1_000_000)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat641'G' Men0.452'Manos' the Hands of Fate0.0193'Til There Was You23.04.com for Murder5.0510 Things I Hate About You16.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/nesting/","title":"Nesting","text":""},{"location":"examples/generated/UserGuide/nesting/#nest","title":"@nest","text":"

    Nest columns into a dataframe nested into a new column

    using TidierData\n\ndf4 = DataFrame(x = [\"a\", \"b\", \"a\", \"b\", \"C\", \"a\"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7)\n\nnested_df = @nest(df4, n2 = starts_with(\"a\"), n3 = y:yz)\n
    3\u00d73 DataFrame Rowxn3n2StringDataFrameDataFrame1a3\u00d72 DataFrame3\u00d72 DataFrame2b2\u00d72 DataFrame2\u00d72 DataFrame3C1\u00d72 DataFrame1\u00d72 DataFrame

    To return to the original dataframe, you can unnest wider and then longer.

    @chain nested_df begin\n    @unnest_wider(n3:n2)\n    @unnest_longer(y:ab)\nend\n
    6\u00d75 DataFrame RowxyyzaabStringInt64Int64Int64Int641a1137122a3159103a6181274b2148115b4161096C517118

    Or you can unnest longer and then wider.

    @chain nested_df begin\n  @unnest_longer(n3:n2)\n  @unnest_wider(n3:n2)\nend\n
    6\u00d75 DataFrame RowxyzyaabStringInt64Int64Int64Int641a1317122a1539103a1861274b1428115b1641096C175118

    "},{"location":"examples/generated/UserGuide/nesting/#unnest_longer","title":"@unnest_longer","text":"

    @unnest_longer adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns.

    df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]);\n\n@chain df begin\n    @unnest_longer(y)\nend\n
    5\u00d72 DataFrame RowxyInt64Any121222323434535

    If there are rows with empty arrays, keep_empty will prevent these rows from being dropped. include_indices will add a new column for each flattened column that logs the position of each entry in the array.

    @chain df begin\n    @unnest_longer(y, keep_empty = true, indices_include = true)\nend\n
    7\u00d73 DataFrame Rowxyy_idInt64AnyInt6411missing12211322242335341635274missing1

    "},{"location":"examples/generated/UserGuide/nesting/#unnest_wider","title":"@unnest_wider","text":"

    @unnest_wider will widen a column or column(s) of Dicts, Arrays, Tuples or Dataframes into multiple columns.

    df2 = DataFrame(\n           name = [\"Zaki\", \"Farida\"],\n           attributes = [\n               Dict(\"age\" => 25, \"city\" => \"New York\"),\n               Dict(\"age\" => 30, \"city\" => \"Los Angeles\")]);\n\n@chain df2 begin\n    @unnest_wider(attributes)\nend\n
    2\u00d73 DataFrame RownamecityageStringStringInt641ZakiNew York252FaridaLos Angeles30

    "},{"location":"examples/generated/UserGuide/nesting/#unnesting-nested-dataframes-with-different-lengths-which-contains-arrays","title":"Unnesting nested Dataframes with different lengths which contains arrays","text":"
    df3 = DataFrame(\n    x = 1:3,\n    y = Any[\n        DataFrame(),\n        DataFrame(a = [\"A\"], b = [14]),\n        DataFrame(a = [\"A\", \"B\", \"C\"], b = [13, 12, 11], c = [4, 4, 4])\n    ]\n)\n
    3\u00d72 DataFrame RowxyInt64Any110\u00d70 DataFrame221\u00d72 DataFrame333\u00d73 DataFrame

    df3 contains dataframes in with different widths that also contain arrays. Chaining together @unnest_wider and @unnest_longer will unnest the columns to tuples first and then they will be fully unnested after.

    @chain df3 begin\n    @unnest_wider(y)\n    @unnest_longer(a:c, keep_empty = true)\nend\n
    5\u00d74 DataFrame RowxabcInt64AnyInt64?Int64?11missingmissingmissing22A14missing33A13443B12453C114

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/piping/","title":"Piping","text":"

    The easiest way to use TidierData.jl for complex data transformation operations is to connect them together using pipes. Julia comes with the built-in |> pipe operator, but TidierData.jl also includes and re-exports the @chain macro from the Chain.jl package. On this page, we will show you how to use both approaches.

    First, let's load a dataset.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/piping/#julias-built-in-pipe","title":"Julia's built-in |> pipe","text":"

    If we wanted to figure out the number of rows in the movies data frame, one way to do this is to apply the nrow() function to movies. The most straightforward way is to write it like this:

    nrow(movies)\n
    58788\n

    Another perfectly valid way to write this expression is by piping movies into nrow using the |> pipe operator.

    movies |> nrow\n
    58788\n

    Why might we want to do this? Well, whereas the first expression would naturally be read as \"Calculate the number of rows of movies,\" the second expression reads as \"Start with movies, then calculate the number of rows.\" For a simple expression, these are easy enough to reason about. However, as we start to pipe more and more functions in a single expression, the piped version becomes much easier to reason about.

    One quick note about Julia's built-in pipe: writing movies |> nrow() would not be considered valid. This is because Julia's built-in pipe always expects a function and not a function call. Writing nrow by itself is naming the function, whereas writing nrow() is calling the function. This quickly becomes an issue once we want to supply arguments to the function we are calling.

    Consider another approach to calculating the number of rows:

    size(movies, 1)\n
    58788\n

    In this case, the size() function returns a tuple of (rows, columns), and if you supply an optional second argument specifying the index of the tuple, it returns only that dimension. In this case, we called size() with a second argument of 1, indicating that we only wanted the function to return the number of rows.

    How would we write this using Julia's built-in pipe?

    movies |>\n  x -> size(x, 1)\n
    58788\n

    You might have wanted to write movies |> size(1), but because size(1) would represent a function call, we have to wrap the function call within an anonymous function, which is easily accomplished using the x -> func(x, arg1, arg2) syntax, where func() refers to any function and arg1 and arg2 refer to any additional arguments that are needed.

    Another way we could have accomplished this is to calculate size, which returns a tuple of (rows, columns), and then to use an anonymous function to grab the first value. Since we are calculating size without any arguments, we can simply write size within the pipe. However, to grab the first value using the x[1] syntax, we have to define an anonymous function. Putting it all together, we get this approach to piping:

    movies |>\n  size |>\n  x -> x[1]\n
    58788\n

    "},{"location":"examples/generated/UserGuide/piping/#using-the-chain-macro","title":"Using the @chain macro","text":"

    The @chain macro comes from the Chain.jl package and is included and re-exported by TidierData.jl. Let's do this same series of exercises using @chain.

    Let's calculate the number of rows using @chain.

    @chain movies nrow\n
    58788\n

    One of the reasons we prefer the use of @chain in TidierData.jl is that it is so concise. There is no need for any operator. Another interesting thing is that @chain doesn't care whether you use a function name or a function call. Both approaches work. As a result, writing nrow() instead of nrow is equally valid using @chain.

    @chain movies nrow()\n
    58788\n

    There are two options for writing out multi-row chains. The preferred approach is as follows, where the starting item is listed, followed by a begin-end block.

    @chain movies begin\n  nrow\nend\n
    58788\n

    @chain also comes with a built-in placeholder, which is _. To calculate the size and extract the first value, we can use this approach:

    @chain movies begin\n  size\n  _[1]\nend\n
    58788\n

    You don't have to list the data frame before the begin-end block. This is equally valid:

    @chain begin\n  movies\n  size\n  _[1]\nend\n
    58788\n

    The only time this approach is preferred is when instead of simply naming the data frame, you are using a function to read in the data frame from a file or database. Because this function call may include the path of the file, which could be quite long, it's easier to write this on it's own line within the begin-end block.

    While the documentation for TidierData.jl follows the convention of placing piped functions on separate lines of code using begin-end blocks, this is purely convention for ease of readability. You could rewrite the code above without the begin-end block as follows:

    @chain movies size _[1]\n
    58788\n

    For simple transformations, this approach is both concise and readable.

    "},{"location":"examples/generated/UserGuide/piping/#using-chain-with-tidierdatajl","title":"Using @chain with TidierData.jl","text":"

    Returning to our convention of multi-line pipes, let's grab the first five movies that were released since 2000 and had a rating of at least 9 out of 10. Here is one way that we could write this:

    @chain movies begin\n    @filter(Year >= 2000 && Rating >= 9)\n    @slice(1:5)\n    @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Note: we generally prefer using && in Julia because it is a \"short-cut\" operator. If the first condition evaluates to false, then the second condition is not even evaluated, which makes it faster (because it takes a short-cut).

    In the case of @filter, multiple conditions can be written out as separate expressions.

    @chain movies begin\n  @filter(Year >= 2000, Rating >= 9)\n  @slice(1:5)\n  @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Another to write this expression is take advantage of the fact that Julia macros can be called without parentheses. In this case, we will add back the && for the sake of readability.

    @chain movies begin\n  @filter Year >= 2000 && Rating >= 9\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Lastly, TidierData.jl also supports multi-line expressions within each of the macros that accept multiple expressions. So you could also write this as follows:

    @chain movies begin\n  @filter begin\n    Year >= 2000\n    Rating >= 9\n  end\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    What's nice about this approach is that if you want to remove some criteria, you can easily comment out the relevant parts. For example, if you're willing to consider older movies, just comment out the Year >= 2000.

    @chain movies begin\n  @filter begin\n    # Year >= 2000\n    Rating >= 9\n  end\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641+1 -119877missing9.42100 Years at the Movies19949missing9.2313 Lakes2004135missing9.042wks, 1yr2002104missing9.45500 Years Later2005106missing9.3

    "},{"location":"examples/generated/UserGuide/piping/#which-approach-to-use","title":"Which approach to use?","text":"

    The purpose of this page was to show you that both Julia's native pipes and the @chain macro are perfectly valid and capable. We prefer the use of @chain because it is a bit more flexible and concise, with a syntax that makes it easy to comment out individual operations. We have adopted a similar begin-end block functionality within TidierData.jl itself, so that you can spread arguments out over multiple lines if you prefer. In the end, the choice is up to you!

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/pivots/","title":"Pivoting","text":"

    Pivoting a dataset is needed when information sitting inside of cell values needs to be converted into column names (to make the dataset wider) or vice verse (to make the dataset longer). Either action can be referred to as \"reshaping\" a dataset, and various frameworks refer to the actions as unstacking/stacking or spreading/gathering. In R's tidyverse, these actions are referred to as pivoting, where the two accompanying actions are @pivot_wider() and @pivot_longer().

    "},{"location":"examples/generated/UserGuide/pivots/#pivot_wider","title":"@pivot_wider()","text":"

    Pivoting a dataset to make it wider is needed when information sitting inside of cell values needs to be converted into column names. The wider format is sometimes required for the purposes of calculating correlations or running statistical tests.

    Let's start with a \"long\" DataFrame and make it wide. Why would we want to make it wide? Well, if we wanted to calculate a correlation between A and B for rows with corresponding id numbers, we may need to first make sure that A and B are represented in adjacent columns.

    using TidierData\n\ndf_long = DataFrame(id = [1, 1, 2, 2],\n                    variable = [\"A\", \"B\", \"A\", \"B\"],\n                    value = [1, 2, 3, 4])\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A121B232A342B4

    To make this dataset wider, we can do the following:

    @pivot_wider(df_long, names_from = variable, values_from = value)\n
    2\u00d73 DataFrame RowidABInt64Int64?Int64?11122234

    In @pivot_wider(), both the names_from and values_from arguments are required. @pivot_wider() also supports string values for the names_from and values_from arguments.

    @pivot_wider(df_long, names_from = \"variable\", values_from = \"value\")\n
    2\u00d73 DataFrame RowidABInt64Int64?Int64?11122234

    "},{"location":"examples/generated/UserGuide/pivots/#pivot_longer","title":"@pivot_longer()","text":"

    For calculating summary statistics (e.g., mean) by groups, or for plotting purposes, DataFrames often need to be converted to their longer form. For this, we can use @pivot_longer. First, let's start with a \"wide\" DataFrame.

    df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4])\n
    2\u00d73 DataFrame RowidABInt64Int64Int6411122234

    Now, let's transform this wide dataset into the longer form. Unlike @pivot_wider(), where providing the names_from and values_from arguments is required, the only item that's required in @pivot_wider() is a set of columns to pivot. The names_to and values_to arguments are optional, and if not provided, they will default to \"variable\" and \"value\", respectively.

    We can recreate the original long dataset by doing the following. Multiple columns must be provided using selection syntax or a selection helper. Tuples containing multiple columns are not yet supported.

    @pivot_longer(df_wide, A:B)\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A122A331B242B4

    Here is another way of providing the same result using a different type of selection syntax.

    @pivot_longer(df_wide, -id)\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A122A331B242B4

    The selected columns can also be included as an array

    @pivot_longer(df_wide, [id, B])\n
    4\u00d73 DataFrame RowAvariablevalueInt64StringInt6411id123id231B243B4

    or excluded

    @pivot_longer(df_wide, -[id, B])\n
    2\u00d74 DataFrame RowidBvariablevalueInt64Int64StringInt64112A1224A3

    If all columns should be included, they can be specified by either everything(), :, or by leaving the argument blank

    @pivot_longer(df_wide, everything())\n
    6\u00d72 DataFrame RowvariablevalueStringInt641id12id23A14A35B26B4

    In this example, we set the names_to and values_to arguments. Either argument can be left out and will revert to the default value. The names_to and values_to arguments can be provided as strings or as bare unquoted variable names.

    Here is an example with names_to and values_to containing strings:

    @pivot_longer(df_wide, A:B, names_to = \"letter\", values_to = \"number\")\n
    4\u00d73 DataFrame RowidletternumberInt64StringInt6411A122A331B242B4

    And here is an example with names_to and values_to containing bare unquoted variables:

    @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)\n
    4\u00d73 DataFrame RowidletternumberInt64StringInt6411A122A331B242B4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/rename/","title":"@rename","text":"

    Renaming columns follows the same syntax as in R's tidyverse, where the \"tidy expression\" is new_name = old_name. While the main function to rename columns is @rename(), you can also use @select() if you additionally plan to select only the renamed columns.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/rename/#rename-using-rename","title":"Rename using @rename()","text":"

    If you only want to rename the columns without selecting them, then this is where @rename() comes in handy. For the sake of brevity, we are selecting the first 5 columns and rows after performing the @rename().

    @chain movies begin\n    @rename(title = Title, Minutes = Length)\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowtitleYearMinutesBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4

    "},{"location":"examples/generated/UserGuide/rename/#rename-using-select","title":"Rename using @select()","text":"

    If you plan to only select those columns that you would like to rename, then you can use @select() to both rename and select the columns of interest.

    @chain movies begin\n  @select(title = Title, Minutes = Length)\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowtitleMinutesStringInt321$1212$1000 a Touchdown713$21 a Day Once a Month74$40,000705$50,000 Climax Show, The71

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/select/","title":"@select","text":"

    The @select() macro in TidierData.jl supports many of the nuances of the R tidyverse implementation, including indexing columns individually by name or number, indexing by ranges of columns using the : operator between column names or numbers, and negative selection using negated column names or numbers. Selection helpers such as starts_with(), ends_with(), matches(), and contains() are also supported.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-individually-by-name","title":"Select the first 5 columns individually by name","text":"
    @chain movies begin\n    @select(Title, Year, Length, Budget, Rating)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-individually-by-number","title":"Select the first 5 columns individually by number","text":"
    @chain movies begin\n    @select(1, 2, 3, 4, 5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-by-name-using-a-range","title":"Select the first 5 columns by name (using a range)","text":"
    @chain movies begin\n    @select(Title:Rating)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-by-number-using-a-range","title":"Select the first 5 columns by number (using a range)","text":"
    @chain movies begin\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-all-but-the-first-5-columns-by-name","title":"Select all but the first 5 columns by name","text":"

    Here we will limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity.

    @chain movies begin\n    @select(-(Title:Rating))\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    We can also use ! for inverted selection instead of -.

    @chain movies begin\n  @select(!(Title:Rating))\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    "},{"location":"examples/generated/UserGuide/select/#select-all-but-the-first-5-columns-by-number","title":"Select all but the first 5 columns by number","text":"

    We will again limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity.

    @chain movies begin\n    @select(-(1:5))\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    "},{"location":"examples/generated/UserGuide/select/#mix-and-match-selection","title":"Mix and match selection","text":"

    Just like in R's tidyverse, you can separate multiple selections with commas and mix and match different ways of selecting columns.

    @chain movies begin\n    @select(1, Budget:Rating)\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleBudgetRatingStringInt32?Float641$missing6.42$1000 a Touchdownmissing6.03$21 a Day Once a Monthmissing8.24$40,000missing8.25$50,000 Climax Show, Themissing3.4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/sep_unite/","title":"Separating","text":"

    Follwing the tidyverse syntax, the @separate() macro in TidierData.jl separates a single column into multiple columns. This is particularly useful for splitting a column containing delimited values into individual columns.

    using TidierData\n\ndf = DataFrame(a = [\"1-1\", \"2-2\", \"3-3-3\"]);\n

    "},{"location":"examples/generated/UserGuide/sep_unite/#separate","title":"@separate","text":"

    Separate the \"a\" column into \"b\", \"c\", and \"d\" columns based on the dash delimiter

    @chain df begin\n    @separate(a, (b, c, d), \"-\")\nend\n
    3\u00d73 DataFrame RowbcdSubStrin\u2026SubStrin\u2026SubStrin\u2026?111missing222missing3333

    The into columns can also be designated as follows:

    new_names = [\"x$(i)\" for i in 1:3]; # or new_names = [\"b\", \"c\", \"d\"], or new_names = [:b, :c, :d]\n\n@separate(df, a, !!new_names, \"-\")\n
    3\u00d73 DataFrame Rowx1x2x3SubStrin\u2026SubStrin\u2026SubStrin\u2026?111missing222missing3333

    "},{"location":"examples/generated/UserGuide/sep_unite/#unite","title":"@unite","text":"

    The @unite macro brings together multiple columns into one, separate the characters by a user specified delimiter Here, the @unite macro combines the \"b\", \"c\", and \"d\" columns columns into a single new \"new_col\" column using the \"/\" delimiter

    df = DataFrame(\n       b = [\"1\", \"2\", \"3\"],\n       c = [\"1\", \"2\", \"3\"],\n       d = [missing, missing, \"3\"]);\n\n@chain df begin\n    @unite(new_col, (b, c, d), \"/\")\nend\n
    3\u00d74 DataFrame Rowbcdnew_colStringStringString?String111missing1/1222missing2/233333/3/3

    "},{"location":"examples/generated/UserGuide/sep_unite/#separate_rows","title":"@separate_rows","text":"

    Separate rows into multiple rows based on a chosen delimiter.

    df = DataFrame(\n       a = 1:3,\n       b = [\"a\", \"aa;bb;cc\", \"dd;ee\"],\n       c = [\"1\", \"2;3;4\", \"5;6\"],\n       d = [\"7\", \"8;9;10\", \"11;12\"],\n       e = [\"11\", \"22;33;44\", \"55;66\"]);\n\n@separate_rows(df, b:e, \";\")\n
    6\u00d75 DataFrame RowabcdeInt64SubStrin\u2026SubStrin\u2026SubStrin\u2026SubStrin\u202611a171122aa282232bb393342cc4104453dd5115563ee61266

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/slice/","title":"@slice","text":"

    Slicing rows is similar to filtering rows, except that slicing is performed based on row numbers rather tha filter criteria. In TidierData.jl, slicing works similarly to R's tidyverse in that both positive (which rows to keep) and negative (which rows to remove) slicing is supported. For @slice(), any valid UnitRange of integers is considered valid; this is not the case for @select() or across().

    Remember: Just like every other TidierData.jl top-level macro, @slice() respects group. This means that in a grouped data frame, @slice(1:2) will select the first 2 rows from each group.

    using TidierData\n\ndf = DataFrame(row_num = 1:10,\n               a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4])\n
    10\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c266c277d388d399e31010e4

    "},{"location":"examples/generated/UserGuide/slice/#slicing-using-a-range-of-numbers","title":"Slicing using a range of numbers","text":"

    This is an easy way of retrieving 5 consecutive rows.

    @chain df begin\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c2

    "},{"location":"examples/generated/UserGuide/slice/#slicing-using-a-more-complex-unitrange-of-numbers","title":"Slicing using a more complex UnitRange of numbers","text":"

    How would we obtain every other from 1 to 7 (counting up by 2)? Note that range() is similar to seq() in R.

    @chain df begin\n  @slice(range(start = 1, step = 2, stop = 7))\nend\n
    4\u00d73 DataFrame Rowrow_numabInt64StringInt6411a123b135c247d3

    This same code can also be written using Julia's shorthand syntax for unit ranges.

    @chain df begin\n  @slice(1:2:7)\nend\n
    4\u00d73 DataFrame Rowrow_numabInt64StringInt6411a123b135c247d3

    "},{"location":"examples/generated/UserGuide/slice/#separate-multiple-row-selections-with-commas","title":"Separate multiple row selections with commas","text":"

    If you have multiple different row selections, you can separate them with commas.

    @chain df begin\n    @slice(1:5, 10)\nend\n
    6\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c2610e4

    "},{"location":"examples/generated/UserGuide/slice/#use-n-as-short-hand-to-indicate-the-number-of-rows","title":"Use n() as short-hand to indicate the number of rows","text":"

    Select the last 2 rows.

    @chain df begin\n  @slice(n()-1, n())\nend\n
    2\u00d73 DataFrame Rowrow_numabInt64StringInt6419e3210e4

    You can even use n() inside of UnitRanges, just like in R. Notice that the order of operations is slightly different in Julia as compared to R, so you don't have to wrap the n()-1 expression inside of parentheses.

    @chain df begin\n  @slice(n()-1:n())\nend\n
    2\u00d73 DataFrame Rowrow_numabInt64StringInt6419e3210e4

    "},{"location":"examples/generated/UserGuide/slice/#inverted-selection-using-negative-numbers","title":"Inverted selection using negative numbers","text":"

    This line selects all rows except the first 5 rows.

    @chain df begin\n    @slice(-(1:5))\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c227d338d349e3510e4

    "},{"location":"examples/generated/UserGuide/slice/#sample-5-random-rows-in-the-data-frame","title":"Sample 5 random rows in the data frame","text":"
    @chain df begin\n  @slice_sample(n = 5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c221a135c248d357d3"},{"location":"examples/generated/UserGuide/slice/#slice-the-min","title":"Slice the min","text":"

    This line selects all rows with the the minimum value of the desired column

    @chain df begin\n  @slice_min(b)\nend\n
    3\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b1

    This line will only show the first row.

    @chain df begin\n  @slice_min(b, with_ties = false)\nend\n
    1\u00d73 DataFrame Rowrow_numabInt64StringInt6411a1

    "},{"location":"examples/generated/UserGuide/slice/#slice-the-max","title":"Slice the max","text":"

    The optional prop arguement will slice a proportion of the full dataframe.

    @chain df begin\n  @slice_max(b, prop = 0.5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt64110e427d338d349e354b2

    "},{"location":"examples/generated/UserGuide/slice/#slice-the-tail","title":"Slice the tail","text":"
    @chain df begin\n  @slice_tail(prop = 0.5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c227d338d349e3510e4"},{"location":"examples/generated/UserGuide/slice/#slice-the-head","title":"Slice the head","text":"
    @chain df begin\n  @slice_head(n = 3)\nend\n
    3\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b1

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/summarize/","title":"@summarize","text":"

    Summarizing a dataset involves aggregating multiple rows down to (usually) a single row of data. This can be performed across the entire dataset, or if the dataset is grouped, then for each row in the dataset. This is implemented similarly to R's tidyverse using @summarize(). Out of admiration for Hadley Wickham, and to be consistent with the R tidyverse, both @summarize() and @summarise() are supported.

    Note that summarization is different from other verbs in the TidierData.jl in 2 respects:

    1. No auto-vectorization is performed when using @summarize()
    2. One layer of grouping is removed after each @summarize() function.

    If you require further changes to grouping beyond the defaults, you can either @ungroup() or call @group_by() to regroup by a different set of variables.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/summarize/#using-summarize-with-n-to-count-the-number-of-movies-in-the-dataset","title":"Using @summarize() with n() to count the number of movies in the dataset.","text":"

    Within the context of @summarize() only, n() is converted to DataFrames.jl's nrow() function.

    @chain movies begin\n    @summarize(n = n())\nend\n
    1\u00d71 DataFrame RownInt64158788

    "},{"location":"examples/generated/UserGuide/summarize/#using-summarize-to-calculate-average-budget-of-movies-in-the-dataset","title":"Using @summarize() to calculate average budget of movies in the dataset.","text":"

    The median budget in this dataset is 3 million, and the mean budget is 13 million! Making movies must be way more lucrative than making Julia packages.

    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(median_budget = median(skipmissing(Budget)),\n             mean_budget = mean(skipmissing(Budget)))\nend\n
    1\u00d72 DataFrame Rowmedian_budgetmean_budgetFloat64Float6413.013.4125

    "},{"location":"examples/generated/UserGuide/summarize/#combining-group_by-with-summarise","title":"Combining @group_by() with @summarise()","text":"

    How many movies came out in each of the last 5 years?

    @chain movies begin\n  @group_by(Year)\n  @summarise(n = n())\n  @arrange(desc(Year))\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowYearnInt32Int6412005349220041945320032158420022168520012121

    Notice that there was no need to explicitly @ungroup() the dataset after summarizing here. The @summarise() function removed one layer of grouping. Since this dataset was only grouped by one variable (Year), it was no longer grouped after the @summarise was performed.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/summary/","title":"@summary","text":"

    The @summary() macro in TidierData.jl provides a concise way to compute summary statistics on data. Similar to its R counterpart, it will provide the mean, median, Q1, Q3, minimum, maximum, and number of missing values in a numerical column or columns.

    "},{"location":"examples/generated/UserGuide/summary/#summary-for-the-whole-dataframe","title":"Summary for the whole dataframe","text":"
    using TidierData\n\ndf = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]);\n\n@chain df begin\n    @summary()\nend\n\n@summary(df)\n
    4\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641A12.03.03.04.05502B77.758.58.59.2510413C1112.013.012.666713.514324D1617.018.018.019.02050"},{"location":"examples/generated/UserGuide/summary/#you-can-specify-columns-for-which-you-want-to-compute-the-summary-this-is-useful-if-the-dataframe-has-a-large-number-of-columns-and-youre-interested-in-only-a-subset-of-them","title":"You can specify columns for which you want to compute the summary. This is useful if the DataFrame has a large number of columns and you're interested in only a subset of them.","text":"
    @chain df begin\n    @summary(B)\nend\n\n@summary(df, B)\n
    1\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641B77.758.58.59.251041"},{"location":"examples/generated/UserGuide/summary/#or-for-a-range-of-columns","title":"or for a range of columns","text":"
    @chain df begin\n    @select(B:D)\n    @summary() # you can also write this @summary(2:4)\nend\n
    3\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641B77.758.58.59.2510412C1112.013.012.666713.514323D1617.018.018.019.02050

    This page was generated using Literate.jl.

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":""},{"location":"#what-is-tidierdatajl","title":"What is TidierData.jl?","text":"

    TidierData.jl is a 100% Julia implementation of the dplyr and tidyr R packages. Powered by the DataFrames.jl package and Julia\u2019s extensive meta-programming capabilities, TidierData.jl is an R user\u2019s love letter to data analysis in Julia.

    TidierData.jl has two goals, which differentiate it from other data analysis meta-packages in Julia:

    Stick as closely to tidyverse syntax as possible.

    Whereas other meta-packages introduce Julia-centric idioms for working with DataFrames, this package\u2019s goal is to reimplement parts of tidyverse in Julia. This means that TidierData.jl uses tidy expressions as opposed to idiomatic Julia expressions. An example of a tidy expression is a = mean(b). In Julia, a and b are variables and are thus \"eagerly\" evaluated. This means that if b is merely referring to a column in a data frame and not an object in the global namespace, then an error will be generated because b was not found. In idiomatic Julia, b would need to be expressed as a symbol, or :b. Even then, a = mean(:b) would generate an error because it's not possible to calculate the mean value of a symbol. To handle this using idiomatic Julia, DataFrames.jl introduces a mini-language that relies heavily on the creation of anonymous functions, with explicit directional pairs syntax using a source => function => destination syntax. While this is quite elegant, it can be verbose. TidierData.jl aims to reduce this complexity by exposing an R-like syntax, which is then converted into valid DataFrames.jl code. The reason that tidy expressions are considered valid by Julia in TidierData.jl is because they are implemented using macros. Macros \"capture\" the expressions they are given, and then they can modify those expressions before evaluating them. For consistency, all top-level dplyr functions are implemented as macros (whether or not a macro is truly needed), and all \"helper\" functions (used inside of those top-level functions) are implemented as functions or pseudo-functions (functions which only exist through modification of the abstract syntax tree).

    Make broadcasting mostly invisible.

    Broadcasting trips up many R users switching to Julia because R users are used to most functions being vectorized. TidierData.jl currently uses a lookup table to decide which functions not to vectorize; all other functions are automatically vectorized. Read the documentation page on \"Autovectorization\" to read about how this works, and how to override the defaults. An example of where this issue commonly causes errors is when centering a variable. To create a new column a that centers the column b, TidierData.jl lets you simply write a = b - mean(b) exactly as you would in R. This works because TidierData.jl knows to not vectorize mean() while also recognizing that - should be vectorized such that this expression is rewritten in DataFrames.jl as :b => (b -> b .- mean(b)) => :a. For any user-defined function that you want to \"mark\" as being non-vectorized, you can prefix it with a ~. For example, a function new_mean(), if it had the same functionality as mean() would normally get vectorized by TidierData.jl unless you write it as ~new_mean().

    "},{"location":"#installation","title":"Installation","text":"

    For the stable version:

    ] add TidierData\n

    The ] character starts the Julia package manager. Press the backspace key to return to the Julia prompt.

    or

    using Pkg\nPkg.add(\"TidierData\")\n

    For the newest version:

    ] add TidierData#main\n

    or

    using Pkg\nPkg.add(url=\"https://github.com/TidierOrg/TidierData.jl\")\n

    "},{"location":"#what-macros-and-functions-does-tidierdatajl-support","title":"What macros and functions does TidierData.jl support?","text":"

    To support R-style programming, TidierData.jl is implemented using macros. This is because macros are able to \"capture\" the code before executing it, which allows the package to support R-like \"tidy expressions\" that would otherwise not be considered valid Julia code.

    TidierData.jl currently supports the following top-level macros:

    Top-level macros:

    • @glimpse() and @head()
    • @select() and @distinct()
    • @rename() and @rename_with()
    • @mutate() and @transmute()
    • @summarize() and @summarise()
    • @filter()
    • @slice(), @slice_sample(), @slice_min(), @slice_max(), @slice_head(), and @slice_tail()
    • @group_by() and @ungroup()
    • @arrange()
    • @relocate()
    • @pull()
    • @count() and @tally()
    • @left_join(), @right_join(), @inner_join(), @full_join(), @anti_join(), and @semi_join()
    • @bind_rows() and @bind_cols()
    • @pivot_wider() and @pivot_longer()
    • @separate(), @separate_rows(), and @unite()
    • @drop_missing() and @fill_missing()
    • @unnest_longer(), @unnest_wider(), and @nest()
    • @clean_names() (as in R's janitor::clean_names() function)
    • @summary() (as in R's summary() function)

    TidierData.jl also supports the following helper functions:

    Helper functions:

    • across()
    • where()
    • desc()
    • if_else() and case_when()
    • n() and row_number()
    • ntile()
    • lag() and lead()
    • everything(), starts_with(), ends_with(), matches(), and contains()
    • as_float(), as_integer(), and as_string()
    • is_number(), is_float(), is_integer(), and is_string()
    • missing_if() and replace_missing()

    See the Reference page for a detailed guide to each of the macros and functions.

    "},{"location":"#example","title":"Example","text":"

    Let's select the first five movies in our dataset whose budget exceeds the mean budget. Unlike in R, where we pass an na.rm = TRUE argument to remove missing values, in Julia we wrap the variable with a skipmissing() to remove the missing values before the mean() is calculated.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n\n@chain movies begin\n    @mutate(Budget = Budget / 1_000_000)\n    @filter(Budget >= mean(skipmissing(Budget)))\n    @select(Title, Budget)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame\n Row \u2502 Title                       Budget   \n     \u2502 String                      Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 'Til There Was You              23.0\n   2 \u2502 10 Things I Hate About You      16.0\n   3 \u2502 102 Dalmatians                  85.0\n   4 \u2502 13 Going On 30                  37.0\n   5 \u2502 13th Warrior, The               85.0\n

    "},{"location":"#whats-new","title":"What\u2019s new","text":"

    See NEWS.md for the latest updates.

    "},{"location":"#whats-missing","title":"What's missing","text":"

    Is there a tidyverse feature missing that you would like to see in TidierData.jl? Please file a GitHub issue. Because TidierData.jl primarily wraps DataFrames.jl, our decision to integrate a new feature will be guided by how well-supported it is within DataFrames.jl and how likely other users are to benefit from it.

    "},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"
    • TidierData.TidierData_set
    • TidierData.across
    • TidierData.as_float
    • TidierData.as_integer
    • TidierData.as_string
    • TidierData.case_when
    • TidierData.desc
    • TidierData.ends_with
    • TidierData.everything
    • TidierData.if_else
    • TidierData.is_float
    • TidierData.is_integer
    • TidierData.is_number
    • TidierData.is_string
    • TidierData.matches
    • TidierData.missing_if
    • TidierData.n
    • TidierData.ntile
    • TidierData.replace_missing
    • TidierData.row_number
    • TidierData.starts_with
    • TidierData.where
    • TidierData.@anti_join
    • TidierData.@arrange
    • TidierData.@bind_cols
    • TidierData.@bind_rows
    • TidierData.@count
    • TidierData.@distinct
    • TidierData.@drop_missing
    • TidierData.@fill_missing
    • TidierData.@filter
    • TidierData.@full_join
    • TidierData.@glimpse
    • TidierData.@group_by
    • TidierData.@head
    • TidierData.@inner_join
    • TidierData.@left_join
    • TidierData.@mutate
    • TidierData.@nest
    • TidierData.@pivot_longer
    • TidierData.@pivot_wider
    • TidierData.@pull
    • TidierData.@relocate
    • TidierData.@rename
    • TidierData.@rename_with
    • TidierData.@right_join
    • TidierData.@select
    • TidierData.@semi_join
    • TidierData.@separate
    • TidierData.@separate_rows
    • TidierData.@slice
    • TidierData.@slice_head
    • TidierData.@slice_max
    • TidierData.@slice_min
    • TidierData.@slice_sample
    • TidierData.@slice_tail
    • TidierData.@summarise
    • TidierData.@summarize
    • TidierData.@summary
    • TidierData.@tally
    • TidierData.@transmute
    • TidierData.@ungroup
    • TidierData.@unite
    • TidierData.@unnest_longer
    • TidierData.@unnest_wider
    "},{"location":"reference/#reference-exported-functions","title":"Reference - Exported functions","text":"

    # TidierData.TidierData_set \u2014 Method.

    TidierData_set(option::AbstractString, value::Bool)\n

    Set package options.

    Here are the supported options and what they do:

    • \"code\": Defaults to false. If set to true, this option displays the DataFrames.jl code generated by the TidierData.jl package. It is useful for debugging whether errors are introduced by TidierData.jl's generated code.

    Arguments

    • option: \"code\"
    • value: true or false

    source

    # TidierData.across \u2014 Method.

    across(variable[s], function[s])\n

    Apply functions to multiple variables. If specifying multiple variables or functions, surround them with parentheses so that they are recognized as a tuple.

    This function should only be called inside of TidierData.jl macros.

    Arguments

    • variable[s]: An unquoted variable, or if multiple, an unquoted tuple of variables.
    • function[s]: A function, or if multiple, a tuple of functions.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(across(b, minimum))\n       end\n1\u00d71 DataFrame\n Row \u2502 b_minimum \n     \u2502 Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @mutate(across((b,c), (minimum, maximum)))\n       end\n5\u00d77 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Char  Int64  Int64  Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11          5         15\n   2 \u2502 b         2     12          1         11          5         15\n   3 \u2502 c         3     13          1         11          5         15\n   4 \u2502 d         4     14          1         11          5         15\n   5 \u2502 e         5     15          1         11          5         15\n\njulia> @chain df begin\n         @mutate(across((b, starts_with(\"c\")), (minimum, maximum)))\n       end\n5\u00d77 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Char  Int64  Int64  Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11          5         15\n   2 \u2502 b         2     12          1         11          5         15\n   3 \u2502 c         3     13          1         11          5         15\n   4 \u2502 d         4     14          1         11          5         15\n   5 \u2502 e         5     15          1         11          5         15\n

    source

    # TidierData.as_float \u2014 Method.

    as_float(value)\n

    Convert a number or string to a Float64 data type.

    This is a useful helper for type conversions. Missing values are propagated.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_float(1)\n1.0\n\njulia> as_float(\"1.5\")\n1.5\n\njulia> as_float(missing)\nmissing\n

    source

    # TidierData.as_integer \u2014 Method.

    as_integer(value)\n

    Convert a number or string to an Int64 data type.

    This is a useful helper for type conversions. Missing values are propagated. Any values after the decimal point are removed.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_integer(1)\n1\n\njulia> as_integer(1.5)\n1\n\njulia> as_integer(\"2\")\n2\n\njulia> as_integer(\"2.5\")\n2\n\njulia> as_integer(missing)\nmissing\n

    source

    # TidierData.as_string \u2014 Method.

    as_string(value)\n

    Convert a number or string to a String data type.

    This is a useful helper for type conversions. Missing values are propagated.

    Arguments

    • value: An AbstractString, Number, or missing value.

    Examples

    julia> as_string(1)\n\"1\"\n\njulia> as_string(1.5)\n\"1.5\"\n\njulia> as_string(missing)\nmissing\n

    source

    # TidierData.case_when \u2014 Method.

    case_when(condition => return_value)\ncase_when(condition_1 => return_value_1, condition_2 => return_value_2, ...)\n

    Return the corresponding return_value for the first condition that evaluates to true.

    The most specific condition should be listed first and most general condition should be listed last. If none of the conditions evaluate to true, then a missing value is returned.

    Arguments

    • condition: A condition that evaluates to true, false, or missing.
    • return_value: The value to return if the condition is true.

    Examples

    julia> df = DataFrame(a = [1, 2, missing, 4, 5]);\n\njulia> @chain df begin\n         @mutate(b = case_when(a > 4  =>  \"hi\",\n                               a > 2  =>  \"medium\",\n                               a > 0  =>  \"low\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  low\n   2 \u2502       2  low\n   3 \u2502 missing  missing \n   4 \u2502       4  medium\n   5 \u2502       5  hi\n\njulia> @chain df begin\n         @mutate(b = case_when(a > 4  =>  \"hi\",\n                               a > 2  =>  \"medium\",\n                               a > 0  =>  \"low\",\n                               true   =>  \"unknown\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  low\n   2 \u2502       2  low\n   3 \u2502 missing  unknown\n   4 \u2502       4  medium\n   5 \u2502       5  hi\n\njulia> @chain df begin\n         @mutate(b = case_when(a >= 3  =>  3,\n                               true    =>  a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2        2\n   3 \u2502 missing  missing \n   4 \u2502       4        3\n   5 \u2502       5        3\n\njulia> @chain df begin\n         @mutate(b = case_when(a >= 3        =>  3,\n                               ismissing(a)  =>  0,\n                               true          =>  a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1      1\n   2 \u2502       2      2\n   3 \u2502 missing      0\n   4 \u2502       4      3\n   5 \u2502       5      3\n

    source

    # TidierData.desc \u2014 Method.

    desc(col)\n

    Orders the rows of a DataFrame column in descending order when used inside of @arrange(). This function should only be called inside of `@arrange()``.

    Arguments

    • col: An unquoted column name.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @arrange(a, desc(b))\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2     12\n   2 \u2502 a         1     11\n   3 \u2502 b         4     14\n   4 \u2502 b         3     13\n   5 \u2502 c         6     16\n   6 \u2502 c         5     15\n   7 \u2502 d         8     18\n   8 \u2502 d         7     17\n   9 \u2502 e        10     20\n  10 \u2502 e         9     19\n

    source

    # TidierData.ends_with \u2014 Method.

    ends_with(suffix)\n

    Select all columns ending with the suffix.

    Arguments

    • suffix: A string.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(ends_with(\"1\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    b_1   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     21\n   2 \u2502     2     22\n   3 \u2502     3     23\n   4 \u2502     4     24\n   5 \u2502     5     25\n

    source

    # TidierData.everything \u2014 Method.

    everything()\n

    Select all (remaining) columns.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(b_1, everything())\n       end\n5\u00d73 DataFrame\n Row \u2502 b_1    a_1    a_2   \n     \u2502 Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    21      1     11\n   2 \u2502    22      2     12\n   3 \u2502    23      3     13\n   4 \u2502    24      4     14\n   5 \u2502    25      5     15\n

    source

    # TidierData.if_else \u2014 Method.

    if_else(condition, yes, no, [miss])\n

    Return the yes value if the condition is true and the no value if the condition is false. If miss is specified, then the provided miss value is returned when the condition contains a missing value. If miss is not specified, then the returned value is an explicit missing value.

    Arguments

    • condition: A condition that evaluates to true, false, or missing.
    • yes: Value to return if the condition is true.
    • no: Value to return if the condition is false.
    • miss: Optional. Value to return if the condition is missing.

    Examples

    julia> df = DataFrame(a = [1, 2, missing, 4, 5]);\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, \"yes\", \"no\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  no\n   2 \u2502       2  no\n   3 \u2502 missing  missing \n   4 \u2502       4  yes\n   5 \u2502       5  yes\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, \"yes\", \"no\", \"unknown\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  no\n   2 \u2502       2  no\n   3 \u2502 missing  unknown\n   4 \u2502       4  yes\n   5 \u2502       5  yes\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, 3, a))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2        2\n   3 \u2502 missing  missing \n   4 \u2502       4        3\n   5 \u2502       5        3\n\njulia> @chain df begin\n         @mutate(b = if_else(a >= 3, 3, a, 0))\n       end\n5\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1      1\n   2 \u2502       2      2\n   3 \u2502 missing      0\n   4 \u2502       4      3\n   5 \u2502       5      3\n

    source

    # TidierData.is_float \u2014 Method.

    is_float(column::AbstractVector)\n

    Determine if the given column contains floating-point numbers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains floating-point numbers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_float(df.c)\ntrue\n\njulia> is_float(df.b)\nfalse\n

    source

    # TidierData.is_integer \u2014 Method.

    is_integer(column::AbstractVector)\n

    Determine if the given column contains integers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains integers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_integer(df.b)\ntrue\n\njulia> is_integer(df.d)\nfalse\n

    source

    # TidierData.is_number \u2014 Method.

    is_number(column::AbstractVector)\n

    Determine if the given column contains numbers.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains numbers, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_number(df.b)\ntrue\n\njulia> is_number(df.c)\ntrue\n\njulia> is_number(df.d)\nfalse\n

    source

    # TidierData.is_string \u2014 Method.

    is_string(column::AbstractVector)\n

    Determine if the given column contains strings.

    Arguments

    • column::AbstractVector: The column whose data type needs to be checked.

    Returns

    • Bool: true if the column contains strings, false otherwise.

    Examples

    julia> df = DataFrame(b = [missing, 2, 3],\n                      c = [missing, 2.2, 34],\n                      d = [missing, missing, \"A\"]);\n\njulia> is_string(df.d)\ntrue\n\njulia> is_string(df.c)\nfalse\n

    source

    # TidierData.matches \u2014 Method.

    matches(pattern, [flags])\n

    Select all columns matching the pattern.

    Arguments

    • pattern: A string.
    • flags: Optional string containing flags. \"i\" = Do case-insensitive pattern matching. \"m\" = Treat string as multiple lines. \"s\" = Treat string as a single line. \"x\" = Tells the regular expression parser to ignore most whitespace that is neither backslashed nor within a character class. You

    can use this to break up your regular expression into (slightly) more readable parts.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(matches(\"^a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin \n         @select(matches(\"1$\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    b_1   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     21\n   2 \u2502     2     22\n   3 \u2502     3     23\n   4 \u2502     4     24\n   5 \u2502     5     25\n\njulia> @chain df begin \n         @select(matches(\"A\", \"i\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.missing_if \u2014 Method.

    missing_if(x, value)\n

    Replace a specific value with missing in x.

    Arguments

    • x: The input value which can be of any type. If x is already missing or equals value, the function will return missing. Otherwise, it returns x unaltered.
    • value: The specific value to be checked against.

    Examples

    julia> df = DataFrame(\n              a = [1, missing, 3, 4],\n              b = [\"apple\", \"apple\", \"banana\", \"cherry\"]\n            );\n\njulia> @chain df begin\n         @mutate(a = missing_if(a, 4), \n                 b = missing_if(b, \"apple\"))\n       end\n4\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1  missing \n   2 \u2502 missing  missing \n   3 \u2502       3  banana\n   4 \u2502 missing  cherry\n

    source

    # TidierData.n \u2014 Method.

    n()\n

    Return the number of rows in the DataFrame or in the group if used in the context of a GroupedDataFrame.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @summarize(n = n())\n       end\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    10\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(n = n())\n       end\n5\u00d72 DataFrame\n Row \u2502 a     n     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2\n   2 \u2502 b         2\n   3 \u2502 c         2\n   4 \u2502 d         2\n   5 \u2502 e         2\n

    source

    # TidierData.ntile \u2014 Method.

    ntile(x, n::Integer)\n

    Break the input vector into n equal-sized buckets.

    ntile() is a rough rank that breaks the input vector into n buckets. If length(x) is not an integer multiple of n, the size of the buckets will differ by up to one, with larger buckets coming first.

    Unlike other ranking functions, ntile() ignores ties: it will create evenly sized buckets even if the same value of x ends up in different buckets.

    Arguments

    • x: A vector to rank. By default, the smallest values will get the smallest ranks. Missing values will be given rank missing.
    • n: Number of groups to bucket into.

    Examples

    julia> x = [5,1,3,2,2, missing]\n6-element Vector{Union{Missing, Int64}}:\n 5\n 1\n 3\n 2\n 2\n  missing\n\njulia> ntile(x, 2)\n6-element Vector{Union{Missing, Int64}}:\n 2\n 1\n 2\n 1\n 1\n  missing\n\njulia> ntile(x, 4)\n6-element Vector{Union{Missing, Int64}}:\n 4\n 1\n 3\n 1\n 2\n  missing\n\njulia> ntile(1:8, 3)\n8-element Vector{Int64}:\n 1\n 1\n 1\n 2\n 2\n 2\n 3\n 3\n\njulia> df = DataFrame(a = 1:8);\n\njulia> @chain df begin\n       @mutate(buckets = ntile(a, 3))\n       end\n8\u00d72 DataFrame\n Row \u2502 a      buckets \n     \u2502 Int64  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2        1\n   3 \u2502     3        1\n   4 \u2502     4        2\n   5 \u2502     5        2\n   6 \u2502     6        2\n   7 \u2502     7        3\n   8 \u2502     8        3\n

    source

    # TidierData.replace_missing \u2014 Method.

    replace_missing(x, replacement)\n

    Replace missing values in x with a specified replacement value.

    Arguments

    • x: The input value which can be of any type. If x is missing, the function will return replacement. Otherwise, it returns x unaltered.
    • replacement: The value to replace missing with in x.

    Examples

    julia> df = DataFrame(\n              a = [1, missing, 3, 4],\n              b = [4, 5, missing, 8]\n            );\n\njulia> @chain df begin\n         @mutate(a = replace_missing(a, 100),\n                 b = replace_missing(b, 35))\n       end\n4\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      4\n   2 \u2502   100      5\n   3 \u2502     3     35\n   4 \u2502     4      8\n

    source

    # TidierData.row_number \u2014 Method.

    row_number()\n

    Return each row's number in a DataFrame or in the group if used in the context of a GroupedDataFrame.

    Arguments

    • None

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2));\n\njulia> @chain df begin\n         @mutate(row_num = row_number())\n       end\n10\u00d72 DataFrame\n Row \u2502 a     row_num \n     \u2502 Char  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 b           3\n   4 \u2502 b           4\n   5 \u2502 c           5\n   6 \u2502 c           6\n   7 \u2502 d           7\n   8 \u2502 d           8\n   9 \u2502 e           9\n  10 \u2502 e          10\n\njulia> @chain df begin\n         @mutate(row_num = row_number() + 1)\n       end\n10\u00d72 DataFrame\n Row \u2502 a     row_num \n     \u2502 Char  Int64   \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           2\n   2 \u2502 a           3\n   3 \u2502 b           4\n   4 \u2502 b           5\n   5 \u2502 c           6\n   6 \u2502 c           7\n   7 \u2502 d           8\n   8 \u2502 d           9\n   9 \u2502 e          10\n  10 \u2502 e          11\n\njulia> @chain df begin\n         @filter(row_number() <= 5)\n       end\n5\u00d71 DataFrame\n Row \u2502 a    \n     \u2502 Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a\n   2 \u2502 a\n   3 \u2502 b\n   4 \u2502 b\n   5 \u2502 c\n

    source

    # TidierData.starts_with \u2014 Method.

    starts_with(prefix)\n

    Select all columns starting with the prefix.

    Arguments

    • prefix: A string.

    Examples

    julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);\n\njulia> @chain df begin \n         @select(starts_with(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 a_1    a_2   \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.where \u2014 Method.

    where(function)\n

    Selects columns on which a function returns true for all values of the column.

    This function should only be called inside of TidierData.jl macros.

    Arguments

    • function: A predicate function (one that returns true or false).

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @select(where(is_number))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n\njulia> @chain df begin\n         @mutate(across(where(is_number), minimum))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum \n     \u2502 Char  Int64  Int64  Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11\n   2 \u2502 b         2     12          1         11\n   3 \u2502 c         3     13          1         11\n   4 \u2502 d         4     14          1         11\n   5 \u2502 e         5     15          1         11\n\njulia> df = DataFrame(a = repeat('a':'e', inner = 3),\n                      b = 1:15,\n                      c = 16:30,\n                      d = 31:45);\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(across(where(is_number), mean))\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b_mean   c_mean   d_mean  \n     \u2502 Char  Float64  Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2.0     17.0     32.0\n   2 \u2502 b         5.0     20.0     35.0\n   3 \u2502 c         8.0     23.0     38.0\n   4 \u2502 d        11.0     26.0     41.0\n   5 \u2502 e        14.0     29.0     44.0\n

    source

    # TidierData.@anti_join \u2014 Macro.

    @anti_join(df1, df2, [by])\n

    Perform an anti-join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @anti_join(df1, df2)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, a = a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n\njulia> @anti_join(df1, df2, \"a\" = \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           2\n

    source

    # TidierData.@arrange \u2014 Macro.

    @arrange(df, exprs...)\n

    Order the rows of a DataFrame by the values of specified columns.

    Arguments

    • df: A DataFrame.
    • exprs...: Variables from the input DataFrame. Use desc() to sort in descending order. Multiple variables can be specified, separated by commas.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);\n\njulia> @chain df begin\n         @arrange(a)\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         6     16\n   7 \u2502 d         7     17\n   8 \u2502 d         8     18\n   9 \u2502 e         9     19\n  10 \u2502 e        10     20\n\njulia> @chain df begin\n         @arrange(a, desc(b))\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         2     12\n   2 \u2502 a         1     11\n   3 \u2502 b         4     14\n   4 \u2502 b         3     13\n   5 \u2502 c         6     16\n   6 \u2502 c         5     15\n   7 \u2502 d         8     18\n   8 \u2502 d         7     17\n   9 \u2502 e        10     20\n  10 \u2502 e         9     19\n

    source

    # TidierData.@bind_cols \u2014 Macro.

    @bind_cols(dfs...)\n

    Bind many DataFrames into one by column.

    Arguments

    • dfs...: DataFrames to combine.

    Examples

    julia> df1 = DataFrame(a=1:3, b=1:3);\n\njulia> df2 = DataFrame(a=4:6, b=4:6);\n\njulia> df3 = DataFrame(a=7:9, c=7:9);\n\njulia> @chain df1 begin\n         @bind_cols(df2, df3)\n       end\n3\u00d76 DataFrame\n Row \u2502 a      b      a_1    b_1    a_2    c     \n     \u2502 Int64  Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      4      4      7      7\n   2 \u2502     2      2      5      5      8      8\n   3 \u2502     3      3      6      6      9      9\n

    source

    # TidierData.@bind_rows \u2014 Macro.

    @bind_rows(dfs..., id)\n

    Bind many DataFrames into one by row.

    Columns present in at least one of the provided DataFrames are kept. Columns not present in some DataFrames are filled with missing values where necessary.

    Arguments

    • dfs...: DataFrames to combine.
    • id: string DataFrame identifier. When id is supplied, a new column of numeric identifiers is created to link each row to its original DataFrame.

    Examples

    julia> df1 = DataFrame(a=1:3, b=1:3);\n\njulia> df2 = DataFrame(a=4:6, b=4:6);\n\njulia> df3 = DataFrame(a=7:9, c=7:9);\n\njulia> @chain df1 begin\n         @bind_rows(df2)\n       end\n6\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     2      2\n   3 \u2502     3      3\n   4 \u2502     4      4\n   5 \u2502     5      5\n   6 \u2502     6      6\n

    When columns are not present in some DataFrames, they are filled with missing values.

    julia> @chain df1 begin\n         @bind_rows(df2, df3)\n       end\n9\u00d73 DataFrame\n Row \u2502 a      b        c       \n     \u2502 Int64  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1  missing \n   2 \u2502     2        2  missing \n   3 \u2502     3        3  missing \n   4 \u2502     4        4  missing \n   5 \u2502     5        5  missing \n   6 \u2502     6        6  missing \n   7 \u2502     7  missing        7\n   8 \u2502     8  missing        8\n   9 \u2502     9  missing        9\n\njulia> @chain df1 begin\n         @bind_rows(df2, df3, id = \"id\")\n       end\n9\u00d74 DataFrame\n Row \u2502 a      b        c        id    \n     \u2502 Int64  Int64?   Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1  missing      1\n   2 \u2502     2        2  missing      1\n   3 \u2502     3        3  missing      1\n   4 \u2502     4        4  missing      2\n   5 \u2502     5        5  missing      2\n   6 \u2502     6        6  missing      2\n   7 \u2502     7  missing        7      3\n   8 \u2502     8  missing        8      3\n   9 \u2502     9  missing        9      3\n

    source

    # TidierData.@count \u2014 Macro.

    @count(df, exprs..., [wt], [sort])\n

    Count the unique values of one or more variables, with an optional weighting.

    @chain df @count(a, b) is roughly equivalent to @chain df @group_by(a, b) @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt). Note that if grouping columns are provided, the result will be an ungrouped data frame, which is slightly different behavior than R's tidyverse.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • exprs...: Column names, separated by commas.
    • wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
    • sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 3),\n                           repeat([\"b\"], inner = 3),\n                           repeat([\"c\"], inner = 1),\n                           missing),\n                      b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n   4 \u2502 b            4\n   5 \u2502 b            5\n   6 \u2502 b            6\n   7 \u2502 c            7\n   8 \u2502 missing      8\n\njulia> @chain df @count()\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     8\n\njulia> @chain df begin\n         @count(a)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            3\n   2 \u2502 b            3\n   3 \u2502 c            1\n   4 \u2502 missing      1\n\njulia> @chain df begin\n         @count(a, wt = b)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            6\n   2 \u2502 b           15\n   3 \u2502 c            7\n   4 \u2502 missing      8\n\njulia> @chain df begin\n         @count(a, wt = b, sort = true)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           15\n   2 \u2502 missing      8\n   3 \u2502 c            7\n   4 \u2502 a            6       \n

    source

    # TidierData.@distinct \u2014 Macro.

    distinct(df, exprs...)\n

    Return distinct rows of a DataFrame.

    If no columns or expressions are provided, then unique rows across all columns are returned. Otherwise, unique rows are determined based on the columns or expressions provided, and then all columns are returned.

    Arguments

    • df: A DataFrame.
    • exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = repeat(1:5, 2), c = 11:20);\n\njulia> @chain df @distinct()\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         1     16\n   7 \u2502 d         2     17\n   8 \u2502 d         3     18\n   9 \u2502 e         4     19\n  10 \u2502 e         5     20\n\njulia> @chain df @distinct(a)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         3     13\n   3 \u2502 c         5     15\n   4 \u2502 d         2     17\n   5 \u2502 e         4     19\n\njulia> @chain df begin\n         @distinct(starts_with(\"a\"))\n       end\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         3     13\n   3 \u2502 c         5     15\n   4 \u2502 d         2     17\n   5 \u2502 e         4     19\n\njulia> @chain df begin\n         @distinct(a, b)\n       end\n10\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         3     13\n   4 \u2502 b         4     14\n   5 \u2502 c         5     15\n   6 \u2502 c         1     16\n   7 \u2502 d         2     17\n   8 \u2502 d         3     18\n   9 \u2502 e         4     19\n  10 \u2502 e         5     20\n

    source

    # TidierData.@drop_missing \u2014 Macro.

    @drop_missing(df, [cols...])\n

    Drop all rows with missing values.

    When called without arguments, @drop_missing() drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • cols...: An optional column, or multiple columns separated by commas or specified using selection helpers.

    Examples

    julia> df = DataFrame(\n              a = [1, 2, missing, 4],\n              b = [1, missing, 3, 4]\n            )\n4\u00d72 DataFrame\n Row \u2502 a        b       \n     \u2502 Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502       1        1\n   2 \u2502       2  missing \n   3 \u2502 missing        3\n   4 \u2502       4        4\n\njulia> @chain df @drop_missing()\n2\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     4      4\n\njulia> @chain df @drop_missing(a)\n3\u00d72 DataFrame\n Row \u2502 a      b       \n     \u2502 Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2  missing \n   3 \u2502     4        4\n\njulia> @chain df @drop_missing(a, b)\n2\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1\n   2 \u2502     4      4\n\njulia> @chain df @drop_missing(starts_with(\"a\"))\n3\u00d72 DataFrame\n Row \u2502 a      b       \n     \u2502 Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1        1\n   2 \u2502     2  missing \n   3 \u2502     4        4\n

    source

    # TidierData.@fill_missing \u2014 Macro.

    @fill_missing(df, [columns...], direction)

    Fill missing values in a DataFrame df using the specified method.

    Arguments

    • df: The DataFrame or GroupedDataFrame in which you want to fill missing values.
    • columns: (Optional) The columns for which missing values need to be filled, separated by commas. If not provided, the operation is applied to all columns.
    • direction: A string containing the method to use for filling missing values. Options include: \"down\" (last observation carried forward) or \"up\" (next observation carried backward).

    Examples

    julia> df = DataFrame(\n          dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n          dt2 = [0.3, 2, missing, 3, missing, 5, 6,missing],\n          dt3 = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n          dt4 = [0.3, missing, missing, 3, missing, 5, 6, missing],\n          dt5 = ['a', 'b', 'a', 'b', 'a', 'a', 'a', 'b']);\n\njulia> @fill_missing(df, dt2, dt4, \"down\")\n8\u00d75 DataFrame\n Row \u2502 dt1        dt2       dt3        dt4       dt5  \n     \u2502 Float64?   Float64?  Float64?   Float64?  Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3  missing         0.3  a\n   2 \u2502       0.2       2.0        0.2       0.3  b\n   3 \u2502 missing         2.0  missing         0.3  a\n   4 \u2502 missing         3.0  missing         3.0  b\n   5 \u2502       1.0       3.0        1.0       3.0  a\n   6 \u2502 missing         5.0  missing         5.0  a\n   7 \u2502       5.0       6.0        5.0       6.0  a\n   8 \u2502       6.0       6.0        6.0       6.0  b\n\njulia> @chain df begin\n         @fill_missing(\"up\")\n       end\n8\u00d75 DataFrame\n Row \u2502 dt1       dt2        dt3       dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?  Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      0.2        0.3       0.2        0.3  a\n   2 \u2502      0.2        2.0       0.2        3.0  b\n   3 \u2502      1.0        3.0       1.0        3.0  a\n   4 \u2502      1.0        3.0       1.0        3.0  b\n   5 \u2502      1.0        5.0       1.0        5.0  a\n   6 \u2502      5.0        5.0       5.0        5.0  a\n   7 \u2502      5.0        6.0       5.0        6.0  a\n   8 \u2502      6.0  missing         6.0  missing    b \n\njulia> @chain df begin\n         @group_by(dt5)\n         @fill_missing(dt1, \"up\")\n       end\nGroupedDataFrame with 2 groups based on key: dt5\nFirst Group (5 rows): dt5 = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n Row \u2502 dt1       dt2        dt3        dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?   Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      1.0        0.3  missing          0.3  a\n   2 \u2502      1.0  missing    missing    missing    a\n   3 \u2502      1.0  missing          1.0  missing    a\n   4 \u2502      5.0        5.0  missing          5.0  a\n   5 \u2502      5.0        6.0        5.0        6.0  a\n\u22ee\nLast Group (3 rows): dt5 = 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)\n Row \u2502 dt1       dt2        dt3        dt4        dt5  \n     \u2502 Float64?  Float64?   Float64?   Float64?   Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      0.2        2.0        0.2  missing    b\n   2 \u2502      6.0        3.0  missing          3.0  b\n   3 \u2502      6.0  missing          6.0  missing    b\n

    source

    # TidierData.@filter \u2014 Macro.

    @filter(df, exprs...)\n

    Subset a DataFrame and return a copy of DataFrame where specified conditions are satisfied.

    Arguments

    • df: A DataFrame.
    • exprs...: transformation(s) that produce vectors containing true or false.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @filter(b >= mean(b))\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 c         3     13\n   2 \u2502 d         4     14\n   3 \u2502 e         5     15\n\njulia> @chain df begin\n         @filter(b >= 3 && c >= 14)\n       end\n2\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 d         4     14\n   2 \u2502 e         5     15\n\njulia> @chain df begin\n         @filter(b in (1, 3))\n       end\n2\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 c         3     13\n

    source

    # TidierData.@full_join \u2014 Macro.

    @full_join(df1, df2, [by])\n

    Perform a full join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @full_join(df1, df2)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, a)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, a = a)\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, \"a\")\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n\njulia> @full_join(df1, df2, \"a\" = \"a\")\n3\u00d73 DataFrame\n Row \u2502 a       b        c       \n     \u2502 String  Int64?   Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1        3\n   2 \u2502 b             2  missing \n   3 \u2502 c       missing        4\n

    source

    # TidierData.@glimpse \u2014 Macro.

    @glimpse(df, width = 80)\n

    Preview a DataFrame (or GroupedDataFrame).

    The @glimpse macro is used to preview a DataFrame or GroupedDataFrame. Each column is printed on a separate row, along with its data type and first few elements, with the output truncated based on the width.

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • width: The width of the output, measured in the number of characters. Defaults to 80.

    Examples

    julia> df = DataFrame(\n               a = 1:100, \n               b = 1:100, \n               c = repeat([\"a\"], 100)\n               );\n\njulia> @chain df @glimpse\nRows: 100\nColumns: 3\n.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,\n\njulia> @chain df begin\n       @group_by(a)\n       @glimpse()\n       end\nRows: 100\nColumns: 3\nGroups: a [100]\n.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,\n.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,\n

    source

    # TidierData.@group_by \u2014 Macro.

    @group_by(df, exprs...)\n

    Return a GroupedDataFrame where operations are performed by groups specified by unique sets of cols.

    Arguments

    • df: A DataFrame.
    • exprs...: DataFrame columns to group by or tidy expressions. Can be a single tidy expression or multiple expressions separated by commas.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @group_by(a)\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0  \n\njulia> @chain df begin\n         @group_by(d = uppercase(a))\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 d     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 A         1.0\n   2 \u2502 B         2.0\n   3 \u2502 C         3.0\n   4 \u2502 D         4.0\n   5 \u2502 E         5.0\n\njulia> @chain df begin\n         @group_by(-(b, c)) # same as `a`\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0\n\njulia> @chain df begin\n         @group_by(!(b, c)) # same as `a`\n         @summarize(b = mean(b))\n       end\n5\u00d72 DataFrame\n Row \u2502 a     b       \n     \u2502 Char  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1.0\n   2 \u2502 b         2.0\n   3 \u2502 c         3.0\n   4 \u2502 d         4.0\n   5 \u2502 e         5.0\n

    source

    # TidierData.@head \u2014 Macro.

       @head(df, value)\n

    Shows the first n rows of the the data frame or of each group in a grouped data frame.

    Arguments

    • df: The data frame.
    • value: number of rows to be returned. Defaults to 6 if left blank.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 4),\n                                  repeat([\"b\"], inner = 4)),\n                             b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 a           3\n   4 \u2502 a           4\n   5 \u2502 b           5\n   6 \u2502 b           6\n   7 \u2502 b           7\n   8 \u2502 b           8\n\njulia> @head(df, 3)\n3\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n\njulia> @head(df)\n6\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n   3 \u2502 a           3\n   4 \u2502 a           4\n   5 \u2502 b           5\n   6 \u2502 b           6\n\njulia> @chain df begin\n         @group_by a\n         @head 2\n       end\nGroupedDataFrame with 2 groups based on key: a\nFirst Group (2 rows): a = \"a\"\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n   2 \u2502 a           2\n\u22ee\nLast Group (2 rows): a = \"b\"\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           5\n   2 \u2502 b           6\n

    source

    # TidierData.@inner_join \u2014 Macro.

    @inner_join(df1, df2, [by])\n

    Perform a inner join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @inner_join(df1, df2)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, a)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, a = a)\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, \"a\")\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n\njulia> @inner_join(df1, df2, \"a\" = \"a\")\n1\u00d73 DataFrame\n Row \u2502 a       b      c     \n     \u2502 String  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1      3\n

    source

    # TidierData.@left_join \u2014 Macro.

    @left_join(df1, df2, [by])\n

    Perform a left join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @left_join(df1, df2)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing \n\njulia> @left_join(df1, df2, a)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, a = a)\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n\njulia> @left_join(df1, df2, \"a\" = \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b      c       \n     \u2502 String  Int64  Int64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1        3\n   2 \u2502 b           2  missing\n

    source

    # TidierData.@mutate \u2014 Macro.

    @mutate(df, exprs...)\n

    Create new columns as functions of existing columns. The results have the same number of rows as df.

    Arguments

    • df: A DataFrame.
    • exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @mutate(d = b + c,\n                 b_minus_mean_b = b - mean(b))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      d      b_minus_mean_b \n     \u2502 Char  Int64  Int64  Int64  Float64        \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11     12            -2.0\n   2 \u2502 b         2     12     14            -1.0\n   3 \u2502 c         3     13     16             0.0\n   4 \u2502 d         4     14     18             1.0\n   5 \u2502 e         5     15     20             2.0\n\njulia> @chain df begin\n         @mutate begin\n           d = b + c\n           b_minus_mean_b = b - mean(b)\n         end\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      d      b_minus_mean_b \n     \u2502 Char  Int64  Int64  Int64  Float64        \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11     12            -2.0\n   2 \u2502 b         2     12     14            -1.0\n   3 \u2502 c         3     13     16             0.0\n   4 \u2502 d         4     14     18             1.0\n   5 \u2502 e         5     15     20             2.0\n\njulia> @chain df begin\n         @mutate(d = b in (1,3))\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b      c      d     \n     \u2502 Char  Int64  Int64  Bool  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11   true\n   2 \u2502 b         2     12  false\n   3 \u2502 c         3     13   true\n   4 \u2502 d         4     14  false\n   5 \u2502 e         5     15  false\n\njulia> @chain df begin\n         @mutate(across((b, c), mean))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_mean   c_mean  \n     \u2502 Char  Int64  Int64  Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11      3.0     13.0\n   2 \u2502 b         2     12      3.0     13.0\n   3 \u2502 c         3     13      3.0     13.0\n   4 \u2502 d         4     14      3.0     13.0\n   5 \u2502 e         5     15      3.0     13.0\n\njulia> @chain df begin\n         @summarize(across(contains(\"b\"), mean))\n       end\n1\u00d71 DataFrame\n Row \u2502 b_mean  \n     \u2502 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0\n\njulia> @chain df begin\n         @summarize(across(-contains(\"a\"), mean))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_mean   c_mean  \n     \u2502 Float64  Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0     13.0\n\njulia> @chain df begin\n         @mutate(across(where(is_number), minimum))\n       end\n5\u00d75 DataFrame\n Row \u2502 a     b      c      b_minimum  c_minimum \n     \u2502 Char  Int64  Int64  Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11          1         11\n   2 \u2502 b         2     12          1         11\n   3 \u2502 c         3     13          1         11\n   4 \u2502 d         4     14          1         11\n   5 \u2502 e         5     15          1         11\n

    source

    # TidierData.@nest \u2014 Macro.

    @nest(df, new_column = nesting_columns)\n

    Multiple columns are nested into one or more new columns in a DataFrame.

    Arguments

    • df: A DataFrame
    • new_column: New column name
    • nesting_columns: Columns to be nested into the new_column

    Examples

    julia> df = DataFrame(a = repeat('a':'e', inner = 3),\n                      b = 1:15,\n                      c_1 = 16:30,\n                      c_2 = 31:45);\n\njulia> @nest(df, data = b:c_2)\n5\u00d72 DataFrame\n Row \u2502 a     data          \n     \u2502 Char  DataFrame     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     3\u00d73 DataFrame \n   2 \u2502 b     3\u00d73 DataFrame \n   3 \u2502 c     3\u00d73 DataFrame \n   4 \u2502 d     3\u00d73 DataFrame \n   5 \u2502 e     3\u00d73 DataFrame \n\njulia> @nest(df, data_1 = b, data_2 = starts_with(\"c\"))\n5\u00d73 DataFrame\n Row \u2502 a     data_1         data_2        \n     \u2502 Char  DataFrame      DataFrame     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     3\u00d71 DataFrame  3\u00d72 DataFrame \n   2 \u2502 b     3\u00d71 DataFrame  3\u00d72 DataFrame \n   3 \u2502 c     3\u00d71 DataFrame  3\u00d72 DataFrame \n   4 \u2502 d     3\u00d71 DataFrame  3\u00d72 DataFrame \n   5 \u2502 e     3\u00d71 DataFrame  3\u00d72 DataFrame \n\njulia> @chain df begin\n         @nest(data = b:c_2)\n         @unnest_longer(data)\n       end\n15\u00d72 DataFrame\n Row \u2502 a     data                         \n     \u2502 Char  NamedTup\u2026                    \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     (b = 1, c_1 = 16, c_2 = 31)\n   2 \u2502 a     (b = 2, c_1 = 17, c_2 = 32)\n   3 \u2502 a     (b = 3, c_1 = 18, c_2 = 33)\n   4 \u2502 b     (b = 4, c_1 = 19, c_2 = 34)\n   5 \u2502 b     (b = 5, c_1 = 20, c_2 = 35)\n   6 \u2502 b     (b = 6, c_1 = 21, c_2 = 36)\n   7 \u2502 c     (b = 7, c_1 = 22, c_2 = 37)\n   8 \u2502 c     (b = 8, c_1 = 23, c_2 = 38)\n   9 \u2502 c     (b = 9, c_1 = 24, c_2 = 39)\n  10 \u2502 d     (b = 10, c_1 = 25, c_2 = 40)\n  11 \u2502 d     (b = 11, c_1 = 26, c_2 = 41)\n  12 \u2502 d     (b = 12, c_1 = 27, c_2 = 42)\n  13 \u2502 e     (b = 13, c_1 = 28, c_2 = 43)\n  14 \u2502 e     (b = 14, c_1 = 29, c_2 = 44)\n  15 \u2502 e     (b = 15, c_1 = 30, c_2 = 45)\n\njulia> @chain df begin\n         @nest(data = b:c_2)\n         @unnest_wider(data)\n       end\n5\u00d74 DataFrame\n Row \u2502 a     b             c_1           c_2          \n     \u2502 Char  Any           Any           Any          \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a     [1, 2, 3]     [16, 17, 18]  [31, 32, 33]\n   2 \u2502 b     [4, 5, 6]     [19, 20, 21]  [34, 35, 36]\n   3 \u2502 c     [7, 8, 9]     [22, 23, 24]  [37, 38, 39]\n   4 \u2502 d     [10, 11, 12]  [25, 26, 27]  [40, 41, 42]\n   5 \u2502 e     [13, 14, 15]  [28, 29, 30]  [43, 44, 45]\n\njulia> @chain df begin\n         @nest(data = -a)\n         @unnest_wider(data) # wider first\n         @unnest_longer(-a)  # then longer\n       end\n15\u00d74 DataFrame\n Row \u2502 a     b      c_1    c_2   \n     \u2502 Char  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     16     31\n   2 \u2502 a         2     17     32\n   3 \u2502 a         3     18     33\n   4 \u2502 b         4     19     34\n   5 \u2502 b         5     20     35\n   6 \u2502 b         6     21     36\n   7 \u2502 c         7     22     37\n   8 \u2502 c         8     23     38\n   9 \u2502 c         9     24     39\n  10 \u2502 d        10     25     40\n  11 \u2502 d        11     26     41\n  12 \u2502 d        12     27     42\n  13 \u2502 e        13     28     43\n  14 \u2502 e        14     29     44\n  15 \u2502 e        15     30     45\n\njulia> @chain df begin\n         @nest(data = -a)\n         @unnest_longer(data) # longer first\n         @unnest_wider(-a)    # then wider\n       end\n15\u00d74 DataFrame\n Row \u2502 a     b      c_2    c_1   \n     \u2502 Char  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     31     16\n   2 \u2502 a         2     32     17\n   3 \u2502 a         3     33     18\n   4 \u2502 b         4     34     19\n   5 \u2502 b         5     35     20\n   6 \u2502 b         6     36     21\n   7 \u2502 c         7     37     22\n   8 \u2502 c         8     38     23\n   9 \u2502 c         9     39     24\n  10 \u2502 d        10     40     25\n  11 \u2502 d        11     41     26\n  12 \u2502 d        12     42     27\n  13 \u2502 e        13     43     28\n  14 \u2502 e        14     44     29\n  15 \u2502 e        15     45     30\n

    source

    # TidierData.@pivot_longer \u2014 Macro.

    @pivotlonger(df, cols, [namesto], [values_to])

    Reshapes the DataFrame to make it longer, increasing the number of rows and reducing the number of columns.

    Arguments

    • df: A DataFrame.
    • cols: Columns to pivot into longer format. Multiple columns can be selected but providing tuples of columns is not yet supported.
    • names_to: Optional, defaults to variable. The name of the newly created column whose values will contain the input DataFrame's column names.
    • values_to: Optional, defaults to value. The name of the newly created column containing the input DataFrame's cell values.

    Examples

    julia> df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4]);\n\njulia> @pivot_longer(df_wide, A:B)\n4\u00d73 DataFrame\n Row \u2502 id     variable  value \n     \u2502 Int64  String    Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A             1\n   2 \u2502     2  A             3\n   3 \u2502     1  B             2\n   4 \u2502     2  B             4\n\njulia> @pivot_longer(df_wide, -id)\n4\u00d73 DataFrame\n Row \u2502 id     variable  value \n     \u2502 Int64  String    Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A             1\n   2 \u2502     2  A             3\n   3 \u2502     1  B             2\n   4 \u2502     2  B             4\n\njulia> @pivot_longer(df_wide, A:B, names_to = \"letter\", values_to = \"number\")\n4\u00d73 DataFrame\n Row \u2502 id     letter  number \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A            1\n   2 \u2502     2  A            3\n   3 \u2502     1  B            2\n   4 \u2502     2  B            4\n\njulia> @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)\n4\u00d73 DataFrame\n Row \u2502 id     letter  number \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A            1\n   2 \u2502     2  A            3\n   3 \u2502     1  B            2\n   4 \u2502     2  B            4\n\njulia> @pivot_longer(df_wide, A:B, names_to = \"letter\")\n4\u00d73 DataFrame\n Row \u2502 id     letter  value \n     \u2502 Int64  String  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  A           1\n   2 \u2502     2  A           3\n   3 \u2502     1  B           2\n   4 \u2502     2  B           4\n

    source

    # TidierData.@pivot_wider \u2014 Macro.

    @pivotwider(df, namesfrom, valuesfrom[, valuesfill])

    Reshapes the DataFrame to make it wider, increasing the number of columns and reducing the number of rows.

    Arguments

    • df: A DataFrame.
    • names_from: The name of the column to get the name of the output columns from.
    • values_from: The name of the column to get the cell values from.
    • values_fill: The value to replace a missing name/value combination (default is missing)

    Examples

    julia> df_long = DataFrame(id = [1, 1, 2, 2],\n                           variable = [\"A\", \"B\", \"A\", \"B\"],\n                           value = [1, 2, 3, 4]);\n\njulia> df_long_missing = DataFrame(id = [1, 1, 2],\n                           variable = [\"A\", \"B\", \"B\"],\n                           value = [1, 2, 4]);\n\njulia> @pivot_wider(df_long, names_from = variable, values_from = value)\n2\u00d73 DataFrame\n Row \u2502 id     A       B      \n     \u2502 Int64  Int64?  Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1       1       2\n   2 \u2502     2       3       4\n\njulia> @pivot_wider(df_long, names_from = \"variable\", values_from = \"value\")\n2\u00d73 DataFrame\n Row \u2502 id     A       B      \n     \u2502 Int64  Int64?  Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1       1       2\n   2 \u2502     2       3       4\n\njulia> @pivot_wider(df_long_missing, names_from = variable, values_from = value, values_fill = 0)\n2\u00d73 DataFrame\n Row \u2502 id     A      B     \n     \u2502 Int64  Int64  Int64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      2\n   2 \u2502     2      0      4\n

    source

    # TidierData.@pull \u2014 Macro.

    @pull(df, column)\n

    Pull (or extract) a column as a vector.

    Arguments

    • df: A DataFrame.
    • column: A single column, referred to either by its name or number.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df @pull(a)\n5-element Vector{Char}:\n 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)\n 'c': ASCII/Unicode U+0063 (category Ll: Letter, lowercase)\n 'd': ASCII/Unicode U+0064 (category Ll: Letter, lowercase)\n 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)\n\njulia> @chain df @pull(2)\n5-element Vector{Int64}:\n 1\n 2\n 3\n 4\n 5\n

    source

    # TidierData.@relocate \u2014 Macro.

    @relocate(df, columns, before = nothing, after = nothing)\n

    Rearranges the columns of a data frame. This function allows for moving specified columns to a new position within the data frame, either before or after a given target column. The columns, before, and after arguments all accept tidy selection functions. Only one of before or after should be specified. If neither are specified, the selected columns will be moved to the beginning of the data frame.

    Arguments

    • df: The data frame.
    • columns: Column or columns to to be moved.
    • before: (Optional) Column or columns before which the specified columns will be moved. If not provided or nothing, this argument is ignored.
    • after: (Optional) Column or columns after which the specified columns will be moved. If not provided or nothing, this argument is ignored.

    Examples

    julia> df = DataFrame(A = 1:5, B = 6:10, C = [\"A\", \"b\", \"C\", \"D\", \"E\"], D = ['A', 'B','A', 'B','C'],\n                      E = 1:5, F = [\"A\", \"b\", \"C\", \"D\", \"E\"]);\n\njulia> @relocate(df, where(is_string), before = where(is_integer))\n5\u00d76 DataFrame\n Row \u2502 C       F       A      B      E      D    \n     \u2502 String  String  Int64  Int64  Int64  Char \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 A       A           1      6      1  A\n   2 \u2502 b       b           2      7      2  B\n   3 \u2502 C       C           3      8      3  A\n   4 \u2502 D       D           4      9      4  B\n   5 \u2502 E       E           5     10      5  C\n\n\njulia> @relocate(df, B, C, D, after = E)\n5\u00d76 DataFrame\n Row \u2502 A      E      B      C       D     F      \n     \u2502 Int64  Int64  Int64  String  Char  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      6  A       A     A\n   2 \u2502     2      2      7  b       B     b\n   3 \u2502     3      3      8  C       A     C\n   4 \u2502     4      4      9  D       B     D\n   5 \u2502     5      5     10  E       C     E\n\njulia> @relocate(df, B, C, D, after = starts_with(\"E\"))\n5\u00d76 DataFrame\n Row \u2502 A      E      B      C       D     F      \n     \u2502 Int64  Int64  Int64  String  Char  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      6  A       A     A\n   2 \u2502     2      2      7  b       B     b\n   3 \u2502     3      3      8  C       A     C\n   4 \u2502     4      4      9  D       B     D\n   5 \u2502     5      5     10  E       C     E\n\njulia> @relocate(df, B:C) # bring columns to the front\n5\u00d76 DataFrame\n Row \u2502 B      C       A      D     E      F      \n     \u2502 Int64  String  Int64  Char  Int64  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6  A           1  A         1  A\n   2 \u2502     7  b           2  B         2  b\n   3 \u2502     8  C           3  A         3  C\n   4 \u2502     9  D           4  B         4  D\n   5 \u2502    10  E           5  C         5  E\n

    source

    # TidierData.@rename \u2014 Macro.

    @rename(df, exprs...)\n

    Change the names of individual column names in a DataFrame. Users can also use @select() to rename and select columns.

    Arguments

    • df: A DataFrame.
    • exprs...: Use new_name = old_name syntax to rename selected columns.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @rename(d = b, e = c)\n       end\n5\u00d73 DataFrame\n Row \u2502 a     d      e     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n

    source

    # TidierData.@rename_with \u2014 Macro.

     @rename_with(df, fn, exprs...)\n

    Renames the chosen column names using a function

    Arguments

    • df: a DataFrame
    • fn: desired function to (such as strremoveall from TidierStrings)
    • exprs: One or more unquoted variable names separated by commas. Variable names

    can also be used as their positions in the data, like x:y, to select a range of variables. Variables names can also be chosen with starts with. Defaults to all columns if empty.

    Examples

    julia> function str_remove_all(column, pattern::String)\n         if ismissing(column)\n             return column\n         end\n         patterns = split(pattern, '|')\n         for p in patterns\n             column = replace(column, strip(p) => \"\")\n         end\n         return column\n       end;\n\njulia> df = DataFrame(\n              term_a = [\"apple\", \"banana\", \"cherry\"],\n              document_a = [\"doc_1\", \"doc2\", \"doc3\"],\n              _n_ = [1, 2, 3]\n            ); \n\njulia> @rename_with(df, str -> str_remove_all(str, \"_a\"), !term_a)\n3\u00d73 DataFrame\n Row \u2502 term_a  document  _n_   \n     \u2502 String  String    Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 apple   doc_1         1\n   2 \u2502 banana  doc2          2\n   3 \u2502 cherry  doc3          3\n

    source

    # TidierData.@right_join \u2014 Macro.

    @right_join(df1, df2, [by])\n

    Perform a right join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @right_join(df1, df2)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, a)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, a = a)\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n\njulia> @right_join(df1, df2, \"a\" = \"a\")\n2\u00d73 DataFrame\n Row \u2502 a       b        c     \n     \u2502 String  Int64?   Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a             1      3\n   2 \u2502 c       missing      4\n

    source

    # TidierData.@select \u2014 Macro.

    @select(df, exprs...)\n

    Select variables in a DataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df @select(a, b, c)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n\njulia> @chain df @select(a:b)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df @select(1:2)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df @select(-(a:b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(a:b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(-(a, b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(a, b))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df begin\n         @select(contains(\"b\"), starts_with(\"c\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df @select(-(1:2))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(!(1:2))\n5\u00d71 DataFrame\n Row \u2502 c     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    11\n   2 \u2502    12\n   3 \u2502    13\n   4 \u2502    14\n   5 \u2502    15\n\njulia> @chain df @select(-c)\n5\u00d72 DataFrame\n Row \u2502 a     b     \n     \u2502 Char  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1\n   2 \u2502 b         2\n   3 \u2502 c         3\n   4 \u2502 d         4\n   5 \u2502 e         5\n\njulia> @chain df begin\n         @select(-contains(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @select(!contains(\"a\"))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n\njulia> @chain df begin\n         @select(where(is_number))\n       end\n5\u00d72 DataFrame\n Row \u2502 b      c     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1     11\n   2 \u2502     2     12\n   3 \u2502     3     13\n   4 \u2502     4     14\n   5 \u2502     5     15\n

    source

    # TidierData.@semi_join \u2014 Macro.

    @semi_join(df1, df2, [by])\n

    Perform an semi-join on df1 and df2 with an optional by.

    Arguments

    • df1: A DataFrame.
    • df2: A DataFrame.
    • by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

    Examples

    julia> df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n\njulia> df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n\njulia> @semi_join(df1, df2)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, a = a)\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n\njulia> @semi_join(df1, df2, \"a\" = \"a\")\n1\u00d72 DataFrame\n Row \u2502 a       b     \n     \u2502 String  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a           1\n

    source

    # TidierData.@separate \u2014 Macro.

    @separate(df, from, into, sep, extra = \"merge\")

    Separate a string column into mulitiple new columns based on a specified delimter

    Arguments

    • df: A DataFrame
    • from: Column that will be split
    • into: New column names, supports [] or ()
    • sep: the string or character on which to split
    • extra: \"merge\", \"warn\" and \"drop\" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. \"warn\" generates a warning message for dropped values.

    Examples

    julia> df = DataFrame(a = [\"1-1\", \"2-2\", \"3-3-3\"]);\n\njulia> @separate(df, a, [b, c, d], \"-\")\n3\u00d73 DataFrame\n Row \u2502 b          c          d          \n     \u2502 SubStrin\u2026  SubStrin\u2026  SubStrin\u2026? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1          missing    \n   2 \u2502 2          2          missing    \n   3 \u2502 3          3          3\n\njulia> @chain df begin\n         @separate(a, (b, c, d), \"-\")\n       end\n3\u00d73 DataFrame\n Row \u2502 b          c          d          \n     \u2502 SubStrin\u2026  SubStrin\u2026  SubStrin\u2026? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1          missing    \n   2 \u2502 2          2          missing    \n   3 \u2502 3          3          3\n\njulia> @separate(df, a, (b, c), \"-\")\n3\u00d72 DataFrame\n Row \u2502 b          c      \n     \u2502 SubStrin\u2026  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1\n   2 \u2502 2          2\n   3 \u2502 3          3-3\n\njulia> @chain df begin\n         @separate(a, (b, c), \"-\", extra = \"drop\")\n       end\n3\u00d72 DataFrame\n Row \u2502 b          c         \n     \u2502 SubStrin\u2026  SubStrin\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1          1\n   2 \u2502 2          2\n   3 \u2502 3          3\n

    source

    # TidierData.@separate_rows \u2014 Macro.

    separate_rows(df, columns..., sep)\n

    Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.

    Arguments

    • df: A DataFrame
    • columns: A column or multiple columns to be split. Can be a mix of integers and column names.
    • sep: The string or character or regular expression used to split the column values.

    Examples

    julia> df = DataFrame(a = 1:3,\n                      b = [\"a\", \"aa;bb;cc\", \"dd;ee\"],\n                      c = [\"1\", \"2;3;4\", \"5;6\"],\n                      d = [\"7\", \"8;9;10\", \"11;12\"])\n3\u00d74 DataFrame\n Row \u2502 a      b         c       d      \n     \u2502 Int64  String    String  String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a         1       7\n   2 \u2502     2  aa;bb;cc  2;3;4   8;9;10\n   3 \u2502     3  dd;ee     5;6     11;12\n\njulia> @separate_rows(df, 2, 4, \";\")\n6\u00d74 DataFrame\n Row \u2502 a      b          c       d         \n     \u2502 Int64  SubStrin\u2026  String  SubStrin\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a          1       7\n   2 \u2502     2  aa         2;3;4   8\n   3 \u2502     2  bb         2;3;4   9\n   4 \u2502     2  cc         2;3;4   10\n   5 \u2502     3  dd         5;6     11\n   6 \u2502     3  ee         5;6     12\n\njulia> @separate_rows(df, b:d, \";\")\n6\u00d74 DataFrame\n Row \u2502 a      b          c          d         \n     \u2502 Int64  SubStrin\u2026  SubStrin\u2026  SubStrin\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  a          1          7\n   2 \u2502     2  aa         2          8\n   3 \u2502     2  bb         3          9\n   4 \u2502     2  cc         4          10\n   5 \u2502     3  dd         5          11\n   6 \u2502     3  ee         6          12\n

    source

    # TidierData.@slice \u2014 Macro.

    @slice(df, exprs...)\n

    Select, remove or duplicate rows by indexing their integer positions.

    Arguments

    • df: A DataFrame.
    • exprs...: integer row values. Use positive values to keep the rows, or negative values to drop. Values provided must be either all positive or all negative, and they must be within the range of DataFrames' row numbers.

    Examples

    julia> df = DataFrame(a = repeat('a':'c', inner = 3), b = 1:9, c = 11:19);\n\njulia> @chain df @slice(1:5)\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 a         3     13\n   4 \u2502 b         4     14\n   5 \u2502 b         5     15\n\njulia> @chain df @slice(-(1:2))\n7\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         3     13\n   2 \u2502 b         4     14\n   3 \u2502 b         5     15\n   4 \u2502 b         6     16\n   5 \u2502 c         7     17\n   6 \u2502 c         8     18\n   7 \u2502 c         9     19\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(1)\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         4     14\n   3 \u2502 c         7     17\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(n())\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         3     13\n   2 \u2502 b         6     16\n   3 \u2502 c         9     19\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(-n())\n         @ungroup\n       end\n6\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 a         2     12\n   3 \u2502 b         4     14\n   4 \u2502 b         5     15\n   5 \u2502 c         7     17\n   6 \u2502 c         8     18\n\njulia> @chain df begin\n         @group_by(a)\n         @slice(-(2:n()))\n         @ungroup\n       end\n3\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         4     14\n   3 \u2502 c         7     17\n

    source

    # TidierData.@slice_head \u2014 Macro.

    @slice_head(df; n, prop)\n

    Retrieve rows from the beginning of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_head(n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a          b          c        \n     \u2502 Float64?   Float64?   Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing          0.3       0.2\n   2 \u2502       0.2        2.0       0.2\n   3 \u2502 missing    missing         0.2\n\njulia> @chain df begin\n         @slice_head(prop = 0.25)\n       end \n2\u00d73 DataFrame\n Row \u2502 a          b         c        \n     \u2502 Float64?   Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3       0.2\n   2 \u2502       0.2       2.0       0.2\n

    source

    # TidierData.@slice_max \u2014 Macro.

    @slice_max(df, column; with_ties = true, n, prop, missing_rm = true)\n

    Retrieve rows with the maximum value(s) from the specified column of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • column: The column for which to slice the maximum values.
    • with_ties: Whether or not all ties will be shown, defaults to true. When false it will only show the first row.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
    • missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_max(b)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n\njulia> @chain df begin\n         @slice_max(b, with_ties = false)\n       end \n1\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n\njulia> @chain df begin\n         @slice_max(b, n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n   3 \u2502      1.0       6.0       1.0\n\njulia> @chain df begin\n         @slice_max(b, prop = 0.5, missing_rm = true)\n       end\n3\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n   3 \u2502      1.0       6.0       1.0\n

    source

    # TidierData.@slice_min \u2014 Macro.

    @slice_min(df, column; with_ties = true, n, prop, missing_rm = true)\n

    Retrieve rows with the minimum value(s) from the specified column of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • column: The column for which to slice the minimum values.
    • with_ties: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
    • missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_min(b)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c         \n     \u2502 Float64?  Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502  missing       0.3        0.2\n   2 \u2502  missing       0.3  missing\n\njulia> @chain df begin\n         @slice_min(b, with_ties = false)\n       end \n1\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502  missing       0.3       0.2\n\njulia> @chain df begin\n         @slice_min(b, n = 3)\n       end\n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3        0.2\n   2 \u2502 missing         0.3  missing   \n   3 \u2502       0.2       2.0        0.2  \n\njulia> @chain df begin\n         @slice_min(b, prop = 0.5, missing_rm = true)\n       end\n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         0.3        0.2\n   2 \u2502 missing         0.3  missing   \n   3 \u2502       0.2       2.0        0.2\n

    source

    # TidierData.@slice_sample \u2014 Macro.

    @slice_sample(df, [n = 1, prop, replace = false])\n

    Randomly sample rows from a DataFrame df or from each group in a GroupedDataFrame. The default is to return 1 row. Either the number of rows (n) or the proportion of rows (prop) should be provided as a keyword argument.

    Arguments

    • df: The source data frame or grouped data frame from which to sample rows.
    • n: The number of rows to sample. Defaults to 1.
    • prop: The proportion of rows to sample.
    • replace: Whether to sample with replacement. Defaults to false.

    Examples

    julia> df = DataFrame(a = 1:10, b = 11:20);\n\njulia> using StableRNGs, Random\n\njulia> rng = StableRNG(1);\n\njulia> Random.seed!(rng, 1);\n\njulia> @chain df begin \n         @slice_sample(n = 5)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6     16\n   2 \u2502     1     11\n   3 \u2502     5     15\n   4 \u2502     4     14\n   5 \u2502     8     18\n\njulia> @chain df begin \n         @slice_sample(n = 5, replace = true)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     7     17\n   2 \u2502     2     12\n   3 \u2502     1     11\n   4 \u2502     4     14\n   5 \u2502     2     12\n\njulia> @chain df begin \n         @slice_sample(prop = 0.5)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     6     16\n   2 \u2502     7     17\n   3 \u2502     5     15\n   4 \u2502     9     19\n   5 \u2502     2     12\n\njulia> @chain df begin \n         @slice_sample(prop = 0.5, replace = true)\n       end\n5\u00d72 DataFrame\n Row \u2502 a      b     \n     \u2502 Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    10     20\n   2 \u2502     4     14\n   3 \u2502     9     19\n   4 \u2502     9     19\n   5 \u2502     8     18\n

    source

    # TidierData.@slice_tail \u2014 Macro.

    @slice_tail(df; n, prop)\n

    Retrieve rows from the end of a DataFrame or GroupedDataFrame.

    Arguments

    • df: The source data frame or grouped data frame from which to slice rows.
    • prop: The proportion of rows to slice.
    • n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

    Examples

    julia> df = DataFrame(\n           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],\n           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],\n           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);\n\njulia> @chain df begin\n         @slice_tail(n = 3)\n       end \n3\u00d73 DataFrame\n Row \u2502 a          b         c         \n     \u2502 Float64?   Float64?  Float64?  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 missing         5.0  missing   \n   2 \u2502       5.0       7.0        5.0\n   3 \u2502       6.0       7.0        6.0\n\njulia> @chain df begin\n         @slice_tail(prop = 0.25)\n       end \n2\u00d73 DataFrame\n Row \u2502 a         b         c        \n     \u2502 Float64?  Float64?  Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502      5.0       7.0       5.0\n   2 \u2502      6.0       7.0       6.0\n

    source

    # TidierData.@summarise \u2014 Macro.

    @summarize(df, exprs...)\n@summarise(df, exprs...)\n

    Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(mean_b = mean(b),\n                    median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize begin\n           mean_b = mean(b)\n           median_b = median(b)\n         end\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0 \n\njulia> @chain df begin\n         @summarise(mean_b = mean(b), median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n

    source

    # TidierData.@summarize \u2014 Macro.

    @summarize(df, exprs...)\n@summarise(df, exprs...)\n

    Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

    Arguments

    • df: A DataFrame.
    • exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @summarize(mean_b = mean(b),\n                    median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize begin\n           mean_b = mean(b)\n           median_b = median(b)\n         end\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0 \n\njulia> @chain df begin\n         @summarise(mean_b = mean(b), median_b = median(b))\n       end\n1\u00d72 DataFrame\n Row \u2502 mean_b   median_b \n     \u2502 Float64  Float64  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     3.0       3.0\n\njulia> @chain df begin\n         @summarize(across((b,c), (minimum, maximum)))\n       end\n1\u00d74 DataFrame\n Row \u2502 b_minimum  c_minimum  b_maximum  c_maximum \n     \u2502 Int64      Int64      Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11          5         15\n\njulia> @chain df begin\n         @summarize(across(where(is_number), minimum))\n       end\n1\u00d72 DataFrame\n Row \u2502 b_minimum  c_minimum \n     \u2502 Int64      Int64     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502         1         11\n

    source

    # TidierData.@summary \u2014 Macro.

       @summary(df, cols...)\n

    For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, median, number of missing values

    Arguments

    • 'df': A DataFrame
    • cols: columns on which summary will be performed. This is an optional arguement, without which summary will be performed on all numerical columns

    Examples

    julia> df = DataFrame(a = [1, 2, 3, 4, 5],\n                      b = [missing, 7, 8, 9, 10],\n                      c = [11, missing, 13, 14, missing],\n                      d = [16, 17, 18, 19, 20]);\n\njulia> @summary(df);\n\njulia> @summary(df, (b:d));\n\njulia> @chain df begin\n         @summary(b:d)\n       end;\n

    source

    # TidierData.@tally \u2014 Macro.

    @tally(df, [wt], [sort])\n

    Tally the unique values of one or more variables, with an optional weighting.

    @tally() is a low-level helper macro for @count() that assumes that any grouping has already been performed. @chain @tally() is roughly equivalent to @chain df @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt).

    Arguments

    • df: A DataFrame or GroupedDataFrame.
    • wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
    • sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

    Examples

    julia> df = DataFrame(a = vcat(repeat([\"a\"], inner = 3),\n                           repeat([\"b\"], inner = 3),\n                           repeat([\"c\"], inner = 1),\n                           missing),\n                      b = 1:8)\n8\u00d72 DataFrame\n Row \u2502 a        b     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            1\n   2 \u2502 a            2\n   3 \u2502 a            3\n   4 \u2502 b            4\n   5 \u2502 b            5\n   6 \u2502 b            6\n   7 \u2502 c            7\n   8 \u2502 missing      8\n\njulia> @chain df @tally()\n1\u00d71 DataFrame\n Row \u2502 n     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     8\n\njulia> @chain df begin\n         @group_by(a)\n         @tally()\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            3\n   2 \u2502 b            3\n   3 \u2502 c            1\n   4 \u2502 missing      1\n\njulia> @chain df begin\n         @group_by(a)\n         @tally(wt = b)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a            6\n   2 \u2502 b           15\n   3 \u2502 c            7\n   4 \u2502 missing      8\n\njulia> @chain df begin\n         @group_by(a)\n         @tally(wt = b, sort = true)\n       end\n4\u00d72 DataFrame\n Row \u2502 a        n     \n     \u2502 String?  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 b           15\n   2 \u2502 missing      8\n   3 \u2502 c            7\n   4 \u2502 a            6       \n

    source

    # TidierData.@transmute \u2014 Macro.

    @transmute(df, exprs...)\n

    Create a new DataFrame with only computed columns.

    Arguments

    • df: A DataFrame.
    • exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @transmute(d = b + c)\n       end\n5\u00d71 DataFrame\n Row \u2502 d     \n     \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    12\n   2 \u2502    14\n   3 \u2502    16\n   4 \u2502    18\n   5 \u2502    20\n

    source

    # TidierData.@ungroup \u2014 Macro.

    @ungroup(df)\n

    Return a DataFrame with all groups removed.

    If this is applied to a GroupedDataFrame, then it removes the grouping. If this is applied to a DataFrame (without any groups), then it returns the DataFrame unchanged.

    Arguments

    • df: A GroupedDataFrame or `DataFrame``.

    Examples

    julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);\n\njulia> @chain df begin\n         @group_by(a)\n       end\nGroupedDataFrame with 5 groups based on key: a\nFirst Group (1 row): a = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n\u22ee\nLast Group (1 row): a = 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 e         5     15\n\njulia> @chain df begin\n         @group_by(a)\n         @ungroup\n       end\n5\u00d73 DataFrame\n Row \u2502 a     b      c     \n     \u2502 Char  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 a         1     11\n   2 \u2502 b         2     12\n   3 \u2502 c         3     13\n   4 \u2502 d         4     14\n   5 \u2502 e         5     15\n

    source

    # TidierData.@unite \u2014 Macro.

      @unite(df, new_cols, from_cols, sep, remove = true)\n

    Separate a multiple columns into one new columns using a specific delimter

    Arguments

    • df: A DataFrame
    • new_col: New column that will recieve the combination
    • from_cols: Column names that it will combine, supports [] or ()
    • sep: the string or character that will separate the values in the new column
    • remove: defaults to true, removes input columns from data frame

    Examples

    julia> df = DataFrame( b = [\"1\", \"2\", \"3\"], c = [\"1\", \"2\", \"3\"], d = [missing, missing, \"3\"]);\n\njulia> @unite(df, new_col, (b, c, d), \"-\")\n3\u00d71 DataFrame\n Row \u2502 new_col \n     \u2502 String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1-1\n   2 \u2502 2-2\n   3 \u2502 3-3-3\n\njulia> @unite(df, new_col, (b, c, d), \"-\", remove = false)\n3\u00d74 DataFrame\n Row \u2502 b       c       d        new_col \n     \u2502 String  String  String?  String  \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 1       1       missing  1-1\n   2 \u2502 2       2       missing  2-2\n   3 \u2502 3       3       3        3-3-3\n

    source

    # TidierData.@unnest_longer \u2014 Macro.

    @unnest_longer(df, columns, indices_include=false)\n

    Unnest arrays in columns from a DataFrame to create a longer DataFrame with one row for each entry of the array.

    Arguments

    • df: A DataFrame.
    • columns: Columns to unnest. Can be a column symbols or a range of columns if they align for number of values.
    • indices_include: Optional. When set to true, adds an index column for each unnested column, which logs the position of each array entry.
    • keep_empty: Optional. When set to true, rows with empty arrays are kept, not skipped, and unnested as missing.

    Examples

    julia> df = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])\n2\u00d73 DataFrame\n Row \u2502 a      b       c      \n     \u2502 Int64  Array\u2026  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  [1, 2]  [5, 6]\n   2 \u2502     2  [3, 4]  [7, 8]\n\njulia> @unnest_longer(df, 2)\n4\u00d73 DataFrame\n Row \u2502 a      b      c      \n     \u2502 Int64  Int64  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1  [5, 6]\n   2 \u2502     1      2  [5, 6]\n   3 \u2502     2      3  [7, 8]\n   4 \u2502     2      4  [7, 8]\n\njulia> @unnest_longer(df, b:c, indices_include = true)\n4\u00d75 DataFrame\n Row \u2502 a      b      c      b_id   c_id  \n     \u2502 Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      5      1      1\n   2 \u2502     1      2      6      2      2\n   3 \u2502     2      3      7      1      1\n   4 \u2502     2      4      8      2      2\n\njulia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]])\n4\u00d72 DataFrame\n Row \u2502 x      y            \n     \u2502 Int64  Array\u2026       \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  Any[]\n   2 \u2502     2  Any[1, 2, 3]\n   3 \u2502     3  Any[4, 5]\n   4 \u2502     4  Any[]\n\njulia> @unnest_longer(df2, y, keep_empty = true)\n7\u00d72 DataFrame\n Row \u2502 x      y       \n     \u2502 Int64  Any     \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  missing \n   2 \u2502     2  1\n   3 \u2502     2  2\n   4 \u2502     2  3\n   5 \u2502     3  4\n   6 \u2502     3  5\n   7 \u2502     4  missing \n

    source

    # TidierData.@unnest_wider \u2014 Macro.

    @unnest_wider(df, columns, names_sep)\n

    Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.

    Arguments

    • df: A DataFrame.
    • columns: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names.
    • names_sep: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator.

    Examples

    julia> df = DataFrame(name = [\"Zaki\", \"Farida\"], attributes = [\n               Dict(\"age\" => 25, \"city\" => \"New York\"),\n               Dict(\"age\" => 30, \"city\" => \"Los Angeles\")]);\n\njulia> @unnest_wider(df, attributes)\n2\u00d73 DataFrame\n Row \u2502 name    city         age   \n     \u2502 String  String       Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502 Zaki    New York        25\n   2 \u2502 Farida  Los Angeles     30\n\njulia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])\n2\u00d73 DataFrame\n Row \u2502 a      b       c      \n     \u2502 Int64  Array\u2026  Array\u2026 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1  [1, 2]  [5, 6]\n   2 \u2502     2  [3, 4]  [7, 8]\n\njulia> @unnest_wider(df2, b:c, names_sep = \"_\")\n2\u00d75 DataFrame\n Row \u2502 a      b_1    b_2    c_1    c_2   \n     \u2502 Int64  Int64  Int64  Int64  Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     1      1      2      5      6\n   2 \u2502     2      3      4      7      8\n

    source

    "},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":""},{"location":"examples/generated/Contributors/Howto/","title":"Contribute","text":""},{"location":"examples/generated/Contributors/Howto/#contribute-to-documentation","title":"Contribute to Documentation","text":"

    Contributing with examples can be done by first creating a new file example here

    Info

    • your_new_file.jl at docs/examples/UserGuide/

    Once this is done you need to add a new entry here at the bottom and the appropriate level.

    Info

    Your new entry should look like:

    • \"Your title example\" : \"examples/generated/UserGuide/your_new_file.md\"

    "},{"location":"examples/generated/Contributors/Howto/#build-docs-locally","title":"Build docs locally","text":"

    If you want to take a look at the docs locally before doing a PR follow the next steps:

    build docs locally

    Install the following dependencies in your system via pip, i.e.

    • pip install mkdocs pygments python-markdown-math
    • pip install mkdocs-material pymdown-extensions mkdocstrings
    • pip install mknotebooks pytkdocs_tweaks mkdocs_include_exclude_files jinja2 mkdocs-video

    Then simply go to your docs env and activate it, i.e.

    docs> julia

    julia> ]

    (docs) pkg> activate .

    Next, run the scripts:

    Info

    Generate files and build docs by running:

    • genfiles.jl
    • make.jl

    Now go to your terminal in the same path docs> and run:

    mkdocs serve

    This should output http://127.0.0.1:8000, copy/paste this into your browser and you are all set.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/across/","title":"across","text":"

    across() is a helper function that is typically used inside @mutate() or @summarize to operate on multiple columns and/or multiple functions. Notice that across() accepts two arguments, a set of variables and a set of functions. If providing multiple variables or functions, these should be provided as a tuple \u2013 in other words, wrapped in parentheses and separated by commas. If you want to skip missing values, you can \"fuse\" the summary function (such as mean()) with the skipmissing() function by using the fuction fusion operator, which you can type out in Julia by typing \\circ and then pressing [Tab] such that it reads mean\u2218skipmissing.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/across/#one-variable-one-function","title":"One variable, one function","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across(Budget, mean\u2218skipmissing))\nend\n
    1\u00d71 DataFrame RowBudget_mean_skipmissingFloat64113.4125"},{"location":"examples/generated/UserGuide/across/#one-variable-one-anonymous-function","title":"One variable, one anonymous function","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across(Budget, (x -> mean(skipmissing(x)))))\nend\n
    1\u00d71 DataFrame RowBudget_functionFloat64113.4125

    Note: compound functions are not correctly supported inside of anonymous functions. As of right now, the above function works, but (x -> mean\u2218skipmissing(x)) does not work. This is a known bug and will be fixed in a future update.

    "},{"location":"examples/generated/UserGuide/across/#multiple-variables-multiple-functions","title":"Multiple variables, multiple functions","text":"
    @chain movies begin\n    @mutate(Budget = Budget / 1_000_000)\n    @summarize(across((Rating, Budget), (mean\u2218skipmissing, median\u2218skipmissing)))\nend\n
    1\u00d74 DataFrame RowRating_mean_skipmissingBudget_mean_skipmissingRating_median_skipmissingBudget_median_skipmissingFloat64Float64Float64Float6415.9328513.41256.13.0"},{"location":"examples/generated/UserGuide/across/#multiple-selection-helpers-multiple-functions","title":"Multiple selection helpers, multiple functions","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(across((starts_with(\"Bud\"), ends_with(\"ting\")), (mean\u2218skipmissing, median\u2218skipmissing)))\nend\n
    1\u00d74 DataFrame RowBudget_mean_skipmissingRating_mean_skipmissingBudget_median_skipmissingRating_median_skipmissingFloat64Float64Float64Float64113.41255.932853.06.1

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/arrange/","title":"@arrange","text":"

    Arranging is the way to sort a data frame. @arrange() can take multiple arguments. Arguments refer to columns that are sorted in ascending order by default. If you want to sort in descending order, make sure to wrap the column name in desc() as shown below.

    DataFrames.jl does not currently support the sort() function on grouped data frames. In order to make this work in TidierData.jl, if you apply @arrange() to a GroupedDataFrame, @arrange() will temporarily ungroup the data, perform the sort(), and then re-group by the original grouping variables.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/arrange/#sort-both-variables-in-ascending-order","title":"Sort both variables in ascending order","text":"
    @chain movies begin\n  @arrange(Year, Rating)\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641Blacksmith Scene18931missing7.02Hadj Cheriff18941missing4.13Glenroy Bros., No. 218941missing4.24Leonard-Cushing Fight18941missing4.45Sioux Ghost Dance18941missing4.4"},{"location":"examples/generated/UserGuide/arrange/#sort-in-a-mix-of-ascending-and-descending-order","title":"Sort in a mix of ascending and descending order","text":"

    To sort in descending order, make sure to wrap the variable inside of desc().

    @chain movies begin\n  @arrange(Year, desc(Rating))\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641Blacksmith Scene18931missing7.02Luis Martinetti, Contortionist18941missing6.13Caicedo (with Pole)18941missing5.84Glenroy Brothers (Comic Boxing)18941missing5.45Buffalo Dance18941missing5.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/autovec/","title":"Auto-vectorization","text":"

    TidierData.jl uses a lookup table to decide which functions not to vectorize. For example, mean() is listed as a function that should never be vectorized. Also, any function used inside of across() is also not automatically vectorized. Any function that is not included in this list and is used in a context other than across() is automatically vectorized.

    Which functions are not vectorized? The set of non-vectorized functions is contained in the array TidierData.not_vectorized[]. Let's take a look at this array. We will wrap it in a string() to make the output easier to read.

    using TidierData\n\nstring(TidierData.not_vectorized[])\n
    \"[:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :\u2218, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode]\"\n

    This \"auto-vectorization\" makes working with TidierData.jl more R-like and convenient. However, if you ever define your own function and try to use it, TidierData.jl may unintentionally vectorize it for you. To prevent auto-vectorization, you can prefix your function with a ~.

    df = DataFrame(a = repeat('a':'e', inner = 2), b = [1,1,1,2,2,2,3,3,3,4], c = 11:20)\n
    10\u00d73 DataFrame RowabcCharInt64Int641a1112a1123b1134b2145c2156c2167d3178d3189e31910e420

    For example, let's define a function new_mean() that calculates a mean.

    new_mean(exprs...) = mean(exprs...)\n
    new_mean (generic function with 1 method)\n

    If we try to use new_mean() inside of @mutate(), it will give us the wrong result. This is because new_mean() is vectorized, which results in the mean being calculated element-wise, which is almost never what we actually want.

    @chain df begin\n    @mutate(d = c - new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a1110.02a1120.03b1130.04b2140.05c2150.06c2160.07d3170.08d3180.09e3190.010e4200.0

    To prevent new_mean() from being vectorized, we need to prefix it with a ~ like this:

    @chain df begin\n    @mutate(d = c - ~new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    Or you can modify the do-not-vectorize list like this:

    push!(TidierData.not_vectorized[], :new_mean)\n
    52-element Vector{Symbol}:\n :getindex\n :rand\n :esc\n :Ref\n :Set\n :Cols\n :collect\n :(:)\n :\u2218\n :lag\n \u22ee\n :categorical\n :as_categorical\n :is_categorical\n :unique\n :iqr\n :cat_other\n :cat_replace_missing\n :cat_recode\n :new_mean\n

    Now new_mean() should behave just like mean() in that it is treated as non-vectorized.

    @chain df begin\n    @mutate(d = c - new_mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    This gives us the correct answer. Notice that adding a ~ is not needed with mean() because mean() is already included on our look-up table of functions not requiring vectorization.

    @chain df begin\n    @mutate(d = c - mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    If you're not sure if a function is vectorized and want to prevent it from being vectorized, you can always prefix it with a ~ to prevent vectorization. Even though mean() is not vectorized anyway, prefixing it with a ~ will not cause any harm.

    @chain df begin\n    @mutate(d = c - ~mean(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a111-4.52a112-3.53b113-2.54b214-1.55c215-0.56c2160.57d3171.58d3182.59e3193.510e4204.5

    If for some crazy reason, you did want to vectorize mean(), you are always allowed to vectorize it, and TidierData.jl won't un-vectorize it.

    @chain df begin\n    @mutate(d = c - mean.(c))\nend\n
    10\u00d74 DataFrame RowabcdCharInt64Int64Float641a1110.02a1120.03b1130.04b2140.05c2150.06c2160.07d3170.08d3180.09e3190.010e4200.0

    Note: ~ also works with operators, so if you want to not vectorize an operator, you can prefix it with ~, for example, a ~* b will perform a matrix multiplication rather than element-wise multiplication.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/benchmark/","title":"Benchmark","text":"

    The goal of this benchmarking is to guage how Tidier.jl performs in comparison to DataFrames.jl. Ultimately, from this benchmarking, we can check that Tidier.jl is comparable in speed to DataFrames.jl.

    "},{"location":"examples/generated/UserGuide/benchmark/#why-function-wrap","title":"Why function wrap?","text":"

    Wrapping code in a function allows it to compile just once, which more closely reflects the reality of production workflows. For a more robust explanation, please see @kdpsingh comment here: https://github.com/TidierOrg/TidierData.jl/issues/24#issuecomment-1682718061

    using TidierData\nusing RDatasets\nusing BenchmarkTools\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/benchmark/#filtering","title":"filtering","text":"
    function filtering_tidier()\n@chain movies begin\n    @filter(Year > 1939 && Votes > 40)\nend\nend\n\n@benchmark filtering_tidier()\n\n@benchmark filter(row -> row.Year > 1939 && row.Votes > 40, movies)\n
    BenchmarkTools.Trial: 493 samples with 1 evaluation.\n Range (min \u2026 max):   9.672 ms \u2026  19.010 ms  \u250a GC (min \u2026 max): 0.00% \u2026 4.76%\n Time  (median):      9.973 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   10.144 ms \u00b1 714.436 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  1.19% \u00b1 2.63%\n\n       \u2584\u2586\u2587\u2588\u2587\u2586\u2584\u2583\u2581\u2582\u2582                                              \n  \u2582\u2583\u2582\u2585\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2585\u2586\u2583\u2584\u2581\u2583\u2583\u2581\u2581\u2581\u2581\u2581\u2581\u2582\u2583\u2583\u2582\u2583\u2584\u2583\u2585\u2583\u2583\u2584\u2584\u2583\u2584\u2583\u2583\u2584\u2583\u2582\u2581\u2583\u2582\u2581\u2583\u2581\u2581\u2581\u2581\u2581\u2581\u2582 \u2583\n  9.67 ms         Histogram: frequency by time         11.3 ms <\n\n Memory estimate: 7.76 MiB, allocs estimate: 287668.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#group_by-summarize","title":"group_by summarize","text":"
    function groupbysummarize_tidier()\n@chain movies begin\n    @group_by(MPAA)\n    @summarise(n=n())\nend\nend\n\n@benchmark groupbysummarize_tidier()\n\n@benchmark combine(groupby(movies, :MPAA), nrow => :n)\n
    BenchmarkTools.Trial: 10000 samples with 1 evaluation.\n Range (min \u2026 max):  419.333 \u03bcs \u2026  3.638 ms  \u250a GC (min \u2026 max): 0.00% \u2026 16.92%\n Time  (median):     426.235 \u03bcs              \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   438.110 \u03bcs \u00b1 74.097 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  1.22% \u00b1  5.06%\n\n  \u2586\u2588\u2587\u2586\u2584\u2584\u2584\u2583\u2582\u2582\u2581\u2581\u2581                                                \u2582\n  \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2587\u2586\u2586\u2586\u2587\u2585\u2585\u2584\u2585\u2584\u2583\u2586\u2586\u2586\u2584\u2584\u2585\u2583\u2585\u2583\u2584\u2583\u2583\u2581\u2584\u2584\u2586\u2587\u2586\u2587\u2586\u2581\u2583\u2584\u2583\u2584\u2585\u2583\u2581\u2583\u2581\u2587\u2586\u2585 \u2588\n  419 \u03bcs        Histogram: log(frequency) by time       619 \u03bcs <\n\n Memory estimate: 474.87 KiB, allocs estimate: 270.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#one-mutate","title":"one mutate","text":"
    function mutate_1_tidier()\n@chain movies begin\n    @mutate(new_col = Votes * R1)\nend\nend\n\n@benchmark mutate_1_tidier()\n\n@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :new_col)\n
    BenchmarkTools.Trial: 6516 samples with 1 evaluation.\n Range (min \u2026 max):  557.359 \u03bcs \u2026   7.220 ms  \u250a GC (min \u2026 max): 0.00% \u2026  9.33%\n Time  (median):     686.274 \u03bcs               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   763.019 \u03bcs \u00b1 243.408 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  9.17% \u00b1 14.37%\n\n    \u2582\u2585\u2586\u2587\u2588\u2588\u2588\u2588\u2587\u2586\u2585\u2584\u2584\u2582\u2582                    \u2581\u2582\u2581\u2581\u2581\u2581\u2581 \u2581\u2581 \u2581     \u2581 \u2581\u2581    \u2582\n  \u2583\u2587\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2588\u2587\u2587\u2586\u2585\u2584\u2585\u2583\u2583\u2585\u2585\u2583\u2585\u2585\u2585\u2585\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2587\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\n  557 \u03bcs        Histogram: log(frequency) by time       1.45 ms <\n\n Memory estimate: 8.42 MiB, allocs estimate: 223.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#mutate-6-new-columns","title":"mutate 6 new columns","text":"
    function mutate6_tidier()\n    @chain movies begin\n        @mutate(\n        Votes_R1_Product = Votes .* R1,\n        Rating_Year_Ratio = Rating ./ Year,\n        R1_to_R5_Sum = R1 + R2 + R3 + R4 + R5,\n        High_Budget_Flag = if_else(ismissing(Budget), \"NA\", Budget .> 50000),\n        R6_to_R8_Avg = (R6 + R7 + R8) / 3,\n        year_Minus_Length = Year - Length)\n    end\nend\n\n@benchmark mutate6_tidier()\n\n@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :Votes_R1_Product, [:Rating, :Year] => ((r, y) -> r ./ y) => :Rating_Year_Ratio, [:R1, :R2, :R3, :R4, :R5] => ((a, b, c, d, e) -> a + b + c + d + e) => :R1_to_R5_Sum, :Budget => (b -> ifelse.(ismissing.(b), missing, b .> 50000)) => :High_Budget_Flag, [:R6, :R7, :R8] => ((f, g, h) -> (f + g + h) / 3) => :R6_to_R8_Avg, [:Year, :Length] => ((y, l) -> y - l) => :Year_Minus_Length )\n
    BenchmarkTools.Trial: 4012 samples with 1 evaluation.\n Range (min \u2026 max):  1.022 ms \u2026   6.252 ms  \u250a GC (min \u2026 max): 0.00% \u2026 10.10%\n Time  (median):     1.143 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   1.241 ms \u00b1 282.196 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  7.04% \u00b1 11.85%\n\n       \u2583\u2588\u2588\u2585\u2582                                                   \n  \u2582\u2582\u2583\u2584\u2587\u2588\u2588\u2588\u2588\u2588\u2587\u2585\u2583\u2583\u2583\u2582\u2582\u2582\u2582\u2581\u2582\u2581\u2582\u2582\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2582\u2582\u2581\u2581\u2581\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2583\u2583\u2583\u2584\u2583\u2583\u2583\u2583\u2583\u2583 \u2583\n  1.02 ms         Histogram: frequency by time        1.92 ms <\n\n Memory estimate: 10.56 MiB, allocs estimate: 581.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#groupby-then-2-mutates","title":"groupby then 2 mutates","text":"
    function groupby1_2mutate_tidier()\n@chain movies begin\n    @group_by(MPAA)\n    @mutate(ace = R1 -> R1/2 * 4)\n    @mutate(Bace = Votes^R1)\nend\nend\n\n@benchmark groupby1_2mutate_tidier()\n\n@benchmark transform( transform( groupby(movies, :MPAA), :R1 => (x -> x/2 * 4) => :ace, ungroup = false), [:Votes, :R1] => ((a, b) -> b .^ a) => :Bace, ungroup = false)\n
    BenchmarkTools.Trial: 683 samples with 1 evaluation.\n Range (min \u2026 max):  6.629 ms \u2026  12.749 ms  \u250a GC (min \u2026 max): 0.00% \u2026 7.02%\n Time  (median):     7.068 ms               \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   7.318 ms \u00b1 541.471 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  2.98% \u00b1 4.19%\n\n      \u2581 \u2584\u2588\u2585\u2584\u2582\u2583                                                 \n  \u2582\u2583\u2584\u2587\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2586\u2584\u2584\u2583\u2584\u2582\u2583\u2583\u2584\u2583\u2584\u2585\u2585\u2586\u2585\u2586\u2586\u2585\u2586\u2587\u2586\u2584\u2586\u2585\u2584\u2583\u2583\u2583\u2583\u2582\u2582\u2582\u2582\u2583\u2582\u2583\u2582\u2582\u2581\u2582\u2583\u2582\u2582\u2581\u2581\u2582\u2583 \u2583\n  6.63 ms         Histogram: frequency by time        8.81 ms <\n\n Memory estimate: 26.17 MiB, allocs estimate: 2449.\n
    "},{"location":"examples/generated/UserGuide/benchmark/#select-5-columns","title":"select 5 columns","text":"
    function select5_tidier()\n    @chain movies begin\n        @select(R1:R5)\n    end\nend\n\n@benchmark select5_tidier()\n\n@benchmark select(movies, :R1, :R2, :R3, :R4, :R5)\n
    BenchmarkTools.Trial: 10000 samples with 1 evaluation.\n Range (min \u2026 max):  173.423 \u03bcs \u2026  4.715 ms  \u250a GC (min \u2026 max): 0.00% \u2026 8.59%\n Time  (median):     221.673 \u03bcs              \u250a GC (median):    0.00%\n Time  (mean \u00b1 \u03c3):   237.430 \u03bcs \u00b1 95.591 \u03bcs  \u250a GC (mean \u00b1 \u03c3):  4.87% \u00b1 9.96%\n\n      \u2585\u2588\u2584                                                       \n  \u2582\u2582\u2584\u2588\u2588\u2588\u2588\u2586\u2585\u2584\u2583\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2581\u2581\u2582\u2581\u2582\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2582\u2581\u2581\u2582\u2581\u2582\u2582\u2582\u2582\u2581\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582\u2582 \u2583\n  173 \u03bcs          Histogram: frequency by time          688 \u03bcs <\n\n Memory estimate: 2.25 MiB, allocs estimate: 200.\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/binding/","title":"Binding","text":"

    Whereas joins are useful for combining data frames based on matching keys, another way to combine data frames is to bind them together, which can be done either by rows or by columns. TidierData.jl implements these actions using @bind_rows() and @bind_cols(), respectively.

    Let's generate three data frames to combine.

    using TidierData\n\ndf1 = DataFrame(a=1:3, b=1:3);\n\ndf2 = DataFrame(a=4:6, b=4:6);\n\ndf3 = DataFrame(a=7:9, c=7:9);\n

    "},{"location":"examples/generated/UserGuide/binding/#bind_rows","title":"@bind_rows()","text":"
    @bind_rows(df1, df2)\n
    6\u00d72 DataFrame RowabInt64Int64111222333444555666

    @bind_rows() keeps columns that are present in at least one of the provided data frames. Any missing columns will be filled with missing values.

    @bind_rows(df1, df3)\n
    6\u00d73 DataFrame RowabcInt64Int64?Int64?111missing222missing333missing47missing758missing869missing9

    There is an optional id argument to add an identifier for combined data frames. Note that both @bind_rows and @bind_cols accept multiple (i.e., more than 2) data frames, as in the example below.

    @bind_rows(df1, df2, df3, id = \"id\")\n
    9\u00d74 DataFrame RowabcidInt64Int64?Int64?Int64111missing1222missing1333missing1444missing2555missing2666missing277missing7388missing8399missing93

    "},{"location":"examples/generated/UserGuide/binding/#bind_cols","title":"@bind_cols()","text":"

    @bind_cols works similarly to R's tidyverse although the .name_repair argument is not supported.

    @bind_cols(df1, df2)\n
    3\u00d74 DataFrame Rowaba_1b_1Int64Int64Int64Int64111442225533366

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/column_names/","title":"Column names","text":"

    When referring to column names, TidierData.jl is a bit unusual for a Julia package in that it does not use symbols. This is because TidierData.jl uses tidy expressions, which in R lingo equates to a style of programming referred to as \"non-standard evaluation.\" If you are creating a new column a containing a value that is the mean of column b, you would simply write a = mean(b).

    However, there may be times when you wish to create or refer to a column containing a space in it. Let's start by creating some column names containing a space in their name.

    using TidierData\n\ndf = DataFrame(var\"my name\" = [\"Ada\", \"Twist\"],\n               var\"my age\" = [40, 50])\n
    2\u00d72 DataFrame Rowmy namemy ageStringInt641Ada402Twist50

    To create a column name containing a space, we used the var\"column name\" notation. Because DataFrame() is a regular Julia function, this is the standard way to refer to a variable containing a space, which is why we need to use this here.

    This notation also works inside of TidierData.jl.

    "},{"location":"examples/generated/UserGuide/column_names/#varcolumn-name-notation","title":"var\"column name\" notation","text":"

    If we want to figure out the age for the people in our dataset a decade from today, we could use this same var\"column name\" notation inside of @mutate.

    @chain df begin\n  @mutate(var\"age in 10 years\" = var\"my age\" + 10)\nend\n
    2\u00d73 DataFrame Rowmy namemy ageage in 10 yearsStringInt64Int641Ada40502Twist5060

    However, typing out the var\"column name\" can become cumbersome. TidierData.jl also supports another shorthand notation to refer to column names containing spaces or other special characters: backticks.

    "},{"location":"examples/generated/UserGuide/column_names/#backtick-notation","title":"Backtick notation","text":"

    This same code could be written more concisely like this:

    @chain df begin\n  @mutate(`age in 10 years` = `my age` + 10)\nend\n
    2\u00d73 DataFrame Rowmy namemy ageage in 10 yearsStringInt64Int641Ada40502Twist5060

    Backticks are an R convention. While they are not specific to tidyverse, they are a convenient way to refer to column names that otherwise would not parse correctly as a single entity. Backticks are supported in all TidierData.jl functions where column names may be referenced.

    "},{"location":"examples/generated/UserGuide/column_names/#cleaning-up-column-names","title":"Cleaning up column names","text":"

    Another option is to clean up the column names so that you do not have spaces to begin with. In R, this is usually accomplished using the janitor package. In Julia, the Cleaner.jl package provides this functionality, which we have wrapped inside of TidierData.jl.

    @chain df begin\n  @clean_names\nend\n
    2\u00d72 DataFrame Rowmy_namemy_ageStringInt641Ada402Twist50

    Although the default value for the case argument is \"snake_case\", you can also set this to \"camelCase\".

    @chain df begin\n  @clean_names(case = \"camelCase\")\nend\n
    2\u00d72 DataFrame RowmyNamemyAgeStringInt641Ada402Twist50

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/conditionals/","title":"Conditionals","text":"

    Conditional functions are a useful tool to update or create new columns conditional on the values of a column of data. When continuous variables are converted to categories, this is sometimes referred to as \"recoding\" a column.

    TidierData.jl provides two functions to recode data: if_else() and case_when().

    "},{"location":"examples/generated/UserGuide/conditionals/#if_else","title":"if_else()","text":"

    Why do we need another if_else() function if base Julia already comes with an ifelse() function. Similar to R, the base Julia implementation of if_else() does not include a way to designate what value to return if the enclosed vector contains a missing value. Additionally, the base Julia implementation of ifelse() produces an error if presented with a missing value in the condition. The TidierData.jl if_else() can handle missing values and includes an optional 4th argument that is used to designate what to return in the event of a `missing`` value for the condition. Let's take a look at some examples.

    using TidierData\n\ndf = DataFrame(a = [1, 2, missing, 4, 5])\n
    5\u00d71 DataFrame RowaInt64?11223missing4455

    Here, we have created a DataFrame containing a single column a with 5 values, for which the 3rd value is missing.

    Now, let's create a new column b that contains a \"yes\" if a is greater than or equal to 3, and a \"no\" otherwise. Notice that when we do this, the missing values remains as missing.

    @chain df begin\n  @mutate(b = if_else(a >= 3, \"yes\", \"no\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String?11no22no3missingmissing44yes55yes

    What if we wanted to fill in the missing value with \"unknown\"? All we need to do is provide an optional 4th argument containing the value to return in the event of a missing condition. When we run this version, missing values in a are converted to \"unknown\" in b.

    @chain df begin\n  @mutate(b = if_else(a >= 3, \"yes\", \"no\", \"unknown\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String11no22no3missingunknown44yes55yes

    Although both of these examples showed how to return a single value (like \"yes\" and \"no\"), you can also return a vector of values, which is useful for updating only a subset of the values of a column. For example, if we wanted to create a column b that contains a 3 when a is greater than or equal to 3 but otherwise remains unchanged, we could provide a 3 for the yes condition and a vector (column) a in the no condition. If we do not provide the optional 4th argument, missing values remain missing.

    @chain df begin\n  @mutate(b = if_else(a >= 3, 3, a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int64?1112223missingmissing443553

    "},{"location":"examples/generated/UserGuide/conditionals/#case_when","title":"case_when()","text":"

    Although if_else() is convenient when evaluating a single condition, it can be cumbersome when evaluating multiple conditions because subsequent conditions need to be nested within the no condition for the preceding argument. For situations where multiple conditions need to be evaluated, case_when() is more convenient.

    Let's first consider a similar example from above and recreate it using case_when(). The following code creates a column b that assigns a value of 3 if a >= 3 and otherwise leaves the value unchanged.

    @chain df begin\n  @mutate(b = case_when(a >= 3  =>  3,\n                        true    =>  a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int64?1112223missingmissing443553

    What is going on here? case_when() uses a condition => return_value syntax, which are encoded as pairs in Julia. You can provide a single pair, or multiple pairs separated by commas. Because the pairs operator (=>) might be confused with a greater than or equal to sign (>=), we have padded two spaces on either side of the => to make sure that the pair remains visually distinct. We do not use a ~ operator in case_when() (as is used in R) because the ~ operator is used to denote de-vectorized functions in TidierData.jl.

    There are 2 other things to note above. First, the true condition evaluates to true for all remaining values of a. The only reason that the b contains a missing value here is that the true condition was met, leading to the value of a (in this case, missing) to be assigned to b. Second, we were able to return a single value (3) in the first condition, and a vector (column) of data (a) in the second condition.

    What if we wanted to fill in the missing values with something else? In this case, we would need to create an explicit condition that checks for missing values and assigns a return value to that condition.

    @chain df begin\n  @mutate(b = case_when(a >= 3        =>  3,\n                        ismissing(a)  =>  0,\n                        true          =>  a))\nend\n
    5\u00d72 DataFrame RowabInt64?Int641112223missing0443553

    Do our conditions have to be mutually exclusive? No. The return value for the first matching condition is assigned to b because the conditions are evaluated sequentially from first to last.

    @chain df begin\n  @mutate(b = case_when(a > 4  =>  \"hi\",\n                        a > 2  =>  \"medium\",\n                        a > 0  =>  \"low\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String?11low22low3missingmissing44medium55hi

    Again, if we want to fill in remaining values (which in this case are the missing ones), we can map the final condition true to the value of \"unknown\". Because the ordering of the conditions matters, the true condition should always be listed last if it is included.

    @chain df begin\n  @mutate(b = case_when(a > 4  =>  \"hi\",\n                        a > 2  =>  \"medium\",\n                        a > 0  =>  \"low\",\n                        true   =>  \"unknown\"))\nend\n
    5\u00d72 DataFrame RowabInt64?String11low22low3missingunknown44medium55hi

    "},{"location":"examples/generated/UserGuide/conditionals/#do-these-functions-work-outside-of-tidierdatajl","title":"Do these functions work outside of TidierData.jl?","text":"

    Yes, both if_else() and case_when() work outside of TidierData.jl. However, you'll need to remember that if working with vectors, both the functions and conditions will need to be vectorized, and in the case of case_when(), the => will need to be written as .=>. The reason this is not needed when using these functions inside of TidierData.jl is because they are auto-vectorized.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/dataset_movies/","title":"Movies dataset","text":"

    To get started, we will load the movies dataset from the RDatasets.jl package.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    To work with this dataset, we will use the @chain macro. This macro initiates a pipe, and every function or macro provided to it between the begin and end blocks modifies the dataframe mentioned at the beginning of the pipe. You don't have to necessarily spread a chain over multiple lines of code, but when working with data frames it's often easiest to do so. Before going further, take a look at the Chain.jl GitHub page to see all the cool things that are possible with this, including mid-chain side effects using @aside and mid-chain assignment of variables.

    Let's take a look at the first 5 rows of the movies dataset using @slice().

    @chain movies begin\n    @slice(1:5)\nend\n
    5\u00d724 DataFrame RowTitleYearLengthBudgetRatingVotesR1R2R3R4R5R6R7R8R9R10MPAAActionAnimationComedyDramaDocumentaryRomanceShortStringInt32Int32Int32?Float64Int32Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Cat\u2026Int32Int32Int32Int32Int32Int32Int321$1971121missing6.43484.54.54.54.514.524.524.514.54.54.500110002$1000 a Touchdown193971missing6.0200.014.54.524.514.514.514.54.54.514.500100003$21 a Day Once a Month19417missing8.250.00.00.00.00.024.50.044.524.524.501000014$40,000199670missing8.2614.50.00.00.00.00.00.00.034.545.500100005$50,000 Climax Show, The197571missing3.41724.54.50.014.514.54.50.00.00.024.50000000

    Let's use @glimpse() to preview the dataset.

    @glimpse(movies)\n
    Rows: 58788\nColumns: 24\n.Title         String         $, $1000 a Touchdown, $21 a Day Once a Month, $40,\n.Year          Int32          1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19\n.Length        Int32          121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10\n.Budget        Union{Missing, Int32}missing, missing, missing, missing, missing,\n.Rating        Float64        6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0,\n.Votes         Int32          348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44\n.R1            Float64        4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5\n.R2            Float64        4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,\n.R3            Float64        4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5,\n.R4            Float64        4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4.\n.R5            Float64        14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0,\n.R6            Float64        24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,\n.R7            Float64        24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5,\n.R8            Float64        14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4\n.R9            Float64        4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4.\n.R10           Float64        4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24.\n.MPAA          CategoricalArrays.CategoricalValue{String, UInt8}, , , , , , R, ,\n.Action        Int32          0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,\n.Animation     Int32          0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Comedy        Int32          1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,\n.Drama         Int32          1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,\n.Documentary   Int32          0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Romance       Int32          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n.Short         Int32          0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/distinct/","title":"@distinct","text":"

    The @distinct() macro in TidierData.jl is useful to select distinct rows. Like it's R counterpart, it can be used with or without arguments. When arguments are provided, it behaves slightly differently than the R version. Whereas the R function only returns the provided columns, the TidierData.jl version returns all columns, where the first match is returned for the non-selected columns.

    using TidierData\n\ndf = DataFrame(a = 1:10, b = repeat('a':'e', inner = 2))\n
    10\u00d72 DataFrame RowabInt64Char11a22a33b44b55c66c77d88d99e1010e

    "},{"location":"examples/generated/UserGuide/distinct/#select-distinct-values-overall","title":"Select distinct values overall","text":"

    Since there are no duplicate rows, this will return all rows.

    @chain df begin\n    @distinct()\nend\n
    10\u00d72 DataFrame RowabInt64Char11a22a33b44b55c66c77d88d99e1010e

    "},{"location":"examples/generated/UserGuide/distinct/#select-distinct-values-based-on-column-b","title":"Select distinct values based on column b","text":"

    Notice that the first matching row for column a is returned for every distinct value of column b. This is slightly different behavior than R's tidyverse, which would have returned only column b.

    @chain df begin\n  @distinct(b)\nend\n
    5\u00d72 DataFrame RowabInt64Char11a23b35c47d59e

    In TidierData.jl, @distinct() works with grouped data frames. If grouped, @distinct() will ignore the grouping when determining distinct values but will return the data frame in grouped form based on the original groupings.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/fill_missing/","title":"Fill missing","text":"

    The @fill_missing macro is a reimplementation of fill(). To mirror the syntax in R, the methods availble are \"up\" (fill from bottom up) and \"down\" fill from top down.

    using TidierData\n\ndf = DataFrame(\n    a = [missing, 2, 3, missing, 5],\n    b = [missing, 1, missing, 4, 5],\n    c = ['a', 'b', missing, 'd', 'e'],\n    group = ['A', 'A', 'B', 'B', 'A']\n);\n

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-all-columns","title":"Fill all columns","text":"

    Fill missing values for the whole DataFrame using the \"down\" method (top to bottom)

    @chain df begin\n    @fill_missing(\"down\")\nend\n\n@fill_missing(df, \"down\")\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA331bB434dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-specifc-columns","title":"Fill specifc columns","text":"

    This fills missing values in columns a and c going from bottom to top.

    @chain df begin\n    @fill_missing(a, c, \"up\")\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char12missingaA221bA33missingdB454dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#fill-with-grouped-dataframes","title":"Fill with Grouped DataFrames","text":"

    When grouping by the group column, this fills missing values in columns a within each group going from top to bottom within that group

    @chain df begin\n    @group_by(group)\n    @fill_missing(a, \"down\")\nend\n

    GroupedDataFrame with 2 groups based on key: group

    First Group (3 rows): group = 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase) RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA355eA

    &vellip;

    Last Group (2 rows): group = 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase) RowabcgroupInt64?Int64?Char?Char13missingmissingB234dB

    "},{"location":"examples/generated/UserGuide/fill_missing/#replace_missing","title":"replace_missing()","text":"

    The replace_missing function facilitates the replacement of missing values with a specified replacement.

    @chain df begin\n    @mutate(b = replace_missing(b, 2))\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64Char?Char1missing2aA221bA332missingB4missing4dB555eA

    "},{"location":"examples/generated/UserGuide/fill_missing/#missing_if","title":"missing_if()","text":"

    The missing_if function is used to introduce missing values under specific conditions.

    @chain df begin\n    @mutate(b = missing_if(b, 5))\nend\n
    5\u00d74 DataFrame RowabcgroupInt64?Int64?Char?Char1missingmissingaA221bA33missingmissingB4missing4dB55missingeA

    Both missing_if and replace_missing are not type specifc.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/filter/","title":"@filter","text":"

    Filtering is a mechanism to indicate which rows you want to keep in a dataset based on criteria. This is also referred to as subsetting. Filtering rows is normally a bit tricky in DataFrames.jl because comparison operators like >= actually need to be vectorized as .>=, which can catch new Julia users by surprise. @filter() mimics R's tidyverse behavior by auto-vectorizing the code and then only selecting those rows that evaluate to true. Similar to dplyr, rows that evaluate to missing are skipped.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/filter/#lets-take-a-look-at-the-movies-whose-budget-was-more-than-average-we-will-select-only-the-first-5-rows-for-the-sake-of-brevity","title":"Let\u2019s take a look at the movies whose budget was more than average. We will select only the first 5 rows for the sake of brevity.","text":"
    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @filter(Budget >= mean(skipmissing(Budget)))\n  @select(Title, Budget)\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat64?1'Til There Was You23.0210 Things I Hate About You16.03102 Dalmatians85.0413 Going On 3037.0513th Warrior, The85.0"},{"location":"examples/generated/UserGuide/filter/#lets-search-for-movies-that-have-at-least-200-votes-and-a-rating-of-greater-than-or-equal-to-8-there-are-3-ways-you-can-specify-an-and-condition-inside-of-tidierdatajl","title":"Let's search for movies that have at least 200 votes and a rating of greater than or equal to 8. There are 3 ways you can specify an \"and\" condition inside of TidierData.jl.","text":""},{"location":"examples/generated/UserGuide/filter/#the-first-option-is-to-use-the-short-circuiting-operator-as-shown-below-this-is-the-preferred-approach-because-the-second-expression-is-only-evaluated-per-element-if-the-first-one-is-true","title":"The first option is to use the short-circuiting && operator as shown below. This is the preferred approach because the second expression is only evaluated (per element) if the first one is true.","text":"
    @chain movies begin\n  @filter(Votes >= 200 && Rating >= 8)\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#the-second-option-is-to-use-the-bitwise-operator-note-that-there-is-a-key-difference-in-syntax-between-and-because-the-operator-takes-a-higher-operator-precedence-than-you-have-to-wrap-the-comparison-expressions-inside-of-parentheses-to-ensure-that-the-overall-expression-is-evaluated-correctly","title":"The second option is to use the bitwise & operator. Note that there is a key difference in syntax between & and &&. Because the & operator takes a higher operator precedence than >=, you have to wrap the comparison expressions inside of parentheses to ensure that the overall expression is evaluated correctly.","text":"
    @chain movies begin\n  @filter((Votes >= 200) & (Rating >= 8))\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#the-third-option-for-and-conditions-only-is-to-separate-the-expressions-with-commas-this-is-similar-to-the-behavior-of-filter-in-tidyverse","title":"The third option for \"and\" conditions only is to separate the expressions with commas. This is similar to the behavior of filter() in tidyverse.","text":"
    @chain movies begin\n  @filter(Votes >= 200, Rating >= 8)\n  @select(Title, Votes, Rating)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleVotesRatingStringInt32Float64112 Angry Men292788.7212 stulev2528.932001: A Space Odyssey649828.3421 Grams218578.0539 Steps, The79318.0"},{"location":"examples/generated/UserGuide/filter/#now-lets-see-how-to-use-filter-with-in-heres-an-example-with-a-tuple","title":"Now let's see how to use @filter() with in. Here's an example with a tuple.","text":"
    @chain movies begin\n  @filter(Title in (\"101 Dalmatians\",\n                    \"102 Dalmatians\"))\n  @select(1:5)\nend\n
    2\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641101 Dalmatians1996103missing5.52102 Dalmatians2000100850000004.7"},{"location":"examples/generated/UserGuide/filter/#we-can-also-use-filter-with-in-using-a-vector-denoted-by-a","title":"We can also use @filter() with in using a vector, denoted by a [].","text":"
    @chain movies begin\n  @filter(Title in [\"101 Dalmatians\",\n                    \"102 Dalmatians\"])\n  @select(1:5)\nend\n
    2\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641101 Dalmatians1996103missing5.52102 Dalmatians2000100850000004.7"},{"location":"examples/generated/UserGuide/filter/#finally-we-can-combine-filter-with-row_number-to-retrieve-the-first-5-rows-which-can-be-used-to-mimic-the-functionality-provided-by-slice","title":"Finally, we can combine @filter with row_number() to retrieve the first 5 rows, which can be used to mimic the functionality provided by @slice.","text":"
    @chain movies begin\n  @filter(row_number() <= 5)\n  @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/group_by/","title":"@group_by","text":"

    Grouping and ungrouping behavior is one of the nicest parts of using R's tidyverse. Once a data frame is grouped, all verbs applied to that data frame respect the grouping, including but not limited to @mutate(), @summarize(), @slice() and @filter, which allows for really powerful abstractions. For example, with @group_by() followed by @filter(), you can limit the rows of a dataset to the maximum or minimum values for each group.

    Exactly as in R's tidyverse, once a data frame is grouped, it remains grouped until either @summarize() is called (which \"peels off\" one layer of grouping) or @ungroup() is called, which removes all layers of grouping. Also as in R's tidyverse, @group_by() sorts the groups in ascending order. Unlike in R, there is never any question about whether a data frame is currently grouped because GroupedDataFrames print out in a very different form than DataFrames, making them easy to tell apart.

    When using @chain, note that you can write either @ungroup or @ungroup(). Both are considered valid.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-mutate","title":"Combining @group_by() with @mutate()","text":"
    @chain movies begin\n    @group_by(Year)\n    @mutate(Mean_Yearly_Rating = mean(skipmissing(Rating)))\n    @select(Year, Rating, Mean_Yearly_Rating)\n    @ungroup\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowYearRatingMean_Yearly_RatingInt32Float64Float64119716.45.66517219396.06.35041319418.26.34107419968.25.74712519753.45.62908"},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-summarize","title":"Combining @group_by() with @summarize()","text":"
    @chain movies begin\n    @group_by(Year)\n    @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)),\n        Median_Yearly_Rating = median(skipmissing(Rating)))\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowYearMean_Yearly_RatingMedian_Yearly_RatingInt32Float64Float64119715.665175.8219396.350416.4319416.341076.4419965.747125.9519755.629085.7"},{"location":"examples/generated/UserGuide/group_by/#grouping-by-multiple-columns","title":"Grouping by multiple columns","text":"
    @chain movies begin\n  @group_by(Year, Comedy)\n  @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)),\n      Median_Yearly_Rating = median(skipmissing(Rating)))\n  @ungroup # Need to ungroup to peel off grouping by Year\n  @arrange(desc(Year), Comedy)\n  @slice(1:5)\nend\n
    5\u00d74 DataFrame RowYearComedyMean_Yearly_RatingMedian_Yearly_RatingInt32Int32Float64Float641200506.627886.752200516.300816.13200406.765216.94200416.428986.65200306.404096.6"},{"location":"examples/generated/UserGuide/group_by/#combining-group_by-with-filter","title":"Combining @group_by() with @filter()","text":"
    @chain movies begin\n    @group_by(Year)\n    @filter(Rating == minimum(Rating))\n    @ungroup\n    @select(Year, Rating)\n    @arrange(desc(Year))\n    @slice(1:10)\nend\n
    10\u00d72 DataFrame RowYearRatingInt32Float64120051.8220041.0320041.0420041.0520041.0620041.0720041.0820041.0920031.01020031.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/interpolation/","title":"Interpolation","text":"

    The !! (\"bang bang\") operator can be used to interpolate values of variables from the parent environment into your code. This operator is borrowed from the R rlang package. At some point, we may switch to using native Julia interpolation, but for a variety of reasons that introduce some complexity with native interpolation, we plan to continue to support !! interpolation.

    To interpolate multiple variables, the rlang R package uses the !!! \"triple bang\" operator. However, in TidierData.jl, the !! \"bang bang\" operator can be used to interpolate either single or multiple values as shown in the examples below.

    Note: You can only interpolate values from variables in the parent environment. If you would like to interpolate column names, you have two options: you can either use across() or you can use @aside with @pull() to create variables in the parent environment containing the values of those columns which can then be accessed using interpolatino.

    myvar = :bandmyvar = Cols(:a, :b)both refer to *columns* with those names. On the other hand,myvar = \"b\",myvar = (\"a\", \"b\")andmyvar = [\"a\", \"b\"]will interpolate the *values*. If you intend to interpolate column names, the preferred way is to useCols()` as in the examples below.

    using TidierData\n\ndf = DataFrame(a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4],\n               c = 11:20)\n
    10\u00d73 DataFrame RowabcStringInt64Int641a1112a1123b1134b2145c2156c2167d3178d3189e31910e420

    "},{"location":"examples/generated/UserGuide/interpolation/#select-the-column-because-myvar-contains-a-symbol","title":"Select the column (because myvar contains a symbol)","text":"
    myvar = :b\n\n@chain df begin\n  @select(!!myvar)\nend\n
    10\u00d71 DataFrame RowbInt64112131425262738393104"},{"location":"examples/generated/UserGuide/interpolation/#select-multiple-variables","title":"Select multiple variables","text":"

    You can also use a vector as in [:a, :b], but Cols() is preferred because it lets you mix and match numbers.

    myvars = Cols(:a, :b)\n\n@chain df begin\n  @select(!!myvars)\nend\n
    10\u00d72 DataFrame RowabStringInt641a12a13b14b25c26c27d38d39e310e4

    This is the same as this...

    myvars = Cols(:a, 2)\n\n@chain df begin\n  @select(!!myvars)\nend\n
    10\u00d72 DataFrame RowabStringInt641a12a13b14b25c26c27d38d39e310e4

    "},{"location":"examples/generated/UserGuide/interpolation/#filter-rows-containing-the-value-of-myvar_string","title":"Filter rows containing the value of myvar_string","text":"
    myvar_string = \"b\"\n\n@chain df begin\n  @filter(a == !!myvar_string)\nend\n
    2\u00d73 DataFrame RowabcStringInt64Int641b1132b214"},{"location":"examples/generated/UserGuide/interpolation/#filtering-rows-works-similarly-using-in","title":"Filtering rows works similarly using in.","text":"

    Note that for in to work here, we have to wrap it in [] because otherwise, the string will be converted into a collection of characters, which are a different data type.

    myvar_string = \"b\"\n\n@chain df begin\n  @filter(a in [!!myvar_string])\nend\n
    2\u00d73 DataFrame RowabcStringInt64Int641b1132b214

    "},{"location":"examples/generated/UserGuide/interpolation/#you-can-also-use-this-for-a-vector-or-tuple-of-strings","title":"You can also use this for a vector (or tuple) of strings.","text":"
    myvars_string = [\"a\", \"b\"]\n\n@chain df begin\n  @filter(a in !!myvars_string)\nend\n
    4\u00d73 DataFrame RowabcStringInt64Int641a1112a1123b1134b214"},{"location":"examples/generated/UserGuide/interpolation/#mutate-one-variable","title":"Mutate one variable","text":"

    Remember: You cannot interpolate column names into @mutate() expressions. However, you can create a temporary variable containing the values of the column in question or you can use @mutate() with across().

    "},{"location":"examples/generated/UserGuide/interpolation/#option-1-create-a-temporary-variable-containing-the-values-of-the-column","title":"Option 1: Create a temporary variable containing the values of the column.","text":"
    myvar = :b\n\n@chain df begin\n  @aside(myvar_values = @pull(_, !!myvar))\n  @mutate(d = !!myvar_values + 1)\nend\n
    10\u00d74 DataFrame RowabcdStringInt64Int64Int641a11122a11223b11324b21435c21536c21637d31748d31849e319410e4205"},{"location":"examples/generated/UserGuide/interpolation/#option-2-use-mutate-with-across","title":"Option 2: Use @mutate() with across()","text":"

    Note: when using across(), anonymous functions are not vectorized. This is intentional to allow users to specify their function exactly as desired.

    @chain df begin\n  @mutate(across(!!myvar, x -> x .+ 1))\n  @rename(d = b_function)\nend\n
    10\u00d74 DataFrame RowabcdStringInt64Int64Int641a11122a11223b11324b21435c21536c21637d31748d31849e319410e4205

    "},{"location":"examples/generated/UserGuide/interpolation/#summarize-across-one-variable","title":"Summarize across one variable","text":"
    myvar = :b\n\n@chain df begin\n  @summarize(across(!!myvar, mean))\nend\n
    1\u00d71 DataFrame Rowb_meanFloat6412.2"},{"location":"examples/generated/UserGuide/interpolation/#summarize-across-multiple-variables","title":"Summarize across multiple variables","text":"
    myvars = Cols(:b, :c)\n\n@chain df begin\n  @summarize(across(!!myvars, (mean, minimum, maximum)))\nend\n
    1\u00d76 DataFrame Rowb_meanc_meanb_minimumc_minimumb_maximumc_maximumFloat64Float64Int64Int64Int64Int6412.215.5111420"},{"location":"examples/generated/UserGuide/interpolation/#group-by-one-interpolated-variable","title":"Group by one interpolated variable","text":"
    myvar = :a\n\n@chain df begin\n  @group_by(!!myvar)\n  @summarize(c = mean(c))\nend\n
    5\u00d72 DataFrame RowacStringFloat641a11.52b13.53c15.54d17.55e19.5"},{"location":"examples/generated/UserGuide/interpolation/#group-by-multiple-interpolated-variables","title":"Group by multiple interpolated variables","text":"

    Once again, you can mix and match column selectors within Cols()

    myvars = Cols(:a, 2)\n\n@chain df begin\n  @group_by(!!myvars)\n  @summarize(c = mean(c))\nend\n

    GroupedDataFrame with 5 groups based on key: a

    First Group (1 row): a = \"a\" RowabcStringInt64Float641a111.5

    &vellip;

    Last Group (2 rows): a = \"e\" RowabcStringInt64Float641e319.02e420.0

    Notice that df remains grouped by a because the @summarize() peeled off one layer of grouping.

    "},{"location":"examples/generated/UserGuide/interpolation/#global-constants","title":"Global constants","text":"

    You can also use !! interpolation to access global variables like pi.

    df = DataFrame(radius = 1:5)\n\n@chain df begin\n  @mutate(area = !!pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    As of v0.14.0, global constants defined within the Base or Core modules (like missing, pi, and Real can be directly referenced without any !!)

    @chain df begin\n  @mutate(area = pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    "},{"location":"examples/generated/UserGuide/interpolation/#alternative-interpolation-syntax","title":"Alternative interpolation syntax","text":"

    Since we know that pi is defined in the Main module, we can also access it using Main.pi.

    @chain df begin\n  @mutate(area = Main.pi * radius^2)\nend\n
    5\u00d72 DataFrame RowradiusareaInt64Float64113.141592212.56643328.27434450.26555578.5398

    The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use !!variable or [Module_name_here].variable syntax to refer to this variable.

    Note: You can use !! interpolation anywhere, including inside of functions and loops.

    df = DataFrame(a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4],\n               c = 11:20)\n\nfor col in [:b, :c]\n  @chain df begin\n    @summarize(across(!!col, mean))\n    println\n  end\nend\n
    1\u00d71 DataFrame\n Row \u2502 b_mean\n     \u2502 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502     2.2\n1\u00d71 DataFrame\n Row \u2502 c_mean\n     \u2502 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   1 \u2502    15.5\n

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/joins/","title":"Joins","text":"

    One really nice thing about the R tidyverse implementation of joins is that they support natural joins. If you don't specify which columns to join on, these column names are inferred from the overlapping columns. While you can override this behavior by specifying which columns to join on, it's convenient that this is not strictly required. We have adopted a similar approach to joins in TidierData.jl.

    Here, we will only show examples of natural joins. For additional ways to join, take a look at the examples in the Reference.

    using TidierData\n

    Let's generate two data frames to join on. Here's the first one.

    df1 = DataFrame(a = [\"a\", \"b\"], b = 1:2);\n

    And here's the second one.

    df2 = DataFrame(a = [\"a\", \"c\"], c = 3:4);\n

    All the joins work similarly to R's tidyverse although the new join_by syntax for non-equijoins is not (yet) supported.

    "},{"location":"examples/generated/UserGuide/joins/#left-join","title":"Left join","text":"
    @left_join(df1, df2)\n
    2\u00d73 DataFrame RowabcStringInt64Int64?1a132b2missing"},{"location":"examples/generated/UserGuide/joins/#right-join","title":"Right join","text":"
    @right_join(df1, df2)\n
    2\u00d73 DataFrame RowabcStringInt64?Int641a132cmissing4"},{"location":"examples/generated/UserGuide/joins/#inner-join","title":"Inner join","text":"
    @inner_join(df1, df2)\n
    1\u00d73 DataFrame RowabcStringInt64Int641a13"},{"location":"examples/generated/UserGuide/joins/#full-join","title":"Full join","text":"
    @full_join(df1, df2)\n
    3\u00d73 DataFrame RowabcStringInt64?Int64?1a132b2missing3cmissing4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/mutate_transmute/","title":"@mutate","text":"

    The primary purpose of @mutate() is to either create a new column or to update an existing column without changing the number of rows in the dataset. If you only plan to select the mutated columns, then you can use @transmute() instead of @mutate(). However, in TidierData.jl, @select() can also be used to create and select new columns (unlike R's tidyverse), which means that @transmute() is a redundant function in that it has the same functionality as @select(). @transmute is included in TidierData.jl for convenience but is not strictly required.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-to-add-a-new-column","title":"Using @mutate() to add a new column","text":"

    Let's create a new column that contains the budget for each movie expressed in millions of dollars, and the select a handful of columns and rows for the sake of brevity. Notice that the underscores in in 1_000_000 are strictly optional and included only for the sake of readability. Underscores within numbers are ignored by Julia, such that 1_000_000 is read by Julia exactly the same as 1000000.

    @chain movies begin\n  @filter(!ismissing(Budget))\n  @mutate(Budget_Millions = Budget/1_000_000)\n  @select(Title, Budget, Budget_Millions)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleBudgetBudget_MillionsStringInt32?Float641'G' Men4500000.452'Manos' the Hands of Fate190000.0193'Til There Was You2300000023.04.com for Murder50000005.0510 Things I Hate About You1600000016.0

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-to-update-an-existing-column","title":"Using @mutate() to update an existing column","text":"

    Here we will repeat the same exercise, except that we will overwrite the existing Budget column.

    @chain movies begin\n    @filter(!ismissing(Budget))\n    @mutate(Budget = Budget/1_000_000)\n    @select(Title, Budget)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat641'G' Men0.452'Manos' the Hands of Fate0.0193'Til There Was You23.04.com for Murder5.0510 Things I Hate About You16.0

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-with-in","title":"Using @mutate() with in","text":"

    Here's an example of using @mutate with in.

    @chain movies begin\n  @filter(!ismissing(Budget))\n  @mutate(Nineties = Year in 1990:1999)\n  @select(Title, Year, Nineties)\n  @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleYearNinetiesStringInt32Bool1'G' Men1935false2'Manos' the Hands of Fate1966false3'Til There Was You1997true4.com for Murder2002false510 Things I Hate About You1999true

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-mutate-with-n-and-row_number","title":"Using @mutate with n() and row_number()","text":"

    Here's an example of using @mutate with both n() and row_number(). Within the context of mutate(), n() and row_number() are created into temporarily columns, which means that they can be used inside of expressions.

    @chain movies begin\n  @mutate(Row_Num = row_number(),\n          Total_Rows = n())\n  @filter(!ismissing(Budget))\n  @select(Title, Year, Row_Num, Total_Rows)\n  @slice(1:5)\nend\n
    5\u00d74 DataFrame RowTitleYearRow_NumTotal_RowsStringInt32Int64Int641'G' Men193522587882'Manos' the Hands of Fate196635587883'Til There Was You199748587884.com for Murder20029158788510 Things I Hate About You199911258788

    "},{"location":"examples/generated/UserGuide/mutate_transmute/#using-transmute-to-update-and-select-columns","title":"Using @transmute to update and select columns.","text":"

    If we knew we wanted to select only the Title and Budget columns, we could have also used@transmute(), which (again) is just an alias for @select().

    @chain movies begin\n    @filter(!ismissing(Budget))\n    @transmute(Title = Title, Budget = Budget/1_000_000)\n    @slice(1:5)\nend\n
    5\u00d72 DataFrame RowTitleBudgetStringFloat641'G' Men0.452'Manos' the Hands of Fate0.0193'Til There Was You23.04.com for Murder5.0510 Things I Hate About You16.0

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/nesting/","title":"Nesting","text":""},{"location":"examples/generated/UserGuide/nesting/#nest","title":"@nest","text":"

    Nest columns into a dataframe nested into a new column

    using TidierData\n\ndf4 = DataFrame(x = [\"a\", \"b\", \"a\", \"b\", \"C\", \"a\"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7)\n\nnested_df = @nest(df4, n2 = starts_with(\"a\"), n3 = y:yz)\n
    3\u00d73 DataFrame Rowxn3n2StringDataFrameDataFrame1a3\u00d72 DataFrame3\u00d72 DataFrame2b2\u00d72 DataFrame2\u00d72 DataFrame3C1\u00d72 DataFrame1\u00d72 DataFrame

    To return to the original dataframe, you can unnest wider and then longer.

    @chain nested_df begin\n    @unnest_wider(n3:n2)\n    @unnest_longer(y:ab)\nend\n
    6\u00d75 DataFrame RowxyyzaabStringInt64Int64Int64Int641a1137122a3159103a6181274b2148115b4161096C517118

    Or you can unnest longer and then wider.

    @chain nested_df begin\n  @unnest_longer(n3:n2)\n  @unnest_wider(n3:n2)\nend\n
    6\u00d75 DataFrame RowxyzyaabStringInt64Int64Int64Int641a1317122a1539103a1861274b1428115b1641096C175118

    "},{"location":"examples/generated/UserGuide/nesting/#unnest_longer","title":"@unnest_longer","text":"

    @unnest_longer adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns.

    df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]);\n\n@chain df begin\n    @unnest_longer(y)\nend\n
    5\u00d72 DataFrame RowxyInt64Any121222323434535

    If there are rows with empty arrays, keep_empty will prevent these rows from being dropped. include_indices will add a new column for each flattened column that logs the position of each entry in the array.

    @chain df begin\n    @unnest_longer(y, keep_empty = true, indices_include = true)\nend\n
    7\u00d73 DataFrame Rowxyy_idInt64AnyInt6411missing12211322242335341635274missing1

    "},{"location":"examples/generated/UserGuide/nesting/#unnest_wider","title":"@unnest_wider","text":"

    @unnest_wider will widen a column or column(s) of Dicts, Arrays, Tuples or Dataframes into multiple columns.

    df2 = DataFrame(\n           name = [\"Zaki\", \"Farida\"],\n           attributes = [\n               Dict(\"age\" => 25, \"city\" => \"New York\"),\n               Dict(\"age\" => 30, \"city\" => \"Los Angeles\")]);\n\n@chain df2 begin\n    @unnest_wider(attributes)\nend\n
    2\u00d73 DataFrame RownamecityageStringStringInt641ZakiNew York252FaridaLos Angeles30

    "},{"location":"examples/generated/UserGuide/nesting/#unnesting-nested-dataframes-with-different-lengths-which-contains-arrays","title":"Unnesting nested Dataframes with different lengths which contains arrays","text":"
    df3 = DataFrame(\n    x = 1:3,\n    y = Any[\n        DataFrame(),\n        DataFrame(a = [\"A\"], b = [14]),\n        DataFrame(a = [\"A\", \"B\", \"C\"], b = [13, 12, 11], c = [4, 4, 4])\n    ]\n)\n
    3\u00d72 DataFrame RowxyInt64Any110\u00d70 DataFrame221\u00d72 DataFrame333\u00d73 DataFrame

    df3 contains dataframes in with different widths that also contain arrays. Chaining together @unnest_wider and @unnest_longer will unnest the columns to tuples first and then they will be fully unnested after.

    @chain df3 begin\n    @unnest_wider(y)\n    @unnest_longer(a:c, keep_empty = true)\nend\n
    5\u00d74 DataFrame RowxabcInt64AnyInt64?Int64?11missingmissingmissing22A14missing33A13443B12453C114

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/piping/","title":"Piping","text":"

    The easiest way to use TidierData.jl for complex data transformation operations is to connect them together using pipes. Julia comes with the built-in |> pipe operator, but TidierData.jl also includes and re-exports the @chain macro from the Chain.jl package. On this page, we will show you how to use both approaches.

    First, let's load a dataset.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/piping/#julias-built-in-pipe","title":"Julia's built-in |> pipe","text":"

    If we wanted to figure out the number of rows in the movies data frame, one way to do this is to apply the nrow() function to movies. The most straightforward way is to write it like this:

    nrow(movies)\n
    58788\n

    Another perfectly valid way to write this expression is by piping movies into nrow using the |> pipe operator.

    movies |> nrow\n
    58788\n

    Why might we want to do this? Well, whereas the first expression would naturally be read as \"Calculate the number of rows of movies,\" the second expression reads as \"Start with movies, then calculate the number of rows.\" For a simple expression, these are easy enough to reason about. However, as we start to pipe more and more functions in a single expression, the piped version becomes much easier to reason about.

    One quick note about Julia's built-in pipe: writing movies |> nrow() would not be considered valid. This is because Julia's built-in pipe always expects a function and not a function call. Writing nrow by itself is naming the function, whereas writing nrow() is calling the function. This quickly becomes an issue once we want to supply arguments to the function we are calling.

    Consider another approach to calculating the number of rows:

    size(movies, 1)\n
    58788\n

    In this case, the size() function returns a tuple of (rows, columns), and if you supply an optional second argument specifying the index of the tuple, it returns only that dimension. In this case, we called size() with a second argument of 1, indicating that we only wanted the function to return the number of rows.

    How would we write this using Julia's built-in pipe?

    movies |>\n  x -> size(x, 1)\n
    58788\n

    You might have wanted to write movies |> size(1), but because size(1) would represent a function call, we have to wrap the function call within an anonymous function, which is easily accomplished using the x -> func(x, arg1, arg2) syntax, where func() refers to any function and arg1 and arg2 refer to any additional arguments that are needed.

    Another way we could have accomplished this is to calculate size, which returns a tuple of (rows, columns), and then to use an anonymous function to grab the first value. Since we are calculating size without any arguments, we can simply write size within the pipe. However, to grab the first value using the x[1] syntax, we have to define an anonymous function. Putting it all together, we get this approach to piping:

    movies |>\n  size |>\n  x -> x[1]\n
    58788\n

    "},{"location":"examples/generated/UserGuide/piping/#using-the-chain-macro","title":"Using the @chain macro","text":"

    The @chain macro comes from the Chain.jl package and is included and re-exported by TidierData.jl. Let's do this same series of exercises using @chain.

    Let's calculate the number of rows using @chain.

    @chain movies nrow\n
    58788\n

    One of the reasons we prefer the use of @chain in TidierData.jl is that it is so concise. There is no need for any operator. Another interesting thing is that @chain doesn't care whether you use a function name or a function call. Both approaches work. As a result, writing nrow() instead of nrow is equally valid using @chain.

    @chain movies nrow()\n
    58788\n

    There are two options for writing out multi-row chains. The preferred approach is as follows, where the starting item is listed, followed by a begin-end block.

    @chain movies begin\n  nrow\nend\n
    58788\n

    @chain also comes with a built-in placeholder, which is _. To calculate the size and extract the first value, we can use this approach:

    @chain movies begin\n  size\n  _[1]\nend\n
    58788\n

    You don't have to list the data frame before the begin-end block. This is equally valid:

    @chain begin\n  movies\n  size\n  _[1]\nend\n
    58788\n

    The only time this approach is preferred is when instead of simply naming the data frame, you are using a function to read in the data frame from a file or database. Because this function call may include the path of the file, which could be quite long, it's easier to write this on it's own line within the begin-end block.

    While the documentation for TidierData.jl follows the convention of placing piped functions on separate lines of code using begin-end blocks, this is purely convention for ease of readability. You could rewrite the code above without the begin-end block as follows:

    @chain movies size _[1]\n
    58788\n

    For simple transformations, this approach is both concise and readable.

    "},{"location":"examples/generated/UserGuide/piping/#using-chain-with-tidierdatajl","title":"Using @chain with TidierData.jl","text":"

    Returning to our convention of multi-line pipes, let's grab the first five movies that were released since 2000 and had a rating of at least 9 out of 10. Here is one way that we could write this:

    @chain movies begin\n    @filter(Year >= 2000 && Rating >= 9)\n    @slice(1:5)\n    @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Note: we generally prefer using && in Julia because it is a \"short-cut\" operator. If the first condition evaluates to false, then the second condition is not even evaluated, which makes it faster (because it takes a short-cut).

    In the case of @filter, multiple conditions can be written out as separate expressions.

    @chain movies begin\n  @filter(Year >= 2000, Rating >= 9)\n  @slice(1:5)\n  @select(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Another to write this expression is take advantage of the fact that Julia macros can be called without parentheses. In this case, we will add back the && for the sake of readability.

    @chain movies begin\n  @filter Year >= 2000 && Rating >= 9\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    Lastly, TidierData.jl also supports multi-line expressions within each of the macros that accept multiple expressions. So you could also write this as follows:

    @chain movies begin\n  @filter begin\n    Year >= 2000\n    Rating >= 9\n  end\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float64113 Lakes2004135missing9.022wks, 1yr2002104missing9.43500 Years Later2005106missing9.349020051440009.15Able's House Is Green, The20031368009.4

    What's nice about this approach is that if you want to remove some criteria, you can easily comment out the relevant parts. For example, if you're willing to consider older movies, just comment out the Year >= 2000.

    @chain movies begin\n  @filter begin\n    # Year >= 2000\n    Rating >= 9\n  end\n  @slice 1:5\n  @select 1:5\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641+1 -119877missing9.42100 Years at the Movies19949missing9.2313 Lakes2004135missing9.042wks, 1yr2002104missing9.45500 Years Later2005106missing9.3

    "},{"location":"examples/generated/UserGuide/piping/#which-approach-to-use","title":"Which approach to use?","text":"

    The purpose of this page was to show you that both Julia's native pipes and the @chain macro are perfectly valid and capable. We prefer the use of @chain because it is a bit more flexible and concise, with a syntax that makes it easy to comment out individual operations. We have adopted a similar begin-end block functionality within TidierData.jl itself, so that you can spread arguments out over multiple lines if you prefer. In the end, the choice is up to you!

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/pivots/","title":"Pivoting","text":"

    Pivoting a dataset is needed when information sitting inside of cell values needs to be converted into column names (to make the dataset wider) or vice verse (to make the dataset longer). Either action can be referred to as \"reshaping\" a dataset, and various frameworks refer to the actions as unstacking/stacking or spreading/gathering. In R's tidyverse, these actions are referred to as pivoting, where the two accompanying actions are @pivot_wider() and @pivot_longer().

    "},{"location":"examples/generated/UserGuide/pivots/#pivot_wider","title":"@pivot_wider()","text":"

    Pivoting a dataset to make it wider is needed when information sitting inside of cell values needs to be converted into column names. The wider format is sometimes required for the purposes of calculating correlations or running statistical tests.

    Let's start with a \"long\" DataFrame and make it wide. Why would we want to make it wide? Well, if we wanted to calculate a correlation between A and B for rows with corresponding id numbers, we may need to first make sure that A and B are represented in adjacent columns.

    using TidierData\n\ndf_long = DataFrame(id = [1, 1, 2, 2],\n                    variable = [\"A\", \"B\", \"A\", \"B\"],\n                    value = [1, 2, 3, 4])\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A121B232A342B4

    To make this dataset wider, we can do the following:

    @pivot_wider(df_long, names_from = variable, values_from = value)\n
    2\u00d73 DataFrame RowidABInt64Int64?Int64?11122234

    In @pivot_wider(), both the names_from and values_from arguments are required. @pivot_wider() also supports string values for the names_from and values_from arguments.

    @pivot_wider(df_long, names_from = \"variable\", values_from = \"value\")\n
    2\u00d73 DataFrame RowidABInt64Int64?Int64?11122234

    "},{"location":"examples/generated/UserGuide/pivots/#pivot_longer","title":"@pivot_longer()","text":"

    For calculating summary statistics (e.g., mean) by groups, or for plotting purposes, DataFrames often need to be converted to their longer form. For this, we can use @pivot_longer. First, let's start with a \"wide\" DataFrame.

    df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4])\n
    2\u00d73 DataFrame RowidABInt64Int64Int6411122234

    Now, let's transform this wide dataset into the longer form. Unlike @pivot_wider(), where providing the names_from and values_from arguments is required, the only item that's required in @pivot_wider() is a set of columns to pivot. The names_to and values_to arguments are optional, and if not provided, they will default to \"variable\" and \"value\", respectively.

    We can recreate the original long dataset by doing the following. Multiple columns must be provided using selection syntax or a selection helper. Tuples containing multiple columns are not yet supported.

    @pivot_longer(df_wide, A:B)\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A122A331B242B4

    Here is another way of providing the same result using a different type of selection syntax.

    @pivot_longer(df_wide, -id)\n
    4\u00d73 DataFrame RowidvariablevalueInt64StringInt6411A122A331B242B4

    The selected columns can also be included as an array

    @pivot_longer(df_wide, [id, B])\n
    4\u00d73 DataFrame RowAvariablevalueInt64StringInt6411id123id231B243B4

    or excluded

    @pivot_longer(df_wide, -[id, B])\n
    2\u00d74 DataFrame RowidBvariablevalueInt64Int64StringInt64112A1224A3

    If all columns should be included, they can be specified by either everything(), :, or by leaving the argument blank

    @pivot_longer(df_wide, everything())\n
    6\u00d72 DataFrame RowvariablevalueStringInt641id12id23A14A35B26B4

    In this example, we set the names_to and values_to arguments. Either argument can be left out and will revert to the default value. The names_to and values_to arguments can be provided as strings or as bare unquoted variable names.

    Here is an example with names_to and values_to containing strings:

    @pivot_longer(df_wide, A:B, names_to = \"letter\", values_to = \"number\")\n
    4\u00d73 DataFrame RowidletternumberInt64StringInt6411A122A331B242B4

    And here is an example with names_to and values_to containing bare unquoted variables:

    @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)\n
    4\u00d73 DataFrame RowidletternumberInt64StringInt6411A122A331B242B4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/rename/","title":"@rename","text":"

    Renaming columns follows the same syntax as in R's tidyverse, where the \"tidy expression\" is new_name = old_name. While the main function to rename columns is @rename(), you can also use @select() if you additionally plan to select only the renamed columns.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/rename/#rename-using-rename","title":"Rename using @rename()","text":"

    If you only want to rename the columns without selecting them, then this is where @rename() comes in handy. For the sake of brevity, we are selecting the first 5 columns and rows after performing the @rename().

    @chain movies begin\n    @rename(title = Title, Minutes = Length)\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowtitleYearMinutesBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4

    "},{"location":"examples/generated/UserGuide/rename/#rename-using-select","title":"Rename using @select()","text":"

    If you plan to only select those columns that you would like to rename, then you can use @select() to both rename and select the columns of interest.

    @chain movies begin\n  @select(title = Title, Minutes = Length)\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowtitleMinutesStringInt321$1212$1000 a Touchdown713$21 a Day Once a Month74$40,000705$50,000 Climax Show, The71

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/select/","title":"@select","text":"

    The @select() macro in TidierData.jl supports many of the nuances of the R tidyverse implementation, including indexing columns individually by name or number, indexing by ranges of columns using the : operator between column names or numbers, and negative selection using negated column names or numbers. Selection helpers such as starts_with(), ends_with(), matches(), and contains() are also supported.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-individually-by-name","title":"Select the first 5 columns individually by name","text":"
    @chain movies begin\n    @select(Title, Year, Length, Budget, Rating)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-individually-by-number","title":"Select the first 5 columns individually by number","text":"
    @chain movies begin\n    @select(1, 2, 3, 4, 5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-by-name-using-a-range","title":"Select the first 5 columns by name (using a range)","text":"
    @chain movies begin\n    @select(Title:Rating)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-the-first-5-columns-by-number-using-a-range","title":"Select the first 5 columns by number (using a range)","text":"
    @chain movies begin\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowTitleYearLengthBudgetRatingStringInt32Int32Int32?Float641$1971121missing6.42$1000 a Touchdown193971missing6.03$21 a Day Once a Month19417missing8.24$40,000199670missing8.25$50,000 Climax Show, The197571missing3.4"},{"location":"examples/generated/UserGuide/select/#select-all-but-the-first-5-columns-by-name","title":"Select all but the first 5 columns by name","text":"

    Here we will limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity.

    @chain movies begin\n    @select(-(Title:Rating))\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    We can also use ! for inverted selection instead of -.

    @chain movies begin\n  @select(!(Title:Rating))\n  @select(1:5)\n  @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    "},{"location":"examples/generated/UserGuide/select/#select-all-but-the-first-5-columns-by-number","title":"Select all but the first 5 columns by number","text":"

    We will again limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity.

    @chain movies begin\n    @select(-(1:5))\n    @select(1:5)\n    @slice(1:5)\nend\n
    5\u00d75 DataFrame RowVotesR1R2R3R4Int32Float64Float64Float64Float6413484.54.54.54.52200.014.54.524.5350.00.00.00.04614.50.00.00.051724.54.50.014.5

    "},{"location":"examples/generated/UserGuide/select/#mix-and-match-selection","title":"Mix and match selection","text":"

    Just like in R's tidyverse, you can separate multiple selections with commas and mix and match different ways of selecting columns.

    @chain movies begin\n    @select(1, Budget:Rating)\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame RowTitleBudgetRatingStringInt32?Float641$missing6.42$1000 a Touchdownmissing6.03$21 a Day Once a Monthmissing8.24$40,000missing8.25$50,000 Climax Show, Themissing3.4

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/sep_unite/","title":"Separating","text":"

    Follwing the tidyverse syntax, the @separate() macro in TidierData.jl separates a single column into multiple columns. This is particularly useful for splitting a column containing delimited values into individual columns.

    using TidierData\n\ndf = DataFrame(a = [\"1-1\", \"2-2\", \"3-3-3\"]);\n

    "},{"location":"examples/generated/UserGuide/sep_unite/#separate","title":"@separate","text":"

    Separate the \"a\" column into \"b\", \"c\", and \"d\" columns based on the dash delimiter

    @chain df begin\n    @separate(a, (b, c, d), \"-\")\nend\n
    3\u00d73 DataFrame RowbcdSubStrin\u2026SubStrin\u2026SubStrin\u2026?111missing222missing3333

    The into columns can also be designated as follows:

    new_names = [\"x$(i)\" for i in 1:3]; # or new_names = [\"b\", \"c\", \"d\"], or new_names = [:b, :c, :d]\n\n@separate(df, a, !!new_names, \"-\")\n
    3\u00d73 DataFrame Rowx1x2x3SubStrin\u2026SubStrin\u2026SubStrin\u2026?111missing222missing3333

    "},{"location":"examples/generated/UserGuide/sep_unite/#unite","title":"@unite","text":"

    The @unite macro brings together multiple columns into one, separate the characters by a user specified delimiter Here, the @unite macro combines the \"b\", \"c\", and \"d\" columns columns into a single new \"new_col\" column using the \"/\" delimiter

    df = DataFrame(\n       b = [\"1\", \"2\", \"3\"],\n       c = [\"1\", \"2\", \"3\"],\n       d = [missing, missing, \"3\"]);\n\n@chain df begin\n    @unite(new_col, (b, c, d), \"/\")\nend\n
    3\u00d71 DataFrame Rownew_colString11/122/233/3/3

    "},{"location":"examples/generated/UserGuide/sep_unite/#separate_rows","title":"@separate_rows","text":"

    Separate rows into multiple rows based on a chosen delimiter.

    df = DataFrame(\n       a = 1:3,\n       b = [\"a\", \"aa;bb;cc\", \"dd;ee\"],\n       c = [\"1\", \"2;3;4\", \"5;6\"],\n       d = [\"7\", \"8;9;10\", \"11;12\"],\n       e = [\"11\", \"22;33;44\", \"55;66\"]);\n\n@separate_rows(df, b:e, \";\")\n
    6\u00d75 DataFrame RowabcdeInt64SubStrin\u2026SubStrin\u2026SubStrin\u2026SubStrin\u202611a171122aa282232bb393342cc4104453dd5115563ee61266

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/slice/","title":"@slice","text":"

    Slicing rows is similar to filtering rows, except that slicing is performed based on row numbers rather tha filter criteria. In TidierData.jl, slicing works similarly to R's tidyverse in that both positive (which rows to keep) and negative (which rows to remove) slicing is supported. For @slice(), any valid UnitRange of integers is considered valid; this is not the case for @select() or across().

    Remember: Just like every other TidierData.jl top-level macro, @slice() respects group. This means that in a grouped data frame, @slice(1:2) will select the first 2 rows from each group.

    using TidierData\n\ndf = DataFrame(row_num = 1:10,\n               a = string.(repeat('a':'e', inner = 2)),\n               b = [1,1,1,2,2,2,3,3,3,4])\n
    10\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c266c277d388d399e31010e4

    "},{"location":"examples/generated/UserGuide/slice/#slicing-using-a-range-of-numbers","title":"Slicing using a range of numbers","text":"

    This is an easy way of retrieving 5 consecutive rows.

    @chain df begin\n    @slice(1:5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c2

    "},{"location":"examples/generated/UserGuide/slice/#slicing-using-a-more-complex-unitrange-of-numbers","title":"Slicing using a more complex UnitRange of numbers","text":"

    How would we obtain every other from 1 to 7 (counting up by 2)? Note that range() is similar to seq() in R.

    @chain df begin\n  @slice(range(start = 1, step = 2, stop = 7))\nend\n
    4\u00d73 DataFrame Rowrow_numabInt64StringInt6411a123b135c247d3

    This same code can also be written using Julia's shorthand syntax for unit ranges.

    @chain df begin\n  @slice(1:2:7)\nend\n
    4\u00d73 DataFrame Rowrow_numabInt64StringInt6411a123b135c247d3

    "},{"location":"examples/generated/UserGuide/slice/#separate-multiple-row-selections-with-commas","title":"Separate multiple row selections with commas","text":"

    If you have multiple different row selections, you can separate them with commas.

    @chain df begin\n    @slice(1:5, 10)\nend\n
    6\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b144b255c2610e4

    "},{"location":"examples/generated/UserGuide/slice/#use-n-as-short-hand-to-indicate-the-number-of-rows","title":"Use n() as short-hand to indicate the number of rows","text":"

    Select the last 2 rows.

    @chain df begin\n  @slice(n()-1, n())\nend\n
    2\u00d73 DataFrame Rowrow_numabInt64StringInt6419e3210e4

    You can even use n() inside of UnitRanges, just like in R. Notice that the order of operations is slightly different in Julia as compared to R, so you don't have to wrap the n()-1 expression inside of parentheses.

    @chain df begin\n  @slice(n()-1:n())\nend\n
    2\u00d73 DataFrame Rowrow_numabInt64StringInt6419e3210e4

    "},{"location":"examples/generated/UserGuide/slice/#inverted-selection-using-negative-numbers","title":"Inverted selection using negative numbers","text":"

    This line selects all rows except the first 5 rows.

    @chain df begin\n    @slice(-(1:5))\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c227d338d349e3510e4

    "},{"location":"examples/generated/UserGuide/slice/#sample-5-random-rows-in-the-data-frame","title":"Sample 5 random rows in the data frame","text":"
    @chain df begin\n  @slice_sample(n = 5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c229e337d343b158d3"},{"location":"examples/generated/UserGuide/slice/#slice-the-min","title":"Slice the min","text":"

    This line selects all rows with the the minimum value of the desired column

    @chain df begin\n  @slice_min(b)\nend\n
    3\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b1

    This line will only show the first row.

    @chain df begin\n  @slice_min(b, with_ties = false)\nend\n
    1\u00d73 DataFrame Rowrow_numabInt64StringInt6411a1

    "},{"location":"examples/generated/UserGuide/slice/#slice-the-max","title":"Slice the max","text":"

    The optional prop arguement will slice a proportion of the full dataframe.

    @chain df begin\n  @slice_max(b, prop = 0.5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt64110e427d338d349e354b2

    "},{"location":"examples/generated/UserGuide/slice/#slice-the-tail","title":"Slice the tail","text":"
    @chain df begin\n  @slice_tail(prop = 0.5)\nend\n
    5\u00d73 DataFrame Rowrow_numabInt64StringInt6416c227d338d349e3510e4"},{"location":"examples/generated/UserGuide/slice/#slice-the-head","title":"Slice the head","text":"
    @chain df begin\n  @slice_head(n = 3)\nend\n
    3\u00d73 DataFrame Rowrow_numabInt64StringInt6411a122a133b1

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/summarize/","title":"@summarize","text":"

    Summarizing a dataset involves aggregating multiple rows down to (usually) a single row of data. This can be performed across the entire dataset, or if the dataset is grouped, then for each row in the dataset. This is implemented similarly to R's tidyverse using @summarize(). Out of admiration for Hadley Wickham, and to be consistent with the R tidyverse, both @summarize() and @summarise() are supported.

    Note that summarization is different from other verbs in the TidierData.jl in 2 respects:

    1. No auto-vectorization is performed when using @summarize()
    2. One layer of grouping is removed after each @summarize() function.

    If you require further changes to grouping beyond the defaults, you can either @ungroup() or call @group_by() to regroup by a different set of variables.

    using TidierData\nusing RDatasets\n\nmovies = dataset(\"ggplot2\", \"movies\");\n

    "},{"location":"examples/generated/UserGuide/summarize/#using-summarize-with-n-to-count-the-number-of-movies-in-the-dataset","title":"Using @summarize() with n() to count the number of movies in the dataset.","text":"

    Within the context of @summarize() only, n() is converted to DataFrames.jl's nrow() function.

    @chain movies begin\n    @summarize(n = n())\nend\n
    1\u00d71 DataFrame RownInt64158788

    "},{"location":"examples/generated/UserGuide/summarize/#using-summarize-to-calculate-average-budget-of-movies-in-the-dataset","title":"Using @summarize() to calculate average budget of movies in the dataset.","text":"

    The median budget in this dataset is 3 million, and the mean budget is 13 million! Making movies must be way more lucrative than making Julia packages.

    @chain movies begin\n  @mutate(Budget = Budget / 1_000_000)\n  @summarize(median_budget = median(skipmissing(Budget)),\n             mean_budget = mean(skipmissing(Budget)))\nend\n
    1\u00d72 DataFrame Rowmedian_budgetmean_budgetFloat64Float6413.013.4125

    "},{"location":"examples/generated/UserGuide/summarize/#combining-group_by-with-summarise","title":"Combining @group_by() with @summarise()","text":"

    How many movies came out in each of the last 5 years?

    @chain movies begin\n  @group_by(Year)\n  @summarise(n = n())\n  @arrange(desc(Year))\n  @slice(1:5)\nend\n
    5\u00d72 DataFrame RowYearnInt32Int6412005349220041945320032158420022168520012121

    Notice that there was no need to explicitly @ungroup() the dataset after summarizing here. The @summarise() function removed one layer of grouping. Since this dataset was only grouped by one variable (Year), it was no longer grouped after the @summarise was performed.

    This page was generated using Literate.jl.

    "},{"location":"examples/generated/UserGuide/summary/","title":"@summary","text":"

    The @summary() macro in TidierData.jl provides a concise way to compute summary statistics on data. Similar to its R counterpart, it will provide the mean, median, Q1, Q3, minimum, maximum, and number of missing values in a numerical column or columns.

    "},{"location":"examples/generated/UserGuide/summary/#summary-for-the-whole-dataframe","title":"Summary for the whole dataframe","text":"
    using TidierData\n\ndf = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]);\n\n@chain df begin\n    @summary()\nend\n\n@summary(df)\n
    4\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641A12.03.03.04.05502B77.758.58.59.2510413C1112.013.012.666713.514324D1617.018.018.019.02050"},{"location":"examples/generated/UserGuide/summary/#you-can-specify-columns-for-which-you-want-to-compute-the-summary-this-is-useful-if-the-dataframe-has-a-large-number-of-columns-and-youre-interested-in-only-a-subset-of-them","title":"You can specify columns for which you want to compute the summary. This is useful if the DataFrame has a large number of columns and you're interested in only a subset of them.","text":"
    @chain df begin\n    @summary(B)\nend\n\n@summary(df, B)\n
    1\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641B77.758.58.59.251041"},{"location":"examples/generated/UserGuide/summary/#or-for-a-range-of-columns","title":"or for a range of columns","text":"
    @chain df begin\n    @select(B:D)\n    @summary() # you can also write this @summary(2:4)\nend\n
    3\u00d79 DataFrame RowColumnMinQ1MedianMeanQ3MaxCountMissing_CountStringInt64Float64Float64Float64Float64Int64Int64Int641B77.758.58.59.2510412C1112.013.012.666713.514323D1617.018.018.019.02050

    This page was generated using Literate.jl.

    "}]} \ No newline at end of file