diff --git a/NEWS.md b/NEWS.md index f35813b4..4c1d78e5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.10.0 - 2023-08-15 +- Refactor macros to make them much faster and memory-efficient. +- `@group_by` no longer automatically sorts by group, which makes it much faster. This is a slight change in behavior from `dplyr` but the speed trade-off is worth it. + ## v0.9.2 - 2023-08-06 - Remove `TidierData_not_vectorized[]` from exports - Add `TidierCats.jl` functions to `not_vectorized[]` list diff --git a/Project.toml b/Project.toml index ebb8cc48..39e4c311 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.9.2" +version = "0.10.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/src/TidierData.jl b/src/TidierData.jl index 8a22e6f2..cdd3643a 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -69,45 +69,45 @@ macro select(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - select($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = select(df_copy, $(tidy_exprs...); ungroup = false) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - select($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = select(df_copy, $(tidy_exprs...)) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -127,45 +127,45 @@ macro transmute(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - select($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = select(df_copy, $(tidy_exprs...); ungroup = false) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - select($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = select(df_copy, $(tidy_exprs...)) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -185,45 +185,45 @@ macro rename(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - rename($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = rename(df_copy, $(tidy_exprs...); ungroup = false) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - rename($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = rename(df_copy, $(tidy_exprs...)) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -243,45 +243,45 @@ macro mutate(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - transform($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = transform(df_copy, $(tidy_exprs...); ungroup = false) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - transform($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = transform(df_copy, $(tidy_exprs...)) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -301,58 +301,51 @@ macro summarize(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs; autovec=false) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + if length(col_names) == 1 + local df_output = combine(df_copy, $(tidy_exprs...); ungroup = true) + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end - @chain _ begin - if length(col_names) == 1 - @chain _ begin - combine(_, $(tidy_exprs...); ungroup = true) - select(_, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - end - else - @chain _ begin - combine(_, $(tidy_exprs...); ungroup = true) - select(_, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - groupby(_, col_names[1:end-1]; sort = true) - end - end + else + local df_output = combine(df_copy, $(tidy_exprs...); ungroup = true) + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end + df_output = groupby(df_output, col_names[1:end-1]; sort = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - combine($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + local df_output = combine(df_copy, $(tidy_exprs...)) + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -379,45 +372,45 @@ macro filter(df, exprs...) tidy_exprs = parse_tidy.(tidy_exprs; subset=true) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number; ungroup = false) - else - _ - end - end - subset($(tidy_exprs...); skipmissing = true, ungroup = false) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = subset(df_copy, $(tidy_exprs...); skipmissing = true, ungroup = false) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) end else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - subset($(tidy_exprs...); skipmissing = true) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = subset(df_copy, $(tidy_exprs...); skipmissing = true) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end end + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -439,25 +432,38 @@ macro group_by(df, exprs...) grouping_exprs = parse_group_by.(exprs) df_expr = quote - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end + local any_expressions = all(typeof.($tidy_exprs) .!= QuoteNode) + + if $any_found_n || $any_found_row_number || any_expressions + if $(esc(df)) isa GroupedDataFrame + local df_copy = deepcopy($(esc(df))) + else + local df_copy = copy($(esc(df))) end - transform($(tidy_exprs...)) - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - groupby(Cols($(grouping_exprs...)); sort = true) + else + local df_copy = $(esc(df)) # not a copy + end + + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + if any_expressions + transform!(df_copy, $(tidy_exprs...)) + end + + if $any_found_n || $any_found_row_number + select!(df_copy, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end + + # removed ;sort = true for speed reasons + # this can cause a large number of allocations when the grouped variable is a string + local df_output = groupby(df_copy, Cols($(grouping_exprs...)); sort = false) + + df_output end if code[] @info MacroTools.prettify(df_expr) @@ -531,7 +537,7 @@ macro arrange(df, exprs...) @chain $(esc(df)) begin DataFrame # remove grouping sort([$(arrange_exprs...)]) # Must use [] instead of Cols() here - groupby(col_names; sort = true) # regroup + groupby(col_names; sort = false) # regroup end else sort($(esc(df)), [$(arrange_exprs...)]) # Must use [] instead of Cols() here @@ -557,57 +563,47 @@ macro distinct(df, exprs...) df_expr = quote if $(esc(df)) isa GroupedDataFrame local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - DataFrame # remove grouping because `unique()` does not work on GroupDataFrames - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - @chain _ begin - if length([$tidy_exprs...]) == 0 - unique(_) - else - unique(_, Cols($(tidy_exprs...))) - end - end - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - groupby(col_names; sort = true) # regroup + + # `@distinct()` uses a different pattern from the other macros + # because if the original DataFrame is grouped, it must be ungrouped + # and then regrouped, so there's no need to make a deepcopy. + # This is because `unique()` does not work on GroupDataFrames. + local df_copy = DataFrame($(esc(df))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + if length([$tidy_exprs...]) == 0 + unique!(df_copy) + else + unique!(df_copy, Cols($(tidy_exprs...))) + end + + if $any_found_n || $any_found_row_number + select!(df_copy, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end + groupby(df_copy, col_names; sort = false) # regroup and value to return else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :TidierData_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :TidierData_row_number) - else - _ - end - end - @chain _ begin - if length([$tidy_exprs...]) == 0 - unique(_) - else - unique(_, Cols($(tidy_exprs...))) - end - end - select(Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + local df_copy = copy($(esc(df))) + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + if length([$tidy_exprs...]) == 0 + unique!(df_copy) + else + unique!(df_copy, Cols($(tidy_exprs...))) end + + if $any_found_n || $any_found_row_number + select!(df_copy, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + end + + df_copy # value to return end end if code[] @@ -644,26 +640,22 @@ macro drop_na(df, exprs...) df_expr = quote if $(esc(df)) isa GroupedDataFrame local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - DataFrame # remove grouping because `dropmissing()` does not work on GroupDataFrames - @chain _ begin - if $num_exprs == 0 - dropmissing(_) - else - dropmissing(_, Cols($(tidy_exprs...))) - end - end - groupby(col_names; sort = true) # regroup + + # A copy is only needed for grouped dataframes because the copy + # has to be regrouped because `dropmissing()` does not support + # grouped data frames. + local df_copy = DataFrame($(esc(df))) + if $num_exprs == 0 + dropmissing!(df_copy) + else + dropmissing!(df_copy, Cols($(tidy_exprs...))) end + groupby(df_copy, col_names; sort = false) # regroup else - @chain $(esc(df)) begin - @chain _ begin - if $num_exprs == 0 - dropmissing(_) - else - dropmissing(_, Cols($(tidy_exprs...))) - end - end + if $num_exprs == 0 + dropmissing($(esc(df))) + else + dropmissing($(esc(df)), Cols($(tidy_exprs...))) end end end