diff --git a/docs/src/index.md b/docs/src/index.md index 5833f63..37fb148 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -25,6 +25,7 @@ In addition, DataFramesMeta provides * `@byrow` for applying functions to each row of a data frame (only supported inside other macros). * `@passmissing` for propagating missing values inside row-wise DataFramesMeta.jl transformations. * `@astable` to create multiple columns within a single transformation. +* `@when` to non-destructively work with a subset of observations (Similar to Stata's `if`) * `@chain`, from [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) for piping the above macros together, similar to [magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html)'s `%>%` in R. * `@label!` and `@note!` for attaching metadata to columns. diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index 52bd069..553b5fa 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -24,7 +24,7 @@ export @with, @rtransform, @rselect, @rtransform!, @rselect!, @distinct, @rdistinct, @distinct!, @rdistinct!, @eachrow, @eachrow!, - @byrow, @passmissing, @astable, @kwarg, + @byrow, @passmissing, @astable, @kwarg, @when, @label!, @note!, printlabels, printnotes, @groupby, @based_on, @where # deprecated diff --git a/src/macros.jl b/src/macros.jl index 855714e..d3baae3 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -547,11 +547,33 @@ getsinglecolumn(df, s) = throw(ArgumentError("Only indexing with Symbols, string "is currently allowed with $DOLLAR")) function with_helper(d, body) + # Get rid of the leading @byrow, @passmissing etc. + # but otherwise leave body untouched + body, outer_flags = extract_macro_flags(body) + if outer_flags[ASTABLE_SYM][] + throw(ArgumentError("@astable macro-flag cannot be used inside of @with")) + end + # If we have a begin...end somewhere, we might + # have a @when. + # Remove the @when statements, recording that they + # exist. To do this we also have to de-construct + # body into a vector expressions. + es, when = get_when_statements(MacroTools.rmlines(MacroTools.block(body)).args) + newbody = Expr(:block, es...) # Make body an expression to force the # complicated method of fun_to_vec # in the case of QuoteNode - t = fun_to_vec(Expr(:block, body); no_dest=true) - :($exec($d, $t)) + t = fun_to_vec(newbody; no_dest=true, outer_flags = outer_flags) + if !isnothing(when) + w = fun_to_vec(when; no_dest = true, gensym_names=false, outer_flags = outer_flags) + z = gensym() + quote + $z = $subset($d, $w; view = true, skipmissing = true) + $exec($z, $t) + end + else + :($exec($d, $t)) + end end """ @@ -663,6 +685,83 @@ macro with(d, body) esc(with_helper(d, body)) end +""" + @when(args...) + +Perform operations on a subset of `df`, but still +return a data frame with the same number of rows as `df`. `@when` can be used +with the `@transform` macros, `@select` macros, and `@with`. + +`@when` is not a "real" macro. It is only functional inside DataFramesMeta.jl macros. +A motivating example: + +``` +@rtransform df begin + @when :a == 1 + :y = :y - mean(:y) +end +``` + +The above block generates the column `:y` which is de-meaned with respect to observations where +`:a == 1`. If `:y` already exists in `df`, then new values over-write old values only +when `:a == 1`. If `:y` does not already exist in `df`, then new values are written +when `:a == 1`, and remaining values are filled with `missing`. + +Only one `@when` statement is allowed per transformation macro and it must be the +first argument in the transformation. + +`@when` inherits `@byrow` and `@passmissing` from the transformation. As an example: + +``` +@transform df @byrow begin + @when :a == 1 + ... +end +``` + +In the above, the condition inside `@when` operates row-wise. However, `@byrow` and `@passmissing` can +also be passed independently, such as `@byrow @when :a == 1`. + +Like `@subset`, `@when` drops rows where `missing` values are returned. Unlike `@subset`, +there is currently no way to control this behavior. + +## Details + +`@when` operates by calling `select` with the `view = true` keyword argument, +followed by a `transform!` call. See `?transform!` for more details. Roughly, +the expression + +``` +@transform df begin + @when :a .== 1 + :y = 5 +end +``` + +translates to + +``` +df1 = @subset(copy(df), :a .== 1; view = true) +df2 = @transform! df1 :y = 5 +parent(df2) +``` + +Unlike the other macro-flags, such as `@passmissing` and `@byrow`, `@when` cannot be +used at the top-level. +``` +@transform df @byrow @when(:a == 1) begin + :x = 1 + :y = 2 +end +``` +is not supported. + +""" +macro when(args...) + throw(ArgumentError("@passmissing only works inside DataFramesMeta macros.")) +end + + ASTABLE_RHS_ORDERBY_DOCS = """ In operations, it is also allowed to use `AsTable(cols)` to work with multiple columns at once, where the columns are grouped together in a @@ -1456,15 +1555,53 @@ end ## transform & @transform ## ############################################################################## +copy_gd(x::GroupedDataFrame) = transform(x; ungroup = false) +copy_gd(x::AbstractDataFrame) = copy(x) +function generic_transform_select_helper(x, args...; wrap_byrow::Bool = false, modify::Bool = false, selectfun::Bool = false) + if selectfun + secondstagefun = select! + if modify + transformfun = select! + else + transformfun = select + end + else + secondstagefun = transform! + if modify + transformfun = transform! + else + transformfun = transform + end + end + x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = wrap_byrow) + exprs, when = get_when_statements(exprs) + # Main.@infiltrate + if !isnothing(when) + w = fun_to_vec(when; no_dest = true, gensym_names=false, outer_flags=outer_flags) + t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) + z = gensym() + if modify + quote + $z = $subset($x, $w; view = true, skipmissing = true) + $parent($secondstagefun($z, $(t...); $(kw...))) + end + else + quote + $z = $subset($copy_gd($x), $w; view = true, skipmissing = true) + $parent($secondstagefun($z, $(t...); $(kw...))) + end + end + else + t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) + quote + $transformfun($x, $(t...); $(kw...)) + end + end +end function transform_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = false) - - t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) - quote - $transform($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = false, modify = false) end """ @@ -1593,12 +1730,7 @@ macro transform(x, args...) end function rtransform_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = true) - - t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) - quote - $transform($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = true, modify = false) end """ @@ -1646,12 +1778,7 @@ end function transform!_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = false) - - t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) - quote - $transform!($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = false, modify = true) end """ @@ -1760,12 +1887,7 @@ macro transform!(x, args...) end function rtransform!_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = true) - - t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) - quote - $transform!($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = true, modify = true) end """ @@ -1784,12 +1906,7 @@ end ############################################################################## function select_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = false) - - t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags, allow_multicol = true) for ex in exprs) - quote - $select($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = false, modify = false, selectfun = true) end """ @@ -1929,12 +2046,7 @@ macro select(x, args...) end function rselect_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = true) - - t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) - quote - $select($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = true, modify = false, selectfun = true) end """ @@ -1982,12 +2094,7 @@ end ############################################################################## function select!_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = false) - - t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) - quote - $select!($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = false, modify = true, selectfun = true) end """ @@ -2118,12 +2225,7 @@ macro select!(x, args...) end function rselect!_helper(x, args...) - x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = true) - - t = (fun_to_vec(ex; gensym_names=false, outer_flags=outer_flags) for ex in exprs) - quote - $select!($x, $(t...); $(kw...)) - end + generic_transform_select_helper(x, args...; wrap_byrow = true, modify = true, selectfun = true) end """ diff --git a/src/parsing.jl b/src/parsing.jl index a431ec6..f82bcc4 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -179,6 +179,7 @@ is_macro_head(ex::Expr, name) = ex.head == :macrocall && ex.args[1] == Symbol(na const BYROW_SYM = Symbol("@byrow") const PASSMISSING_SYM = Symbol("@passmissing") const ASTABLE_SYM = Symbol("@astable") +const WHEN_SYM = Symbol("@when") const DEFAULT_FLAGS = (;BYROW_SYM => Ref(false), PASSMISSING_SYM => Ref(false), ASTABLE_SYM => Ref(false)) extract_macro_flags(ex, exprflags = deepcopy(DEFAULT_FLAGS)) = (ex, exprflags) @@ -191,6 +192,9 @@ function extract_macro_flags(ex::Expr, exprflags = deepcopy(DEFAULT_FLAGS)) throw(ArgumentError("Redundant flag $macroname used.")) end exprflag[] = true + if length(ex.args) > 3 + throw(ArgumentError("Too many arguments passed to $macroname")) + end return extract_macro_flags(MacroTools.unblock(ex.args[3]), exprflags) else return (ex, exprflags) @@ -199,6 +203,56 @@ function extract_macro_flags(ex::Expr, exprflags = deepcopy(DEFAULT_FLAGS)) return (ex, exprflags) end +""" + omit_nested_when(ex::Expr, when = Ref(false)) + +For a statement of the form `@passmissing @when x` return `@passmissing x` and +a flag signifying a `@when` statement was present. +""" +function omit_nested_when(ex::Expr, when = Ref(false)) + if ex.head == :macrocall && ex.args[1] in keys(DEFAULT_FLAGS) || is_macro_head(ex, "@when") + macroname = ex.args[1] + if length(ex.args) > 3 + throw(ArgumentError("Too many arguments passed to $macroname")) + end + if macroname == Symbol("@when") + when[] = true + return omit_nested_when(MacroTools.unblock(ex.args[3]), when) + else + new_expr, when = omit_nested_when(MacroTools.unblock(ex.args[3]), when) + ex.args[3] = new_expr + end + end + return ex, when +end +omit_nested_when(ex, when = Ref(false)) = ex, when + +function get_when_statements(exprs) + new_exprs = [] + when_statement = nothing + seen_non_when = false + seen_when = false + for expr in exprs + e, when = omit_nested_when(expr) + if when[] + if seen_when + throw(ArgumentError("Only one @when statement allowed at a time")) + end + if seen_non_when + throw(ArgumentError("All @when statements must come first")) + end + seen_when = true + when_statement = e + else + seen_non_when = true + push!(new_exprs, expr) + end + end + + new_exprs, when_statement +end + + """ check_macro_flags_consistency(exprflags) diff --git a/test/runtests.jl b/test/runtests.jl index 4c12faf..4beaab8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,6 +16,7 @@ my_tests = ["dataframes.jl", "astable.jl", "astable_flag.jl", "passmissing.jl", + "when.jl", "multicol.jl"] println("Running tests:") diff --git a/test/when.jl b/test/when.jl new file mode 100644 index 0000000..dac0ff4 --- /dev/null +++ b/test/when.jl @@ -0,0 +1,486 @@ +module TestWhen + +using Test +using DataFrames +using DataFramesMeta +using Statistics + +const ≅ = isequal + +@testset "@transform when" begin + df = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(a = [1, 2], z = [60, 500], c = [missing, 5]) + df2 = @transform df begin + @when :a .> 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @transform(df, @when(:a .> 1), :c = 5, :z = 500) + @test df2 ≅ res + + df2 = @transform df @byrow begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @transform df @byrow @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @transform df begin + @byrow @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @transform df begin + @when @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + + df2 = @transform df begin + @when @byrow :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + + dfa = copy(df) + dfa.a = [missing, 2] + df2 = @transform dfa begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ DataFrame(a = [missing, 2], z = [60, 500], c = [missing, 5]) +end + +@testset "@rtransform when" begin + df = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(a = [1, 2], z = [60, 500], c = [missing, 5]) + df2 = @rtransform df begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @rtransform(df, @when(:a > 1), :c = 5, :z = 500) + @test df2 ≅ res + + df2 = @rtransform df @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @rtransform df begin + @when :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + + dfa = copy(df) + dfa.a = [missing, 2] + df2 = @transform dfa begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ DataFrame(a = [missing, 2], z = [60, 500], c = [missing, 5]) +end + +@testset "@transform! when" begin + df_orig = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(a = [1, 2], z = [60, 500], c = [missing, 5]) + df = copy(df_orig) + df2 = @transform! df begin + @when :a .> 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @transform!(df, @when(:a .> 1), :c = 5, :z = 500) + @test df2 ≅ res + + df = copy(df_orig) + df2 = @transform! df @byrow begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @transform! df @byrow @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @transform! df begin + @byrow @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @transform! df begin + @when @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @transform! df begin + @when @byrow :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + dfa = copy(df_orig) + dfa.a = [missing, 2] + df = copy(dfa) + df2 = @transform! df begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ DataFrame(a = [missing, 2], z = [60, 500], c = [missing, 5]) + @test df2 === df +end + + +@testset "@rtransform! when" begin + df_orig = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(a = [1, 2], z = [60, 500], c = [missing, 5]) + df = copy(df_orig) + df2 = @rtransform! df begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @rtransform!(df, @when(:a > 1), :c = 5, :z = 500) + @test df2 ≅ res + + df = copy(df_orig) + df2 = @rtransform! df begin + @when :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + dfa = copy(df_orig) + dfa.a = [missing, 2] + df = copy(dfa) + df2 = @rtransform! df begin + @when @passmissing :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ DataFrame(a = [missing, 2], z = [60, 500], c = [missing, 5]) + @test df2 === df +end + +@testset "@select when" begin + df = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(c = [missing, 5], z = [60, 500]) + df2 = @select df begin + @when :a .> 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @select(df, @when(:a .> 1), :c = 5, :z = 500) + @test df2 ≅ res + + df2 = @select df @byrow begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @select df @byrow @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @select df begin + @byrow @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @select df begin + @when @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + + df2 = @select df begin + @when @byrow :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + + dfa = copy(df) + dfa.a = [missing, 2] + df2 = @select dfa begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res +end + +@testset "@rselect when" begin + df = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(c = [missing, 5], z = [60, 500]) + df2 = @rselect df begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @rselect(df, @when(:a > 1), :c = 5, :z = 500) + @test df2 ≅ res + + df2 = @rselect df @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + + df2 = @rselect df begin + @when :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + + dfa = copy(df) + dfa.a = [missing, 2] + df2 = @select dfa begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res +end + +@testset "@select! when" begin + df_orig = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(c = [missing, 5], z = [60, 500]) + df = copy(df_orig) + df2 = @select! df begin + @when :a .> 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @select!(df, @when(:a .> 1), :c = 5, :z = 500) + @test df2 ≅ res + + df = copy(df_orig) + df2 = @select! df @byrow begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @select! df @byrow @passmissing begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @select! df begin + @byrow @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @select! df begin + @when @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @select! df begin + @when @byrow :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + dfa = copy(df_orig) + dfa.a = [missing, 2] + df = copy(dfa) + df2 = @select! df begin + @when @passmissing @byrow :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df +end + + +@testset "@rselect! when" begin + df_orig = DataFrame(a = [1, 2], z = [60, 70]) + res = DataFrame(c = [missing, 5], z = [60, 500]) + df = copy(df_orig) + df2 = @rselect! df begin + @when :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + df = copy(df_orig) + df2 = @rselect!(df, @when(:a > 1), :c = 5, :z = 500) + @test df2 ≅ res + + df = copy(df_orig) + df2 = @rselect! df begin + @when :a > 1 ? true : missing + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df + + dfa = copy(df_orig) + dfa.a = [missing, 2] + df = copy(dfa) + df2 = @rselect! df begin + @when @passmissing :a > 1 + :c = 5 + :z = 500 + end + @test df2 ≅ res + @test df2 === df +end + +df = DataFrame(a = [1, missing, 3, 4], z = [50, 60, 70, 80]) +@testset "@when errors" begin + @test_throws LoadError @eval @transform df begin + @when :a .> 1 + @when :a .> 2 + :c = 5 + end + + @test_throws LoadError @eval @transform df @when(:a .== 1) begin + :c = 1 + :b = 2 + end + + @test_throws LoadError @eval @transform df @byrow @when(:a == 1) begin + :c = 1 + :b = 2 + end + + @test_throws LoadError @eval @transform df @when(:a == 1) @byrow begin + :c = 1 + :b = 2 + end + + @test_throws LoadError @eval @transform df @astable begin + @when :x == 1 + :z = 1 + end + + + @test_throws LoadError @eval @with df @when(:a == 1) begin + first(:z) + end +end + +@testset "@with when" begin + df = DataFrame(a = [missing, 2], z = [60, 70]) + + t = @with df begin + @when :a .> 1 + :z + end + @test t === view(df.z, 2:2) + + t = @with df @byrow begin + @when :a > 1 + first(:z) + end + @test t == [70] + + t = @with df begin + @when @byrow :a > 1 + first(:z) + end + @test t == 70 + + t = @with df begin + @when @byrow @passmissing :a > 1 && true + first(:z) + end + @test t == 70 +end + +end # module \ No newline at end of file