From 5d10db983fb23eecd82a8899602cdddbe15966cf Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 04:03:51 +0200 Subject: [PATCH 01/13] Add null-safe equality operator (%<=>%) --- R/pkg/NAMESPACE | 3 +- R/pkg/R/column.R | 44 +++++++++++++++++++++++ R/pkg/R/generics.R | 4 +++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 ++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index db8e06db18edc..f6438204d36eb 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -182,7 +182,8 @@ exportMethods("arrange", exportClasses("Column") -exportMethods("%in%", +exportMethods("%<=>%", + "%in%", "abs", "acos", "add_months", diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 539d91b0f8797..1093a96397c19 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -302,3 +302,47 @@ setMethod("otherwise", jc <- callJMethod(x@jc, "otherwise", value) column(jc) }) + +#' \%<=>\% +#' +#' Equality test that is safe for null values. +#' +#' Can be used, unlike standard equality operator, to perform null-safe joins. +#' Equivalent to Scala +#' \href{https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Column@%3C=%3E(other:Any):org.apache.spark.sql.Column}{\code{<=>}} and +#' \href{https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Column@eqNullSafe(other:Any):org.apache.spark.sql.Column}{\code{eqNullSafe}}. +#' +#' @param x a Column +#' @param value a value to compare +#' @rdname eq_null_safe +#' @name %<=>% +#' @aliases %<=>%,Column-method +#' @export +#' @examples +#' \dontrun{ +#' df1 <- createDataFrame(data.frame( +#' x = c(1, NA, 3, NA), y = c(2, 6, 3, NA) +#' )) +#' +#' head(select(df1, df1$x == df1$y, df1$x %<=>% df1$y)) +#' ## (x = y) (x <=> y) +#' ##1 FALSE FALSE +#' ##2 NA FALSE +#' ##3 TRUE TRUE +#' ##4 NA TRUE +#' +#' df2 <- createDataFrame(data.frame(y = c(3, NA))) +#' count(join(df1, df2, df1$y == df2$y)) +#' ## [1] 1 +#' +#' count(join(df1, df2, df1$y %<=>% df2$y)) +#' ## [1] 2 +#' } +#' @note \%<=>\% since 2.3.0 +setMethod("%<=>%", + signature(x = "Column", value = "ANY"), + function(x, value) { + value <- if (class(value) == "Column") { value@jc } else { value } + jc <- callJMethod(x@jc, "eqNullSafe", value) + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e510ff9a2d80f..3933b02959e5f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -856,6 +856,10 @@ setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") }) #' @export setGeneric("over", function(x, window) { standardGeneric("over") }) +#' @rdname eq_null_safe +#' @export +setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") }) + ###################### WindowSpec Methods ########################## #' @rdname partitionBy diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 1828cddffd27c..f6927c225a633 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1323,6 +1323,7 @@ test_that("column operators", { c3 <- (c + c2 - c2) * c2 %% c2 c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3) c5 <- c2 ^ c3 ^ c4 + c6 <- c2 %<=>% c3 }) test_that("column functions", { @@ -1975,6 +1976,16 @@ test_that("filter() on a DataFrame", { # Test stats::filter is working #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint + + # test suites for %<=>% + dfNa <- read.json(jsonPathNa) + expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1) + expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 -1) + expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3) + expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3) + # match NA from two columns + expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2) + expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2) }) test_that("join(), crossJoin() and merge() on a DataFrame", { From 13138e3cd5bc4f41513c13d29fbedda45ec389a4 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 14:39:33 +0200 Subject: [PATCH 02/13] Fix %<=>% details --- R/pkg/R/column.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 1093a96397c19..e6e1d281ca02c 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -308,9 +308,7 @@ setMethod("otherwise", #' Equality test that is safe for null values. #' #' Can be used, unlike standard equality operator, to perform null-safe joins. -#' Equivalent to Scala -#' \href{https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Column@%3C=%3E(other:Any):org.apache.spark.sql.Column}{\code{<=>}} and -#' \href{https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Column@eqNullSafe(other:Any):org.apache.spark.sql.Column}{\code{eqNullSafe}}. +#' Equivalent to Scala \code{Column.<=>} and \code{Column.eqNullSafe}. #' #' @param x a Column #' @param value a value to compare From 50253447810bceeabd3608018dbe83429a4f80f4 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 04:19:49 +0200 Subject: [PATCH 03/13] Implement not function and ! operator --- R/pkg/NAMESPACE | 1 + R/pkg/R/column.R | 23 +++++++++++-- R/pkg/R/functions.R | 39 +++++++++++++++++++++++ R/pkg/R/generics.R | 4 +++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 9 ++++++ 5 files changed, 74 insertions(+), 2 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index f6438204d36eb..e8de34d9371a0 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -292,6 +292,7 @@ exportMethods("%<=>%", "nanvl", "negate", "next_day", + "not", "ntile", "otherwise", "over", diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index e6e1d281ca02c..f3cbb309f040d 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -67,8 +67,7 @@ operators <- list( "+" = "plus", "-" = "minus", "*" = "multiply", "/" = "divide", "%%" = "mod", "==" = "equalTo", ">" = "gt", "<" = "lt", "!=" = "notEqual", "<=" = "leq", ">=" = "geq", # we can not override `&&` and `||`, so use `&` and `|` instead - "&" = "and", "|" = "or", #, "!" = "unary_$bang" - "^" = "pow" + "&" = "and", "|" = "or", "^" = "pow" ) column_functions1 <- c("asc", "desc", "isNaN", "isNull", "isNotNull") column_functions2 <- c("like", "rlike", "getField", "getItem", "contains") @@ -344,3 +343,23 @@ setMethod("%<=>%", jc <- callJMethod(x@jc, "eqNullSafe", value) column(jc) }) + +#' ! +#' +#' @rdname not +#' @aliases !,Column-method +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame(data.frame(x = c(-1, 0, 1))) +#' +#' head(select(df, !column("x") > 0)) +#' ## (NOT (x > 0.0)) +#' ##1 TRUE +#' ##2 TRUE +#' ##3 FALSE +#' } +#' @note ! since 2.3.0 +setMethod("!", + signature(x = "Column"), + function(x) not(x)) \ No newline at end of file diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f4a34fbabe4d7..27ad7a72fd976 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3859,3 +3859,42 @@ setMethod("posexplode_outer", jc <- callJStatic("org.apache.spark.sql.functions", "posexplode_outer", x@jc) column(jc) }) + +#' not +#' +#' Inversion of boolean expression. +#' +#' \code{not} and \code{!} cannot be applied directly to numerical column. +#' To achieve R-like truthiness column has to be casted to \code{BooleanType}. +#' +#' @param x Column to compute on +#' @rdname not +#' @name not +#' @aliases not,Column-method +#' @export +#' @examples \dontrun{ +#' df <- createDataFrame(data.frame( +#' is_true = c(TRUE, FALSE, NA), +#' flag = c(1, 0, 1) +#' )) +#' +#' head(select(df, not(df$is_true))) +#' ## (NOT is_true) +#' ##1 FALSE +#' ##2 TRUE +#' ##3 NA +#' +#' # Explicit cast is required when working with numeric column +#' head(select(df, not(cast(df$flag, "boolean")))) +#' ## (NOT CAST(flag AS BOOLEAN)) +#' ## 1 FALSE +#' ## 2 TRUE +#' ## 3 FALSE +#' } +#' @note not since 2.3.0 +setMethod("not", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc) + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 3933b02959e5f..d4e4958dc078c 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1158,6 +1158,10 @@ setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") }) #' @export setGeneric("negate", function(x) { standardGeneric("negate") }) +#' @rdname not +#' @export +setGeneric("not", function(x) { standardGeneric("not") }) + #' @rdname next_day #' @export setGeneric("next_day", function(y, x) { standardGeneric("next_day") }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index f6927c225a633..d1660bf6c1e1e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1324,6 +1324,7 @@ test_that("column operators", { c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3) c5 <- c2 ^ c3 ^ c4 c6 <- c2 %<=>% c3 + c7 <- !c6 }) test_that("column functions", { @@ -1349,6 +1350,7 @@ test_that("column functions", { c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3) c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy") c21 <- posexplode_outer(c) + explode_outer(c) + c22 <- not(c) # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) @@ -1489,6 +1491,13 @@ test_that("column functions", { lapply( list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)), as.environment)) + + df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA))) + expect_equal( + collect(select(df, alias(SparkR::not(df$is_true), "is_false"))), + data.frame(is_false = c(FALSE, TRUE, NA)) + ) + }) test_that("column binary mathfunctions", { From fb476202f163e004679aae449155e013675eb1ae Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 17:05:38 +0200 Subject: [PATCH 04/13] Fix style --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index d1660bf6c1e1e..324667544121f 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1989,7 +1989,7 @@ test_that("filter() on a DataFrame", { # test suites for %<=>% dfNa <- read.json(jsonPathNa) expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1) - expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 -1) + expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1) expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3) expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3) # match NA from two columns From 38075bfcab37314fd2d6da94784ead5451dfc8b3 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 20:20:54 +0200 Subject: [PATCH 05/13] Add not to masked names --- R/pkg/inst/tests/testthat/test_context.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index c847113491113..c64fe6edcd49e 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -21,10 +21,10 @@ test_that("Check masked functions", { # Check that we are not masking any new function from base, stats, testthat unexpectedly # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it # hard for users to use base R functions. Please check when in doubt. - namesOfMaskedCompletely <- c("cov", "filter", "sample") + namesOfMaskedCompletely <- c("cov", "filter", "sample", "not") namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", - "summary", "transform", "drop", "window", "as.data.frame", "union") + "summary", "transform", "drop", "window", "as.data.frame", "union", "not") if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) } From 8b18e5b16767a67891da781fe87ac6d033592453 Mon Sep 17 00:00:00 2001 From: zero323 Date: Thu, 27 Apr 2017 21:16:05 +0200 Subject: [PATCH 06/13] Add missing line at EOF --- R/pkg/R/column.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index f3cbb309f040d..aa9ccc0d3416c 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -362,4 +362,4 @@ setMethod("%<=>%", #' @note ! since 2.3.0 setMethod("!", signature(x = "Column"), - function(x) not(x)) \ No newline at end of file + function(x) not(x)) From 412f4f0af614103958fc9cb097f117828daf7c95 Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 28 Apr 2017 20:23:22 +0200 Subject: [PATCH 07/13] Remove Spark:: prefix from not --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 324667544121f..209e6cfbff6be 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1494,7 +1494,7 @@ test_that("column functions", { df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA))) expect_equal( - collect(select(df, alias(SparkR::not(df$is_true), "is_false"))), + collect(select(df, alias(not(df$is_true), "is_false"))), data.frame(is_false = c(FALSE, TRUE, NA)) ) From 1c4cd9a2ef162041d6588e3c1f50e8b612300c88 Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 28 Apr 2017 20:25:18 +0200 Subject: [PATCH 08/13] Reorder filter suite --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 209e6cfbff6be..08296354ca7ed 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1983,9 +1983,6 @@ test_that("filter() on a DataFrame", { filtered6 <- where(df, df$age %in% c(19, 30)) expect_equal(count(filtered6), 2) - # Test stats::filter is working - #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint - # test suites for %<=>% dfNa <- read.json(jsonPathNa) expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1) @@ -1995,6 +1992,9 @@ test_that("filter() on a DataFrame", { # match NA from two columns expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2) expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2) + + # Test stats::filter is working + #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint }) test_that("join(), crossJoin() and merge() on a DataFrame", { From af781a7df8f66cd182e7d85843fc559d2d4583fc Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 28 Apr 2017 20:30:02 +0200 Subject: [PATCH 09/13] Remove example output(s) from not --- R/pkg/R/functions.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 27ad7a72fd976..f9687d680e7a2 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3879,17 +3879,9 @@ setMethod("posexplode_outer", #' )) #' #' head(select(df, not(df$is_true))) -#' ## (NOT is_true) -#' ##1 FALSE -#' ##2 TRUE -#' ##3 NA #' #' # Explicit cast is required when working with numeric column #' head(select(df, not(cast(df$flag, "boolean")))) -#' ## (NOT CAST(flag AS BOOLEAN)) -#' ## 1 FALSE -#' ## 2 TRUE -#' ## 3 FALSE #' } #' @note not since 2.3.0 setMethod("not", From a2c24b59c5f8b0bb187d70c2d7267b2a18908e3b Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 28 Apr 2017 20:33:50 +0200 Subject: [PATCH 10/13] Remove example output and reformat ! and %<=>% --- R/pkg/R/column.R | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index aa9ccc0d3416c..6a7375e5dc281 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -322,18 +322,11 @@ setMethod("otherwise", #' )) #' #' head(select(df1, df1$x == df1$y, df1$x %<=>% df1$y)) -#' ## (x = y) (x <=> y) -#' ##1 FALSE FALSE -#' ##2 NA FALSE -#' ##3 TRUE TRUE -#' ##4 NA TRUE #' #' df2 <- createDataFrame(data.frame(y = c(3, NA))) #' count(join(df1, df2, df1$y == df2$y)) -#' ## [1] 1 #' #' count(join(df1, df2, df1$y %<=>% df2$y)) -#' ## [1] 2 #' } #' @note \%<=>\% since 2.3.0 setMethod("%<=>%", @@ -354,12 +347,6 @@ setMethod("%<=>%", #' df <- createDataFrame(data.frame(x = c(-1, 0, 1))) #' #' head(select(df, !column("x") > 0)) -#' ## (NOT (x > 0.0)) -#' ##1 TRUE -#' ##2 TRUE -#' ##3 FALSE #' } #' @note ! since 2.3.0 -setMethod("!", - signature(x = "Column"), - function(x) not(x)) +setMethod("!", signature(x = "Column"), function(x) not(x)) From f7b673a9f4870ab79c9d96ad3b751cbd86db4a71 Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 28 Apr 2017 21:48:17 +0200 Subject: [PATCH 11/13] Adjust name / rdname / aliases --- R/pkg/R/column.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 6a7375e5dc281..24b26cc76d6a1 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -339,7 +339,10 @@ setMethod("%<=>%", #' ! #' -#' @rdname not +#' Inversion of boolean expression. +#' +#' @rdname column +#' @name ! #' @aliases !,Column-method #' @export #' @examples @@ -349,4 +352,5 @@ setMethod("%<=>%", #' head(select(df, !column("x") > 0)) #' } #' @note ! since 2.3.0 +#' @seealso \link{not} setMethod("!", signature(x = "Column"), function(x) not(x)) From 875c62fd129826717595954d7b7e7ce50943196a Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 30 Apr 2017 03:49:29 +0200 Subject: [PATCH 12/13] Use not as @name and @rdname for ! --- R/pkg/R/column.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 24b26cc76d6a1..3e48eaaf02893 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -341,8 +341,8 @@ setMethod("%<=>%", #' #' Inversion of boolean expression. #' -#' @rdname column -#' @name ! +#' @rdname not +#' @name not #' @aliases !,Column-method #' @export #' @examples From 8ab9e0c6457214d0cf2df698e76edd8aaaf8eb2e Mon Sep 17 00:00:00 2001 From: zero323 Date: Sun, 30 Apr 2017 08:02:59 +0200 Subject: [PATCH 13/13] Remove note --- R/pkg/R/column.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 3e48eaaf02893..147ee4b6887b9 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -352,5 +352,4 @@ setMethod("%<=>%", #' head(select(df, !column("x") > 0)) #' } #' @note ! since 2.3.0 -#' @seealso \link{not} setMethod("!", signature(x = "Column"), function(x) not(x))