Skip to content

Commit

Permalink
[SPARK-20490][SPARKR] Add R wrappers for eqNullSafe and ! / not
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

- Add null-safe equality operator `%<=>%` (sames as `o.a.s.sql.Column.eqNullSafe`, `o.a.s.sql.Column.<=>`)
- Add boolean negation operator `!` and function `not `.

## How was this patch tested?

Existing unit tests, additional unit tests, `check-cran.sh`.

Author: zero323 <[email protected]>

Closes #17783 from zero323/SPARK-20490.
  • Loading branch information
zero323 authored and Felix Cheung committed May 1, 2017
1 parent 6613046 commit 80e9cf1
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 5 deletions.
4 changes: 3 additions & 1 deletion R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ exportMethods("arrange",

exportClasses("Column")

exportMethods("%in%",
exportMethods("%<=>%",
"%in%",
"abs",
"acos",
"add_months",
Expand Down Expand Up @@ -291,6 +292,7 @@ exportMethods("%in%",
"nanvl",
"negate",
"next_day",
"not",
"ntile",
"otherwise",
"over",
Expand Down
55 changes: 53 additions & 2 deletions R/pkg/R/column.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ operators <- list(
"+" = "plus", "-" = "minus", "*" = "multiply", "/" = "divide", "%%" = "mod",
"==" = "equalTo", ">" = "gt", "<" = "lt", "!=" = "notEqual", "<=" = "leq", ">=" = "geq",
# we can not override `&&` and `||`, so use `&` and `|` instead
"&" = "and", "|" = "or", #, "!" = "unary_$bang"
"^" = "pow"
"&" = "and", "|" = "or", "^" = "pow"
)
column_functions1 <- c("asc", "desc", "isNaN", "isNull", "isNotNull")
column_functions2 <- c("like", "rlike", "getField", "getItem", "contains")
Expand Down Expand Up @@ -302,3 +301,55 @@ setMethod("otherwise",
jc <- callJMethod(x@jc, "otherwise", value)
column(jc)
})

#' \%<=>\%
#'
#' Equality test that is safe for null values.
#'
#' Can be used, unlike standard equality operator, to perform null-safe joins.
#' Equivalent to Scala \code{Column.<=>} and \code{Column.eqNullSafe}.
#'
#' @param x a Column
#' @param value a value to compare
#' @rdname eq_null_safe
#' @name %<=>%
#' @aliases %<=>%,Column-method
#' @export
#' @examples
#' \dontrun{
#' df1 <- createDataFrame(data.frame(
#' x = c(1, NA, 3, NA), y = c(2, 6, 3, NA)
#' ))
#'
#' head(select(df1, df1$x == df1$y, df1$x %<=>% df1$y))
#'
#' df2 <- createDataFrame(data.frame(y = c(3, NA)))
#' count(join(df1, df2, df1$y == df2$y))
#'
#' count(join(df1, df2, df1$y %<=>% df2$y))
#' }
#' @note \%<=>\% since 2.3.0
setMethod("%<=>%",
signature(x = "Column", value = "ANY"),
function(x, value) {
value <- if (class(value) == "Column") { value@jc } else { value }
jc <- callJMethod(x@jc, "eqNullSafe", value)
column(jc)
})

#' !
#'
#' Inversion of boolean expression.
#'
#' @rdname not
#' @name not
#' @aliases !,Column-method
#' @export
#' @examples
#' \dontrun{
#' df <- createDataFrame(data.frame(x = c(-1, 0, 1)))
#'
#' head(select(df, !column("x") > 0))
#' }
#' @note ! since 2.3.0
setMethod("!", signature(x = "Column"), function(x) not(x))
31 changes: 31 additions & 0 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3859,3 +3859,34 @@ setMethod("posexplode_outer",
jc <- callJStatic("org.apache.spark.sql.functions", "posexplode_outer", x@jc)
column(jc)
})

#' not
#'
#' Inversion of boolean expression.
#'
#' \code{not} and \code{!} cannot be applied directly to numerical column.
#' To achieve R-like truthiness column has to be casted to \code{BooleanType}.
#'
#' @param x Column to compute on
#' @rdname not
#' @name not
#' @aliases not,Column-method
#' @export
#' @examples \dontrun{
#' df <- createDataFrame(data.frame(
#' is_true = c(TRUE, FALSE, NA),
#' flag = c(1, 0, 1)
#' ))
#'
#' head(select(df, not(df$is_true)))
#'
#' # Explicit cast is required when working with numeric column
#' head(select(df, not(cast(df$flag, "boolean"))))
#' }
#' @note not since 2.3.0
setMethod("not",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc)
column(jc)
})
8 changes: 8 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,10 @@ setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") })
#' @export
setGeneric("over", function(x, window) { standardGeneric("over") })

#' @rdname eq_null_safe
#' @export
setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })

###################### WindowSpec Methods ##########################

#' @rdname partitionBy
Expand Down Expand Up @@ -1154,6 +1158,10 @@ setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
#' @export
setGeneric("negate", function(x) { standardGeneric("negate") })

#' @rdname not
#' @export
setGeneric("not", function(x) { standardGeneric("not") })

#' @rdname next_day
#' @export
setGeneric("next_day", function(y, x) { standardGeneric("next_day") })
Expand Down
4 changes: 2 additions & 2 deletions R/pkg/inst/tests/testthat/test_context.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ test_that("Check masked functions", {
# Check that we are not masking any new function from base, stats, testthat unexpectedly
# NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
# hard for users to use base R functions. Please check when in doubt.
namesOfMaskedCompletely <- c("cov", "filter", "sample")
namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
"summary", "transform", "drop", "window", "as.data.frame", "union")
"summary", "transform", "drop", "window", "as.data.frame", "union", "not")
if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
}
Expand Down
20 changes: 20 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,8 @@ test_that("column operators", {
c3 <- (c + c2 - c2) * c2 %% c2
c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
c5 <- c2 ^ c3 ^ c4
c6 <- c2 %<=>% c3
c7 <- !c6
})

test_that("column functions", {
Expand All @@ -1348,6 +1350,7 @@ test_that("column functions", {
c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
c21 <- posexplode_outer(c) + explode_outer(c)
c22 <- not(c)

# Test if base::is.nan() is exposed
expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
Expand Down Expand Up @@ -1488,6 +1491,13 @@ test_that("column functions", {
lapply(
list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)),
as.environment))

df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
expect_equal(
collect(select(df, alias(not(df$is_true), "is_false"))),
data.frame(is_false = c(FALSE, TRUE, NA))
)

})

test_that("column binary mathfunctions", {
Expand Down Expand Up @@ -1973,6 +1983,16 @@ test_that("filter() on a DataFrame", {
filtered6 <- where(df, df$age %in% c(19, 30))
expect_equal(count(filtered6), 2)

# test suites for %<=>%
dfNa <- read.json(jsonPathNa)
expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
# match NA from two columns
expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)

# Test stats::filter is working
#expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
})
Expand Down

0 comments on commit 80e9cf1

Please sign in to comment.