Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-20490][SPARKR] Add R wrappers for eqNullSafe and ! / not #17783

Closed
wants to merge 13 commits into from
4 changes: 3 additions & 1 deletion R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ exportMethods("arrange",

exportClasses("Column")

exportMethods("%in%",
exportMethods("%<=>%",
"%in%",
"abs",
"acos",
"add_months",
Expand Down Expand Up @@ -291,6 +292,7 @@ exportMethods("%in%",
"nanvl",
"negate",
"next_day",
"not",
"ntile",
"otherwise",
"over",
Expand Down
55 changes: 53 additions & 2 deletions R/pkg/R/column.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ operators <- list(
"+" = "plus", "-" = "minus", "*" = "multiply", "/" = "divide", "%%" = "mod",
"==" = "equalTo", ">" = "gt", "<" = "lt", "!=" = "notEqual", "<=" = "leq", ">=" = "geq",
# we can not override `&&` and `||`, so use `&` and `|` instead
"&" = "and", "|" = "or", #, "!" = "unary_$bang"
"^" = "pow"
"&" = "and", "|" = "or", "^" = "pow"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens with #, ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this PR implements ! I think it is just confusing, don't you? Not to mention that createOperator didn't work with !. I can restore it if you believe it serves some purpose.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, let me back track... it looks like "!" = "unary_$bang" has always been commented out. the #, does that
I agree we don't need to leave that comment here

)
column_functions1 <- c("asc", "desc", "isNaN", "isNull", "isNotNull")
column_functions2 <- c("like", "rlike", "getField", "getItem", "contains")
Expand Down Expand Up @@ -302,3 +301,55 @@ setMethod("otherwise",
jc <- callJMethod(x@jc, "otherwise", value)
column(jc)
})

#' \%<=>\%
#'
#' Equality test that is safe for null values.
#'
#' Can be used, unlike standard equality operator, to perform null-safe joins.
#' Equivalent to Scala \code{Column.<=>} and \code{Column.eqNullSafe}.
#'
#' @param x a Column
#' @param value a value to compare
#' @rdname eq_null_safe
#' @name %<=>%
#' @aliases %<=>%,Column-method
#' @export
#' @examples
#' \dontrun{
#' df1 <- createDataFrame(data.frame(
#' x = c(1, NA, 3, NA), y = c(2, 6, 3, NA)
#' ))
#'
#' head(select(df1, df1$x == df1$y, df1$x %<=>% df1$y))
#'
#' df2 <- createDataFrame(data.frame(y = c(3, NA)))
#' count(join(df1, df2, df1$y == df2$y))
#'
#' count(join(df1, df2, df1$y %<=>% df2$y))
#' }
#' @note \%<=>\% since 2.3.0
setMethod("%<=>%",
signature(x = "Column", value = "ANY"),
function(x, value) {
value <- if (class(value) == "Column") { value@jc } else { value }
jc <- callJMethod(x@jc, "eqNullSafe", value)
column(jc)
})

#' !
#'
#' Inversion of boolean expression.
#'
#' @rdname not
#' @name not
#' @aliases !,Column-method
#' @export
#' @examples
#' \dontrun{
#' df <- createDataFrame(data.frame(x = c(-1, 0, 1)))
#'
#' head(select(df, !column("x") > 0))
#' }
#' @note ! since 2.3.0
setMethod("!", signature(x = "Column"), function(x) not(x))
31 changes: 31 additions & 0 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3859,3 +3859,34 @@ setMethod("posexplode_outer",
jc <- callJStatic("org.apache.spark.sql.functions", "posexplode_outer", x@jc)
column(jc)
})

#' not
#'
#' Inversion of boolean expression.
#'
#' \code{not} and \code{!} cannot be applied directly to numerical column.
#' To achieve R-like truthiness column has to be casted to \code{BooleanType}.
#'
#' @param x Column to compute on
#' @rdname not
#' @name not
#' @aliases not,Column-method
#' @export
#' @examples \dontrun{
#' df <- createDataFrame(data.frame(
#' is_true = c(TRUE, FALSE, NA),
#' flag = c(1, 0, 1)
#' ))
#'
#' head(select(df, not(df$is_true)))
#'
#' # Explicit cast is required when working with numeric column
#' head(select(df, not(cast(df$flag, "boolean"))))
#' }
#' @note not since 2.3.0
setMethod("not",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc)
column(jc)
})
8 changes: 8 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,10 @@ setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") })
#' @export
setGeneric("over", function(x, window) { standardGeneric("over") })

#' @rdname eq_null_safe
#' @export
setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })

###################### WindowSpec Methods ##########################

#' @rdname partitionBy
Expand Down Expand Up @@ -1154,6 +1158,10 @@ setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
#' @export
setGeneric("negate", function(x) { standardGeneric("negate") })

#' @rdname not
#' @export
setGeneric("not", function(x) { standardGeneric("not") })

#' @rdname next_day
#' @export
setGeneric("next_day", function(y, x) { standardGeneric("next_day") })
Expand Down
4 changes: 2 additions & 2 deletions R/pkg/inst/tests/testthat/test_context.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ test_that("Check masked functions", {
# Check that we are not masking any new function from base, stats, testthat unexpectedly
# NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
# hard for users to use base R functions. Please check when in doubt.
namesOfMaskedCompletely <- c("cov", "filter", "sample")
namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
"summary", "transform", "drop", "window", "as.data.frame", "union")
"summary", "transform", "drop", "window", "as.data.frame", "union", "not")
if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
}
Expand Down
20 changes: 20 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,8 @@ test_that("column operators", {
c3 <- (c + c2 - c2) * c2 %% c2
c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
c5 <- c2 ^ c3 ^ c4
c6 <- c2 %<=>% c3
c7 <- !c6
})

test_that("column functions", {
Expand All @@ -1348,6 +1350,7 @@ test_that("column functions", {
c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
c21 <- posexplode_outer(c) + explode_outer(c)
c22 <- not(c)

# Test if base::is.nan() is exposed
expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
Expand Down Expand Up @@ -1488,6 +1491,13 @@ test_that("column functions", {
lapply(
list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)),
as.environment))

df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
expect_equal(
collect(select(df, alias(not(df$is_true), "is_false"))),
data.frame(is_false = c(FALSE, TRUE, NA))
)

})

test_that("column binary mathfunctions", {
Expand Down Expand Up @@ -1973,6 +1983,16 @@ test_that("filter() on a DataFrame", {
filtered6 <- where(df, df$age %in% c(19, 30))
expect_equal(count(filtered6), 2)

# test suites for %<=>%
dfNa <- read.json(jsonPathNa)
expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
# match NA from two columns
expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)

# Test stats::filter is working
#expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
})
Expand Down