diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 3fb85f5490c34..e5f76570616d8 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -139,7 +139,6 @@ call_binding_agg <- function(fun_name, ...) { agg_funcs[[fun_name]](...) } -#' @importFrom stats runif create_binding_cache <- function() { # Called in .onLoad() .cache$docs <- list() @@ -164,17 +163,6 @@ create_binding_cache <- function() { register_bindings_type() register_bindings_augmented() - # HACK because random() doesn't work (ARROW-17974) - register_scalar_function( - "_random_along", - function(context, x) { - Array$create(runif(length(x))) - }, - in_type = schema(x = boolean()), - out_type = float64(), - auto_convert = FALSE - ) - # We only create the cache for nse_funcs and not agg_funcs .cache$functions <- c(as.list(nse_funcs), arrow_funcs) } diff --git a/r/R/dplyr-slice.R b/r/R/dplyr-slice.R index 102986cdbf108..ba7ec5fc44aa7 100644 --- a/r/R/dplyr-slice.R +++ b/r/R/dplyr-slice.R @@ -86,6 +86,7 @@ slice_max.arrow_dplyr_query <- function(.data, order_by, ..., n, prop, with_ties } slice_max.Dataset <- slice_max.ArrowTabular <- slice_max.RecordBatchReader <- slice_max.arrow_dplyr_query +#' @importFrom stats runif slice_sample.arrow_dplyr_query <- function(.data, ..., n, @@ -116,10 +117,21 @@ slice_sample.arrow_dplyr_query <- function(.data, if (prop < 1) { .data <- as_adq(.data) # TODO(ARROW-17974): use Expression$create("random") instead of UDF hack - # HACK: use our UDF to generate random. It needs an input column because + # HACK: use a UDF to generate random. It needs an input column because # nullary functions don't work, and that column has to be typed. We've # chosen boolean() type because it's compact and can always be created: # pick any column and do is.na, that will be boolean. + if (is.null(.cache$functions[["_random_along"]])) { + register_scalar_function( + "_random_along", + function(context, x) { + Array$create(runif(length(x))) + }, + in_type = schema(x = boolean()), + out_type = float64(), + auto_convert = FALSE + ) + } # TODO: get an actual FieldRef because the first col could be derived ref <- Expression$create("is_null", .data$selected_columns[[1]]) expr <- Expression$create("_random_along", ref) < prop diff --git a/r/tests/testthat/test-dplyr-slice.R b/r/tests/testthat/test-dplyr-slice.R index 5b577e0388cab..a1c71e2222973 100644 --- a/r/tests/testthat/test-dplyr-slice.R +++ b/r/tests/testthat/test-dplyr-slice.R @@ -99,8 +99,10 @@ test_that("slice_sample, ungrouped", { "weight_by" ) + # Let's not take any chances on random failures + skip_on_cran() # Because this is random (and we only have 10 rows), try several times - for (i in 1:10) { + for (i in 1:50) { sampled_prop <- tab %>% slice_sample(prop = .2) %>% collect() %>%