From e97656558973beed940bace7308a6017367788e2 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Sun, 7 Aug 2022 06:43:02 -0400 Subject: [PATCH] Apply #13773, drop badges, add placeholder write_arrow --- r/NAMESPACE | 1 + r/R/arrow-package.R | 8 ++++++++ r/R/compute.R | 9 ++++++++- r/R/table.R | 9 ++++++++- r/README.md | 8 ++------ r/man/register_scalar_function.Rd | 2 +- r/man/write_arrow.Rd | 15 +++++++++++++++ r/tests/testthat/test-compute.R | 24 ++++++++++++++++++++++-- 8 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 r/man/write_arrow.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 17f404caa140d..61a3b7048f88b 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -374,6 +374,7 @@ export(utf8) export(value_counts) export(vctrs_extension_array) export(vctrs_extension_type) +export(write_arrow) export(write_csv_arrow) export(write_dataset) export(write_feather) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index f3e0b817d5f42..943d2d73e19bc 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -138,3 +138,11 @@ option_use_threads <- function() { option_compress_metadata <- function() { !is_false(getOption("arrow.compress_metadata")) } + +# Restore this to suppress sparklyr warning + +#' Use `write_ipc_stream` or `write_to_raw` instead. +#' @param ... Ignored arguments +#' @export +#' @keywords internal +write_arrow <- function(...) stop("write_arrow has been removed") diff --git a/r/R/compute.R b/r/R/compute.R index 0985e73a5f2d3..636c9146ca37b 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -344,7 +344,7 @@ cast_options <- function(safe = TRUE, ...) { #' @return `NULL`, invisibly #' @export #' -#' @examplesIf arrow_with_dataset() +#' @examplesIf arrow_with_dataset() && identical(Sys.getenv("NOT_CRAN"), "true") #' library(dplyr, warn.conflicts = FALSE) #' #' some_model <- lm(mpg ~ disp + cyl, data = mtcars) @@ -385,6 +385,13 @@ register_scalar_function <- function(name, fun, in_type, out_type, update_cache = TRUE ) + # User-defined functions require some special handling + # in the query engine which currently require an opt-in using + # the R_ARROW_COLLECT_WITH_UDF environment variable while this + # behaviour is stabilized. + # TODO(ARROW-17178) remove the need for this! + Sys.setenv(R_ARROW_COLLECT_WITH_UDF = "true") + invisible(NULL) } diff --git a/r/R/table.R b/r/R/table.R index 5579c676d5157..d7e276415c5cd 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -331,5 +331,12 @@ as_arrow_table.arrow_dplyr_query <- function(x, ...) { # See query-engine.R for ExecPlan/Nodes plan <- ExecPlan$create() final_node <- plan$Build(x) - plan$Run(final_node, as_table = TRUE) + + run_with_event_loop <- identical( + Sys.getenv("R_ARROW_COLLECT_WITH_UDF", ""), + "true" + ) + + result <- plan$Run(final_node, as_table = run_with_event_loop) + as_arrow_table(result) } diff --git a/r/README.md b/r/README.md index 1509ae7793f6a..c939faf26dee8 100644 --- a/r/README.md +++ b/r/README.md @@ -1,9 +1,5 @@ # arrow -[![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow) -[![CI](https://github.com/apache/arrow/workflows/R/badge.svg?event=push)](https://github.com/apache/arrow/actions?query=workflow%3AR+branch%3Amaster+event%3Apush) -[![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow) - **[Apache Arrow](https://arrow.apache.org/) is a cross-language development platform for in-memory data.** It specifies a standardized language-independent columnar memory format for flat and hierarchical @@ -138,7 +134,7 @@ returns an R `data.frame`. To return an Arrow `Table`, set argument - `read_json_arrow()`: read a JSON data file For writing data to single files, the `arrow` package provides the -functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`. +functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`. These can be used with R `data.frame` and Arrow `Table` objects. For example, let’s write the Star Wars characters data that’s included @@ -270,7 +266,7 @@ sw %>% ``` Additionally, equality joins (e.g. `left_join()`, `inner_join()`) are supported -for joining multiple tables. +for joining multiple tables. ```r jedi <- data.frame( diff --git a/r/man/register_scalar_function.Rd b/r/man/register_scalar_function.Rd index 4da8f54f645b0..324dd5fad1f58 100644 --- a/r/man/register_scalar_function.Rd +++ b/r/man/register_scalar_function.Rd @@ -48,7 +48,7 @@ stateless and return output with the same shape (i.e., the same number of rows) as the input. } \examples{ -\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset() && identical(Sys.getenv("NOT_CRAN"), "true")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(dplyr, warn.conflicts = FALSE) some_model <- lm(mpg ~ disp + cyl, data = mtcars) diff --git a/r/man/write_arrow.Rd b/r/man/write_arrow.Rd new file mode 100644 index 0000000000000..a86588e1d4862 --- /dev/null +++ b/r/man/write_arrow.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/arrow-package.R +\name{write_arrow} +\alias{write_arrow} +\title{Use \code{write_ipc_stream} or \code{write_to_raw} instead.} +\usage{ +write_arrow(...) +} +\arguments{ +\item{...}{Ignored arguments} +} +\description{ +Use \code{write_ipc_stream} or \code{write_to_raw} instead. +} +\keyword{internal} diff --git a/r/tests/testthat/test-compute.R b/r/tests/testthat/test-compute.R index 9e487169f4b15..bbc727de8fc3f 100644 --- a/r/tests/testthat/test-compute.R +++ b/r/tests/testthat/test-compute.R @@ -81,6 +81,9 @@ test_that("arrow_scalar_function() works with auto_convert = TRUE", { test_that("register_scalar_function() adds a compute function to the registry", { skip_if_not(CanRunWithCapturedR()) + # TODO(ARROW-17178): User-defined function-friendly ExecPlan execution has + # occasional valgrind errors + skip_on_linux_devel() register_scalar_function( "times_32", @@ -109,6 +112,8 @@ test_that("register_scalar_function() adds a compute function to the registry", dplyr::collect(), tibble::tibble(a = 1L, b = 32.0) ) + + Sys.unsetenv("R_ARROW_COLLECT_WITH_UDF") }) test_that("arrow_scalar_function() with bad return type errors", { @@ -143,6 +148,9 @@ test_that("arrow_scalar_function() with bad return type errors", { call_function("times_32_bad_return_type_scalar", Array$create(1L)), "Expected return Array or Scalar with type 'double'" ) + + # TODO(ARROW-17178) remove the need for this! + Sys.unsetenv("R_ARROW_COLLECT_WITH_UDF") }) test_that("register_user_defined_function() can register multiple kernels", { @@ -203,12 +211,18 @@ test_that("register_user_defined_function() errors for unsupported specification ), "Kernels for user-defined function must accept the same number of arguments" ) + + # TODO(ARROW-17178) remove the need for this! + Sys.unsetenv("R_ARROW_COLLECT_WITH_UDF") }) test_that("user-defined functions work during multi-threaded execution", { skip_if_not(CanRunWithCapturedR()) skip_if_not_available("dataset") - # Snappy has a UBSan issue: https://github.com/google/snappy/pull/148 + # Skip on linux devel because: + # TODO(ARROW-17283): Snappy has a UBSan issue that is fixed in the dev version + # TODO(ARROW-17178): User-defined function-friendly ExecPlan execution has + # occasional valgrind errors skip_on_linux_devel() n_rows <- 10000 @@ -255,6 +269,9 @@ test_that("user-defined functions work during multi-threaded execution", { dplyr::collect() expect_identical(result2$fun_result, example_df$value * 32) + + # TODO(ARROW-17178) remove the need for this! + Sys.unsetenv("R_ARROW_COLLECT_WITH_UDF") }) test_that("user-defined error when called from an unsupported context", { @@ -271,7 +288,7 @@ test_that("user-defined error when called from an unsupported context", { on.exit(unregister_binding("times_32", update_cache = TRUE)) stream_plan_with_udf <- function() { - record_batch(a = 1:1000) %>% + record_batch(a = 1:1000) %>% dplyr::mutate(b = times_32(a)) %>% as_record_batch_reader() %>% as_arrow_table() @@ -304,4 +321,7 @@ test_that("user-defined error when called from an unsupported context", { "Call to R \\(.*?\\) from a non-R thread from an unsupported context" ) } + + # TODO(ARROW-17178) remove the need for this! + Sys.unsetenv("R_ARROW_COLLECT_WITH_UDF") })