Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-17088: [R] Use .arrow as extension of IPC files of datasets #13690

Merged
merged 17 commits into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions r/R/dataset-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
#' use the current `group_by()` columns.
#' @param basename_template string template for the names of files to be written.
#' Must contain `"{i}"`, which will be replaced with an autoincremented
#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
#' will yield `"part-0.feather", ...`.
#' integer to generate basenames of datafiles. For example, `"part-{i}.arrow"`
#' will yield `"part-0.arrow", ...`.
#' If not specified, it defaults to `"part-{i}.<default extension>"`.
#' @param hive_style logical: write partition segments as Hive-style
#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
#' @param existing_data_behavior The behavior to use when there is already data
Expand Down Expand Up @@ -133,6 +134,7 @@ write_dataset <- function(dataset,
max_rows_per_group = bitwShiftL(1, 20),
...) {
format <- match.arg(format)
format <- ifelse(as.character(format) %in% c("feather", "ipc"), "arrow", as.character(format))
mopcup marked this conversation as resolved.
Show resolved Hide resolved
if (inherits(dataset, "arrow_dplyr_query")) {
# partitioning vars need to be in the `select` schema
dataset <- ensure_group_vars(dataset)
Expand Down
5 changes: 3 additions & 2 deletions r/man/write_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 36 additions & 4 deletions r/tests/testthat/test-dataset-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ test_that("Writing a dataset: CSV->IPC", {

# Check whether "int" is present in the files or just in the dirs
first <- read_feather(
dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
dir(dst_dir, pattern = ".arrow$", recursive = TRUE, full.names = TRUE)[1],
as_data_frame = FALSE
)
# It shouldn't be there
Expand Down Expand Up @@ -139,6 +139,36 @@ test_that("Writing a dataset: Parquet->Parquet (default)", {
)
})

test_that("Writing a dataset: `basename_template` default behavier", {
nealrichardson marked this conversation as resolved.
Show resolved Hide resolved
ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")

dst_dir <- make_temp_dir()
write_dataset(ds, dst_dir, format = "parquet", max_rows_per_file = 5L)
expect_identical(
dir(dst_dir, full.names = FALSE, recursive = TRUE),
paste0("part-", 0:3, ".parquet")
)

dst_dir <- make_temp_dir()
expect_error(
write_dataset(ds, dst_dir, format = "parquet", basename_template = "part-i.parquet"),
mopcup marked this conversation as resolved.
Show resolved Hide resolved
"basename_template did not contain '\\{i\\}'"
)

feather_dir <- make_temp_dir()
write_dataset(ds, feather_dir, format = "feather", partitioning = "int")
expect_identical(
dir(feather_dir, full.names = FALSE, recursive = TRUE),
sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
)
ipc_dir <- make_temp_dir()
write_dataset(ds, ipc_dir, format = "ipc", partitioning = "int")
expect_identical(
dir(ipc_dir, full.names = FALSE, recursive = TRUE),
sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
)
})

test_that("Writing a dataset: existing data behavior", {
# This test does not work on Windows because unlink does not immediately
# delete the data.
Expand Down Expand Up @@ -458,8 +488,10 @@ test_that("Writing a dataset: CSV format options", {
test_that("Dataset writing: unsupported features/input validation", {
skip_if_not_available("parquet")
expect_error(write_dataset(4), "You must supply a")
expect_error(write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
"Field names must be unique")
expect_error(
write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
"Field names must be unique"
)

ds <- open_dataset(hive_dir)
expect_error(
Expand All @@ -470,7 +502,7 @@ test_that("Dataset writing: unsupported features/input validation", {
write_dataset(ds, tempfile(), basename_template = "something_without_i")
)
expect_error(
write_dataset(ds, tempfile(), basename_template = NULL)
write_dataset(ds, tempfile(), basename_template = "NULL")
mopcup marked this conversation as resolved.
Show resolved Hide resolved
)
})

Expand Down