Skip to content

Commit

Permalink
ARROW-17088: [R] Use .arrow as extension of IPC files of datasets (#…
Browse files Browse the repository at this point in the history
…13690)

Lead-authored-by: mopcup <[email protected]>
Co-authored-by: mopcup <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
mopcup authored Aug 2, 2022
1 parent a9dcaff commit 8cac69c
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 7 deletions.
8 changes: 6 additions & 2 deletions r/R/dataset-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
#' use the current `group_by()` columns.
#' @param basename_template string template for the names of files to be written.
#' Must contain `"{i}"`, which will be replaced with an autoincremented
#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
#' will yield `"part-0.feather", ...`.
#' integer to generate basenames of datafiles. For example, `"part-{i}.arrow"`
#' will yield `"part-0.arrow", ...`.
#' If not specified, it defaults to `"part-{i}.<default extension>"`.
#' @param hive_style logical: write partition segments as Hive-style
#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
#' @param existing_data_behavior The behavior to use when there is already data
Expand Down Expand Up @@ -133,6 +134,9 @@ write_dataset <- function(dataset,
max_rows_per_group = bitwShiftL(1, 20),
...) {
format <- match.arg(format)
if (format %in% c("feather", "ipc")) {
format <- "arrow"
}
if (inherits(dataset, "arrow_dplyr_query")) {
# partitioning vars need to be in the `select` schema
dataset <- ensure_group_vars(dataset)
Expand Down
5 changes: 3 additions & 2 deletions r/man/write_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 39 additions & 3 deletions r/tests/testthat/test-dataset-write.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ test_that("Writing a dataset: CSV->IPC", {

# Check whether "int" is present in the files or just in the dirs
first <- read_feather(
dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
dir(dst_dir, pattern = ".arrow$", recursive = TRUE, full.names = TRUE)[1],
as_data_frame = FALSE
)
# It shouldn't be there
Expand Down Expand Up @@ -139,6 +139,40 @@ test_that("Writing a dataset: Parquet->Parquet (default)", {
)
})

test_that("Writing a dataset: `basename_template` default behavier", {
ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")

dst_dir <- make_temp_dir()
write_dataset(ds, dst_dir, format = "parquet", max_rows_per_file = 5L)
expect_identical(
dir(dst_dir, full.names = FALSE, recursive = TRUE),
paste0("part-", 0:3, ".parquet")
)
dst_dir <- make_temp_dir()
write_dataset(ds, dst_dir, format = "parquet", basename_template = "{i}.data", max_rows_per_file = 5L)
expect_identical(
dir(dst_dir, full.names = FALSE, recursive = TRUE),
paste0(0:3, ".data")
)
dst_dir <- make_temp_dir()
expect_error(
write_dataset(ds, dst_dir, format = "parquet", basename_template = "part-i.parquet"),
"basename_template did not contain '\\{i\\}'"
)
feather_dir <- make_temp_dir()
write_dataset(ds, feather_dir, format = "feather", partitioning = "int")
expect_identical(
dir(feather_dir, full.names = FALSE, recursive = TRUE),
sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
)
ipc_dir <- make_temp_dir()
write_dataset(ds, ipc_dir, format = "ipc", partitioning = "int")
expect_identical(
dir(ipc_dir, full.names = FALSE, recursive = TRUE),
sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
)
})

test_that("Writing a dataset: existing data behavior", {
# This test does not work on Windows because unlink does not immediately
# delete the data.
Expand Down Expand Up @@ -458,8 +492,10 @@ test_that("Writing a dataset: CSV format options", {
test_that("Dataset writing: unsupported features/input validation", {
skip_if_not_available("parquet")
expect_error(write_dataset(4), "You must supply a")
expect_error(write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
"Field names must be unique")
expect_error(
write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
"Field names must be unique"
)

ds <- open_dataset(hive_dir)
expect_error(
Expand Down

0 comments on commit 8cac69c

Please sign in to comment.