-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-15260: [R] open_dataset - add file_name as column #12826
Changes from 25 commits
bbc0224
b43947d
a379e14
095762b
814093d
40c6f44
01b3001
138bcd1
a3e6c66
c5d42c2
5d09a61
ca96473
5a7532e
23e476e
92411e4
20f1c57
765d834
51ae21c
b44e1e2
b734085
7d9d356
3966f9c
671edf5
6ac5e8a
4c16978
620df47
2521a43
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
register_bindings_augmented <- function() { | ||
register_binding("add_filename", function() { | ||
Expression$field_ref("__filename") | ||
}) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -134,6 +134,10 @@ read_compressed_error <- function(e) { | |
stop(e) | ||
} | ||
|
||
# This function was refactored in ARROW-15260 to only raise an error if | ||
# the appropriate string was found and so errors must be raised manually after | ||
# calling this if matching error not found | ||
# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors | ||
handle_parquet_io_error <- function(e, format, call) { | ||
msg <- conditionMessage(e) | ||
if (grepl("Parquet magic bytes not found in footer", msg) && length(format) > 1 && is_character(format)) { | ||
|
@@ -143,8 +147,8 @@ handle_parquet_io_error <- function(e, format, call) { | |
msg, | ||
i = "Did you mean to specify a 'format' other than the default (parquet)?" | ||
) | ||
abort(msg, call = call) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why these changes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see, so that you could chain together multiple of these. I suspect we could have a better developer experience, this change makes it easy to accidentally swallow all other errors. Could you make a followup jira for this, and leave a big comment above these? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have created ARROW-17355 and added a comment now. |
||
} | ||
abort(msg, call = call) | ||
} | ||
|
||
as_writable_table <- function(x) { | ||
|
@@ -205,6 +209,10 @@ repeat_value_as_array <- function(object, n) { | |
return(Scalar$create(object)$as_array(n)) | ||
} | ||
|
||
# This function was refactored in ARROW-15260 to only raise an error if | ||
# the appropriate string was found and so errors must be raised manually after | ||
# calling this if matching error not found | ||
# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors | ||
handle_csv_read_error <- function(e, schema, call) { | ||
msg <- conditionMessage(e) | ||
|
||
|
@@ -217,8 +225,27 @@ handle_csv_read_error <- function(e, schema, call) { | |
"header being read in as data." | ||
) | ||
) | ||
abort(msg, call = call) | ||
} | ||
} | ||
|
||
# This function only raises an error if | ||
# the appropriate string was found and so errors must be raised manually after | ||
# calling this if matching error not found | ||
# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors | ||
handle_augmented_field_misuse <- function(e, call) { | ||
msg <- conditionMessage(e) | ||
if (grepl("No match for FieldRef.Name(__filename)", msg, fixed = TRUE)) { | ||
msg <- c( | ||
msg, | ||
i = paste( | ||
"Augmented fields such as 'filename' must", | ||
"only be used with with Dataset objects which have", | ||
"not been aggregated or joined." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wordsmithing here, how about something like "'filename' can only be used with Dataset objects, and it can only be added before doing an aggregation or a join"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a lot clearer, thank you! I've updated the error to incorporate both that, and the two different ways of referring to the |
||
) | ||
) | ||
abort(msg, call = call) | ||
} | ||
abort(msg, call = call) | ||
} | ||
|
||
is_compressed <- function(compression) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1131,7 +1131,6 @@ test_that("dataset to C-interface to arrow_dplyr_query with proj/filter", { | |
delete_arrow_array_stream(stream_ptr) | ||
}) | ||
|
||
|
||
test_that("Filter parquet dataset with is.na ARROW-15312", { | ||
ds_path <- make_temp_dir() | ||
|
||
|
@@ -1349,3 +1348,93 @@ test_that("FileSystemFactoryOptions input validation", { | |
fixed = TRUE | ||
) | ||
}) | ||
|
||
test_that("can add in augmented fields", { | ||
nealrichardson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ds <- open_dataset(hive_dir) | ||
|
||
observed <- ds %>% | ||
mutate(file_name = add_filename()) %>% | ||
collect() | ||
|
||
expect_named( | ||
observed, | ||
c("int", "dbl", "lgl", "chr", "fct", "ts", "group", "other", "file_name") | ||
) | ||
|
||
expect_equal( | ||
sort(unique(observed$file_name)), | ||
list.files(hive_dir, full.names = TRUE, recursive = TRUE) | ||
) | ||
|
||
error_regex <- paste( | ||
"Augmented fields such as 'filename' must", | ||
"only be used with with Dataset objects which have", | ||
"not been aggregated or joined." | ||
) | ||
|
||
# errors appropriately with ArrowTabular objects | ||
expect_error( | ||
arrow_table(mtcars) %>% | ||
mutate(file = add_filename()) %>% | ||
collect(), | ||
regexp = error_regex | ||
) | ||
|
||
# errors appropriately with aggregation | ||
expect_error( | ||
ds %>% | ||
summarise(max_int = max(int)) %>% | ||
mutate(file_name = add_filename()) %>% | ||
collect(), | ||
regexp = error_regex | ||
) | ||
|
||
# joins to tables | ||
another_table <- select(example_data, int, dbl2) | ||
expect_error( | ||
ds %>% | ||
left_join(another_table, by = "int") %>% | ||
mutate(file = add_filename()) %>% | ||
collect(), | ||
regexp = error_regex | ||
) | ||
|
||
# and on joins to datasets | ||
another_dataset <- write_dataset(another_table, "another_dataset") | ||
expect_error( | ||
ds %>% | ||
left_join(open_dataset("another_dataset"), by = "int") %>% | ||
mutate(file = add_filename()) %>% | ||
collect(), | ||
regexp = error_regex | ||
) | ||
|
||
# this hits the implicit_schema path by joining afterwards | ||
join_after <- ds %>% | ||
mutate(file = add_filename()) %>% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation here and on the next example are off |
||
left_join(open_dataset("another_dataset"), by = "int") %>% | ||
collect() | ||
|
||
expect_named( | ||
join_after, | ||
c("int", "dbl", "lgl", "chr", "fct", "ts", "group", "other", "file", "dbl2") | ||
) | ||
|
||
expect_equal( | ||
sort(unique(join_after$file)), | ||
list.files(hive_dir, full.names = TRUE, recursive = TRUE) | ||
) | ||
|
||
# another test on the explicit_schema path | ||
summarise_after <- ds %>% | ||
mutate(file = add_filename()) %>% | ||
group_by(file) %>% | ||
summarise(max_int = max(int)) %>% | ||
collect() | ||
|
||
expect_equal( | ||
sort(summarise_after$file), | ||
list.files(hive_dir, full.names = TRUE, recursive = TRUE) | ||
) | ||
|
||
}) | ||
thisisnic marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this worked? Great! Do you want to add a comment here explaining what's going on, or encapsulate this augmented column business into a well named function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, was very pleased that it was this simple in the end! Comment added.