From eb48ffc4f950e33c3c75cbc7eaf446d5bfbceb41 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Tue, 2 Apr 2024 14:00:49 -0500 Subject: [PATCH 01/33] WIP: Use bigrquerystorage for downloads --- DESCRIPTION | 3 +- NEWS.md | 5 +++ R/bq-download.R | 42 ++++++++++++++++++++++++- R/dbi-result.R | 1 + R/dplyr.R | 37 ++++++++++++++++------ R/utils.R | 4 +++ man/bq_table_download.Rd | 15 ++++++++- man/src_bigquery.Rd | 2 +- tests/testthat/_snaps/dbi-connection.md | 2 +- tests/testthat/test-bq-download.R | 13 ++++---- tests/testthat/test-bq-parse.R | 4 +-- tests/testthat/test-bq-table.R | 16 +++++----- 12 files changed, 113 insertions(+), 31 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 340a2216..0b5e4b6a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,8 @@ Suggests: sodium, testthat (>= 3.1.5), wk (>= 0.3.2), - withr + withr, + bigrquerystorage LinkingTo: cli, cpp11, diff --git a/NEWS.md b/NEWS.md index 3e526903..28c7c4ec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # bigrquery (development version) +* If the bigrquerystorage package is installed, `bq_table_download()` (and + hence basically every function that downloads data from BigQuery) will + use it. This will considerably improve the speed of downloading large + datasets. + # bigrquery 1.5.1 * Forward compatibility with upcoming dbplyr release (#601). diff --git a/R/bq-download.R b/R/bq-download.R index 126d1dcf..e1259fc7 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -1,5 +1,6 @@ #' Download table data #' +#' This function provides two ways to download #' This retrieves rows in chunks of `page_size`. It is most suitable for results #' of smaller queries (<100 MB, say). For larger queries, it is better to #' export the results to a CSV file stored on google cloud and use the @@ -43,19 +44,27 @@ #' @param start_index Starting row index (zero-based). #' @param max_connections Number of maximum simultaneous connections to #' BigQuery servers. +#' @param api Which API to use? The `"json"` API works where ever bigrquery +#' does, but is slow and can require fiddling with the `page_size` parameter. +#' The `"arrow"` API is faster and more reliable, but only works if you +#' have also installed the bigrquerystorage package. +#' +#' Because the `"arrow"` API is so much faster, it will be used automatically +#' if the bigrquerystorage package is installed. #' @inheritParams api-job #' @param bigint The R type that BigQuery's 64-bit integer types should be #' mapped to. The default is `"integer"`, which returns R's `integer` type, #' but results in `NA` for values above/below +/- 2147483647. `"integer64"` #' returns a [bit64::integer64], which allows the full range of 64 bit #' integers. +#' @param billing Identifier of project to bill. #' @param max_results `r lifecycle::badge("deprecated")` Deprecated. Please use #' `n_max` instead. #' @section Google BigQuery API documentation: #' * [list](https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list) #' @export #' @examplesIf bq_testable() -#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000) +#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project()) bq_table_download <- function(x, n_max = Inf, @@ -64,6 +73,8 @@ bq_table_download <- max_connections = 6L, quiet = NA, bigint = c("integer", "integer64", "numeric", "character"), + api = c("json", "arrow"), + billing = x$project, max_results = deprecated()) { x <- as_bq_table(x) check_number_whole(n_max, min = 0, allow_infinite = TRUE) @@ -71,6 +82,13 @@ bq_table_download <- check_number_whole(max_connections, min = 1) quiet <- check_quiet(quiet) bigint <- arg_match(bigint) + + if (missing(api)) { + api <- if (has_bigrquerystorage()) "arrow" else "json" + } else { + api <- arg_match(api) + } + if (lifecycle::is_present(max_results)) { lifecycle::deprecate_warn( "1.4.0", "bq_table_download(max_results)", "bq_table_download(n_max)" @@ -78,6 +96,28 @@ bq_table_download <- n_max <- max_results } + if (api == "arrow") { + check_installed("bigrquerystorage", "required to download using arrow API") + if (!missing(page_size)) { + cli::cli_warn('{.arg page_size} is ignored when {.code api == "arrow"}') + } + if (!missing(start_index)) { + cli::cli_warn('{.arg start_index} is ignored when {.code api == "arrow"}') + } + if (!missing(max_connections)) { + cli::cli_warn('{.arg max_connections} is ignored when {.code api == "arrow"}') + } + + return(bigrquerystorage::bqs_table_download( + x = toString(x), + parent = billing, + n_max = n_max, + quiet = quiet, + bigint = bigint, + as_tibble = TRUE + )) + } + params <- set_row_params( nrow = bq_table_nrow(x), n_max = n_max, diff --git a/R/dbi-result.R b/R/dbi-result.R index e5248933..7ba63b4d 100644 --- a/R/dbi-result.R +++ b/R/dbi-result.R @@ -105,6 +105,7 @@ setMethod( n <- res@cursor$left() } + # TODO: figure out what how to ignore pagination here data <- bq_table_download(res@bq_table, n_max = n, start_index = res@cursor$cur(), diff --git a/R/dplyr.R b/R/dplyr.R index 6044a441..aae8411e 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -20,14 +20,18 @@ #' # set up for billing #' con <- DBI::dbConnect(bigquery(), project = bq_test_project()) #' -#' shakespeare <- con %>% tbl("publicdata.samples.shakespeare") +#' shakespeare <- con %>% tbl(I("publicdata.samples.shakespeare")) #' shakespeare #' shakespeare %>% #' group_by(word) %>% #' summarise(n = sum(word_count, na.rm = TRUE)) %>% #' arrange(desc(n)) #' } -src_bigquery <- function(project, dataset, billing = project, max_pages = 10) { +src_bigquery <- function(project, + dataset, + billing = project, + api = c("json", "arrow"), + max_pages = 10) { check_installed("dbplyr") con <- DBI::dbConnect( @@ -127,6 +131,7 @@ collect.tbl_BigQueryConnection <- function(x, ..., check_bool(warn_incomplete) con <- dbplyr::remote_con(x) + billing <- con@billing if (op_can_download(x)) { lq <- x$lazy_query @@ -136,7 +141,6 @@ collect.tbl_BigQueryConnection <- function(x, ..., } else { sql <- dbplyr::db_sql_render(con, x) - billing <- con@billing if (is.null(con@dataset)) { tb <- bq_project_query(billing, sql, quiet = con@quiet, ...) } else { @@ -147,13 +151,26 @@ collect.tbl_BigQueryConnection <- function(x, ..., quiet <- if (n < 100) TRUE else con@quiet bigint <- con@bigint %||% "integer" - out <- bq_table_download(tb, - n_max = n, - page_size = page_size, - quiet = quiet, - max_connections = max_connections, - bigint = bigint - ) + + if (has_bigrquerystorage()) { + out <- bq_table_download(tb, + n_max = n, + quiet = quiet, + bigint = bigint, + billing = billing, + api = "arrow" + ) + } else { + out <- bq_table_download(tb, + n_max = n, + page_size = page_size, + quiet = quiet, + max_connections = max_connections, + bigint = bigint, + api = "json" + ) + } + dplyr::grouped_df(out, intersect(dbplyr::op_grps(x), names(out))) } diff --git a/R/utils.R b/R/utils.R index 98e872fd..1e84f9ff 100644 --- a/R/utils.R +++ b/R/utils.R @@ -71,3 +71,7 @@ as_query <- function(x, error_arg = caller_arg(x), error_call = caller_env()) { check_string(x, arg = error_arg, call = error_call) x } + +has_bigrquerystorage <- function() { + is_installed("bigrquerystorage") +} diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index 90970863..928627ef 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -12,6 +12,8 @@ bq_table_download( max_connections = 6L, quiet = NA, bigint = c("integer", "integer64", "numeric", "character"), + api = c("json", "arrow"), + billing = x$project, max_results = deprecated() ) } @@ -44,6 +46,16 @@ but results in \code{NA} for values above/below +/- 2147483647. \code{"integer64 returns a \link[bit64:bit64-package]{bit64::integer64}, which allows the full range of 64 bit integers.} +\item{api}{Which API to use? The \code{"json"} API works where ever bigrquery +does, but is slow and can require fiddling with the \code{page_size} parameter. +The \code{"arrow"} API is faster and more reliable, but only works if you +have also installed the bigrquerystorage package. + +Because the \code{"arrow"} API is so much faster, it will be used automatically +if the bigrquerystorage package is installed.} + +\item{billing}{Identifier of project to bill.} + \item{max_results}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Deprecated. Please use \code{n_max} instead.} } @@ -54,6 +66,7 @@ a tibble. If you need a \code{data.frame}, coerce the results with \code{\link[=as.data.frame]{as.data.frame()}}. } \description{ +This function provides two ways to download This retrieves rows in chunks of \code{page_size}. It is most suitable for results of smaller queries (<100 MB, say). For larger queries, it is better to export the results to a CSV file stored on google cloud and use the @@ -94,6 +107,6 @@ nested/repeated values, are not well supported in R. \examples{ \dontshow{if (bq_testable()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -df <- bq_table_download("publicdata.samples.natality", n_max = 35000) +df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project()) \dontshow{\}) # examplesIf} } diff --git a/man/src_bigquery.Rd b/man/src_bigquery.Rd index 39a2b1bf..36ef88b6 100644 --- a/man/src_bigquery.Rd +++ b/man/src_bigquery.Rd @@ -31,7 +31,7 @@ library(dplyr) # set up for billing con <- DBI::dbConnect(bigquery(), project = bq_test_project()) -shakespeare <- con \%>\% tbl("publicdata.samples.shakespeare") +shakespeare <- con \%>\% tbl(I("publicdata.samples.shakespeare")) shakespeare shakespeare \%>\% group_by(word) \%>\% diff --git a/tests/testthat/_snaps/dbi-connection.md b/tests/testthat/_snaps/dbi-connection.md index 724fa1cb..67c560a7 100644 --- a/tests/testthat/_snaps/dbi-connection.md +++ b/tests/testthat/_snaps/dbi-connection.md @@ -48,7 +48,7 @@ Code DBI::dbReadTable(con, "natality", n_max = 10) Condition - Error in `as_bq_table()`: + Error in `bigrquery::as_bq_table()`: ! `name` ("natality") must have 2 or 3 components if the connection doesn't have a dataset. # can create bq_table from connection + name diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index e8f64576..04ad531d 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -3,8 +3,8 @@ test_that("same results regardless of page size", { tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df3 <- bq_table_download(tb, n_max = 30, page_size = 10) - df1 <- bq_table_download(tb, n_max = 30, page_size = 30) + df3 <- bq_table_download(tb, n_max = 30, page_size = 10, api = "json") + df1 <- bq_table_download(tb, n_max = 30, page_size = 30, api = "json") expect_equal(nrow(df1), 30) expect_equal(df1, df3) }) @@ -13,7 +13,7 @@ test_that("can retrieve fraction of page size", { skip_if_no_auth() tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 15, page_size = 10) + df <- bq_table_download(tb, n_max = 15, page_size = 10, api = "json") expect_equal(nrow(df), 15) }) @@ -21,7 +21,7 @@ test_that("can retrieve zero rows", { skip_if_no_auth() tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 0) + df <- bq_table_download(tb, n_max = 0, api = "json") expect_equal(nrow(df), 0) expect_named(df, c("phase", "phase_emoji", "peak_datetime")) }) @@ -34,7 +34,7 @@ test_that("can specify large integers in page params", { withr::local_options(list(scipen = -4)) tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 100, page_size = 20) + df <- bq_table_download(tb, n_max = 100, page_size = 20, api = "json") expect_equal(nrow(df), 100) }) @@ -49,7 +49,8 @@ test_that("errors when table is known to be incomplete", { tb, n_max = 35000, page_size = 35000, - bigint = "integer64" + bigint = "integer64", + api = "json" ), transform = function(x) { gsub("[0-9,]+ rows were received", "{n} rows were received", x, perl = TRUE) diff --git a/tests/testthat/test-bq-parse.R b/tests/testthat/test-bq-parse.R index 93328768..70ac23b4 100644 --- a/tests/testthat/test-bq-parse.R +++ b/tests/testthat/test-bq-parse.R @@ -128,11 +128,11 @@ test_that("can parse nested structures", { test_that("can parse empty arrays", { tb <- bq_project_query(bq_test_project(), "SELECT ARRAY[] as x") - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_equal(df$x, list(integer(length = 0))) tb <- bq_project_query(bq_test_project(), "SELECT ARRAY>[] as x") - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_equal(df$x, list(tibble::tibble(a = integer(length = 0), b = character()))) }) diff --git a/tests/testthat/test-bq-table.R b/tests/testthat/test-bq-table.R index 2400ee6e..76b30396 100644 --- a/tests/testthat/test-bq-table.R +++ b/tests/testthat/test-bq-table.R @@ -38,7 +38,7 @@ test_that("can round trip to non-default location", { bq_df <- bq_table(dallas, "df") bq_table_upload(bq_df, df1) - df2 <- bq_table_download(bq_df) + df2 <- bq_table_download(bq_df, api = "json") df2 <- df2[order(df2$x), names(df1)] # BQ doesn't guarantee order rownames(df2) <- NULL @@ -54,7 +54,7 @@ test_that("can roundtrip via save + load", { defer(gs_object_delete(gs)) bq_table_load(tb2, gs) - df <- bq_table_download(tb2) + df <- bq_table_download(tb2, api = "json") expect_equal(dim(df), c(32, 11)) }) @@ -79,7 +79,7 @@ test_that("can round trip atomic vectors", { bq_df <- bq_test_table() bq_table_upload(bq_df, df1) - df2 <- bq_table_download(bq_df, bigint = "integer") + df2 <- bq_table_download(bq_df, bigint = "integer", api = "json") df2 <- df2[order(df2[[1]]), names(df1)] # BQ doesn't gaurantee order rownames(df2) <- NULL @@ -94,7 +94,7 @@ test_that("can round-trip POSIXt to either TIMESTAMP or DATETIME", { bq_fields(list(bq_field("datetime", "TIMESTAMP"))) ) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) tb2 <- bq_table_create( @@ -102,7 +102,7 @@ test_that("can round-trip POSIXt to either TIMESTAMP or DATETIME", { bq_fields(list(bq_field("datetime", "DATETIME"))) ) bq_table_upload(tb2, df) - df2 <- bq_table_download(tb2) + df2 <- bq_table_download(tb2, api = "json") expect_equal(df2, df) }) @@ -117,7 +117,7 @@ test_that("can round trip data frame with list-cols", { ) bq_table_upload(tb, df1) - df2 <- bq_table_download(tb, bigint = "integer") + df2 <- bq_table_download(tb, bigint = "integer", api = "json") # restore column order df2 <- df2[names(df1)] df2$struct[[1]] <- df2$struct[[1]][c("x", "y", "z")] @@ -164,7 +164,7 @@ test_that("can round-trip GEOGRAPHY", { tb1 <- bq_table_create(bq_test_table(), as_bq_fields(df)) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) }) @@ -173,6 +173,6 @@ test_that("can round-trip BYTES", { tb1 <- bq_table_create(bq_test_table(), as_bq_fields(df)) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) }) From 1b95bf3a0c0af8f7e218c542701f8e07152f44ee Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Wed, 10 Apr 2024 13:46:40 -0500 Subject: [PATCH 02/33] Pass bq-download tests --- tests/testthat/_snaps/bq-download.md | 3 ++- tests/testthat/test-bq-download.R | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/testthat/_snaps/bq-download.md b/tests/testthat/_snaps/bq-download.md index bc9eed7a..b9496982 100644 --- a/tests/testthat/_snaps/bq-download.md +++ b/tests/testthat/_snaps/bq-download.md @@ -1,7 +1,8 @@ # errors when table is known to be incomplete Code - bq_table_download(tb, n_max = 35000, page_size = 35000, bigint = "integer64") + bq_table_download(tb, n_max = 35000, page_size = 35000, bigint = "integer64", + api = "json") Message Downloading first chunk of data. Condition diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 04ad531d..b604081b 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -174,7 +174,7 @@ test_that("can convert date time types", { " tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") base <- ISOdatetime(2000, 1, 2, 3, 4, 5.67, tz = "UTC") @@ -198,7 +198,7 @@ test_that("can parse fractional seconds", { test_that("correctly parse logical values" ,{ query <- "SELECT TRUE as x" tb <- bq_project_query(bq_test_project(), query) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_true(df$x) }) @@ -209,18 +209,18 @@ test_that("the return type of integer columns is set by the bigint argument", { qry <- bq_project_query(bq_test_project(), sql) expect_warning( - out_int <- bq_table_download(qry, bigint = "integer")$x, + out_int <- bq_table_download(qry, bigint = "integer", api = "json")$x, "integer overflow" ) expect_identical(out_int, suppressWarnings(as.integer(x))) - out_int64 <- bq_table_download(qry, bigint = "integer64")$x + out_int64 <- bq_table_download(qry, bigint = "integer64", api = "json")$x expect_identical(out_int64, bit64::as.integer64(x)) - out_dbl <- bq_table_download(qry, bigint = "numeric")$x + out_dbl <- bq_table_download(qry, bigint = "numeric", api = "json")$x expect_identical(out_dbl, as.double(x)) - out_chr <- bq_table_download(qry, bigint = "character")$x + out_chr <- bq_table_download(qry, bigint = "character", api = "json")$x expect_identical(out_chr, x) }) @@ -228,7 +228,7 @@ test_that("can convert geography type", { skip_if_not_installed("wk") sql <- "SELECT ST_GEOGFROMTEXT('POINT (30 10)') as geography" tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_identical(df$geography, wk::wkt("POINT(30 10)")) }) @@ -236,7 +236,7 @@ test_that("can convert geography type", { test_that("can convert bytes type", { sql <- "SELECT ST_ASBINARY(ST_GEOGFROMTEXT('POINT (30 10)')) as bytes" tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_identical( df$bytes, From d90ac6ffbd04ba25179374c01877ae1e69c6b03d Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Wed, 10 Apr 2024 13:49:33 -0500 Subject: [PATCH 03/33] Fix doc buglet; re-document --- R/bq-perform.R | 2 +- man/api-perform.Rd | 2 +- man/api-table.Rd | 2 +- man/src_bigquery.Rd | 8 +++++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/bq-perform.R b/R/bq-perform.R index 8085b5dc..ec005098 100644 --- a/R/bq-perform.R +++ b/R/bq-perform.R @@ -192,7 +192,7 @@ export_json <- function(values) { #' Google Cloud. #' #' For Google Cloud Storage URIs: Each URI can contain one -#' `'*'`` wildcard character and it must come after the 'bucket' name. +#' `'*'` wildcard character and it must come after the 'bucket' name. #' Size limits related to load jobs apply to external data sources. #' #' For Google Cloud Bigtable URIs: Exactly one URI can be specified and diff --git a/man/api-perform.Rd b/man/api-perform.Rd index ea6f34df..3ee02a7b 100644 --- a/man/api-perform.Rd +++ b/man/api-perform.Rd @@ -129,7 +129,7 @@ to the table. Google Cloud. For Google Cloud Storage URIs: Each URI can contain one -`'*'`` wildcard character and it must come after the 'bucket' name. +\code{'*'} wildcard character and it must come after the 'bucket' name. Size limits related to load jobs apply to external data sources. For Google Cloud Bigtable URIs: Exactly one URI can be specified and diff --git a/man/api-table.Rd b/man/api-table.Rd index 6509d18d..70c8031a 100644 --- a/man/api-table.Rd +++ b/man/api-table.Rd @@ -66,7 +66,7 @@ number of files.} Google Cloud. For Google Cloud Storage URIs: Each URI can contain one -`'*'`` wildcard character and it must come after the 'bucket' name. +\code{'*'} wildcard character and it must come after the 'bucket' name. Size limits related to load jobs apply to external data sources. For Google Cloud Bigtable URIs: Exactly one URI can be specified and diff --git a/man/src_bigquery.Rd b/man/src_bigquery.Rd index 36ef88b6..2cc6c52d 100644 --- a/man/src_bigquery.Rd +++ b/man/src_bigquery.Rd @@ -4,7 +4,13 @@ \alias{src_bigquery} \title{A BigQuery data source for dplyr.} \usage{ -src_bigquery(project, dataset, billing = project, max_pages = 10) +src_bigquery( + project, + dataset, + billing = project, + api = c("json", "arrow"), + max_pages = 10 +) } \arguments{ \item{project}{project id or name} From 6f6783e823c2191e0d324ced629c74c4ce2032db Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Wed, 10 Apr 2024 13:54:27 -0500 Subject: [PATCH 04/33] Final api = 'json' args --- R/dbi-connection.R | 2 +- R/dbi-result.R | 5 +++-- tests/testthat/_snaps/dbi-connection.md | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/dbi-connection.R b/R/dbi-connection.R index a790ba24..18cb4b35 100644 --- a/R/dbi-connection.R +++ b/R/dbi-connection.R @@ -318,7 +318,7 @@ setMethod("dbCreateTable", "BigQueryConnection", dbCreateTable_bq) dbReadTable_bq <- function(conn, name, ...) { tb <- as_bq_table(conn, name) - bq_table_download(tb, ...) + bq_table_download(tb, ..., api = "json") } #' @rdname DBI diff --git a/R/dbi-result.R b/R/dbi-result.R index 7ba63b4d..f9554ef8 100644 --- a/R/dbi-result.R +++ b/R/dbi-result.R @@ -105,13 +105,14 @@ setMethod( n <- res@cursor$left() } - # TODO: figure out what how to ignore pagination here + # TODO: figure out how to ignore pagination here data <- bq_table_download(res@bq_table, n_max = n, start_index = res@cursor$cur(), page_size = res@page_size, bigint = res@bigint, - quiet = res@quiet + quiet = res@quiet, + api = "json" ) res@cursor$adv(nrow(data)) diff --git a/tests/testthat/_snaps/dbi-connection.md b/tests/testthat/_snaps/dbi-connection.md index 67c560a7..724fa1cb 100644 --- a/tests/testthat/_snaps/dbi-connection.md +++ b/tests/testthat/_snaps/dbi-connection.md @@ -48,7 +48,7 @@ Code DBI::dbReadTable(con, "natality", n_max = 10) Condition - Error in `bigrquery::as_bq_table()`: + Error in `as_bq_table()`: ! `name` ("natality") must have 2 or 3 components if the connection doesn't have a dataset. # can create bq_table from connection + name From 36ed1799a076b12e78021889ee4e5b5c24046a31 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:03:24 -0500 Subject: [PATCH 05/33] When possible, Use arrow in dbFetch() --- R/dbi-result.R | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/R/dbi-result.R b/R/dbi-result.R index f9554ef8..dd1fee7a 100644 --- a/R/dbi-result.R +++ b/R/dbi-result.R @@ -100,20 +100,28 @@ setMethod( "dbFetch", "BigQueryResult", function(res, n = -1, ...) { check_number_whole(n, min = -1, allow_infinite = TRUE) - - if (n == -1 || n == Inf) { - n <- res@cursor$left() + if (n == -1) n <- Inf + + if (has_bigrquerystorage() && n == Inf && res@cursor$cur() == 0) { + # If possible, download complete dataset using arrow + data <- bq_table_download(res@bq_table, + bigint = res@bigint, + quiet = res@quiet, + n_max = res@cursor$left(), + api = "arrow" + ) + } else { + # Otherwise, fall back to slower JSON API + data <- bq_table_download(res@bq_table, + n_max = n, + start_index = res@cursor$cur(), + page_size = res@page_size, + bigint = res@bigint, + quiet = res@quiet, + api = "json" + ) } - - # TODO: figure out how to ignore pagination here - data <- bq_table_download(res@bq_table, - n_max = n, - start_index = res@cursor$cur(), - page_size = res@page_size, - bigint = res@bigint, - quiet = res@quiet, - api = "json" - ) + res@cursor$adv(nrow(data)) data From 793a251346ac0338884ff9d1469fcf05ea719b3f Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:25:46 -0500 Subject: [PATCH 06/33] Add API argument to collect() --- R/bq-download.R | 15 +++++--- R/dplyr.R | 27 +++++++++++--- _pkgdown.yml | 1 + man/collect.tbl_BigQueryConnection.Rd | 54 +++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 11 deletions(-) create mode 100644 man/collect.tbl_BigQueryConnection.Rd diff --git a/R/bq-download.R b/R/bq-download.R index e1259fc7..cb33f6d9 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -82,12 +82,7 @@ bq_table_download <- check_number_whole(max_connections, min = 1) quiet <- check_quiet(quiet) bigint <- arg_match(bigint) - - if (missing(api)) { - api <- if (has_bigrquerystorage()) "arrow" else "json" - } else { - api <- arg_match(api) - } + api <- check_api(api) if (lifecycle::is_present(max_results)) { lifecycle::deprecate_warn( @@ -242,6 +237,14 @@ bq_table_download <- parse_postprocess(table_data, bigint = bigint) } +check_api <- function(api = c("json", "arrow"), error_call = caller_env()) { + if (identical(api, c("json", "arrow"))) { + if (has_bigrquerystorage()) "arrow" else "json" + } else { + arg_match(api, error_call = error_call) + } +} + # This function is a modified version of # https://github.com/r-dbi/RPostgres/blob/master/R/PqResult.R parse_postprocess <- function(df, bigint) { diff --git a/R/dplyr.R b/R/dplyr.R index aae8411e..c75875d1 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -120,15 +120,32 @@ db_copy_to.BigQueryConnection <- function(con, # Efficient downloads ----------------------------------------------- # registered onLoad + +#' Collect a BigQuery table +#' +#' This collect method is specialised for BigQuery tables, generating the +#' SQL from your dplyr commands, then calling [bq_project_query()] +#' or [bq_dataset_query()] to run the query, then [bq_download_table()] +#' to download the results. Thus the arguments are a combination of the +#' arguments to [dplyr::collect()], `bq_project_query()`/`bq_dataset_query()`, +#' and `bq_download_table()`. +#' +#' @inheritParams dplyr::collect +#' @inheritParams bq_table_download +#' @param n Maximum number of results to retrieve. +#' The default, `Inf`, will retrieve all rows. +#' @param ... Other arguments passed on to +#' `bq_project_query()`/`bq_project_query()` collect.tbl_BigQueryConnection <- function(x, ..., - page_size = NULL, - max_connections = 6L, n = Inf, - warn_incomplete = TRUE) { + api = c("json", "arrow"), + page_size = NULL, + max_connections = 6L + ) { + api <- check_api(api) check_number_whole(n, min = 0, allow_infinite = TRUE) check_number_whole(max_connections, min = 1) - check_bool(warn_incomplete) con <- dbplyr::remote_con(x) billing <- con@billing @@ -152,7 +169,7 @@ collect.tbl_BigQueryConnection <- function(x, ..., quiet <- if (n < 100) TRUE else con@quiet bigint <- con@bigint %||% "integer" - if (has_bigrquerystorage()) { + if (api == "arrow") { out <- bq_table_download(tb, n_max = n, quiet = quiet, diff --git a/_pkgdown.yml b/_pkgdown.yml index beadd04a..d4bda334 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,6 +16,7 @@ reference: contents: - src_bigquery - bigquery + - collect.tbl_BigQueryConnection - title: Low-level API contents: diff --git a/man/collect.tbl_BigQueryConnection.Rd b/man/collect.tbl_BigQueryConnection.Rd new file mode 100644 index 00000000..4402deb1 --- /dev/null +++ b/man/collect.tbl_BigQueryConnection.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{collect.tbl_BigQueryConnection} +\alias{collect.tbl_BigQueryConnection} +\title{Collect a BigQuery table} +\usage{ +collect.tbl_BigQueryConnection( + x, + ..., + n = Inf, + api = c("json", "arrow"), + page_size = NULL, + max_connections = 6L +) +} +\arguments{ +\item{x}{A data frame, data frame extension (e.g. a tibble), or a lazy +data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more +details.} + +\item{...}{Other arguments passed on to +\code{bq_project_query()}/\code{bq_project_query()}} + +\item{n}{Maximum number of results to retrieve. +The default, \code{Inf}, will retrieve all rows.} + +\item{api}{Which API to use? The \code{"json"} API works where ever bigrquery +does, but is slow and can require fiddling with the \code{page_size} parameter. +The \code{"arrow"} API is faster and more reliable, but only works if you +have also installed the bigrquerystorage package. + +Because the \code{"arrow"} API is so much faster, it will be used automatically +if the bigrquerystorage package is installed.} + +\item{page_size}{The number of rows requested per chunk. It is recommended to +leave this unspecified until you have evidence that the \code{page_size} +selected automatically by \code{bq_table_download()} is problematic. + +When \code{page_size = NULL} bigrquery determines a conservative, natural chunk +size empirically. If you specify the \code{page_size}, it is important that each +chunk fits on one page, i.e. that the requested row limit is low enough to +prevent the API from paginating based on response size.} + +\item{max_connections}{Number of maximum simultaneous connections to +BigQuery servers.} +} +\description{ +This collect method is specialised for BigQuery tables, generating the +SQL from your dplyr commands, then calling \code{\link[=bq_project_query]{bq_project_query()}} +or \code{\link[=bq_dataset_query]{bq_dataset_query()}} to run the query, then \code{\link[=bq_download_table]{bq_download_table()}} +to download the results. Thus the arguments are a combination of the +arguments to \code{\link[dplyr:compute]{dplyr::collect()}}, \code{bq_project_query()}/\code{bq_dataset_query()}, +and \code{bq_download_table()}. +} From c68a49deac6eb8020a5bd477ad335fbc4d52246f Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:32:49 -0500 Subject: [PATCH 07/33] Clarify bug workaround --- R/dbi-result.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/dbi-result.R b/R/dbi-result.R index dd1fee7a..f4543717 100644 --- a/R/dbi-result.R +++ b/R/dbi-result.R @@ -103,11 +103,14 @@ setMethod( if (n == -1) n <- Inf if (has_bigrquerystorage() && n == Inf && res@cursor$cur() == 0) { + # https://github.com/meztez/bigrquerystorage/issues/48 + n <- res@cursor$left() + # If possible, download complete dataset using arrow data <- bq_table_download(res@bq_table, + n_max = n, bigint = res@bigint, quiet = res@quiet, - n_max = res@cursor$left(), api = "arrow" ) } else { From 9d1f36d4be96b70906abf87edfb88428a88ec59b Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:34:03 -0500 Subject: [PATCH 08/33] R CMD check fixes --- R/dplyr.R | 5 ++--- man/collect.tbl_BigQueryConnection.Rd | 4 ++-- man/src_bigquery.Rd | 8 +------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/R/dplyr.R b/R/dplyr.R index c75875d1..47a7b38e 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -30,7 +30,6 @@ src_bigquery <- function(project, dataset, billing = project, - api = c("json", "arrow"), max_pages = 10) { check_installed("dbplyr") @@ -125,10 +124,10 @@ db_copy_to.BigQueryConnection <- function(con, #' #' This collect method is specialised for BigQuery tables, generating the #' SQL from your dplyr commands, then calling [bq_project_query()] -#' or [bq_dataset_query()] to run the query, then [bq_download_table()] +#' or [bq_dataset_query()] to run the query, then [bq_table_download()] #' to download the results. Thus the arguments are a combination of the #' arguments to [dplyr::collect()], `bq_project_query()`/`bq_dataset_query()`, -#' and `bq_download_table()`. +#' and `bq_table_download()`. #' #' @inheritParams dplyr::collect #' @inheritParams bq_table_download diff --git a/man/collect.tbl_BigQueryConnection.Rd b/man/collect.tbl_BigQueryConnection.Rd index 4402deb1..4f68161a 100644 --- a/man/collect.tbl_BigQueryConnection.Rd +++ b/man/collect.tbl_BigQueryConnection.Rd @@ -47,8 +47,8 @@ BigQuery servers.} \description{ This collect method is specialised for BigQuery tables, generating the SQL from your dplyr commands, then calling \code{\link[=bq_project_query]{bq_project_query()}} -or \code{\link[=bq_dataset_query]{bq_dataset_query()}} to run the query, then \code{\link[=bq_download_table]{bq_download_table()}} +or \code{\link[=bq_dataset_query]{bq_dataset_query()}} to run the query, then \code{\link[=bq_table_download]{bq_table_download()}} to download the results. Thus the arguments are a combination of the arguments to \code{\link[dplyr:compute]{dplyr::collect()}}, \code{bq_project_query()}/\code{bq_dataset_query()}, -and \code{bq_download_table()}. +and \code{bq_table_download()}. } diff --git a/man/src_bigquery.Rd b/man/src_bigquery.Rd index 2cc6c52d..36ef88b6 100644 --- a/man/src_bigquery.Rd +++ b/man/src_bigquery.Rd @@ -4,13 +4,7 @@ \alias{src_bigquery} \title{A BigQuery data source for dplyr.} \usage{ -src_bigquery( - project, - dataset, - billing = project, - api = c("json", "arrow"), - max_pages = 10 -) +src_bigquery(project, dataset, billing = project, max_pages = 10) } \arguments{ \item{project}{project id or name} From c28f1801ef4591544974a144e3cb1f3b16c97e5b Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:34:34 -0500 Subject: [PATCH 09/33] use_tidy_description() --- DESCRIPTION | 4 +- tests/testthat/_snaps/dbi-connection.new.md | 77 +++++++++++++++++++++ tests/testthat/_snaps/dbi-result.new.md | 23 ++++++ 3 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/_snaps/dbi-connection.new.md create mode 100644 tests/testthat/_snaps/dbi-result.new.md diff --git a/DESCRIPTION b/DESCRIPTION index 0b5e4b6a..d43737bc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,7 @@ Imports: rlang (>= 1.1.0), tibble Suggests: + bigrquerystorage, blob, covr, dbplyr (>= 2.4.0), @@ -38,9 +39,8 @@ Suggests: readr, sodium, testthat (>= 3.1.5), - wk (>= 0.3.2), withr, - bigrquerystorage + wk (>= 0.3.2) LinkingTo: cli, cpp11, diff --git a/tests/testthat/_snaps/dbi-connection.new.md b/tests/testthat/_snaps/dbi-connection.new.md new file mode 100644 index 00000000..67c560a7 --- /dev/null +++ b/tests/testthat/_snaps/dbi-connection.new.md @@ -0,0 +1,77 @@ +# useful print with and without dataset + + Code + # With dataset + con1 + Output + + Dataset: p.x + Billing: b + Code + # Without dataset + con2 + Output + + Billing: p + +# dbQuoteIdentifier validates inputs + + Code + DBI::dbQuoteIdentifier(con, c("x", NA)) + Condition + Error in `DBI::dbQuoteIdentifier()`: + ! `x` must not contain missing values. + +# dbWriteTable errors on unsupported arguments + + Code + DBI::dbWriteTable(con, "x", df, field.types = list()) + Condition + Error in `DBI::dbWriteTable()`: + ! `field.types` not supported by bigrquery. + Code + DBI::dbWriteTable(con, "x", df, temporary = TRUE) + Condition + Error in `DBI::dbWriteTable()`: + ! `temporary = FALSE` not supported by bigrquery. + +# dataset is optional + + Code + DBI::dbListTables(con) + Condition + Error in `DBI::dbListTables()`: + ! Can't list tables without a connection `dataset`. + +--- + + Code + DBI::dbReadTable(con, "natality", n_max = 10) + Condition + Error in `bigrquery::as_bq_table()`: + ! `name` ("natality") must have 2 or 3 components if the connection doesn't have a dataset. + +# can create bq_table from connection + name + + Code + as_bq_table(con1, "x") + Condition + Error in `as_bq_table()`: + ! `name` ("x") must have 2 or 3 components if the connection doesn't have a dataset. + +--- + + Code + as_bq_table(con1, "a.b.c.d") + Condition + Error in `as_bq_table()`: + ! `name` ("a.b.c.d") must have 1-3 components. + +# as_bq_table checks its input types + + Code + as_bq_table(con1, letters) + Condition + Error in `as_bq_table()`: + ! `name` must be a string or a dbplyr_table_ident. + diff --git a/tests/testthat/_snaps/dbi-result.new.md b/tests/testthat/_snaps/dbi-result.new.md new file mode 100644 index 00000000..4ada7d56 --- /dev/null +++ b/tests/testthat/_snaps/dbi-result.new.md @@ -0,0 +1,23 @@ +# can retrieve query in pieces and that quiet is respected + + Code + DBI::dbFetch(res, NA) + Condition + Error in `DBI::dbFetch()`: + ! is.numeric(n) is not TRUE + +--- + + Code + df <- DBI::dbFetch(res, 10) + +# can get metadata + + Code + res + Output + + Query: SELECT cyl, mpg FROM mtcars + Has completed: FALSE + Rows fetched: 0 + From 4a18fa241c5b29e125042cecdd0d428c38d28a28 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:39:30 -0500 Subject: [PATCH 10/33] Start tests for arrow api --- tests/testthat/test-bq-download.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index b604081b..1921ebcd 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -59,6 +59,20 @@ test_that("errors when table is known to be incomplete", { ) }) +# api = "arrow" ---------------------------------------------------------------- + +test_that("check_api respects inputs", { + expect_equal(check_api("arrow"), "arrow") + expect_equal(check_api("json"), "json") +}) + +test_that("uses arrow api if bigrquerystorage installed", { + expect_equal(check_api(), "arrow") + + local_mocked_bindings(is_installed = function(...) FALSE) + expect_equal(check_api(), "json") +}) + # helpers around row and chunk params ------------------------------------------ test_that("set_row_params() works ", { From 5590b6d2600f5846d539bbe566e0db880ec8ac2a Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:40:57 -0500 Subject: [PATCH 11/33] Polish news --- NEWS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 28c7c4ec..fc5d66d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,9 @@ # bigrquery (development version) * If the bigrquerystorage package is installed, `bq_table_download()` (and - hence basically every function that downloads data from BigQuery) will - use it. This will considerably improve the speed of downloading large - datasets. + hence `collect()`, `dbGetQuery()` and `dbFetch()` will use it. This will + drastically improve the speed of downloading large datasets. A big thanks + to @meztez for creating the bigrquerystorage package! # bigrquery 1.5.1 From 5c257186cf0b53a403065d7f52713dda6a38f1a5 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 11 Apr 2024 08:49:46 -0500 Subject: [PATCH 12/33] Polish docs --- R/bq-download.R | 51 ++++++++++++------------- man/bq_table_download.Rd | 55 ++++++++++++--------------- man/collect.tbl_BigQueryConnection.Rd | 10 ++--- 3 files changed, 54 insertions(+), 62 deletions(-) diff --git a/R/bq-download.R b/R/bq-download.R index cb33f6d9..6d067470 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -1,31 +1,28 @@ #' Download table data #' -#' This function provides two ways to download -#' This retrieves rows in chunks of `page_size`. It is most suitable for results -#' of smaller queries (<100 MB, say). For larger queries, it is better to -#' export the results to a CSV file stored on google cloud and use the -#' bq command line tool to download locally. +#' @description +#' This function provides two ways to download data from BigQuery, transfering +#' data using either JSON or arrow, depending on the `api` argument. JSON is +#' much slower but requires no additional dependencies, and is what bigrquery +#' used prior to version 1.6.0. The arrow method is much much faster, but +#' requires the bigrquerystorage, which in turn requires the arrow package. +#' These dependencies are fairly heavy, and can be tricky to compile on Linux, +#' but in our opinion the massive speedup is worth the effort. +#' +#' ## JSON API +#' +#' The JSON API retrieves rows in chunks of `page_size`. It is most suitable +#' for results of smaller queries (<100 MB, say). Unfortunately due to +#' limitations in the BigQuery API, you may need to vary this parameter +#' depending on the complexity of the underlying data. #' -#' @section Complex data: -#' bigrquery will retrieve nested and repeated columns in to list-columns +#' The JSON API will convert nested and repeated columns in to list-columns #' as follows: #' #' * Repeated values (arrays) will become a list-column of vectors. #' * Records will become list-columns of named lists. #' * Repeated records will become list-columns of data frames. #' -#' @section Larger datasets: -#' In my timings, this code takes around 1 minute per 100 MB of data. -#' If you need to download considerably more than this, I recommend: -#' -#' * Export a `.csv` file to Cloud Storage using [bq_table_save()]. -#' * Use the `gsutil` command line utility to download it. -#' * Read the csv file into R with `readr::read_csv()` or `data.table::fread()`. -#' -#' Unfortunately you can not export nested or repeated formats into CSV, and -#' the formats that BigQuery supports (arvn and ndjson) that allow for -#' nested/repeated values, are not well supported in R. -#' #' @return Because data retrieval may generate list-columns and the `data.frame` #' print method can have problems with list-columns, this method returns #' a tibble. If you need a `data.frame`, coerce the results with @@ -33,17 +30,17 @@ #' @param x A [bq_table] #' @param n_max Maximum number of results to retrieve. Use `Inf` to retrieve all #' rows. -#' @param page_size The number of rows requested per chunk. It is recommended to -#' leave this unspecified until you have evidence that the `page_size` -#' selected automatically by `bq_table_download()` is problematic. +#' @param page_size (JSON only) The number of rows requested per chunk. It is +#' recommended to leave this unspecified until you have evidence that the +#' `page_size` selected automatically by `bq_table_download()` is problematic. #' #' When `page_size = NULL` bigrquery determines a conservative, natural chunk #' size empirically. If you specify the `page_size`, it is important that each #' chunk fits on one page, i.e. that the requested row limit is low enough to #' prevent the API from paginating based on response size. -#' @param start_index Starting row index (zero-based). -#' @param max_connections Number of maximum simultaneous connections to -#' BigQuery servers. +#' @param start_index (JSON only) Starting row index (zero-based). +#' @param max_connections (JSON only) Number of maximum simultaneous +#' connections to BigQuery servers. #' @param api Which API to use? The `"json"` API works where ever bigrquery #' does, but is slow and can require fiddling with the `page_size` parameter. #' The `"arrow"` API is faster and more reliable, but only works if you @@ -57,7 +54,9 @@ #' but results in `NA` for values above/below +/- 2147483647. `"integer64"` #' returns a [bit64::integer64], which allows the full range of 64 bit #' integers. -#' @param billing Identifier of project to bill. +#' @param billing (Arrow only) Project to bill; defaults to the project of `x`, +#' and typically only needs to be specified if you're working with public +#' datasets. #' @param max_results `r lifecycle::badge("deprecated")` Deprecated. Please use #' `n_max` instead. #' @section Google BigQuery API documentation: diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index 928627ef..0d6eb2af 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -23,19 +23,19 @@ bq_table_download( \item{n_max}{Maximum number of results to retrieve. Use \code{Inf} to retrieve all rows.} -\item{page_size}{The number of rows requested per chunk. It is recommended to -leave this unspecified until you have evidence that the \code{page_size} -selected automatically by \code{bq_table_download()} is problematic. +\item{page_size}{(JSON only) The number of rows requested per chunk. It is +recommended to leave this unspecified until you have evidence that the +\code{page_size} selected automatically by \code{bq_table_download()} is problematic. When \code{page_size = NULL} bigrquery determines a conservative, natural chunk size empirically. If you specify the \code{page_size}, it is important that each chunk fits on one page, i.e. that the requested row limit is low enough to prevent the API from paginating based on response size.} -\item{start_index}{Starting row index (zero-based).} +\item{start_index}{(JSON only) Starting row index (zero-based).} -\item{max_connections}{Number of maximum simultaneous connections to -BigQuery servers.} +\item{max_connections}{(JSON only) Number of maximum simultaneous +connections to BigQuery servers.} \item{quiet}{If \code{FALSE}, displays progress bar; if \code{TRUE} is silent; if \code{NA} picks based on whether or not you're in an interactive context.} @@ -54,7 +54,9 @@ have also installed the bigrquerystorage package. Because the \code{"arrow"} API is so much faster, it will be used automatically if the bigrquerystorage package is installed.} -\item{billing}{Identifier of project to bill.} +\item{billing}{(Arrow only) Project to bill; defaults to the project of \code{x}, +and typically only needs to be specified if you're working with public +datasets.} \item{max_results}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Deprecated. Please use \code{n_max} instead.} @@ -66,15 +68,21 @@ a tibble. If you need a \code{data.frame}, coerce the results with \code{\link[=as.data.frame]{as.data.frame()}}. } \description{ -This function provides two ways to download -This retrieves rows in chunks of \code{page_size}. It is most suitable for results -of smaller queries (<100 MB, say). For larger queries, it is better to -export the results to a CSV file stored on google cloud and use the -bq command line tool to download locally. -} -\section{Complex data}{ - -bigrquery will retrieve nested and repeated columns in to list-columns +This function provides two ways to download data from BigQuery, transfering +data using either JSON or arrow, depending on the \code{api} argument. JSON is +much slower but requires no additional dependencies, and is what bigrquery +used prior to version 1.6.0. The arrow method is much much faster, but +requires the bigrquerystorage, which in turn requires the arrow package. +These dependencies are fairly heavy, and can be tricky to compile on Linux, +but in our opinion the massive speedup is worth the effort. +\subsection{JSON API}{ + +The JSON API retrieves rows in chunks of \code{page_size}. It is most suitable +for results of smaller queries (<100 MB, say). Unfortunately due to +limitations in the BigQuery API, you may need to vary this parameter +depending on the complexity of the underlying data. + +The JSON API will convert nested and repeated columns in to list-columns as follows: \itemize{ \item Repeated values (arrays) will become a list-column of vectors. @@ -82,22 +90,7 @@ as follows: \item Repeated records will become list-columns of data frames. } } - -\section{Larger datasets}{ - -In my timings, this code takes around 1 minute per 100 MB of data. -If you need to download considerably more than this, I recommend: -\itemize{ -\item Export a \code{.csv} file to Cloud Storage using \code{\link[=bq_table_save]{bq_table_save()}}. -\item Use the \code{gsutil} command line utility to download it. -\item Read the csv file into R with \code{readr::read_csv()} or \code{data.table::fread()}. -} - -Unfortunately you can not export nested or repeated formats into CSV, and -the formats that BigQuery supports (arvn and ndjson) that allow for -nested/repeated values, are not well supported in R. } - \section{Google BigQuery API documentation}{ \itemize{ diff --git a/man/collect.tbl_BigQueryConnection.Rd b/man/collect.tbl_BigQueryConnection.Rd index 4f68161a..ac401ae2 100644 --- a/man/collect.tbl_BigQueryConnection.Rd +++ b/man/collect.tbl_BigQueryConnection.Rd @@ -32,17 +32,17 @@ have also installed the bigrquerystorage package. Because the \code{"arrow"} API is so much faster, it will be used automatically if the bigrquerystorage package is installed.} -\item{page_size}{The number of rows requested per chunk. It is recommended to -leave this unspecified until you have evidence that the \code{page_size} -selected automatically by \code{bq_table_download()} is problematic. +\item{page_size}{(JSON only) The number of rows requested per chunk. It is +recommended to leave this unspecified until you have evidence that the +\code{page_size} selected automatically by \code{bq_table_download()} is problematic. When \code{page_size = NULL} bigrquery determines a conservative, natural chunk size empirically. If you specify the \code{page_size}, it is important that each chunk fits on one page, i.e. that the requested row limit is low enough to prevent the API from paginating based on response size.} -\item{max_connections}{Number of maximum simultaneous connections to -BigQuery servers.} +\item{max_connections}{(JSON only) Number of maximum simultaneous +connections to BigQuery servers.} } \description{ This collect method is specialised for BigQuery tables, generating the From b8f44a8b50aa67825cc740afee0a4bd6bb56048b Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 15:13:01 -0500 Subject: [PATCH 13/33] Use bigrquerystorage without DBI methods --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d43737bc..2ca64657 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,7 @@ BugReports: https://github.com/r-dbi/bigrquery/issues Depends: R (>= 3.6) Imports: + bigrquerystorage (>= 1.1.0), bit64, brio, cli, @@ -30,7 +31,6 @@ Imports: rlang (>= 1.1.0), tibble Suggests: - bigrquerystorage, blob, covr, dbplyr (>= 2.4.0), @@ -82,3 +82,5 @@ Collate: 'import-standalone-types-check.R' 'utils.R' 'zzz.R' +Remotes: + meztez/bigrquerystorage#52 From 7265975f1666882ea7275ff59043ae0c0f0b9f44 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 15:13:47 -0500 Subject: [PATCH 14/33] Wrap DB references in I() --- tests/testthat/test-dplyr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-dplyr.R b/tests/testthat/test-dplyr.R index b9ffd8ff..2bf86187 100644 --- a/tests/testthat/test-dplyr.R +++ b/tests/testthat/test-dplyr.R @@ -156,8 +156,8 @@ test_that("all BigQuery tbls share the same src", { billing = bq_test_project() ) - tbl1 <- dplyr::tbl(con1, "basedata.mtcars", vars = "x") - tbl2 <- dplyr::tbl(con2, "publicdata.samples.natality", vars = "x") + tbl1 <- dplyr::tbl(con1, I("basedata.mtcars"), vars = "x") + tbl2 <- dplyr::tbl(con2, I("publicdata.samples.natality"), vars = "x") expect_true(dplyr::same_src(tbl1, tbl2)) expect_false(dplyr::same_src(tbl1, mtcars)) }) From 9b52bf39daf27d58818c3f8cc782896a7ef4a490 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 15:15:26 -0500 Subject: [PATCH 15/33] Don't commit snapshots --- tests/testthat/_snaps/dbi-connection.new.md | 77 --------------------- tests/testthat/_snaps/dbi-result.new.md | 23 ------ 2 files changed, 100 deletions(-) delete mode 100644 tests/testthat/_snaps/dbi-connection.new.md delete mode 100644 tests/testthat/_snaps/dbi-result.new.md diff --git a/tests/testthat/_snaps/dbi-connection.new.md b/tests/testthat/_snaps/dbi-connection.new.md deleted file mode 100644 index 67c560a7..00000000 --- a/tests/testthat/_snaps/dbi-connection.new.md +++ /dev/null @@ -1,77 +0,0 @@ -# useful print with and without dataset - - Code - # With dataset - con1 - Output - - Dataset: p.x - Billing: b - Code - # Without dataset - con2 - Output - - Billing: p - -# dbQuoteIdentifier validates inputs - - Code - DBI::dbQuoteIdentifier(con, c("x", NA)) - Condition - Error in `DBI::dbQuoteIdentifier()`: - ! `x` must not contain missing values. - -# dbWriteTable errors on unsupported arguments - - Code - DBI::dbWriteTable(con, "x", df, field.types = list()) - Condition - Error in `DBI::dbWriteTable()`: - ! `field.types` not supported by bigrquery. - Code - DBI::dbWriteTable(con, "x", df, temporary = TRUE) - Condition - Error in `DBI::dbWriteTable()`: - ! `temporary = FALSE` not supported by bigrquery. - -# dataset is optional - - Code - DBI::dbListTables(con) - Condition - Error in `DBI::dbListTables()`: - ! Can't list tables without a connection `dataset`. - ---- - - Code - DBI::dbReadTable(con, "natality", n_max = 10) - Condition - Error in `bigrquery::as_bq_table()`: - ! `name` ("natality") must have 2 or 3 components if the connection doesn't have a dataset. - -# can create bq_table from connection + name - - Code - as_bq_table(con1, "x") - Condition - Error in `as_bq_table()`: - ! `name` ("x") must have 2 or 3 components if the connection doesn't have a dataset. - ---- - - Code - as_bq_table(con1, "a.b.c.d") - Condition - Error in `as_bq_table()`: - ! `name` ("a.b.c.d") must have 1-3 components. - -# as_bq_table checks its input types - - Code - as_bq_table(con1, letters) - Condition - Error in `as_bq_table()`: - ! `name` must be a string or a dbplyr_table_ident. - diff --git a/tests/testthat/_snaps/dbi-result.new.md b/tests/testthat/_snaps/dbi-result.new.md deleted file mode 100644 index 4ada7d56..00000000 --- a/tests/testthat/_snaps/dbi-result.new.md +++ /dev/null @@ -1,23 +0,0 @@ -# can retrieve query in pieces and that quiet is respected - - Code - DBI::dbFetch(res, NA) - Condition - Error in `DBI::dbFetch()`: - ! is.numeric(n) is not TRUE - ---- - - Code - df <- DBI::dbFetch(res, 10) - -# can get metadata - - Code - res - Output - - Query: SELECT cyl, mpg FROM mtcars - Has completed: FALSE - Rows fetched: 0 - From fea1b2f78a4227cfcb3ceed79b18dd7ea1c172ae Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 16:20:10 -0500 Subject: [PATCH 16/33] Restore bigrquerystorage to correct place --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2ca64657..57f5f6f2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,6 @@ BugReports: https://github.com/r-dbi/bigrquery/issues Depends: R (>= 3.6) Imports: - bigrquerystorage (>= 1.1.0), bit64, brio, cli, @@ -31,6 +30,7 @@ Imports: rlang (>= 1.1.0), tibble Suggests: + bigrquerystorage (>= 1.1.0), blob, covr, dbplyr (>= 2.4.0), From bb4ff9be1b44c060ea56028ba09d0e1a7e6d0d2b Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 16:20:44 -0500 Subject: [PATCH 17/33] Implement `bq_perform_query_schema()` --- NAMESPACE | 1 + R/bq-perform.R | 73 ++++++++++++++++++++++++++------ man/api-perform.Rd | 9 ++++ tests/testthat/test-bq-perform.R | 12 +++++- 4 files changed, 81 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5d04491b..dff217d8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -88,6 +88,7 @@ export(bq_perform_extract) export(bq_perform_load) export(bq_perform_query) export(bq_perform_query_dry_run) +export(bq_perform_query_schema) export(bq_perform_upload) export(bq_project_datasets) export(bq_project_jobs) diff --git a/R/bq-perform.R b/R/bq-perform.R index ec005098..b9f1301b 100644 --- a/R/bq-perform.R +++ b/R/bq-perform.R @@ -340,21 +340,14 @@ bq_perform_query_dry_run <- function(query, billing, parameters = NULL, use_legacy_sql = FALSE) { - check_string(query) - check_string(billing) - check_bool(use_legacy_sql) - query <- list( - query = unbox(query), - useLegacySql = unbox(use_legacy_sql) + query <- bq_perform_query_data( + query = query, + billing = billing, + default_dataset = default_dataset, + parameters = parameters, + use_legacy_sql = use_legacy_sql ) - if (!is.null(parameters)) { - parameters <- as_bq_params(parameters) - query$queryParameters <- as_json(parameters) - } - if (!is.null(default_dataset)) { - query$defaultDataset <- datasetReference(default_dataset) - } url <- bq_path(billing, jobs = "") body <- list(configuration = list(query = query, dryRun = unbox(TRUE))) @@ -368,6 +361,60 @@ bq_perform_query_dry_run <- function(query, billing, structure(bytes, class = "bq_bytes") } +#' @export +#' @rdname api-perform +bq_perform_query_schema <- function(query, billing, + ..., + default_dataset = NULL, + parameters = NULL) { + + query <- bq_perform_query_data( + query = query, + billing = billing, + default_dataset = default_dataset, + parameters = parameters, + use_legacy_sql = FALSE + ) + + url <- bq_path(billing, jobs = "") + body <- list(configuration = list(query = query, dryRun = unbox(TRUE))) + + res <- bq_post( + url, + body = bq_body(body, ...), + query = list(fields = "statistics") + ) + # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableSchema + res$statistics$query$schema$fields +} + +bq_perform_query_data <- function(query, billing, + ..., + default_dataset = NULL, + parameters = NULL, + use_legacy_sql = FALSE, + call = caller_env()) { + check_string(query, error_call = call) + check_string(billing, error_call = call) + check_bool(use_legacy_sql, error_call = call) + + query <- list( + query = unbox(query), + useLegacySql = unbox(use_legacy_sql) + ) + if (!is.null(parameters)) { + parameters <- as_bq_params(parameters) + query$queryParameters <- as_json(parameters) + } + if (!is.null(default_dataset)) { + query$defaultDataset <- datasetReference(default_dataset) + } + + query +} + + + #' @export #' @rdname api-perform bq_perform_copy <- function(src, dest, diff --git a/man/api-perform.Rd b/man/api-perform.Rd index 3ee02a7b..821f101a 100644 --- a/man/api-perform.Rd +++ b/man/api-perform.Rd @@ -7,6 +7,7 @@ \alias{bq_perform_load} \alias{bq_perform_query} \alias{bq_perform_query_dry_run} +\alias{bq_perform_query_schema} \alias{bq_perform_copy} \title{BigQuery jobs: perform a job} \usage{ @@ -64,6 +65,14 @@ bq_perform_query_dry_run( use_legacy_sql = FALSE ) +bq_perform_query_schema( + query, + billing, + ..., + default_dataset = NULL, + parameters = NULL +) + bq_perform_copy( src, dest, diff --git a/tests/testthat/test-bq-perform.R b/tests/testthat/test-bq-perform.R index 3b804321..9ad0a06d 100644 --- a/tests/testthat/test-bq-perform.R +++ b/tests/testthat/test-bq-perform.R @@ -95,10 +95,20 @@ test_that("can supply array parameters", { expect_setequal(df$values, c("a", "b")) }) -test_that("can estimate cost", { +test_that("can estimate cost and get schema", { cost <- bq_perform_query_dry_run( "SELECT count(*) FROM bigquery-public-data.moon_phases.moon_phases", billing = bq_test_project() ) expect_equal(cost, structure(0, class = "bq_bytes")) + + schema <- bq_perform_query_schema( + "SELECT * FROM bigquery-public-data.moon_phases.moon_phases", + billing = bq_test_project() + ) + names <- vapply(schema, function(x) x$name, character(1)) + expect_equal(names, c("phase", "phase_emoji", "peak_datetime")) + + types <- vapply(schema, function(x) x$type, character(1)) + expect_equal(types, c("STRING", "STRING", "DATETIME")) }) From 17c0d3a02337552d0c0e7bc2bc99017146c6b621 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 18 Apr 2024 16:37:54 -0500 Subject: [PATCH 18/33] Use `bq_perform_query_schema()` to get vars --- NEWS.md | 5 +++++ R/dplyr.R | 12 ++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index fc5d66d5..914549b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # bigrquery (development version) +* `tbl()` uses a more efficient method to determine variable names. + +* New `bq_perform_query_schema()` to determine the schema of a query + without executing it. + * If the bigrquerystorage package is installed, `bq_table_download()` (and hence `collect()`, `dbGetQuery()` and `dbFetch()` will use it. This will drastically improve the speed of downloading large datasets. A big thanks diff --git a/R/dplyr.R b/R/dplyr.R index 47a7b38e..de9cbcaa 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -48,10 +48,18 @@ src_bigquery <- function(project, tbl.BigQueryConnection <- function(src, from, ...) { src <- dbplyr::src_dbi(src, auto_disconnect = FALSE) + sql <- dbplyr::sql_query_fields(src$con, from) + dataset <- if (!is.null(src$con@dataset)) as_bq_dataset(src$con) + schema <- bq_perform_query_schema(sql, + billing = src$con@billing, + default_dataset = dataset + ) + vars <- map_chr(schema, "[[", "name") + if (utils::packageVersion("dbplyr") >= "2.4.0.9000") { - tbl <- dplyr::tbl(src, from = from) + tbl <- dplyr::tbl(src, from = from, vars = vars) } else { - tbl <- dplyr::tbl(src, from = from, check_from = FALSE) + tbl <- dplyr::tbl(src, from = from, vars = vars, check_from = FALSE) } # This is ugly, but I don't see a better way of doing this From f81a82b7b8fb893a133323e7fe463a7b21092773 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:03:28 -0500 Subject: [PATCH 19/33] Add some basic type tests --- tests/testthat/test-bq-download.R | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 1921ebcd..2d98443b 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -73,6 +73,54 @@ test_that("uses arrow api if bigrquerystorage installed", { expect_equal(check_api(), "json") }) +test_that("can convert date time types", { + sql <- "SELECT + '\U0001f603' as unicode, + datetime, + TRUE as logicaltrue, + FALSE as logicalfalse, + CAST ('Hi' as BYTES) as bytes, + CAST (datetime as DATE) as date, + CAST (datetime as TIME) as time, + CAST (datetime as TIMESTAMP) as timestamp, + ST_GEOGFROMTEXT('POINT (30 10)') as geography + FROM (SELECT DATETIME '2000-01-02 03:04:05.67' as datetime) + " + + tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) + df <- bq_table_download(tb, api = "arrow") + + base <- ISOdatetime(2000, 1, 2, 3, 4, 5.67, tz = "UTC") + expect_identical(df$unicode, "\U0001f603", ignore_encoding = FALSE) + + expect_equal(df$logicaltrue, TRUE) + expect_equal(df$logicalfalse, FALSE) + + expect_equal(unclass(df$bytes), list(as.raw(c(0x48, 0x69)))) + + expect_equal(df$date, as.Date(base)) + expect_equal(df$timestamp, base) + # expect_equal(df$datetime, base) + expect_equal(df$time, hms::hms(hours = 3, minutes = 4, seconds = 5.67)) + + # expect_identical(df$geography, wk::wkt("POINT(30 10)")) +}) + +test_that("the return type of integer columns is set by the bigint argument", { + x <- c("-2147483648", "-2147483647", "-1", "0", "1", "2147483647", "2147483648") + sql <- paste0("SELECT * FROM UNNEST ([", paste0(x, collapse = ","), "]) AS x"); + qry <- bq_project_query(bq_test_project(), sql) + + out_int64 <- bq_table_download(qry, bigint = "integer64", api = "arrow")$x + expect_identical(out_int64, bit64::as.integer64(x)) + + out_dbl <- bq_table_download(qry, bigint = "numeric", api = "arrow")$x + expect_identical(out_dbl, as.double(x)) + + out_chr <- bq_table_download(qry, bigint = "character", api = "arrow")$x + expect_identical(out_chr, x) +}) + # helpers around row and chunk params ------------------------------------------ test_that("set_row_params() works ", { From 6043c82cfadd361a7bf971897c9680ff76763a4b Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:07:58 -0500 Subject: [PATCH 20/33] Polish docs some more --- R/bq-download.R | 20 ++++++++++++++------ man/bq_table_download.Rd | 21 +++++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/R/bq-download.R b/R/bq-download.R index 6d067470..effd39ef 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -2,12 +2,20 @@ #' #' @description #' This function provides two ways to download data from BigQuery, transfering -#' data using either JSON or arrow, depending on the `api` argument. JSON is -#' much slower but requires no additional dependencies, and is what bigrquery -#' used prior to version 1.6.0. The arrow method is much much faster, but -#' requires the bigrquerystorage, which in turn requires the arrow package. -#' These dependencies are fairly heavy, and can be tricky to compile on Linux, -#' but in our opinion the massive speedup is worth the effort. +#' data using either JSON or arrow, depending on the `api` argument. +#' `api = "json"` is much slower but requires no additional dependencies, +#' and is what bigrquery always used prior to v1.6.0. `api = "arrow"` is +#' much much faster, but requires the bigrquerystorage package. +#' +#' ## Arrow API +#' +#' The arrow API is much faster, but has heavier dependencies: bigrquerystorage +#' requires the arrow package, which can be tricky to compile on Linux (but in +#' general you can get a binary from +#' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/). +#' +#' Currently the only know limitation of `api = "arrow"` is that geographic +#' data is returned as a string; you'll need to parse yourself using `wkt::wkt()`. #' #' ## JSON API #' diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index 0d6eb2af..aeac37e0 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -69,12 +69,21 @@ a tibble. If you need a \code{data.frame}, coerce the results with } \description{ This function provides two ways to download data from BigQuery, transfering -data using either JSON or arrow, depending on the \code{api} argument. JSON is -much slower but requires no additional dependencies, and is what bigrquery -used prior to version 1.6.0. The arrow method is much much faster, but -requires the bigrquerystorage, which in turn requires the arrow package. -These dependencies are fairly heavy, and can be tricky to compile on Linux, -but in our opinion the massive speedup is worth the effort. +data using either JSON or arrow, depending on the \code{api} argument. +\code{api = "json"} is much slower but requires no additional dependencies, +and is what bigrquery always used prior to v1.6.0. \code{api = "arrow"} is +much much faster, but requires the bigrquerystorage package. +\subsection{Arrow API}{ + +The arrow API is much faster, but has heavier dependencies: bigrquerystorage +requires the arrow package, which can be tricky to compile on Linux (but in +general you can get a binary from +\href{https://posit.co/products/cloud/public-package-manager/}{Posit Public Package Manager}. + +Currently the only know limitation of \code{api = "arrow"} is that geographic +data is returned as a string; you'll need to parse yourself using \code{wkt::wkt()}. +} + \subsection{JSON API}{ The JSON API retrieves rows in chunks of \code{page_size}. It is most suitable From 55694bbc5ce9167102486856016180a738225e7e Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:22:07 -0500 Subject: [PATCH 21/33] Polishing tests --- R/bq-download.R | 11 +++++++---- man/bq_table_download.Rd | 8 ++++++-- tests/testthat/test-bq-download.R | 22 ++++++++++++++++++++-- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/R/bq-download.R b/R/bq-download.R index effd39ef..a0428618 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -10,12 +10,15 @@ #' ## Arrow API #' #' The arrow API is much faster, but has heavier dependencies: bigrquerystorage -#' requires the arrow package, which can be tricky to compile on Linux (but in -#' general you can get a binary from +#' requires the arrow package, which can be tricky to compile on Linux (but you +#' usually should be able to get a binary from #' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/). #' -#' Currently the only know limitation of `api = "arrow"` is that geographic -#' data is returned as a string; you'll need to parse yourself using `wkt::wkt()`. +#' There are two known limitations of `api = "arrow"`: +#' +#' * Geographic data is returned as a string; you'll need to parse yourself +#' using `wkt::wkt()`. +#' * When querying public data, you'll now need to provide a `billing` project. #' #' ## JSON API #' diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index aeac37e0..201ced71 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -80,8 +80,12 @@ requires the arrow package, which can be tricky to compile on Linux (but in general you can get a binary from \href{https://posit.co/products/cloud/public-package-manager/}{Posit Public Package Manager}. -Currently the only know limitation of \code{api = "arrow"} is that geographic -data is returned as a string; you'll need to parse yourself using \code{wkt::wkt()}. +There are two known limitations of \code{api = "arrow"}: +\itemize{ +\item Geographic data is returned as a string; you'll need to parse yourself +using \code{wkt::wkt()}. +\item When querying public data, you'll now need to provide a \code{billing} project. +} } \subsection{JSON API}{ diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 2d98443b..7b58fcff 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -73,7 +73,7 @@ test_that("uses arrow api if bigrquerystorage installed", { expect_equal(check_api(), "json") }) -test_that("can convert date time types", { +test_that("arrow api can convert non-nested types", { sql <- "SELECT '\U0001f603' as unicode, datetime, @@ -106,7 +106,25 @@ test_that("can convert date time types", { # expect_identical(df$geography, wk::wkt("POINT(30 10)")) }) -test_that("the return type of integer columns is set by the bigint argument", { +test_that("arrow api can convert nested types", { + skip("https://github.com/meztez/bigrquerystorage/issues/54") + sql <- "SELECT + STRUCT(1.0 AS a, 'abc' AS b) as s, + [1.0, 2.0, 3.0] as a, + [STRUCT(1.0 as a, 'a' as b), STRUCT(2.0, 'b'), STRUCT(3, 'c')] as aos, + STRUCT([1.0, 2.0, 3.0] as a, ['a', 'b'] as b) as soa + " + + tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) + df <- bq_table_download(tb, api = "arrow") + + expect_equal(df$s, list(list(a = 1, b = "abc"))) + expect_equal(df$a, list(c(1, 2, 3))) + expect_equal(df$aos, list(tibble(a = c(1, 2, 3), b = c("a", "b", "c")))) + expect_equal(df$soa, list(list(a = c(1, 2, 3), b = c("a", "b")))) +}) + +test_that("arrow api respects bigint", { x <- c("-2147483648", "-2147483647", "-1", "0", "1", "2147483647", "2147483648") sql <- paste0("SELECT * FROM UNNEST ([", paste0(x, collapse = ","), "]) AS x"); qry <- bq_project_query(bq_test_project(), sql) From f777d65bc50fe74ef12ccd9c782b26d8cd08b921 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:27:53 -0500 Subject: [PATCH 22/33] Test & improve argument warnings --- R/bq-download.R | 15 ++++++++++++--- tests/testthat/_snaps/bq-download.md | 13 +++++++++++++ tests/testthat/test-bq-download.R | 12 ++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/R/bq-download.R b/R/bq-download.R index a0428618..4d6face5 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -104,13 +104,22 @@ bq_table_download <- if (api == "arrow") { check_installed("bigrquerystorage", "required to download using arrow API") if (!missing(page_size)) { - cli::cli_warn('{.arg page_size} is ignored when {.code api == "arrow"}') + cli::cli_warn( + '{.arg page_size} is ignored when {.code api == "arrow"}', + call = environment() + ) } if (!missing(start_index)) { - cli::cli_warn('{.arg start_index} is ignored when {.code api == "arrow"}') + cli::cli_warn( + '{.arg start_index} is ignored when {.code api == "arrow"}', + call = environment() + ) } if (!missing(max_connections)) { - cli::cli_warn('{.arg max_connections} is ignored when {.code api == "arrow"}') + cli::cli_warn( + '{.arg max_connections} is ignored when {.code api == "arrow"}', + call = environment() + ) } return(bigrquerystorage::bqs_table_download( diff --git a/tests/testthat/_snaps/bq-download.md b/tests/testthat/_snaps/bq-download.md index b9496982..5d5d49a0 100644 --- a/tests/testthat/_snaps/bq-download.md +++ b/tests/testthat/_snaps/bq-download.md @@ -11,3 +11,16 @@ x 35,000 rows were requested, but only {n} rows were received. i Leave `page_size` unspecified or use an even smaller value. +# warns if supplying unnused arguments + + Code + . <- bq_table_download(tb, api = "arrow", page_size = 1, start_index = 1, + max_connections = 1) + Condition + Warning in `bq_table_download()`: + `page_size` is ignored when `api == "arrow"` + Warning in `bq_table_download()`: + `start_index` is ignored when `api == "arrow"` + Warning in `bq_table_download()`: + `max_connections` is ignored when `api == "arrow"` + diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 7b58fcff..4520c6a1 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -73,6 +73,18 @@ test_that("uses arrow api if bigrquerystorage installed", { expect_equal(check_api(), "json") }) +test_that("warns if supplying unnused arguments", { + tb <- bq_project_query(bq_test_project(), "SELECT 1.0", quiet = TRUE) + expect_snapshot( + . <- bq_table_download(tb, + api = "arrow", + page_size = 1, + start_index = 1, + max_connections = 1 + ) + ) +}) + test_that("arrow api can convert non-nested types", { sql <- "SELECT '\U0001f603' as unicode, From 26104ba7486d32d5985dad426c288990bd3a78d6 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:29:07 -0500 Subject: [PATCH 23/33] Drop R3.6 check since we're losing it soon anyway --- .github/workflows/R-CMD-check.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index ee65ccb5..d9fced24 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -25,8 +25,6 @@ jobs: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - # Use 3.6 to trigger usage of RTools35 - - {os: windows-latest, r: '3.6'} # use 4.1 to check with rtools40's older compiler - {os: windows-latest, r: '4.1'} From 50b69c16968ad972b393eb3e32ed199d27c6ec91 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 19 Apr 2024 08:31:56 -0500 Subject: [PATCH 24/33] Re-document --- man/bq_table_download.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index 201ced71..b0d8eaaf 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -76,8 +76,8 @@ much much faster, but requires the bigrquerystorage package. \subsection{Arrow API}{ The arrow API is much faster, but has heavier dependencies: bigrquerystorage -requires the arrow package, which can be tricky to compile on Linux (but in -general you can get a binary from +requires the arrow package, which can be tricky to compile on Linux (but you +usually should be able to get a binary from \href{https://posit.co/products/cloud/public-package-manager/}{Posit Public Package Manager}. There are two known limitations of \code{api = "arrow"}: From 7f59af7e0aed21806636d910e4e583744c50ff82 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 19 Sep 2024 17:40:34 -0500 Subject: [PATCH 25/33] Can now use CRAN nanoparquet --- DESCRIPTION | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a44b1746..f828baf0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ Imports: prettyunits, rlang (>= 1.1.0), tibble, - nanoparquet (> 0.3.1) + nanoparquet (>= 0.3.1) Suggests: blob, covr, @@ -41,8 +41,6 @@ Suggests: testthat (>= 3.1.5), wk (>= 0.3.2), withr -Remotes: - r-lib/nanoparquet LinkingTo: cli, cpp11, From 27c3dc7439f3ed4c5a9825a42ea09c6f0f54ef51 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 07:32:08 -0500 Subject: [PATCH 26/33] Re-add accidentally dropped dep --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index f828baf0..06c17f43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,6 +31,7 @@ Imports: tibble, nanoparquet (>= 0.3.1) Suggests: + bigrquerystorage (>= 1.1.0), blob, covr, dbplyr (>= 2.4.0), From 120d5a5012bc13c71985de5b974eed338abece44 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 07:34:38 -0500 Subject: [PATCH 27/33] Can use CRAN bigrquerystorage --- DESCRIPTION | 2 -- 1 file changed, 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 06c17f43..a061c365 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,5 +83,3 @@ Collate: 'import-standalone-types-check.R' 'utils.R' 'zzz.R' -Remotes: - meztez/bigrquerystorage#52 From 3858c5e6e793b4ba10709b3d59e16fb79aa5ece5 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 07:35:56 -0500 Subject: [PATCH 28/33] Fix merge issue in NEWS --- NEWS.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index d9a0888e..8ecfd063 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,21 +1,14 @@ # bigrquery (development version) -* The `bq_perform_upload()` function now allows users to choose the transmission format (JSON or PARQUET) for data sent to BigQuery (@apalacio9502, #608). -* bigrquery now requires R 4.0, in line with our version support principles. - -* The `bq_perform_upload()` function now allows users to choose the transmission format (JSON or PARQUET) for data sent to BigQuery (@apalacio9502, #608). -* bigrquery now requires R 4.0, in line with our version support principles. - -* `tbl()` uses a more efficient method to determine variable names. - -* New `bq_perform_query_schema()` to determine the schema of a query - without executing it. - * If the bigrquerystorage package is installed, `bq_table_download()` (and hence `collect()`, `dbGetQuery()` and `dbFetch()` will use it. This will drastically improve the speed of downloading large datasets. A big thanks to @meztez for creating the bigrquerystorage package! +* The `bq_perform_upload()` function now allows users to choose the transmission format (JSON or PARQUET) for data sent to BigQuery (@apalacio9502, #608). + +* bigrquery now requires R 4.0, in line with our version support principles. + # bigrquery 1.5.1 * Forward compatibility with upcoming dbplyr release (#601). From 9c513736cb42c1fc86a6234d9fcc8e2e9dafdf19 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 07:41:47 -0500 Subject: [PATCH 29/33] Polish docs --- R/bq-download.R | 30 +++++++++++++++--------------- man/bq_table_download.Rd | 8 ++++---- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/R/bq-download.R b/R/bq-download.R index 4d6face5..0cc48c3d 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -2,30 +2,30 @@ #' #' @description #' This function provides two ways to download data from BigQuery, transfering -#' data using either JSON or arrow, depending on the `api` argument. -#' `api = "json"` is much slower but requires no additional dependencies, -#' and is what bigrquery always used prior to v1.6.0. `api = "arrow"` is -#' much much faster, but requires the bigrquerystorage package. +#' data using either JSON or arrow, depending on the `api` argument. If +#' bigrquerystorage is installed, `api = "arrow"` will be used (because it's +#' so much faster, but see the limitions below), otherwise you can select +#' deliberately by using `api = "json"` or `api = "arrow"`. #' #' ## Arrow API #' #' The arrow API is much faster, but has heavier dependencies: bigrquerystorage #' requires the arrow package, which can be tricky to compile on Linux (but you -#' usually should be able to get a binary from +#' usually should be able to get a binary from #' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/). -#' +#' #' There are two known limitations of `api = "arrow"`: #' -#' * Geographic data is returned as a string; you'll need to parse yourself +#' * Geographic data is returned as a string; you'll need to parse yourself #' using `wkt::wkt()`. #' * When querying public data, you'll now need to provide a `billing` project. -#' +#' #' ## JSON API -#' -#' The JSON API retrieves rows in chunks of `page_size`. It is most suitable +#' +#' The JSON API retrieves rows in chunks of `page_size`. It is most suitable #' for results of smaller queries (<100 MB, say). Unfortunately due to -#' limitations in the BigQuery API, you may need to vary this parameter -#' depending on the complexity of the underlying data. +#' limitations in the BigQuery API, you may need to vary this parameter +#' depending on the complexity of the underlying data. #' #' The JSON API will convert nested and repeated columns in to list-columns #' as follows: @@ -41,8 +41,8 @@ #' @param x A [bq_table] #' @param n_max Maximum number of results to retrieve. Use `Inf` to retrieve all #' rows. -#' @param page_size (JSON only) The number of rows requested per chunk. It is -#' recommended to leave this unspecified until you have evidence that the +#' @param page_size (JSON only) The number of rows requested per chunk. It is +#' recommended to leave this unspecified until you have evidence that the #' `page_size` selected automatically by `bq_table_download()` is problematic. #' #' When `page_size = NULL` bigrquery determines a conservative, natural chunk @@ -50,7 +50,7 @@ #' chunk fits on one page, i.e. that the requested row limit is low enough to #' prevent the API from paginating based on response size. #' @param start_index (JSON only) Starting row index (zero-based). -#' @param max_connections (JSON only) Number of maximum simultaneous +#' @param max_connections (JSON only) Number of maximum simultaneous #' connections to BigQuery servers. #' @param api Which API to use? The `"json"` API works where ever bigrquery #' does, but is slow and can require fiddling with the `page_size` parameter. diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index b0d8eaaf..d4972994 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -69,10 +69,10 @@ a tibble. If you need a \code{data.frame}, coerce the results with } \description{ This function provides two ways to download data from BigQuery, transfering -data using either JSON or arrow, depending on the \code{api} argument. -\code{api = "json"} is much slower but requires no additional dependencies, -and is what bigrquery always used prior to v1.6.0. \code{api = "arrow"} is -much much faster, but requires the bigrquerystorage package. +data using either JSON or arrow, depending on the \code{api} argument. If +bigrquerystorage is installed, \code{api = "arrow"} will be used (because it's +so much faster, but see the limitions below), otherwise you can select +deliberately by using \code{api = "json"} or \code{api = "arrow"}. \subsection{Arrow API}{ The arrow API is much faster, but has heavier dependencies: bigrquerystorage From 9e6eceee7bd9fb13ebf1b939d039b578769a4c21 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 10:34:01 -0500 Subject: [PATCH 30/33] Switch back to dev version --- DESCRIPTION | 4 +++- R/bq-download.R | 7 ++----- man/bq_table_download.Rd | 8 ++------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a061c365..34bfa4ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,7 @@ Imports: tibble, nanoparquet (>= 0.3.1) Suggests: - bigrquerystorage (>= 1.1.0), + bigrquerystorage (>= 1.1.0.9000), blob, covr, dbplyr (>= 2.4.0), @@ -83,3 +83,5 @@ Collate: 'import-standalone-types-check.R' 'utils.R' 'zzz.R' +Remotes: + meztez/bigrquerystorage diff --git a/R/bq-download.R b/R/bq-download.R index 0cc48c3d..14d12a66 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -14,11 +14,8 @@ #' usually should be able to get a binary from #' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/). #' -#' There are two known limitations of `api = "arrow"`: -#' -#' * Geographic data is returned as a string; you'll need to parse yourself -#' using `wkt::wkt()`. -#' * When querying public data, you'll now need to provide a `billing` project. +#' There's one known limitation of `api = "arrow"`: when querying public data, +#' you'll now need to provide a `billing` project. #' #' ## JSON API #' diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index d4972994..939f8780 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -80,12 +80,8 @@ requires the arrow package, which can be tricky to compile on Linux (but you usually should be able to get a binary from \href{https://posit.co/products/cloud/public-package-manager/}{Posit Public Package Manager}. -There are two known limitations of \code{api = "arrow"}: -\itemize{ -\item Geographic data is returned as a string; you'll need to parse yourself -using \code{wkt::wkt()}. -\item When querying public data, you'll now need to provide a \code{billing} project. -} +There's one known limitation of \code{api = "arrow"}: when querying public data, +you'll now need to provide a \code{billing} project. } \subsection{JSON API}{ From 97e87abc8b02655859b66278b6816cf2e5a716ae Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 10:40:17 -0500 Subject: [PATCH 31/33] Restore more types supported by dev bigrquerystorage --- tests/testthat/test-bq-download.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 4520c6a1..815d09b9 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -76,7 +76,7 @@ test_that("uses arrow api if bigrquerystorage installed", { test_that("warns if supplying unnused arguments", { tb <- bq_project_query(bq_test_project(), "SELECT 1.0", quiet = TRUE) expect_snapshot( - . <- bq_table_download(tb, + . <- bq_table_download(tb, api = "arrow", page_size = 1, start_index = 1, @@ -108,14 +108,14 @@ test_that("arrow api can convert non-nested types", { expect_equal(df$logicaltrue, TRUE) expect_equal(df$logicalfalse, FALSE) - expect_equal(unclass(df$bytes), list(as.raw(c(0x48, 0x69)))) + expect_equal(df$bytes, blob::as.blob(as.raw(c(0x48, 0x69)))) expect_equal(df$date, as.Date(base)) expect_equal(df$timestamp, base) - # expect_equal(df$datetime, base) + expect_equal(df$datetime, base) expect_equal(df$time, hms::hms(hours = 3, minutes = 4, seconds = 5.67)) - # expect_identical(df$geography, wk::wkt("POINT(30 10)")) + expect_identical(df$geography, wk::wkt("POINT(30 10)")) }) test_that("arrow api can convert nested types", { From 1446d9fd11334d562653e872d680dec57fd92b51 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 11:58:11 -0500 Subject: [PATCH 32/33] Improve tests --- R/bq-perform.R | 11 ++++------- tests/testthat/test-bq-download.R | 10 +++++----- tests/testthat/test-dplyr.R | 17 ++++++++++++++--- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/R/bq-perform.R b/R/bq-perform.R index 92b486e2..203d97a6 100644 --- a/R/bq-perform.R +++ b/R/bq-perform.R @@ -361,9 +361,8 @@ bq_perform_query_dry_run <- function(query, billing, query <- bq_perform_query_data( query = query, - billing = billing, default_dataset = default_dataset, - parameters = parameters, + parameters = parameters, use_legacy_sql = use_legacy_sql ) @@ -388,9 +387,8 @@ bq_perform_query_schema <- function(query, billing, query <- bq_perform_query_data( query = query, - billing = billing, default_dataset = default_dataset, - parameters = parameters, + parameters = parameters, use_legacy_sql = FALSE ) @@ -406,14 +404,13 @@ bq_perform_query_schema <- function(query, billing, res$statistics$query$schema$fields } -bq_perform_query_data <- function(query, billing, +bq_perform_query_data <- function(query, ..., default_dataset = NULL, parameters = NULL, - use_legacy_sql = FALSE, + use_legacy_sql = FALSE, call = caller_env()) { check_string(query, error_call = call) - check_string(billing, error_call = call) check_bool(use_legacy_sql, error_call = call) query <- list( diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 815d09b9..9d07c660 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -100,7 +100,7 @@ test_that("arrow api can convert non-nested types", { " tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb, api = "arrow") + df <- bq_table_download(tb, api = "arrow", quiet = TRUE) base <- ISOdatetime(2000, 1, 2, 3, 4, 5.67, tz = "UTC") expect_identical(df$unicode, "\U0001f603", ignore_encoding = FALSE) @@ -128,7 +128,7 @@ test_that("arrow api can convert nested types", { " tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb, api = "arrow") + df <- bq_table_download(tb, api = "arrow", quiet = TRUE) expect_equal(df$s, list(list(a = 1, b = "abc"))) expect_equal(df$a, list(c(1, 2, 3))) @@ -141,13 +141,13 @@ test_that("arrow api respects bigint", { sql <- paste0("SELECT * FROM UNNEST ([", paste0(x, collapse = ","), "]) AS x"); qry <- bq_project_query(bq_test_project(), sql) - out_int64 <- bq_table_download(qry, bigint = "integer64", api = "arrow")$x + out_int64 <- bq_table_download(qry, bigint = "integer64", api = "arrow", quiet = TRUE)$x expect_identical(out_int64, bit64::as.integer64(x)) - out_dbl <- bq_table_download(qry, bigint = "numeric", api = "arrow")$x + out_dbl <- bq_table_download(qry, bigint = "numeric", api = "arrow", quiet = TRUE)$x expect_identical(out_dbl, as.double(x)) - out_chr <- bq_table_download(qry, bigint = "character", api = "arrow")$x + out_chr <- bq_table_download(qry, bigint = "character", api = "arrow", quiet = TRUE)$x expect_identical(out_chr, x) }) diff --git a/tests/testthat/test-dplyr.R b/tests/testthat/test-dplyr.R index 8b0c5774..01bfeeed 100644 --- a/tests/testthat/test-dplyr.R +++ b/tests/testthat/test-dplyr.R @@ -21,14 +21,25 @@ test_that("can work with literal SQL", { }) test_that("can work with nested table identifier", { - con_us <- DBI::dbConnect( + con1 <- DBI::dbConnect( bigquery(), project = "bigquery-public-data", billing = bq_test_project() ) + # As far as I can tell from the BigQuery API there's no way to provide + # a default project; you can either provide a default dataset + project or + # nothing + table_name <- I("bigquery-public-data.utility_us.country_code_iso") + expect_no_error(dplyr::collect(head(dplyr::tbl(con1, table_name)))) - expect_s3_class(dplyr::collect(head(dplyr::tbl(con_us, I("utility_us.country_code_iso")))), "tbl_df") - expect_error(dplyr::collect(head(dplyr::tbl(con_us, "utility_us.country_code_iso"))), "tbl_df") + + con2 <- DBI::dbConnect( + bigquery(), + project = "bigquery-public-data", + dataset = "utility_us", + billing = bq_test_project(), + ) + expect_no_error(dplyr::collect(head(dplyr::tbl(con2, "country_code_iso")))) }) test_that("can copy_to", { From fcfb00cf2480075cd1ddb3131eea6f7006e1f16d Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 20 Sep 2024 15:48:49 -0500 Subject: [PATCH 33/33] Use correct function name --- tests/testthat/test-bq-download.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index 9d07c660..b7ee579a 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -108,7 +108,7 @@ test_that("arrow api can convert non-nested types", { expect_equal(df$logicaltrue, TRUE) expect_equal(df$logicalfalse, FALSE) - expect_equal(df$bytes, blob::as.blob(as.raw(c(0x48, 0x69)))) + expect_equal(df$bytes, blob::as_blob(as.raw(c(0x48, 0x69)))) expect_equal(df$date, as.Date(base)) expect_equal(df$timestamp, base)