diff --git a/DESCRIPTION b/DESCRIPTION index 64d05146..34bfa4ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,9 +12,9 @@ Description: Easily talk to Google's 'BigQuery' database from R. License: MIT + file LICENSE URL: https://bigrquery.r-dbi.org, https://github.com/r-dbi/bigrquery BugReports: https://github.com/r-dbi/bigrquery/issues -Depends: +Depends: R (>= 4.0) -Imports: +Imports: bit64, brio, cli, @@ -29,8 +29,9 @@ Imports: prettyunits, rlang (>= 1.1.0), tibble, - nanoparquet (> 0.3.1) + nanoparquet (>= 0.3.1) Suggests: + bigrquerystorage (>= 1.1.0.9000), blob, covr, dbplyr (>= 2.4.0), @@ -41,9 +42,7 @@ Suggests: testthat (>= 3.1.5), wk (>= 0.3.2), withr -Remotes: - r-lib/nanoparquet -LinkingTo: +LinkingTo: cli, cpp11, rapidjsonr @@ -54,7 +53,7 @@ Config/testthat/start-first: bq-table, dplyr Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 -Collate: +Collate: 'bigrquery-package.R' 'bq-auth.R' 'bq-dataset.R' @@ -84,3 +83,5 @@ Collate: 'import-standalone-types-check.R' 'utils.R' 'zzz.R' +Remotes: + meztez/bigrquerystorage diff --git a/NAMESPACE b/NAMESPACE index 15cf7aae..8dfaf332 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -88,6 +88,7 @@ export(bq_perform_extract) export(bq_perform_load) export(bq_perform_query) export(bq_perform_query_dry_run) +export(bq_perform_query_schema) export(bq_perform_upload) export(bq_project_datasets) export(bq_project_jobs) diff --git a/NEWS.md b/NEWS.md index 1792daed..8ecfd063 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,12 @@ # bigrquery (development version) +* If the bigrquerystorage package is installed, `bq_table_download()` (and + hence `collect()`, `dbGetQuery()` and `dbFetch()` will use it. This will + drastically improve the speed of downloading large datasets. A big thanks + to @meztez for creating the bigrquerystorage package! + * The `bq_perform_upload()` function now allows users to choose the transmission format (JSON or PARQUET) for data sent to BigQuery (@apalacio9502, #608). + * bigrquery now requires R 4.0, in line with our version support principles. # bigrquery 1.5.1 @@ -14,26 +20,26 @@ * bigrquery is now MIT licensed (#453). * Deprecated functions (i.e. those not starting with `bq_`) have been - removed (#551). These have been superseded for a long time and were formally + removed (#551). These have been superseded for a long time and were formally deprecated in bigrquery 1.3.0 (2020). * `bq_table_download()` now returns unknown fields as character vectors. This means that BIGNUMERIC (#435) and JSON (#544) data is downloaded into R for you to process as you wish. - + It now parses dates using the clock package. This leads to a considerable performance improvement (#430) and ensures that dates prior to 1970-01-01 are parsed correctly (#285). ## Significant DBI improvements -* bigquery datasets and tables will now appear in the connection pane when +* bigquery datasets and tables will now appear in the connection pane when using `dbConnect` (@meztez, #431). * `dbAppendTable()` (#539), `dbCreateTable()` (#483), and `dbExecute` (#502) are now supported. -* `dbGetQuery()`/`dbSendQuery()` gains support for parameterised queries via +* `dbGetQuery()`/`dbSendQuery()` gains support for parameterised queries via the `params` argument (@byapparov, #444). * `dbReadTable()`, `dbWriteTable()`, `dbExistsTable()`, `dbRemoveTable()`, @@ -46,13 +52,13 @@ * Joins now work correctly across bigrquery connections (#433). -* `grepl(pattern, x)` is now correctly translated to +* `grepl(pattern, x)` is now correctly translated to `REGEXP_CONTAINS(x, pattern)` (#416). * `median()` gets a translation that works in `summarise()` and a clear error if you use it in `mutate()` (#419). -* `tbl()` now works with views (#519), including the views found in the +* `tbl()` now works with views (#519), including the views found in the `INFORMATION_SCHEMA` schema (#468). * `tbl(con, sql("..."))` now works robustly once more (#540), fixing the @@ -64,10 +70,10 @@ ## Minor improvements and bug fixes * Google API URLs have been aligned with the Google Cloud Discovery docs. This - enables support for Private and Restricted Google APIs configurations + enables support for Private and Restricted Google APIs configurations (@husseyd, #541) -* Functions generally try to do a better job of telling you when you've +* Functions generally try to do a better job of telling you when you've supplied the wrong type of input. Additionally, if you supply `SQL()` to a query, you no longer get a weird warning (#498). @@ -79,10 +85,10 @@ * `dbGetRowCount()` and `dbHasComplete()` now return correct values when you try to fetch more rows than actually exist (#501). -* New `dbQuoteLiteral()` method for logicals reverts breaking change introduced +* New `dbQuoteLiteral()` method for logicals reverts breaking change introduced by DBI 1.1.2 (@meztez, #478). -* `dbWriteTable()` now correct uses the `billing` value set in the +* `dbWriteTable()` now correct uses the `billing` value set in the connection (#486). # bigrquery 1.4.2 @@ -108,7 +114,7 @@ * bigrquery is now compatible with dbplyr 2.2.0 (@mgirlich, #495). -* brio is new in Imports, replacing the use of the Suggested package readr, +* brio is new in Imports, replacing the use of the Suggested package readr, in `bq_table_download()` (@AdeelK93, #462). # bigrquery 1.4.0 @@ -133,7 +139,7 @@ # bigrquery 1.3.2 * BigQuery `BYTES` and `GEOGRAPHY` column types are now supported via - the [blob](https://blob.tidyverse.org/) and + the [blob](https://blob.tidyverse.org/) and [wk](https://paleolimbot.github.io/wk/) packages, respectively (@paleolimbot, #354, #388). @@ -159,7 +165,7 @@ * When `bq_perform_*()` fails, you now see all errors, not just the first (#355). -* `bq_perform_query()` can now execute parameterised query with parameters +* `bq_perform_query()` can now execute parameterised query with parameters of `ARRAY` type (@byapparov, #303). Vectors of length > 1 will be automatically converted to `ARRAY` type, or use `bq_param_array()` to be explicit. @@ -172,14 +178,14 @@ error for DDL queries, and it returns the number of affected rows for DML queries (#375). -* `dbSendQuery()` (and hence `dbGetQuery()`) and `collect()` passes on `...` - to `bq_perform_query()`. `collect()` gains `page_size` and `max_connection` +* `dbSendQuery()` (and hence `dbGetQuery()`) and `collect()` passes on `...` + to `bq_perform_query()`. `collect()` gains `page_size` and `max_connection` arguments that are passed on to `bq_table_download()` (#374). * `copy_to()` now works with BigQuery (although it doesn't support temporary tables so application is somewhat limited) (#337). - -* `str_detect()` now correctly translated to `REGEXP_CONTAINS` + +* `str_detect()` now correctly translated to `REGEXP_CONTAINS` (@jimmyg3g, #369). * Error messages include hints for common problems (@deflaux, #353). @@ -192,14 +198,14 @@ bigrquery's auth functionality now comes from the [gargle package](https://gargl * Application Default Credentials * Service account tokens from the metadata server available to VMs running on GCE - + Where to learn more: - + * Help for [`bq_auth()`](https://bigrquery.r-dbi.org/reference/bq_auth.html) *all that most users need* * *details for more advanced users* - [How gargle gets tokens](https://gargle.r-lib.org/articles/how-gargle-gets-tokens.html) - [Non-interactive auth](https://gargle.r-lib.org/articles/non-interactive-auth.html) - - [How to get your own API credentials](https://gargle.r-lib.org/articles/get-api-credentials.html) + - [How to get your own API credentials](https://gargle.r-lib.org/articles/get-api-credentials.html) ### Changes that a user will notice @@ -225,7 +231,7 @@ gargle and rlang are newly Imported. * `bq_field()` can now pass `description` parameter which will be applied in `bq_table_create()` call (@byapparov, #272). - + * `bq_table_patch()` - allows to patch table (@byapparov, #253) with new schema. @@ -233,14 +239,14 @@ gargle and rlang are newly Imported. ## Improved type support -* `bq_table_download()` and the `DBI::dbConnect` method now has a `bigint` - argument which governs how BigQuery integer columns are imported into R. As - before, the default is `bigint = "integer"`. You can set - `bigint = "integer64"` to import BigQuery integer columns as - `bit64::integer64` columns in R which allows for values outside the range of +* `bq_table_download()` and the `DBI::dbConnect` method now has a `bigint` + argument which governs how BigQuery integer columns are imported into R. As + before, the default is `bigint = "integer"`. You can set + `bigint = "integer64"` to import BigQuery integer columns as + `bit64::integer64` columns in R which allows for values outside the range of `integer` (`-2147483647` to `2147483647`) (@rasmusab, #94). -* `bq_table_download()` now treats NUMERIC columns the same was as FLOAT +* `bq_table_download()` now treats NUMERIC columns the same was as FLOAT columns (@paulsendavidjay, #282). * `bq_table_upload()` works with POSIXct/POSIXct variables (#251) @@ -258,7 +264,7 @@ gargle and rlang are newly Imported. * `bq_job()` tracks location so bigrquery now works painlessly with non-US/EU locations (#274). -* `bq_perform_upload()` will only autodetect a schema if the table does +* `bq_perform_upload()` will only autodetect a schema if the table does not already exist. * `bq_table_download()` correctly computes page ranges if both `max_results` @@ -273,23 +279,23 @@ gargle and rlang are newly Imported. The system for downloading data from BigQuery into R has been rewritten from the ground up to give considerable improvements in performance and flexibility. * The two steps, downloading and parsing, now happen in sequence, rather than - interleaved. This means that you'll now see two progress bars: one for - downloading JSON from BigQuery and one for parsing that JSON into a data + interleaved. This means that you'll now see two progress bars: one for + downloading JSON from BigQuery and one for parsing that JSON into a data frame. - -* Downloads now occur in parallel, using up to 6 simultaneous connections by + +* Downloads now occur in parallel, using up to 6 simultaneous connections by default. -* The parsing code has been rewritten in C++. As well as considerably improving - performance, this also adds support for nested (record/struct) and repeated - (array) columns (#145). These columns will yield list-columns in the +* The parsing code has been rewritten in C++. As well as considerably improving + performance, this also adds support for nested (record/struct) and repeated + (array) columns (#145). These columns will yield list-columns in the following forms: - + * Repeated values become list-columns containing vectors. * Nested values become list-columns containing named lists. * Repeated nested values become list-columns containing data frames. -* Results are now returned as tibbles, not data frames, because the base print +* Results are now returned as tibbles, not data frames, because the base print method does not handle list columns well. I can now download the first million rows of `publicdata.samples.natality` in about a minute. This data frame is about 170 MB in BigQuery and 140 MB in R; a minute to download this much data seems reasonable to me. The bottleneck for loading BigQuery data is now parsing BigQuery's json format. I don't see any obvious way to make this faster as I'm already using the fastest C++ json parser, [RapidJson](http://rapidjson.org). If this is still too slow for you (i.e. you're downloading GBs of data), see `?bq_table_download` for an alternative approach. @@ -301,18 +307,18 @@ I can now download the first million rows of `publicdata.samples.natality` in ab * `dplyr::compute()` now works (@realAkhmed, #52). * `tbl()` now accepts fully (or partially) qualified table names, like - "publicdata.samples.shakespeare" or "samples.shakespeare". This makes it + "publicdata.samples.shakespeare" or "samples.shakespeare". This makes it possible to join tables across datasets (#219). ### DBI -* `dbConnect()` now defaults to standard SQL, rather than legacy SQL. Use +* `dbConnect()` now defaults to standard SQL, rather than legacy SQL. Use `use_legacy_sql = TRUE` if you need the previous behaviour (#147). -* `dbConnect()` now allows `dataset` to be omitted; this is natural when you +* `dbConnect()` now allows `dataset` to be omitted; this is natural when you want to use tables from multiple datasets. - -* `dbWriteTable()` and `dbReadTable()` now accept fully (or partially) + +* `dbWriteTable()` and `dbReadTable()` now accept fully (or partially) qualified table names. * `dbi_driver()` is deprecated; please use `bigquery()` instead. @@ -322,26 +328,26 @@ I can now download the first million rows of `publicdata.samples.natality` in ab The low-level API has been completely overhauled to make it easier to use. The primary motivation was to make bigrquery development more enjoyable for me, but it should also be helpful to you when you need to go outside of the features provided by higher-level DBI and dplyr interfaces. The old API has been soft-deprecated - it will continue to work, but no further development will occur (including bug fixes). It will be formally deprecated in the next version, and then removed in the version after that. * __Consistent naming scheme__: - All API functions now have the form `bq_object_verb()`, e.g. + All API functions now have the form `bq_object_verb()`, e.g. `bq_table_create()`, or `bq_dataset_delete()`. * __S3 classes__: `bq_table()`, `bq_dataset()`, `bq_job()`, `bq_field()` and `bq_fields()` - constructor functions create S3 objects corresponding to important BigQuery - objects (#150). These are paired with `as_` coercion functions and used throughout + constructor functions create S3 objects corresponding to important BigQuery + objects (#150). These are paired with `as_` coercion functions and used throughout the new API. * __Easier local testing__: - New `bq_test_project()` and `bq_test_dataset()` make it easier to run - bigrquery tests locally. To run the tests yourself, you need to create a + New `bq_test_project()` and `bq_test_dataset()` make it easier to run + bigrquery tests locally. To run the tests yourself, you need to create a BigQuery project, and then follow the instructions in `?bq_test_project`. -* __More efficient data transfer__: - The new API makes extensive use of the `fields` query parameter, ensuring +* __More efficient data transfer__: + The new API makes extensive use of the `fields` query parameter, ensuring that functions only download data that they actually use (#153). -* __Tighter GCS connection__: - New `bq_table_load()` loads data from a Google Cloud Storage URI, pairing +* __Tighter GCS connection__: + New `bq_table_load()` loads data from a Google Cloud Storage URI, pairing with `bq_table_save()` which saves data to a GCS URI (#155). ## Bug fixes and minor improvements @@ -355,12 +361,12 @@ The low-level API has been completely overhauled to make it easier to use. The p (@edgararuiz). * If you have the development version of dbplyr installed, `print()`ing - a BigQuery table will not perform an unneeded query, but will instead + a BigQuery table will not perform an unneeded query, but will instead download directly from the table (#226). ### Low-level -* Request error messages now contain the "reason", which can contain +* Request error messages now contain the "reason", which can contain useful information for debugging (#209). * `bq_dataset_query()` and `bq_project_query()` can now supply query parameters @@ -385,53 +391,53 @@ The low-level API has been completely overhauled to make it easier to use. The p * The DBI driver gets a new name: `bigquery()`. -* New `insert_extract_job()` make it possible to extract data and save in +* New `insert_extract_job()` make it possible to extract data and save in google storage (@realAkhmed, #119). * New `insert_table()` allows you to insert empty tables into a dataset. -* All POST requests (inserts, updates, copies and `query_exec`) now - take `...`. This allows you to add arbitrary additional data to the - request body making it possible to use parts of the BigQuery API +* All POST requests (inserts, updates, copies and `query_exec`) now + take `...`. This allows you to add arbitrary additional data to the + request body making it possible to use parts of the BigQuery API that are otherwise not exposed (#149). `snake_case` argument names are - automatically converted to `camelCase` so you can stick consistently + automatically converted to `camelCase` so you can stick consistently to snake case in your R code. -* Full support for DATE, TIME, and DATETIME types (#128). +* Full support for DATE, TIME, and DATETIME types (#128). ## Big fixes and minor improvements * All bigrquery requests now have a custom user agent that specifies the versions of bigrquery and httr that are used (#151). -* `dbConnect()` gains new `use_legacy_sql`, `page_size`, and `quiet` arguments - that are passed onto `query_exec()`. These allow you to control query options +* `dbConnect()` gains new `use_legacy_sql`, `page_size`, and `quiet` arguments + that are passed onto `query_exec()`. These allow you to control query options at the connection level. * `insert_upload_job()` now sends data in newline-delimited JSON instead of csv (#97). This should be considerably faster and avoids character - encoding issues (#45). `POSIXlt` columns are now also correctly + encoding issues (#45). `POSIXlt` columns are now also correctly coerced to TIMESTAMPS (#98). * `insert_query_job()` and `query_exec()` gain new arguments: * `quiet = TRUE` will suppress the progress bars if needed. - * `use_legacy_sql = FALSE` option allows you to opt-out of the + * `use_legacy_sql = FALSE` option allows you to opt-out of the legacy SQL system (#124, @backlin) * `list_tables()` (#108) and `list_datasets()` (#141) are now paginated. By default they retrieve 50 items per page, and will iterate until they get everything. -* `list_tabledata()` and `query_exec()` now give a nicer progress bar, +* `list_tabledata()` and `query_exec()` now give a nicer progress bar, including estimated time remaining (#100). -* `query_exec()` should be considerably faster because profiling revealed that - ~40% of the time taken by was a single line inside a function that helps +* `query_exec()` should be considerably faster because profiling revealed that + ~40% of the time taken by was a single line inside a function that helps parse BigQuery's json into an R data frame. I replaced the slow R code with a faster C function. -* `set_oauth2.0_cred()` allows user to supply their own Google OAuth +* `set_oauth2.0_cred()` allows user to supply their own Google OAuth application when setting credentials (#130, @jarodmeng) * `wait_for()` uses now reports the query total bytes billed, which is @@ -449,12 +455,12 @@ The low-level API has been completely overhauled to make it easier to use. The p * Provide full DBI compliant interface (@krlmlr). * Backend now translates `iflese()` to `IF` (@realAkhmed, #53). - + # Version 0.2.0. * Compatible with latest httr. -* Computation of the SQL data type that corresponds to a given R object +* Computation of the SQL data type that corresponds to a given R object is now more robust against unknown classes. (#95, @krlmlr) * A data frame with full schema information is returned for zero-row results. @@ -469,8 +475,8 @@ The low-level API has been completely overhauled to make it easier to use. The p * New `format_dataset()` and `format_table()`. (#81, @krlmlr) -* New `list_tabledata_iter()` that allows fetching a table in chunks of +* New `list_tabledata_iter()` that allows fetching a table in chunks of varying size. (#77, #87, @krlmlr) -* Add support for API keys via the `BIGRQUERY_API_KEY` environment variable. +* Add support for API keys via the `BIGRQUERY_API_KEY` environment variable. (#49) diff --git a/R/bq-download.R b/R/bq-download.R index 126d1dcf..14d12a66 100644 --- a/R/bq-download.R +++ b/R/bq-download.R @@ -1,30 +1,36 @@ #' Download table data #' -#' This retrieves rows in chunks of `page_size`. It is most suitable for results -#' of smaller queries (<100 MB, say). For larger queries, it is better to -#' export the results to a CSV file stored on google cloud and use the -#' bq command line tool to download locally. +#' @description +#' This function provides two ways to download data from BigQuery, transfering +#' data using either JSON or arrow, depending on the `api` argument. If +#' bigrquerystorage is installed, `api = "arrow"` will be used (because it's +#' so much faster, but see the limitions below), otherwise you can select +#' deliberately by using `api = "json"` or `api = "arrow"`. #' -#' @section Complex data: -#' bigrquery will retrieve nested and repeated columns in to list-columns +#' ## Arrow API +#' +#' The arrow API is much faster, but has heavier dependencies: bigrquerystorage +#' requires the arrow package, which can be tricky to compile on Linux (but you +#' usually should be able to get a binary from +#' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/). +#' +#' There's one known limitation of `api = "arrow"`: when querying public data, +#' you'll now need to provide a `billing` project. +#' +#' ## JSON API +#' +#' The JSON API retrieves rows in chunks of `page_size`. It is most suitable +#' for results of smaller queries (<100 MB, say). Unfortunately due to +#' limitations in the BigQuery API, you may need to vary this parameter +#' depending on the complexity of the underlying data. +#' +#' The JSON API will convert nested and repeated columns in to list-columns #' as follows: #' #' * Repeated values (arrays) will become a list-column of vectors. #' * Records will become list-columns of named lists. #' * Repeated records will become list-columns of data frames. #' -#' @section Larger datasets: -#' In my timings, this code takes around 1 minute per 100 MB of data. -#' If you need to download considerably more than this, I recommend: -#' -#' * Export a `.csv` file to Cloud Storage using [bq_table_save()]. -#' * Use the `gsutil` command line utility to download it. -#' * Read the csv file into R with `readr::read_csv()` or `data.table::fread()`. -#' -#' Unfortunately you can not export nested or repeated formats into CSV, and -#' the formats that BigQuery supports (arvn and ndjson) that allow for -#' nested/repeated values, are not well supported in R. -#' #' @return Because data retrieval may generate list-columns and the `data.frame` #' print method can have problems with list-columns, this method returns #' a tibble. If you need a `data.frame`, coerce the results with @@ -32,30 +38,40 @@ #' @param x A [bq_table] #' @param n_max Maximum number of results to retrieve. Use `Inf` to retrieve all #' rows. -#' @param page_size The number of rows requested per chunk. It is recommended to -#' leave this unspecified until you have evidence that the `page_size` -#' selected automatically by `bq_table_download()` is problematic. +#' @param page_size (JSON only) The number of rows requested per chunk. It is +#' recommended to leave this unspecified until you have evidence that the +#' `page_size` selected automatically by `bq_table_download()` is problematic. #' #' When `page_size = NULL` bigrquery determines a conservative, natural chunk #' size empirically. If you specify the `page_size`, it is important that each #' chunk fits on one page, i.e. that the requested row limit is low enough to #' prevent the API from paginating based on response size. -#' @param start_index Starting row index (zero-based). -#' @param max_connections Number of maximum simultaneous connections to -#' BigQuery servers. +#' @param start_index (JSON only) Starting row index (zero-based). +#' @param max_connections (JSON only) Number of maximum simultaneous +#' connections to BigQuery servers. +#' @param api Which API to use? The `"json"` API works where ever bigrquery +#' does, but is slow and can require fiddling with the `page_size` parameter. +#' The `"arrow"` API is faster and more reliable, but only works if you +#' have also installed the bigrquerystorage package. +#' +#' Because the `"arrow"` API is so much faster, it will be used automatically +#' if the bigrquerystorage package is installed. #' @inheritParams api-job #' @param bigint The R type that BigQuery's 64-bit integer types should be #' mapped to. The default is `"integer"`, which returns R's `integer` type, #' but results in `NA` for values above/below +/- 2147483647. `"integer64"` #' returns a [bit64::integer64], which allows the full range of 64 bit #' integers. +#' @param billing (Arrow only) Project to bill; defaults to the project of `x`, +#' and typically only needs to be specified if you're working with public +#' datasets. #' @param max_results `r lifecycle::badge("deprecated")` Deprecated. Please use #' `n_max` instead. #' @section Google BigQuery API documentation: #' * [list](https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list) #' @export #' @examplesIf bq_testable() -#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000) +#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project()) bq_table_download <- function(x, n_max = Inf, @@ -64,6 +80,8 @@ bq_table_download <- max_connections = 6L, quiet = NA, bigint = c("integer", "integer64", "numeric", "character"), + api = c("json", "arrow"), + billing = x$project, max_results = deprecated()) { x <- as_bq_table(x) check_number_whole(n_max, min = 0, allow_infinite = TRUE) @@ -71,6 +89,8 @@ bq_table_download <- check_number_whole(max_connections, min = 1) quiet <- check_quiet(quiet) bigint <- arg_match(bigint) + api <- check_api(api) + if (lifecycle::is_present(max_results)) { lifecycle::deprecate_warn( "1.4.0", "bq_table_download(max_results)", "bq_table_download(n_max)" @@ -78,6 +98,37 @@ bq_table_download <- n_max <- max_results } + if (api == "arrow") { + check_installed("bigrquerystorage", "required to download using arrow API") + if (!missing(page_size)) { + cli::cli_warn( + '{.arg page_size} is ignored when {.code api == "arrow"}', + call = environment() + ) + } + if (!missing(start_index)) { + cli::cli_warn( + '{.arg start_index} is ignored when {.code api == "arrow"}', + call = environment() + ) + } + if (!missing(max_connections)) { + cli::cli_warn( + '{.arg max_connections} is ignored when {.code api == "arrow"}', + call = environment() + ) + } + + return(bigrquerystorage::bqs_table_download( + x = toString(x), + parent = billing, + n_max = n_max, + quiet = quiet, + bigint = bigint, + as_tibble = TRUE + )) + } + params <- set_row_params( nrow = bq_table_nrow(x), n_max = n_max, @@ -202,6 +253,14 @@ bq_table_download <- parse_postprocess(table_data, bigint = bigint) } +check_api <- function(api = c("json", "arrow"), error_call = caller_env()) { + if (identical(api, c("json", "arrow"))) { + if (has_bigrquerystorage()) "arrow" else "json" + } else { + arg_match(api, error_call = error_call) + } +} + # This function is a modified version of # https://github.com/r-dbi/RPostgres/blob/master/R/PqResult.R parse_postprocess <- function(df, bigint) { diff --git a/R/bq-perform.R b/R/bq-perform.R index cd7a1e41..203d97a6 100644 --- a/R/bq-perform.R +++ b/R/bq-perform.R @@ -210,7 +210,7 @@ export_json <- function(values) { #' Google Cloud. #' #' For Google Cloud Storage URIs: Each URI can contain one -#' `'*'`` wildcard character and it must come after the 'bucket' name. +#' `'*'` wildcard character and it must come after the 'bucket' name. #' Size limits related to load jobs apply to external data sources. #' #' For Google Cloud Bigtable URIs: Exactly one URI can be specified and @@ -358,21 +358,13 @@ bq_perform_query_dry_run <- function(query, billing, parameters = NULL, use_legacy_sql = FALSE) { - check_string(query) - check_string(billing) - check_bool(use_legacy_sql) - query <- list( - query = unbox(query), - useLegacySql = unbox(use_legacy_sql) + query <- bq_perform_query_data( + query = query, + default_dataset = default_dataset, + parameters = parameters, + use_legacy_sql = use_legacy_sql ) - if (!is.null(parameters)) { - parameters <- as_bq_params(parameters) - query$queryParameters <- as_json(parameters) - } - if (!is.null(default_dataset)) { - query$defaultDataset <- datasetReference(default_dataset) - } url <- bq_path(billing, jobs = "") body <- list(configuration = list(query = query, dryRun = unbox(TRUE))) @@ -386,6 +378,58 @@ bq_perform_query_dry_run <- function(query, billing, structure(bytes, class = "bq_bytes") } +#' @export +#' @rdname api-perform +bq_perform_query_schema <- function(query, billing, + ..., + default_dataset = NULL, + parameters = NULL) { + + query <- bq_perform_query_data( + query = query, + default_dataset = default_dataset, + parameters = parameters, + use_legacy_sql = FALSE + ) + + url <- bq_path(billing, jobs = "") + body <- list(configuration = list(query = query, dryRun = unbox(TRUE))) + + res <- bq_post( + url, + body = bq_body(body, ...), + query = list(fields = "statistics") + ) + # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableSchema + res$statistics$query$schema$fields +} + +bq_perform_query_data <- function(query, + ..., + default_dataset = NULL, + parameters = NULL, + use_legacy_sql = FALSE, + call = caller_env()) { + check_string(query, error_call = call) + check_bool(use_legacy_sql, error_call = call) + + query <- list( + query = unbox(query), + useLegacySql = unbox(use_legacy_sql) + ) + if (!is.null(parameters)) { + parameters <- as_bq_params(parameters) + query$queryParameters <- as_json(parameters) + } + if (!is.null(default_dataset)) { + query$defaultDataset <- datasetReference(default_dataset) + } + + query +} + + + #' @export #' @rdname api-perform bq_perform_copy <- function(src, dest, diff --git a/R/dbi-connection.R b/R/dbi-connection.R index a790ba24..18cb4b35 100644 --- a/R/dbi-connection.R +++ b/R/dbi-connection.R @@ -318,7 +318,7 @@ setMethod("dbCreateTable", "BigQueryConnection", dbCreateTable_bq) dbReadTable_bq <- function(conn, name, ...) { tb <- as_bq_table(conn, name) - bq_table_download(tb, ...) + bq_table_download(tb, ..., api = "json") } #' @rdname DBI diff --git a/R/dbi-result.R b/R/dbi-result.R index e5248933..f4543717 100644 --- a/R/dbi-result.R +++ b/R/dbi-result.R @@ -100,18 +100,31 @@ setMethod( "dbFetch", "BigQueryResult", function(res, n = -1, ...) { check_number_whole(n, min = -1, allow_infinite = TRUE) + if (n == -1) n <- Inf - if (n == -1 || n == Inf) { + if (has_bigrquerystorage() && n == Inf && res@cursor$cur() == 0) { + # https://github.com/meztez/bigrquerystorage/issues/48 n <- res@cursor$left() + + # If possible, download complete dataset using arrow + data <- bq_table_download(res@bq_table, + n_max = n, + bigint = res@bigint, + quiet = res@quiet, + api = "arrow" + ) + } else { + # Otherwise, fall back to slower JSON API + data <- bq_table_download(res@bq_table, + n_max = n, + start_index = res@cursor$cur(), + page_size = res@page_size, + bigint = res@bigint, + quiet = res@quiet, + api = "json" + ) } - - data <- bq_table_download(res@bq_table, - n_max = n, - start_index = res@cursor$cur(), - page_size = res@page_size, - bigint = res@bigint, - quiet = res@quiet - ) + res@cursor$adv(nrow(data)) data diff --git a/R/dplyr.R b/R/dplyr.R index 9e8c2ff0..de9cbcaa 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -27,7 +27,10 @@ #' summarise(n = sum(word_count, na.rm = TRUE)) %>% #' arrange(desc(n)) #' } -src_bigquery <- function(project, dataset, billing = project, max_pages = 10) { +src_bigquery <- function(project, + dataset, + billing = project, + max_pages = 10) { check_installed("dbplyr") con <- DBI::dbConnect( @@ -45,10 +48,18 @@ src_bigquery <- function(project, dataset, billing = project, max_pages = 10) { tbl.BigQueryConnection <- function(src, from, ...) { src <- dbplyr::src_dbi(src, auto_disconnect = FALSE) + sql <- dbplyr::sql_query_fields(src$con, from) + dataset <- if (!is.null(src$con@dataset)) as_bq_dataset(src$con) + schema <- bq_perform_query_schema(sql, + billing = src$con@billing, + default_dataset = dataset + ) + vars <- map_chr(schema, "[[", "name") + if (utils::packageVersion("dbplyr") >= "2.4.0.9000") { - tbl <- dplyr::tbl(src, from = from) + tbl <- dplyr::tbl(src, from = from, vars = vars) } else { - tbl <- dplyr::tbl(src, from = from, check_from = FALSE) + tbl <- dplyr::tbl(src, from = from, vars = vars, check_from = FALSE) } # This is ugly, but I don't see a better way of doing this @@ -116,17 +127,35 @@ db_copy_to.BigQueryConnection <- function(con, # Efficient downloads ----------------------------------------------- # registered onLoad + +#' Collect a BigQuery table +#' +#' This collect method is specialised for BigQuery tables, generating the +#' SQL from your dplyr commands, then calling [bq_project_query()] +#' or [bq_dataset_query()] to run the query, then [bq_table_download()] +#' to download the results. Thus the arguments are a combination of the +#' arguments to [dplyr::collect()], `bq_project_query()`/`bq_dataset_query()`, +#' and `bq_table_download()`. +#' +#' @inheritParams dplyr::collect +#' @inheritParams bq_table_download +#' @param n Maximum number of results to retrieve. +#' The default, `Inf`, will retrieve all rows. +#' @param ... Other arguments passed on to +#' `bq_project_query()`/`bq_project_query()` collect.tbl_BigQueryConnection <- function(x, ..., - page_size = NULL, - max_connections = 6L, n = Inf, - warn_incomplete = TRUE) { + api = c("json", "arrow"), + page_size = NULL, + max_connections = 6L + ) { + api <- check_api(api) check_number_whole(n, min = 0, allow_infinite = TRUE) check_number_whole(max_connections, min = 1) - check_bool(warn_incomplete) con <- dbplyr::remote_con(x) + billing <- con@billing if (op_can_download(x)) { lq <- x$lazy_query @@ -136,7 +165,6 @@ collect.tbl_BigQueryConnection <- function(x, ..., } else { sql <- dbplyr::db_sql_render(con, x) - billing <- con@billing if (is.null(con@dataset)) { tb <- bq_project_query(billing, sql, quiet = con@quiet, ...) } else { @@ -147,13 +175,26 @@ collect.tbl_BigQueryConnection <- function(x, ..., quiet <- if (n < 100) TRUE else con@quiet bigint <- con@bigint %||% "integer" - out <- bq_table_download(tb, - n_max = n, - page_size = page_size, - quiet = quiet, - max_connections = max_connections, - bigint = bigint - ) + + if (api == "arrow") { + out <- bq_table_download(tb, + n_max = n, + quiet = quiet, + bigint = bigint, + billing = billing, + api = "arrow" + ) + } else { + out <- bq_table_download(tb, + n_max = n, + page_size = page_size, + quiet = quiet, + max_connections = max_connections, + bigint = bigint, + api = "json" + ) + } + dplyr::grouped_df(out, intersect(dbplyr::op_grps(x), names(out))) } diff --git a/R/utils.R b/R/utils.R index 98e872fd..1e84f9ff 100644 --- a/R/utils.R +++ b/R/utils.R @@ -71,3 +71,7 @@ as_query <- function(x, error_arg = caller_arg(x), error_call = caller_env()) { check_string(x, arg = error_arg, call = error_call) x } + +has_bigrquerystorage <- function() { + is_installed("bigrquerystorage") +} diff --git a/_pkgdown.yml b/_pkgdown.yml index beadd04a..d4bda334 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,6 +16,7 @@ reference: contents: - src_bigquery - bigquery + - collect.tbl_BigQueryConnection - title: Low-level API contents: diff --git a/man/api-perform.Rd b/man/api-perform.Rd index 081e2686..736883a1 100644 --- a/man/api-perform.Rd +++ b/man/api-perform.Rd @@ -7,6 +7,7 @@ \alias{bq_perform_load} \alias{bq_perform_query} \alias{bq_perform_query_dry_run} +\alias{bq_perform_query_schema} \alias{bq_perform_copy} \title{BigQuery jobs: perform a job} \usage{ @@ -65,6 +66,14 @@ bq_perform_query_dry_run( use_legacy_sql = FALSE ) +bq_perform_query_schema( + query, + billing, + ..., + default_dataset = NULL, + parameters = NULL +) + bq_perform_copy( src, dest, @@ -148,7 +157,7 @@ to the table. Google Cloud. For Google Cloud Storage URIs: Each URI can contain one -`'*'`` wildcard character and it must come after the 'bucket' name. +\code{'*'} wildcard character and it must come after the 'bucket' name. Size limits related to load jobs apply to external data sources. For Google Cloud Bigtable URIs: Exactly one URI can be specified and diff --git a/man/api-table.Rd b/man/api-table.Rd index 6509d18d..70c8031a 100644 --- a/man/api-table.Rd +++ b/man/api-table.Rd @@ -66,7 +66,7 @@ number of files.} Google Cloud. For Google Cloud Storage URIs: Each URI can contain one -`'*'`` wildcard character and it must come after the 'bucket' name. +\code{'*'} wildcard character and it must come after the 'bucket' name. Size limits related to load jobs apply to external data sources. For Google Cloud Bigtable URIs: Exactly one URI can be specified and diff --git a/man/bq_table_download.Rd b/man/bq_table_download.Rd index 90970863..939f8780 100644 --- a/man/bq_table_download.Rd +++ b/man/bq_table_download.Rd @@ -12,6 +12,8 @@ bq_table_download( max_connections = 6L, quiet = NA, bigint = c("integer", "integer64", "numeric", "character"), + api = c("json", "arrow"), + billing = x$project, max_results = deprecated() ) } @@ -21,19 +23,19 @@ bq_table_download( \item{n_max}{Maximum number of results to retrieve. Use \code{Inf} to retrieve all rows.} -\item{page_size}{The number of rows requested per chunk. It is recommended to -leave this unspecified until you have evidence that the \code{page_size} -selected automatically by \code{bq_table_download()} is problematic. +\item{page_size}{(JSON only) The number of rows requested per chunk. It is +recommended to leave this unspecified until you have evidence that the +\code{page_size} selected automatically by \code{bq_table_download()} is problematic. When \code{page_size = NULL} bigrquery determines a conservative, natural chunk size empirically. If you specify the \code{page_size}, it is important that each chunk fits on one page, i.e. that the requested row limit is low enough to prevent the API from paginating based on response size.} -\item{start_index}{Starting row index (zero-based).} +\item{start_index}{(JSON only) Starting row index (zero-based).} -\item{max_connections}{Number of maximum simultaneous connections to -BigQuery servers.} +\item{max_connections}{(JSON only) Number of maximum simultaneous +connections to BigQuery servers.} \item{quiet}{If \code{FALSE}, displays progress bar; if \code{TRUE} is silent; if \code{NA} picks based on whether or not you're in an interactive context.} @@ -44,6 +46,18 @@ but results in \code{NA} for values above/below +/- 2147483647. \code{"integer64 returns a \link[bit64:bit64-package]{bit64::integer64}, which allows the full range of 64 bit integers.} +\item{api}{Which API to use? The \code{"json"} API works where ever bigrquery +does, but is slow and can require fiddling with the \code{page_size} parameter. +The \code{"arrow"} API is faster and more reliable, but only works if you +have also installed the bigrquerystorage package. + +Because the \code{"arrow"} API is so much faster, it will be used automatically +if the bigrquerystorage package is installed.} + +\item{billing}{(Arrow only) Project to bill; defaults to the project of \code{x}, +and typically only needs to be specified if you're working with public +datasets.} + \item{max_results}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Deprecated. Please use \code{n_max} instead.} } @@ -54,14 +68,30 @@ a tibble. If you need a \code{data.frame}, coerce the results with \code{\link[=as.data.frame]{as.data.frame()}}. } \description{ -This retrieves rows in chunks of \code{page_size}. It is most suitable for results -of smaller queries (<100 MB, say). For larger queries, it is better to -export the results to a CSV file stored on google cloud and use the -bq command line tool to download locally. +This function provides two ways to download data from BigQuery, transfering +data using either JSON or arrow, depending on the \code{api} argument. If +bigrquerystorage is installed, \code{api = "arrow"} will be used (because it's +so much faster, but see the limitions below), otherwise you can select +deliberately by using \code{api = "json"} or \code{api = "arrow"}. +\subsection{Arrow API}{ + +The arrow API is much faster, but has heavier dependencies: bigrquerystorage +requires the arrow package, which can be tricky to compile on Linux (but you +usually should be able to get a binary from +\href{https://posit.co/products/cloud/public-package-manager/}{Posit Public Package Manager}. + +There's one known limitation of \code{api = "arrow"}: when querying public data, +you'll now need to provide a \code{billing} project. } -\section{Complex data}{ -bigrquery will retrieve nested and repeated columns in to list-columns +\subsection{JSON API}{ + +The JSON API retrieves rows in chunks of \code{page_size}. It is most suitable +for results of smaller queries (<100 MB, say). Unfortunately due to +limitations in the BigQuery API, you may need to vary this parameter +depending on the complexity of the underlying data. + +The JSON API will convert nested and repeated columns in to list-columns as follows: \itemize{ \item Repeated values (arrays) will become a list-column of vectors. @@ -69,22 +99,7 @@ as follows: \item Repeated records will become list-columns of data frames. } } - -\section{Larger datasets}{ - -In my timings, this code takes around 1 minute per 100 MB of data. -If you need to download considerably more than this, I recommend: -\itemize{ -\item Export a \code{.csv} file to Cloud Storage using \code{\link[=bq_table_save]{bq_table_save()}}. -\item Use the \code{gsutil} command line utility to download it. -\item Read the csv file into R with \code{readr::read_csv()} or \code{data.table::fread()}. } - -Unfortunately you can not export nested or repeated formats into CSV, and -the formats that BigQuery supports (arvn and ndjson) that allow for -nested/repeated values, are not well supported in R. -} - \section{Google BigQuery API documentation}{ \itemize{ @@ -94,6 +109,6 @@ nested/repeated values, are not well supported in R. \examples{ \dontshow{if (bq_testable()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -df <- bq_table_download("publicdata.samples.natality", n_max = 35000) +df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project()) \dontshow{\}) # examplesIf} } diff --git a/man/collect.tbl_BigQueryConnection.Rd b/man/collect.tbl_BigQueryConnection.Rd new file mode 100644 index 00000000..ac401ae2 --- /dev/null +++ b/man/collect.tbl_BigQueryConnection.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr.R +\name{collect.tbl_BigQueryConnection} +\alias{collect.tbl_BigQueryConnection} +\title{Collect a BigQuery table} +\usage{ +collect.tbl_BigQueryConnection( + x, + ..., + n = Inf, + api = c("json", "arrow"), + page_size = NULL, + max_connections = 6L +) +} +\arguments{ +\item{x}{A data frame, data frame extension (e.g. a tibble), or a lazy +data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for more +details.} + +\item{...}{Other arguments passed on to +\code{bq_project_query()}/\code{bq_project_query()}} + +\item{n}{Maximum number of results to retrieve. +The default, \code{Inf}, will retrieve all rows.} + +\item{api}{Which API to use? The \code{"json"} API works where ever bigrquery +does, but is slow and can require fiddling with the \code{page_size} parameter. +The \code{"arrow"} API is faster and more reliable, but only works if you +have also installed the bigrquerystorage package. + +Because the \code{"arrow"} API is so much faster, it will be used automatically +if the bigrquerystorage package is installed.} + +\item{page_size}{(JSON only) The number of rows requested per chunk. It is +recommended to leave this unspecified until you have evidence that the +\code{page_size} selected automatically by \code{bq_table_download()} is problematic. + +When \code{page_size = NULL} bigrquery determines a conservative, natural chunk +size empirically. If you specify the \code{page_size}, it is important that each +chunk fits on one page, i.e. that the requested row limit is low enough to +prevent the API from paginating based on response size.} + +\item{max_connections}{(JSON only) Number of maximum simultaneous +connections to BigQuery servers.} +} +\description{ +This collect method is specialised for BigQuery tables, generating the +SQL from your dplyr commands, then calling \code{\link[=bq_project_query]{bq_project_query()}} +or \code{\link[=bq_dataset_query]{bq_dataset_query()}} to run the query, then \code{\link[=bq_table_download]{bq_table_download()}} +to download the results. Thus the arguments are a combination of the +arguments to \code{\link[dplyr:compute]{dplyr::collect()}}, \code{bq_project_query()}/\code{bq_dataset_query()}, +and \code{bq_table_download()}. +} diff --git a/tests/testthat/_snaps/bq-download.md b/tests/testthat/_snaps/bq-download.md index bc9eed7a..5d5d49a0 100644 --- a/tests/testthat/_snaps/bq-download.md +++ b/tests/testthat/_snaps/bq-download.md @@ -1,7 +1,8 @@ # errors when table is known to be incomplete Code - bq_table_download(tb, n_max = 35000, page_size = 35000, bigint = "integer64") + bq_table_download(tb, n_max = 35000, page_size = 35000, bigint = "integer64", + api = "json") Message Downloading first chunk of data. Condition @@ -10,3 +11,16 @@ x 35,000 rows were requested, but only {n} rows were received. i Leave `page_size` unspecified or use an even smaller value. +# warns if supplying unnused arguments + + Code + . <- bq_table_download(tb, api = "arrow", page_size = 1, start_index = 1, + max_connections = 1) + Condition + Warning in `bq_table_download()`: + `page_size` is ignored when `api == "arrow"` + Warning in `bq_table_download()`: + `start_index` is ignored when `api == "arrow"` + Warning in `bq_table_download()`: + `max_connections` is ignored when `api == "arrow"` + diff --git a/tests/testthat/test-bq-download.R b/tests/testthat/test-bq-download.R index e8f64576..b7ee579a 100644 --- a/tests/testthat/test-bq-download.R +++ b/tests/testthat/test-bq-download.R @@ -3,8 +3,8 @@ test_that("same results regardless of page size", { tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df3 <- bq_table_download(tb, n_max = 30, page_size = 10) - df1 <- bq_table_download(tb, n_max = 30, page_size = 30) + df3 <- bq_table_download(tb, n_max = 30, page_size = 10, api = "json") + df1 <- bq_table_download(tb, n_max = 30, page_size = 30, api = "json") expect_equal(nrow(df1), 30) expect_equal(df1, df3) }) @@ -13,7 +13,7 @@ test_that("can retrieve fraction of page size", { skip_if_no_auth() tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 15, page_size = 10) + df <- bq_table_download(tb, n_max = 15, page_size = 10, api = "json") expect_equal(nrow(df), 15) }) @@ -21,7 +21,7 @@ test_that("can retrieve zero rows", { skip_if_no_auth() tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 0) + df <- bq_table_download(tb, n_max = 0, api = "json") expect_equal(nrow(df), 0) expect_named(df, c("phase", "phase_emoji", "peak_datetime")) }) @@ -34,7 +34,7 @@ test_that("can specify large integers in page params", { withr::local_options(list(scipen = -4)) tb <- as_bq_table("bigquery-public-data.moon_phases.moon_phases") - df <- bq_table_download(tb, n_max = 100, page_size = 20) + df <- bq_table_download(tb, n_max = 100, page_size = 20, api = "json") expect_equal(nrow(df), 100) }) @@ -49,7 +49,8 @@ test_that("errors when table is known to be incomplete", { tb, n_max = 35000, page_size = 35000, - bigint = "integer64" + bigint = "integer64", + api = "json" ), transform = function(x) { gsub("[0-9,]+ rows were received", "{n} rows were received", x, perl = TRUE) @@ -58,6 +59,98 @@ test_that("errors when table is known to be incomplete", { ) }) +# api = "arrow" ---------------------------------------------------------------- + +test_that("check_api respects inputs", { + expect_equal(check_api("arrow"), "arrow") + expect_equal(check_api("json"), "json") +}) + +test_that("uses arrow api if bigrquerystorage installed", { + expect_equal(check_api(), "arrow") + + local_mocked_bindings(is_installed = function(...) FALSE) + expect_equal(check_api(), "json") +}) + +test_that("warns if supplying unnused arguments", { + tb <- bq_project_query(bq_test_project(), "SELECT 1.0", quiet = TRUE) + expect_snapshot( + . <- bq_table_download(tb, + api = "arrow", + page_size = 1, + start_index = 1, + max_connections = 1 + ) + ) +}) + +test_that("arrow api can convert non-nested types", { + sql <- "SELECT + '\U0001f603' as unicode, + datetime, + TRUE as logicaltrue, + FALSE as logicalfalse, + CAST ('Hi' as BYTES) as bytes, + CAST (datetime as DATE) as date, + CAST (datetime as TIME) as time, + CAST (datetime as TIMESTAMP) as timestamp, + ST_GEOGFROMTEXT('POINT (30 10)') as geography + FROM (SELECT DATETIME '2000-01-02 03:04:05.67' as datetime) + " + + tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) + df <- bq_table_download(tb, api = "arrow", quiet = TRUE) + + base <- ISOdatetime(2000, 1, 2, 3, 4, 5.67, tz = "UTC") + expect_identical(df$unicode, "\U0001f603", ignore_encoding = FALSE) + + expect_equal(df$logicaltrue, TRUE) + expect_equal(df$logicalfalse, FALSE) + + expect_equal(df$bytes, blob::as_blob(as.raw(c(0x48, 0x69)))) + + expect_equal(df$date, as.Date(base)) + expect_equal(df$timestamp, base) + expect_equal(df$datetime, base) + expect_equal(df$time, hms::hms(hours = 3, minutes = 4, seconds = 5.67)) + + expect_identical(df$geography, wk::wkt("POINT(30 10)")) +}) + +test_that("arrow api can convert nested types", { + skip("https://github.com/meztez/bigrquerystorage/issues/54") + sql <- "SELECT + STRUCT(1.0 AS a, 'abc' AS b) as s, + [1.0, 2.0, 3.0] as a, + [STRUCT(1.0 as a, 'a' as b), STRUCT(2.0, 'b'), STRUCT(3, 'c')] as aos, + STRUCT([1.0, 2.0, 3.0] as a, ['a', 'b'] as b) as soa + " + + tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) + df <- bq_table_download(tb, api = "arrow", quiet = TRUE) + + expect_equal(df$s, list(list(a = 1, b = "abc"))) + expect_equal(df$a, list(c(1, 2, 3))) + expect_equal(df$aos, list(tibble(a = c(1, 2, 3), b = c("a", "b", "c")))) + expect_equal(df$soa, list(list(a = c(1, 2, 3), b = c("a", "b")))) +}) + +test_that("arrow api respects bigint", { + x <- c("-2147483648", "-2147483647", "-1", "0", "1", "2147483647", "2147483648") + sql <- paste0("SELECT * FROM UNNEST ([", paste0(x, collapse = ","), "]) AS x"); + qry <- bq_project_query(bq_test_project(), sql) + + out_int64 <- bq_table_download(qry, bigint = "integer64", api = "arrow", quiet = TRUE)$x + expect_identical(out_int64, bit64::as.integer64(x)) + + out_dbl <- bq_table_download(qry, bigint = "numeric", api = "arrow", quiet = TRUE)$x + expect_identical(out_dbl, as.double(x)) + + out_chr <- bq_table_download(qry, bigint = "character", api = "arrow", quiet = TRUE)$x + expect_identical(out_chr, x) +}) + # helpers around row and chunk params ------------------------------------------ test_that("set_row_params() works ", { @@ -173,7 +266,7 @@ test_that("can convert date time types", { " tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") base <- ISOdatetime(2000, 1, 2, 3, 4, 5.67, tz = "UTC") @@ -197,7 +290,7 @@ test_that("can parse fractional seconds", { test_that("correctly parse logical values" ,{ query <- "SELECT TRUE as x" tb <- bq_project_query(bq_test_project(), query) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_true(df$x) }) @@ -208,18 +301,18 @@ test_that("the return type of integer columns is set by the bigint argument", { qry <- bq_project_query(bq_test_project(), sql) expect_warning( - out_int <- bq_table_download(qry, bigint = "integer")$x, + out_int <- bq_table_download(qry, bigint = "integer", api = "json")$x, "integer overflow" ) expect_identical(out_int, suppressWarnings(as.integer(x))) - out_int64 <- bq_table_download(qry, bigint = "integer64")$x + out_int64 <- bq_table_download(qry, bigint = "integer64", api = "json")$x expect_identical(out_int64, bit64::as.integer64(x)) - out_dbl <- bq_table_download(qry, bigint = "numeric")$x + out_dbl <- bq_table_download(qry, bigint = "numeric", api = "json")$x expect_identical(out_dbl, as.double(x)) - out_chr <- bq_table_download(qry, bigint = "character")$x + out_chr <- bq_table_download(qry, bigint = "character", api = "json")$x expect_identical(out_chr, x) }) @@ -227,7 +320,7 @@ test_that("can convert geography type", { skip_if_not_installed("wk") sql <- "SELECT ST_GEOGFROMTEXT('POINT (30 10)') as geography" tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_identical(df$geography, wk::wkt("POINT(30 10)")) }) @@ -235,7 +328,7 @@ test_that("can convert geography type", { test_that("can convert bytes type", { sql <- "SELECT ST_ASBINARY(ST_GEOGFROMTEXT('POINT (30 10)')) as bytes" tb <- bq_project_query(bq_test_project(), sql, quiet = TRUE) - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_identical( df$bytes, diff --git a/tests/testthat/test-bq-parse.R b/tests/testthat/test-bq-parse.R index 93328768..70ac23b4 100644 --- a/tests/testthat/test-bq-parse.R +++ b/tests/testthat/test-bq-parse.R @@ -128,11 +128,11 @@ test_that("can parse nested structures", { test_that("can parse empty arrays", { tb <- bq_project_query(bq_test_project(), "SELECT ARRAY[] as x") - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_equal(df$x, list(integer(length = 0))) tb <- bq_project_query(bq_test_project(), "SELECT ARRAY>[] as x") - df <- bq_table_download(tb) + df <- bq_table_download(tb, api = "json") expect_equal(df$x, list(tibble::tibble(a = integer(length = 0), b = character()))) }) diff --git a/tests/testthat/test-bq-perform.R b/tests/testthat/test-bq-perform.R index 3b804321..9ad0a06d 100644 --- a/tests/testthat/test-bq-perform.R +++ b/tests/testthat/test-bq-perform.R @@ -95,10 +95,20 @@ test_that("can supply array parameters", { expect_setequal(df$values, c("a", "b")) }) -test_that("can estimate cost", { +test_that("can estimate cost and get schema", { cost <- bq_perform_query_dry_run( "SELECT count(*) FROM bigquery-public-data.moon_phases.moon_phases", billing = bq_test_project() ) expect_equal(cost, structure(0, class = "bq_bytes")) + + schema <- bq_perform_query_schema( + "SELECT * FROM bigquery-public-data.moon_phases.moon_phases", + billing = bq_test_project() + ) + names <- vapply(schema, function(x) x$name, character(1)) + expect_equal(names, c("phase", "phase_emoji", "peak_datetime")) + + types <- vapply(schema, function(x) x$type, character(1)) + expect_equal(types, c("STRING", "STRING", "DATETIME")) }) diff --git a/tests/testthat/test-bq-table.R b/tests/testthat/test-bq-table.R index 2400ee6e..76b30396 100644 --- a/tests/testthat/test-bq-table.R +++ b/tests/testthat/test-bq-table.R @@ -38,7 +38,7 @@ test_that("can round trip to non-default location", { bq_df <- bq_table(dallas, "df") bq_table_upload(bq_df, df1) - df2 <- bq_table_download(bq_df) + df2 <- bq_table_download(bq_df, api = "json") df2 <- df2[order(df2$x), names(df1)] # BQ doesn't guarantee order rownames(df2) <- NULL @@ -54,7 +54,7 @@ test_that("can roundtrip via save + load", { defer(gs_object_delete(gs)) bq_table_load(tb2, gs) - df <- bq_table_download(tb2) + df <- bq_table_download(tb2, api = "json") expect_equal(dim(df), c(32, 11)) }) @@ -79,7 +79,7 @@ test_that("can round trip atomic vectors", { bq_df <- bq_test_table() bq_table_upload(bq_df, df1) - df2 <- bq_table_download(bq_df, bigint = "integer") + df2 <- bq_table_download(bq_df, bigint = "integer", api = "json") df2 <- df2[order(df2[[1]]), names(df1)] # BQ doesn't gaurantee order rownames(df2) <- NULL @@ -94,7 +94,7 @@ test_that("can round-trip POSIXt to either TIMESTAMP or DATETIME", { bq_fields(list(bq_field("datetime", "TIMESTAMP"))) ) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) tb2 <- bq_table_create( @@ -102,7 +102,7 @@ test_that("can round-trip POSIXt to either TIMESTAMP or DATETIME", { bq_fields(list(bq_field("datetime", "DATETIME"))) ) bq_table_upload(tb2, df) - df2 <- bq_table_download(tb2) + df2 <- bq_table_download(tb2, api = "json") expect_equal(df2, df) }) @@ -117,7 +117,7 @@ test_that("can round trip data frame with list-cols", { ) bq_table_upload(tb, df1) - df2 <- bq_table_download(tb, bigint = "integer") + df2 <- bq_table_download(tb, bigint = "integer", api = "json") # restore column order df2 <- df2[names(df1)] df2$struct[[1]] <- df2$struct[[1]][c("x", "y", "z")] @@ -164,7 +164,7 @@ test_that("can round-trip GEOGRAPHY", { tb1 <- bq_table_create(bq_test_table(), as_bq_fields(df)) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) }) @@ -173,6 +173,6 @@ test_that("can round-trip BYTES", { tb1 <- bq_table_create(bq_test_table(), as_bq_fields(df)) bq_table_upload(tb1, df) - df1 <- bq_table_download(tb1) + df1 <- bq_table_download(tb1, api = "json") expect_equal(df1, df) }) diff --git a/tests/testthat/test-dplyr.R b/tests/testthat/test-dplyr.R index c78bcbaf..01bfeeed 100644 --- a/tests/testthat/test-dplyr.R +++ b/tests/testthat/test-dplyr.R @@ -21,14 +21,25 @@ test_that("can work with literal SQL", { }) test_that("can work with nested table identifier", { - con_us <- DBI::dbConnect( + con1 <- DBI::dbConnect( bigquery(), project = "bigquery-public-data", billing = bq_test_project() ) + # As far as I can tell from the BigQuery API there's no way to provide + # a default project; you can either provide a default dataset + project or + # nothing + table_name <- I("bigquery-public-data.utility_us.country_code_iso") + expect_no_error(dplyr::collect(head(dplyr::tbl(con1, table_name)))) - expect_s3_class(dplyr::collect(head(dplyr::tbl(con_us, I("utility_us.country_code_iso")))), "tbl_df") - expect_error(dplyr::collect(head(dplyr::tbl(con_us, "utility_us.country_code_iso"))), "tbl_df") + + con2 <- DBI::dbConnect( + bigquery(), + project = "bigquery-public-data", + dataset = "utility_us", + billing = bq_test_project(), + ) + expect_no_error(dplyr::collect(head(dplyr::tbl(con2, "country_code_iso")))) }) test_that("can copy_to", { @@ -167,8 +178,8 @@ test_that("all BigQuery tbls share the same src", { billing = bq_test_project() ) - tbl1 <- dplyr::tbl(con1, "basedata.mtcars", vars = "x") - tbl2 <- dplyr::tbl(con2, "publicdata.samples.natality", vars = "x") + tbl1 <- dplyr::tbl(con1, I("basedata.mtcars"), vars = "x") + tbl2 <- dplyr::tbl(con2, I("publicdata.samples.natality"), vars = "x") expect_true(dplyr::same_src(tbl1, tbl2)) expect_false(dplyr::same_src(tbl1, mtcars)) })