From ec1eda1b14834346fdb6f2d1f5709063fdb06df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Felipe=20Quintero=20Moreano?= Date: Fri, 26 Apr 2024 10:11:55 -0500 Subject: [PATCH] feat: Adds support for scanning parquet from GCP (#1056) Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- NEWS.md | 3 + R/extendr-wrappers.R | 2 +- R/io_csv.R | 5 +- R/io_parquet.R | 28 +- man/IO_read_parquet.Rd | 27 + man/IO_scan_parquet.Rd | 27 + src/Makevars.win | 2 +- src/rust/Cargo.lock | 667 +++++++++++++++++++++++- src/rust/Cargo.toml | 5 + src/rust/src/rdataframe/read_parquet.rs | 10 +- src/rust/src/rdatatype.rs | 14 + tests/testthat/test-csv-read.R | 2 +- 12 files changed, 780 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index de7fd44b9..cbc4c75ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,9 @@ ### New features - `$cut()` and `$qcut()` to bin continuous values into discrete categories (#1057). +- `pl$scan_parquet()` and `pl$read_parquet()` gain an argument `storage_options` + to scan/read data via cloud storage providers (GCP, AWS, Azure). Note that this + support is experimental (#1056, @andyquinterom). ### Bug fixes diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 3dcf18657..51ef5af59 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -102,7 +102,7 @@ import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) -new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, use_statistics, low_memory, hive_partitioning) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, use_statistics, low_memory, hive_partitioning) +new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning) test_rpolarserr <- function() .Call(wrap__test_rpolarserr) diff --git a/R/io_csv.R b/R/io_csv.R index 4002eb6fc..fc70b3397 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -216,8 +216,9 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) { # try download file if valid url if (!is.null(con)) { close(con) - if (is.null(cache_temp_file[[actual_url]])) - cache_temp_file[[actual_url]] <- tempfile() + if (is.null(cache_temp_file[[actual_url]])) { + cache_temp_file[[actual_url]] = tempfile() + } if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) { download.file(url = actual_url, destfile = cache_temp_file[[actual_url]]) message(paste("tmp file placed in \n", cache_temp_file[[actual_url]])) diff --git a/R/io_parquet.R b/R/io_parquet.R index 2c611a1ad..677b2e40d 100644 --- a/R/io_parquet.R +++ b/R/io_parquet.R @@ -14,7 +14,28 @@ #' and use them to prune reads. #' @param use_statistics Use statistics in the parquet file to determine if pages #' can be skipped from reading. +#' @param storage_options Experimental. List of options necessary to scan +#' parquet files from different cloud storage providers (GCP, AWS, Azure). +#' See the 'Details' section. #' @rdname IO_scan_parquet +#' @details +#' ## Connecting to cloud providers +#' +#' Polars supports scanning parquet files from different cloud providers. +#' The cloud providers currently supported are AWS, GCP, and Azure. +#' The supported keys to pass to the `storage_options` argument can be found +#' here: +#' +#' - [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html) +#' - [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html) +#' - [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html) +#' +#' ### Implementation details +#' +#' - Currently it is impossible to scan public parquet files from GCP without +#' a valid service account. Be sure to always include a service account in the +#' `storage_options` argument. +#' #' @examplesIf requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet() #' temp_dir = tempfile() #' # Write a hive-style partitioned parquet dataset @@ -46,6 +67,7 @@ pl_scan_parquet = function( hive_partitioning = TRUE, rechunk = FALSE, low_memory = FALSE, + storage_options = NULL, use_statistics = TRUE, cache = TRUE) { new_from_parquet( @@ -58,7 +80,8 @@ pl_scan_parquet = function( row_index = row_index_offset, low_memory = low_memory, use_statistics = use_statistics, - hive_partitioning = hive_partitioning + hive_partitioning = hive_partitioning, + storage_options = storage_options ) |> unwrap("in pl$scan_parquet():") } @@ -66,7 +89,7 @@ pl_scan_parquet = function( #' Read a parquet file #' @rdname IO_read_parquet #' @inherit pl_read_csv return -#' @inheritParams pl_scan_parquet +#' @inherit pl_scan_parquet params details #' @examplesIf requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet() #' temp_dir = tempfile() #' # Write a hive-style partitioned parquet dataset @@ -98,6 +121,7 @@ pl_read_parquet = function( hive_partitioning = TRUE, rechunk = TRUE, low_memory = FALSE, + storage_options = NULL, use_statistics = TRUE, cache = TRUE) { .args = as.list(environment()) diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd index 5b38d0ec1..991d85e46 100644 --- a/man/IO_read_parquet.Rd +++ b/man/IO_read_parquet.Rd @@ -14,6 +14,7 @@ pl_read_parquet( hive_partitioning = TRUE, rechunk = TRUE, low_memory = FALSE, + storage_options = NULL, use_statistics = TRUE, cache = TRUE ) @@ -44,6 +45,10 @@ the final DataFrame into contiguous memory chunks.} \item{low_memory}{Reduce memory usage (will yield a lower performance).} +\item{storage_options}{Experimental. List of options necessary to scan +parquet files from different cloud storage providers (GCP, AWS, Azure). +See the 'Details' section.} + \item{use_statistics}{Use statistics in the parquet file to determine if pages can be skipped from reading.} @@ -55,6 +60,28 @@ can be skipped from reading.} \description{ Read a parquet file } +\details{ +\subsection{Connecting to cloud providers}{ + +Polars supports scanning parquet files from different cloud providers. +The cloud providers currently supported are AWS, GCP, and Azure. +The supported keys to pass to the \code{storage_options} argument can be found +here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +} +\subsection{Implementation details}{ +\itemize{ +\item Currently it is impossible to scan public parquet files from GCP without +a valid service account. Be sure to always include a service account in the +\code{storage_options} argument. +} +} + +} +} \examples{ \dontshow{if (requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} temp_dir = tempfile() diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd index 5c6234635..0cfcd0ccb 100644 --- a/man/IO_scan_parquet.Rd +++ b/man/IO_scan_parquet.Rd @@ -14,6 +14,7 @@ pl_scan_parquet( hive_partitioning = TRUE, rechunk = FALSE, low_memory = FALSE, + storage_options = NULL, use_statistics = TRUE, cache = TRUE ) @@ -44,6 +45,10 @@ the final DataFrame into contiguous memory chunks.} \item{low_memory}{Reduce memory usage (will yield a lower performance).} +\item{storage_options}{Experimental. List of options necessary to scan +parquet files from different cloud storage providers (GCP, AWS, Azure). +See the 'Details' section.} + \item{use_statistics}{Use statistics in the parquet file to determine if pages can be skipped from reading.} @@ -55,6 +60,28 @@ can be skipped from reading.} \description{ Scan a parquet file } +\details{ +\subsection{Connecting to cloud providers}{ + +Polars supports scanning parquet files from different cloud providers. +The cloud providers currently supported are AWS, GCP, and Azure. +The supported keys to pass to the \code{storage_options} argument can be found +here: +\itemize{ +\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws} +\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} +\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} +} +\subsection{Implementation details}{ +\itemize{ +\item Currently it is impossible to scan public parquet files from GCP without +a valid service account. Be sure to always include a service account in the +\code{storage_options} argument. +} +} + +} +} \examples{ \dontshow{if (requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} temp_dir = tempfile() diff --git a/src/Makevars.win b/src/Makevars.win index e2e875b4c..592715af1 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -6,7 +6,7 @@ LIBNAME = libr_polars.a TARGET_DIR = $(CURDIR)/rust/target LIBDIR = $(TARGET_DIR)/$(TARGET)/$(LIBR_POLARS_PROFILE) STATLIB = $(LIBDIR)/$(LIBNAME) -PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -lsynchronization -t +PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -lncrypt -lcrypt32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -lsynchronization -t # Rtools42 doesn't have the linker in the location that cargo expects, so we # need to overwrite it via configuration. diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 5659eeab0..c79022866 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -185,6 +185,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" + [[package]] name = "bincode" version = "1.3.3" @@ -209,6 +215,15 @@ dependencies = [ "serde", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "brotli" version = "3.4.0" @@ -333,6 +348,16 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -428,6 +453,32 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dyn-clone" version = "1.0.16" @@ -440,6 +491,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + [[package]] name = "enum_dispatch" version = "0.3.12" @@ -558,6 +618,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futures" version = "0.3.30" @@ -660,6 +729,16 @@ dependencies = [ "windows 0.48.0", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.12" @@ -685,6 +764,25 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "h2" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "halfbrown" version = "0.2.4" @@ -742,6 +840,84 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http", + "hyper", + "rustls", + "tokio", + "tokio-rustls", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -765,6 +941,16 @@ dependencies = [ "cc", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indenter" version = "0.3.3" @@ -801,6 +987,21 @@ dependencies = [ "windows 0.48.0", ] +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.10" @@ -1057,6 +1258,16 @@ dependencies = [ "rawpointer", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.7.1" @@ -1081,6 +1292,12 @@ dependencies = [ "libmimalloc-sys", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" version = "0.7.2" @@ -1221,12 +1438,49 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8718f8b65fdf67a45108d1548347d4af7d71fb81ce727bbf9e3b2535e079db3" +dependencies = [ + "async-trait", + "base64 0.21.7", + "bytes", + "chrono", + "futures", + "humantime", + "hyper", + "itertools", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "ring", + "rustls-pemfile 2.1.2", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "overload" version = "0.1.1" @@ -1474,6 +1728,7 @@ version = "0.39.2" source = "git+https://github.com/pola-rs/polars.git?rev=4c57688d204fad3d0d5e4586ecd0405ead7baeb2#4c57688d204fad3d0d5e4586ecd0405ead7baeb2" dependencies = [ "avro-schema", + "object_store", "polars-arrow-format", "regex", "simdutf8", @@ -1499,6 +1754,7 @@ dependencies = [ "memchr", "memmap2", "num-traits", + "object_store", "once_cell", "percent-encoding", "polars-arrow", @@ -1510,6 +1766,7 @@ dependencies = [ "polars-utils", "rayon", "regex", + "reqwest", "ryu", "serde", "serde_json", @@ -1518,6 +1775,7 @@ dependencies = [ "smartstring", "tokio", "tokio-util", + "url", "zstd", ] @@ -1548,6 +1806,7 @@ source = "git+https://github.com/pola-rs/polars.git?rev=4c57688d204fad3d0d5e4586 dependencies = [ "ahash", "bitflags 2.4.2", + "futures", "glob", "once_cell", "polars-arrow", @@ -1561,6 +1820,7 @@ dependencies = [ "polars-utils", "rayon", "smartstring", + "tokio", "version_check", ] @@ -1572,7 +1832,7 @@ dependencies = [ "ahash", "aho-corasick", "argminmax", - "base64", + "base64 0.21.7", "bytemuck", "chrono", "chrono-tz", @@ -1607,7 +1867,7 @@ source = "git+https://github.com/pola-rs/polars.git?rev=4c57688d204fad3d0d5e4586 dependencies = [ "ahash", "async-stream", - "base64", + "base64 0.21.7", "brotli", "ethnum", "flate2", @@ -1633,6 +1893,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-queue", "enum_dispatch", + "futures", "hashbrown 0.14.3", "num-traits", "polars-arrow", @@ -1645,6 +1906,7 @@ dependencies = [ "polars-utils", "rayon", "smartstring", + "tokio", "uuid", "version_check", ] @@ -1658,6 +1920,7 @@ dependencies = [ "bytemuck", "chrono", "chrono-tz", + "futures", "hashbrown 0.14.3", "once_cell", "percent-encoding", @@ -1770,6 +2033,16 @@ dependencies = [ "cc", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.35" @@ -1970,6 +2243,64 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "reqwest" +version = "0.11.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +dependencies = [ + "base64 0.21.7", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pemfile 1.0.4", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-rustls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "winreg", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rle-decode-fast" version = "1.0.3" @@ -1995,6 +2326,65 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustls" +version = "0.21.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile 1.0.4", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64 0.21.7", +] + +[[package]] +name = "rustls-pemfile" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +dependencies = [ + "base64 0.22.0", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" + +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.14" @@ -2007,6 +2397,24 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -2019,6 +2427,39 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "seq-macro" version = "0.3.5" @@ -2057,6 +2498,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2123,6 +2576,28 @@ dependencies = [ "version_check", ] +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "snap" version = "1.1.1" @@ -2247,6 +2722,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + [[package]] name = "sysinfo" version = "0.30.5" @@ -2261,6 +2742,27 @@ dependencies = [ "windows 0.52.0", ] +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "target-features" version = "0.1.5" @@ -2309,6 +2811,21 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.36.0" @@ -2322,9 +2839,31 @@ dependencies = [ "num_cpus", "pin-project-lite", "socket2", + "tokio-macros", "windows-sys 0.48.0", ] +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.50", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.10" @@ -2336,8 +2875,15 @@ dependencies = [ "futures-sink", "pin-project-lite", "tokio", + "tracing", ] +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + [[package]] name = "tracing" version = "0.1.40" @@ -2399,12 +2945,39 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-reverse" version = "1.0.8" @@ -2426,6 +2999,23 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "uuid" version = "1.7.0" @@ -2459,6 +3049,25 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2490,6 +3099,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877b9c3f61ceea0e56331985743b13f3d25c406a7098d45180fb5f09bc19ed97" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.91" @@ -2519,6 +3140,29 @@ version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96565907687f7aceb35bc5fc03770a8a0471d82e479f25832f54a0e3f4b28446" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2535,6 +3179,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -2701,6 +3354,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "xxhash-rust" version = "0.8.10" diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 5c3bebcc0..eb06211e8 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -67,7 +67,10 @@ features = [ "arg_where", "asof_join", "avro", + "aws", + "azure", "binary_encoding", + "cloud", "concat_str", "cov", "cross_join", @@ -97,6 +100,8 @@ features = [ "ewma_by", "find_many", "fmt", + "gcp", + "http", "interpolate", "ipc", "is_between", diff --git a/src/rust/src/rdataframe/read_parquet.rs b/src/rust/src/rdataframe/read_parquet.rs index 392048b74..c134e5a75 100644 --- a/src/rust/src/rdataframe/read_parquet.rs +++ b/src/rust/src/rdataframe/read_parquet.rs @@ -1,4 +1,5 @@ use crate::lazy::dataframe::RPolarsLazyFrame; +use crate::rdatatype::robj_to_cloud_options; use crate::robj_to; use crate::rpolarserr::{polars_to_rpolars_err, RResult}; @@ -6,6 +7,7 @@ use extendr_api::Rinternals; use extendr_api::{extendr, extendr_module, Robj}; use polars::io::RowIndex; use polars::prelude::{self as pl}; + #[allow(clippy::too_many_arguments)] #[extendr] pub fn new_from_parquet( @@ -16,12 +18,14 @@ pub fn new_from_parquet( rechunk: Robj, row_name: Robj, row_index: Robj, - //storage_options: Robj, // not supported yet, add provide features e.g. aws + storage_options: Robj, use_statistics: Robj, low_memory: Robj, hive_partitioning: Robj, //retries: Robj // not supported yet, with CloudOptions ) -> RResult { + let path = robj_to!(String, path)?; + let cloud_options = robj_to_cloud_options(&path, &storage_options)?; let offset = robj_to!(Option, u32, row_index)?.unwrap_or(0); let opt_row_index = robj_to!(Option, String, row_name)?.map(|name| RowIndex { name, offset }); let args = pl::ScanArgsParquet { @@ -31,7 +35,7 @@ pub fn new_from_parquet( rechunk: robj_to!(bool, rechunk)?, row_index: opt_row_index, low_memory: robj_to!(bool, low_memory)?, - cloud_options: None, + cloud_options, use_statistics: robj_to!(bool, use_statistics)?, hive_options: polars::io::HiveOptions { enabled: robj_to!(bool, hive_partitioning)?, @@ -39,7 +43,7 @@ pub fn new_from_parquet( }, }; - pl::LazyFrame::scan_parquet(robj_to!(String, path)?, args) + pl::LazyFrame::scan_parquet(path, args) .map_err(polars_to_rpolars_err) .map(RPolarsLazyFrame) } diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 375c63fa5..e44309a79 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -361,6 +361,20 @@ pub fn robj_to_interpolation_method(robj: Robj) -> RResult RResult> { + if robj.is_null() { + return Ok(None); + } + if let (Some(names), Some(values)) = (robj.as_str_iter(), robj.names()) { + Ok(Some(pl::cloud::CloudOptions::from_untyped_config( + url, + values.zip(names), + )?)) + } else { + Ok(None) + } +} + pub fn robj_to_rank_method(robj: Robj) -> RResult { use pl::RankMethod as RM; match robj_to_rchoice(robj)?.to_lowercase().as_str() { diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R index bc2fa5cc8..0f4923ce4 100644 --- a/tests/testthat/test-csv-read.R +++ b/tests/testthat/test-csv-read.R @@ -184,7 +184,7 @@ test_that("bad paths", { test_that("cache url tempfile", { skip_if_offline() - url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv" + url = "https://vincentarelbundock.github.io/Rdatasets/csv/AER/BenderlyZwick.csv" local_mocked_bindings( download.file = function(...) invisible(NULL), )