feat: Adds support for scanning parquet from GCP (#1056)

Co-authored-by: Etienne Bacher <[email protected]>
pola-rs · Apr 26, 2024 · ec1eda1 · ec1eda1
1 parent 2988ec2
commit ec1eda1
Show file tree

Hide file tree

Showing 12 changed files with 780 additions and 12 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,9 @@
 ### New features
 
 - `$cut()` and `$qcut()` to bin continuous values into discrete categories (#1057).
+- `pl$scan_parquet()` and `pl$read_parquet()` gain an argument `storage_options`
+  to scan/read data via cloud storage providers (GCP, AWS, Azure). Note that this
+  support is experimental (#1056, @andyquinterom).
 
 ### Bug fixes
 

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -102,7 +102,7 @@ import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index,
 
 new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors)
 
-new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, use_statistics, low_memory, hive_partitioning) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, use_statistics, low_memory, hive_partitioning)
+new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning)
 
 test_rpolarserr <- function() .Call(wrap__test_rpolarserr)
 

diff --git a/R/io_csv.R b/R/io_csv.R
@@ -216,8 +216,9 @@ check_is_link = function(path, reuse_downloaded, raise_error = FALSE) {
     # try download file if valid url
     if (!is.null(con)) {
       close(con)
-      if (is.null(cache_temp_file[[actual_url]]))
-        cache_temp_file[[actual_url]] <- tempfile()
+      if (is.null(cache_temp_file[[actual_url]])) {
+        cache_temp_file[[actual_url]] = tempfile()
+      }
       if (isFALSE(reuse_downloaded) || isFALSE(file.exists(cache_temp_file[[actual_url]]))) {
         download.file(url = actual_url, destfile = cache_temp_file[[actual_url]])
         message(paste("tmp file placed in \n", cache_temp_file[[actual_url]]))

diff --git a/R/io_parquet.R b/R/io_parquet.R
@@ -14,7 +14,28 @@
 #' and use them to prune reads.
 #' @param use_statistics Use statistics in the parquet file to determine if pages
 #' can be skipped from reading.
+#' @param storage_options Experimental. List of options necessary to scan
+#' parquet files from different cloud storage providers (GCP, AWS, Azure).
+#' See the 'Details' section.
 #' @rdname IO_scan_parquet
+#' @details
+#' ## Connecting to cloud providers
+#'
+#' Polars supports scanning parquet files from different cloud providers.
+#' The cloud providers currently supported are AWS, GCP, and Azure.
+#' The supported keys to pass to the `storage_options` argument can be found
+#' here:
+#'
+#' - [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
+#' - [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
+#' - [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
+#'
+#' ### Implementation details
+#'
+#' - Currently it is impossible to scan public parquet files from GCP without
+#'   a valid service account. Be sure to always include a service account in the
+#'   `storage_options` argument.
+#'
 #' @examplesIf requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet()
 #' temp_dir = tempfile()
 #' # Write a hive-style partitioned parquet dataset
@@ -46,6 +67,7 @@ pl_scan_parquet = function(
     hive_partitioning = TRUE,
     rechunk = FALSE,
     low_memory = FALSE,
+    storage_options = NULL,
     use_statistics = TRUE,
     cache = TRUE) {
   new_from_parquet(
@@ -58,15 +80,16 @@ pl_scan_parquet = function(
     row_index = row_index_offset,
     low_memory = low_memory,
     use_statistics = use_statistics,
-    hive_partitioning = hive_partitioning
+    hive_partitioning = hive_partitioning,
+    storage_options = storage_options
   ) |>
     unwrap("in pl$scan_parquet():")
 }
 
 #' Read a parquet file
 #' @rdname IO_read_parquet
 #' @inherit pl_read_csv return
-#' @inheritParams pl_scan_parquet
+#' @inherit pl_scan_parquet params details
 #' @examplesIf requireNamespace("arrow", quietly = TRUE) && arrow::arrow_with_dataset() && arrow::arrow_with_parquet()
 #' temp_dir = tempfile()
 #' # Write a hive-style partitioned parquet dataset
@@ -98,6 +121,7 @@ pl_read_parquet = function(
     hive_partitioning = TRUE,
     rechunk = TRUE,
     low_memory = FALSE,
+    storage_options = NULL,
     use_statistics = TRUE,
     cache = TRUE) {
   .args = as.list(environment())

diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd
diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd
diff --git a/src/Makevars.win b/src/Makevars.win
@@ -6,7 +6,7 @@ LIBNAME = libr_polars.a
 TARGET_DIR = $(CURDIR)/rust/target
 LIBDIR = $(TARGET_DIR)/$(TARGET)/$(LIBR_POLARS_PROFILE)
 STATLIB = $(LIBDIR)/$(LIBNAME)
-PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -lsynchronization -t
+PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -lncrypt -lcrypt32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -lsynchronization -t
 
 # Rtools42 doesn't have the linker in the location that cargo expects, so we
 # need to overwrite it via configuration.