diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 73c6a3b6d..eea49782d 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -103,7 +103,7 @@ jobs: run: task build-website - name: upload docs - if: ${{ github.event_name == 'pull_request' }} + if: always() uses: actions/upload-artifact@v4 with: name: docs diff --git a/NEWS.md b/NEWS.md index 8d396a350..4e1c4ef47 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ ### New features - New method `$register_globals()` (#1064). +- New experimental method `$sql()` for DataFrame and LazyFrame (#1065). ## Polars R Package 0.16.2 diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 28e2a2c8a..0743cf430 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -2435,3 +2435,54 @@ DataFrame_clear = function(n = 0) { out } + + +# TODO: we can't use % in the SQL query +# +#' Execute a SQL query against the DataFrame +#' +#' @inherit LazyFrame_sql description details params seealso +#' @inherit pl_DataFrame return +#' @examplesIf polars_info()$features$sql +#' df1 = pl$DataFrame( +#' a = 1:3, +#' b = c("zz", "yy", "xx"), +#' c = as.Date(c("1999-12-31", "2010-10-10", "2077-08-08")) +#' ) +#' +#' # Query the DataFrame using SQL: +#' df1$sql("SELECT c, b FROM self WHERE a > 1") +#' +#' # Join two DataFrames using SQL. +#' df2 = pl$DataFrame(a = 3:1, d = c(125, -654, 888)) +#' df1$sql( +#' " +#' SELECT self.*, d +#' FROM self +#' INNER JOIN df2 USING (a) +#' WHERE a > 1 AND EXTRACT(year FROM c) < 2050 +#' " +#' ) +#' +#' # Apply transformations to a DataFrame using SQL, aliasing "self" to "frame". +#' df1$sql( +#' query = r"( +#' SELECT +#' a, +#' MOD(a, 2) == 0 AS a_is_even, +#' CONCAT_WS(':', b, b) AS b_b, +#' EXTRACT(year FROM c) AS year, +#' 0::float AS 'zero' +#' FROM frame +#' )", +#' table_name = "frame" +#' ) +DataFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame()) { + self$lazy()$sql( + query, + table_name = table_name, + envir = envir + )$collect() |> + result() |> + unwrap("in $sql():") +} diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index dc3324862..7c00410fd 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -169,8 +169,7 @@ LazyFrame_width = method_as_active_binding(\() length(self$schema)) #' #' @param ... Anything that is accepted by `pl$DataFrame()` #' -#' @return LazyFrame -#' @keywords LazyFrame_new +#' @return [LazyFrame][LazyFrame_class] #' #' @examples #' pl$LazyFrame( @@ -2078,3 +2077,69 @@ LazyFrame_to_dot = function( LazyFrame_clear = function(n = 0) { pl$DataFrame(schema = self$schema)$clear(n)$lazy() } + + +# TODO: we can't use % in the SQL query +# +#' Execute a SQL query against the LazyFrame +#' +#' The calling frame is automatically registered as a table in the SQL context +#' under the name `"self"`. All [DataFrames][DataFrame_class] and +#' [LazyFrames][LazyFrame_class] found in the `envir` are also registered, +#' using their variable name. +#' More control over registration and execution behaviour is available by +#' the [SQLContext][SQLContext_class] object. +#' +#' This functionality is considered **unstable**, although it is close to +#' being considered stable. It may be changed at any point without it being +#' considered a breaking change. +#' @inherit pl_LazyFrame return +#' @inheritParams SQLContext_execute +#' @inheritParams SQLContext_register_globals +#' @param table_name `NULL` (default) or a character of an explicit name for the table +#' that represents the calling frame (the alias `"self"` will always be registered/available). +#' @seealso +#' - [SQLContext][SQLContext_class] +#' @examplesIf polars_info()$features$sql +#' lf1 = pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x")) +#' lf2 = pl$LazyFrame(a = 3:1, d = c(125, -654, 888)) +#' +#' # Query the LazyFrame using SQL: +#' lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect() +#' +#' # Join two LazyFrames: +#' lf1$sql( +#' " +#' SELECT self.*, d +#' FROM self +#' INNER JOIN lf2 USING (a) +#' WHERE a > 1 AND b < 8 +#' " +#' )$collect() +#' +#' # Apply SQL transforms (aliasing "self" to "frame") and subsequently +#' # filter natively (you can freely mix SQL and native operations): +#' lf1$sql( +#' query = r"( +#' SELECT +#' a, +#' MOD(a, 2) == 0 AS a_is_even, +#' (b::float / 2) AS 'b/2', +#' CONCAT_WS(':', c, c, c) AS c_c_c +#' FROM frame +#' ORDER BY a +#' )", +#' table_name = "frame" +#' )$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect() +LazyFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame()) { + result({ + ctx = pl$SQLContext()$register_globals(envir = envir)$register("self", self) + + if (!is.null(table_name)) { + ctx$register(table_name, self) + } + + ctx$execute(query) + }) |> + unwrap("in $sql():") +} diff --git a/R/sql.R b/R/sql.R index a0b640d66..83639a326 100644 --- a/R/sql.R +++ b/R/sql.R @@ -60,7 +60,7 @@ pl_SQLContext = function(...) { #' Execute SQL query against the registered data #' #' Parse the given SQL query and execute it against the registered frame data. -#' @param query A valid string SQL query. +#' @param query A character of the SQL query to execute. #' @return A [LazyFrame][LazyFrame_class] #' @examplesIf polars_info()$features$sql #' query = "SELECT * FROM mtcars WHERE cyl = 4" @@ -174,7 +174,8 @@ SQLContext_tables = function() { #' Automatically maps variable names to table names. #' @inherit SQLContext_register details return #' @param ... Ignored. -#' @param envir The environment to search for polars DataFrames/LazyFrames. +#' @param envir The environment to search for polars +#' [DataFrames][DataFrame_class]/[LazyFrames][LazyFrame_class]. #' @seealso #' - [`$register()`][SQLContext_register] #' - [`$register_many()`][SQLContext_register_many] diff --git a/man/DataFrame_sql.Rd b/man/DataFrame_sql.Rd new file mode 100644 index 000000000..8c109b627 --- /dev/null +++ b/man/DataFrame_sql.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_sql} +\alias{DataFrame_sql} +\title{Execute a SQL query against the DataFrame} +\usage{ +DataFrame_sql(query, ..., table_name = NULL, envir = parent.frame()) +} +\arguments{ +\item{query}{A character of the SQL query to execute.} + +\item{...}{Ignored.} + +\item{table_name}{\code{NULL} (default) or a character of an explicit name for the table +that represents the calling frame (the alias \code{"self"} will always be registered/available).} + +\item{envir}{The environment to search for polars +\link[=DataFrame_class]{DataFrames}/\link[=LazyFrame_class]{LazyFrames}.} +} +\value{ +\link[=DataFrame_class]{DataFrame} +} +\description{ +The calling frame is automatically registered as a table in the SQL context +under the name \code{"self"}. All \link[=DataFrame_class]{DataFrames} and +\link[=LazyFrame_class]{LazyFrames} found in the \code{envir} are also registered, +using their variable name. +More control over registration and execution behaviour is available by +the \link[=SQLContext_class]{SQLContext} object. +} +\details{ +This functionality is considered \strong{unstable}, although it is close to +being considered stable. It may be changed at any point without it being +considered a breaking change. +} +\examples{ +\dontshow{if (polars_info()$features$sql) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +df1 = pl$DataFrame( + a = 1:3, + b = c("zz", "yy", "xx"), + c = as.Date(c("1999-12-31", "2010-10-10", "2077-08-08")) +) + +# Query the DataFrame using SQL: +df1$sql("SELECT c, b FROM self WHERE a > 1") + +# Join two DataFrames using SQL. +df2 = pl$DataFrame(a = 3:1, d = c(125, -654, 888)) +df1$sql( + " +SELECT self.*, d +FROM self +INNER JOIN df2 USING (a) +WHERE a > 1 AND EXTRACT(year FROM c) < 2050 +" +) + +# Apply transformations to a DataFrame using SQL, aliasing "self" to "frame". +df1$sql( + query = r"( +SELECT +a, +MOD(a, 2) == 0 AS a_is_even, +CONCAT_WS(':', b, b) AS b_b, +EXTRACT(year FROM c) AS year, +0::float AS 'zero' +FROM frame +)", + table_name = "frame" +) +\dontshow{\}) # examplesIf} +} +\seealso{ +\itemize{ +\item \link[=SQLContext_class]{SQLContext} +} +} diff --git a/man/LazyFrame_sql.Rd b/man/LazyFrame_sql.Rd new file mode 100644 index 000000000..b8fbce85f --- /dev/null +++ b/man/LazyFrame_sql.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_sql} +\alias{LazyFrame_sql} +\title{Execute a SQL query against the LazyFrame} +\usage{ +LazyFrame_sql(query, ..., table_name = NULL, envir = parent.frame()) +} +\arguments{ +\item{query}{A character of the SQL query to execute.} + +\item{...}{Ignored.} + +\item{table_name}{\code{NULL} (default) or a character of an explicit name for the table +that represents the calling frame (the alias \code{"self"} will always be registered/available).} + +\item{envir}{The environment to search for polars +\link[=DataFrame_class]{DataFrames}/\link[=LazyFrame_class]{LazyFrames}.} +} +\value{ +\link[=LazyFrame_class]{LazyFrame} +} +\description{ +The calling frame is automatically registered as a table in the SQL context +under the name \code{"self"}. All \link[=DataFrame_class]{DataFrames} and +\link[=LazyFrame_class]{LazyFrames} found in the \code{envir} are also registered, +using their variable name. +More control over registration and execution behaviour is available by +the \link[=SQLContext_class]{SQLContext} object. +} +\details{ +This functionality is considered \strong{unstable}, although it is close to +being considered stable. It may be changed at any point without it being +considered a breaking change. +} +\examples{ +\dontshow{if (polars_info()$features$sql) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +lf1 = pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x")) +lf2 = pl$LazyFrame(a = 3:1, d = c(125, -654, 888)) + +# Query the LazyFrame using SQL: +lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect() + +# Join two LazyFrames: +lf1$sql( + " +SELECT self.*, d +FROM self +INNER JOIN lf2 USING (a) +WHERE a > 1 AND b < 8 +" +)$collect() + +# Apply SQL transforms (aliasing "self" to "frame") and subsequently +# filter natively (you can freely mix SQL and native operations): +lf1$sql( + query = r"( +SELECT + a, +MOD(a, 2) == 0 AS a_is_even, +(b::float / 2) AS 'b/2', +CONCAT_WS(':', c, c, c) AS c_c_c +FROM frame +ORDER BY a +)", + table_name = "frame" +)$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect() +\dontshow{\}) # examplesIf} +} +\seealso{ +\itemize{ +\item \link[=SQLContext_class]{SQLContext} +} +} diff --git a/man/SQLContext_execute.Rd b/man/SQLContext_execute.Rd index ff266a509..735217449 100644 --- a/man/SQLContext_execute.Rd +++ b/man/SQLContext_execute.Rd @@ -7,7 +7,7 @@ SQLContext_execute(query) } \arguments{ -\item{query}{A valid string SQL query.} +\item{query}{A character of the SQL query to execute.} } \value{ A \link[=LazyFrame_class]{LazyFrame} diff --git a/man/SQLContext_register_globals.Rd b/man/SQLContext_register_globals.Rd index 2c520983b..620969d32 100644 --- a/man/SQLContext_register_globals.Rd +++ b/man/SQLContext_register_globals.Rd @@ -9,7 +9,8 @@ SQLContext_register_globals(..., envir = parent.frame()) \arguments{ \item{...}{Ignored.} -\item{envir}{The environment to search for polars DataFrames/LazyFrames.} +\item{envir}{The environment to search for polars +\link[=DataFrame_class]{DataFrames}/\link[=LazyFrame_class]{LazyFrames}.} } \value{ Returns the \link[=SQLContext_class]{SQLContext} object invisibly. diff --git a/man/pl_LazyFrame.Rd b/man/pl_LazyFrame.Rd index fcce955cd..11efc24fc 100644 --- a/man/pl_LazyFrame.Rd +++ b/man/pl_LazyFrame.Rd @@ -10,7 +10,7 @@ pl_LazyFrame(...) \item{...}{Anything that is accepted by \code{pl$DataFrame()}} } \value{ -LazyFrame +\link[=LazyFrame_class]{LazyFrame} } \description{ This is simply a convenience function to create \code{LazyFrame}s in a quick way. @@ -39,4 +39,3 @@ pl$LazyFrame( schema = list(Sepal.Length = pl$Float32, Species = pl$String) )$collect() } -\keyword{LazyFrame_new} diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 331d918aa..90695389c 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -89,12 +89,12 @@ [41] "quantile" "rechunk" "rename" "reverse" [45] "rolling" "sample" "schema" "select" [49] "select_seq" "shape" "shift" "shift_and_fill" - [53] "slice" "sort" "std" "sum" - [57] "tail" "to_data_frame" "to_list" "to_series" - [61] "to_struct" "transpose" "unique" "unnest" - [65] "var" "width" "with_columns" "with_columns_seq" - [69] "with_row_index" "write_csv" "write_ipc" "write_json" - [73] "write_ndjson" "write_parquet" + [53] "slice" "sort" "sql" "std" + [57] "sum" "tail" "to_data_frame" "to_list" + [61] "to_series" "to_struct" "transpose" "unique" + [65] "unnest" "var" "width" "with_columns" + [69] "with_columns_seq" "with_row_index" "write_csv" "write_ipc" + [73] "write_json" "write_ndjson" "write_parquet" --- @@ -164,13 +164,13 @@ [41] "shift_and_fill" "sink_csv" [43] "sink_ipc" "sink_ndjson" [45] "sink_parquet" "slice" - [47] "sort" "std" - [49] "sum" "tail" - [51] "to_dot" "unique" - [53] "unnest" "var" - [55] "width" "with_columns" - [57] "with_columns_seq" "with_context" - [59] "with_row_index" + [47] "sort" "sql" + [49] "std" "sum" + [51] "tail" "to_dot" + [53] "unique" "unnest" + [55] "var" "width" + [57] "with_columns" "with_columns_seq" + [59] "with_context" "with_row_index" --- diff --git a/tests/testthat/test-sql.R b/tests/testthat/test-sql.R index a7d6d0038..491604f5a 100644 --- a/tests/testthat/test-sql.R +++ b/tests/testthat/test-sql.R @@ -75,3 +75,35 @@ test_that("SQLContext_register_globals", { func2(ctx3) expect_equal(ctx3$tables(), c("f1", "f2")) }) + + +test_that("sql method for DataFrame and LazyFrame", { + df1 = pl$DataFrame(x = 1) + lf1 = pl$LazyFrame(x = 2) + + expect_true(df1$equals( + df1$sql("select * from self") + )) + expect_true(df1$equals( + df1$sql("select * from foo", table_name = "foo") + )) + expect_true(df1$equals( + lf1$sql("select * from df1")$collect() + )) + expect_true(df1$equals( + lf1$sql("select x/2 from self")$collect() + )) + + # Test the envir argument works correctly + func1 = function(data) { + df1 = pl$DataFrame(foo = "bar") + data$sql("select * from self join df1 using (x)", envir = parent.frame()) + } + + expect_true(df1$equals( + func1(df1) + )) + expect_true(pl$DataFrame(x = numeric(0))$equals( + func1(lf1)$collect() + )) +})