Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve digest_to_checksum for SQL Server #97

Merged
merged 16 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ S3method(as.character,Id)
S3method(db_timestamp,"NULL")
S3method(db_timestamp,SQLiteConnection)
S3method(db_timestamp,default)
S3method(digest_to_checksum,data.frame)
S3method(digest_to_checksum,default)
S3method(digest_to_checksum,tbl_PqConnection)
S3method(dplyr::anti_join,tbl_sql)
S3method(dplyr::full_join,tbl_sql)
S3method(dplyr::inner_join,tbl_sql)
Expand Down Expand Up @@ -55,7 +58,6 @@ import(RSQLite)
import(dbplyr)
import(lubridate)
import(tidyverse)
import(utils)
importFrom(R6,R6Class)
importFrom(methods,setGeneric)
importFrom(rlang,":=")
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
- A `character` input for `port` is allowed if it is a string of digits.
- Now checks if `timezone` and `timezone_out` is an IANA time zone.

* `digest_to_checksum()` has improved performance on Microsoft SQL Server by use of the built-in `HashBytes` function (#97).

* `get_connection()` now checks the value of any `timezone` and `timezone_out` arguments (#83).

* `table_exists()` now correctly gives ambiguity warning on Microsoft SQL Server and PostgreSQL backends (#80).
Expand Down
5 changes: 0 additions & 5 deletions R/SCDB-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,3 @@

#' @import RSQLite
NULL

#' @import utils
utils::globalVariables(".")
utils::globalVariables("NULLIF")
utils::globalVariables("CONCAT_WS")
116 changes: 75 additions & 41 deletions R/digest_to_checksum.R
Original file line number Diff line number Diff line change
@@ -1,89 +1,123 @@
#' Computes an MD5 checksum from columns
#' Computes an checksum from columns
#'
#' @details
#' In most cases, the md5 algorithm is used to compute the checksums.
#' For Microsoft SQL Server, the SHA-256 algorithm is used.
#'
#' @name digest_to_checksum
#'
#' @template .data
#' @param col Name of the column to put the checksums in
#' @param warn Flag to warn if target column already exists in data
LasseEngboChr marked this conversation as resolved.
Show resolved Hide resolved
#' @param exclude Columns to exclude from the checksum generation
#' @examples
#' digest_to_checksum(mtcars)
#'
#' @return .data with an checksum column added
#' @export
digest_to_checksum <- function(.data, col = "checksum", exclude = NULL, warn = TRUE) {
digest_to_checksum <- function(.data, col = "checksum", exclude = NULL) {

# Check arguments
assert_data_like(.data)
checkmate::assert_character(col)
checkmate::assert_logical(warn)
checkmate::assert_character(exclude, null.ok = TRUE)

if (as.character(dplyr::ensym(col)) %in% colnames(.data) && warn) {
warning("Column ",
as.character(dplyr::ensym(col)),
" already exists in data and will be overwritten!")
if (as.character(dplyr::ensym(col)) %in% colnames(.data)) {
warning(glue::glue("Column {as.character(dplyr::ensym(col))} already exists in data and will be overwritten!"))
}

colnames <- .data |>
dplyr::select(!tidyselect::any_of(c(col, exclude))) |>
colnames()
UseMethod("digest_to_checksum", .data)
}

.data <- .data |>
dplyr::select(!tidyselect::any_of(col)) |>
dplyr::mutate(dplyr::across(
tidyselect::all_of(colnames),
~ dplyr::coalesce(as.character(.), ""),
.names = "{.col}.__chr"
))

return(digest_to_checksum_internal(.data, col))
}
# Resolve visible binding warning for SQL commands
utils::globalVariables(c("CONVERT", "VARCHAR"))

#' @template .data
#' @param col The name of column the checksums will be placed in
#' @inherit digest_to_checksum return
#' @noRd
digest_to_checksum_internal <- function(.data, col) {
UseMethod("digest_to_checksum_internal")
`digest_to_checksum.tbl_Microsoft SQL Server` <- function(
.data,
col = formals(digest_to_checksum)$col,
exclude = formals(digest_to_checksum)$exclude) {

conn <- dbplyr::remote_con(.data)

hash_cols <- dbplyr::ident(setdiff(colnames(.data), c(col, exclude)))

.data <- .data |>
dplyr::mutate(
{{ col }} := !!dbplyr::sql_call2(
"HashBytes",
"SHA2_256",
dbplyr::build_sql("(SELECT ", hash_cols, " FOR XML RAW)", con = conn),
con = conn
)
)

return(.data)
}

#' @noRd
digest_to_checksum_internal.default <- function(.data, col) {
#' @export
digest_to_checksum.default <- function(
.data,
col = formals(digest_to_checksum)$col,
exclude = formals(digest_to_checksum)$exclude) {

# Compute checksums locally then join back onto original data
hash_cols <- setdiff(colnames(.data), c(col, exclude))

# The md5 algorithm needs character inputs, so we convert the hash columns to character and concatenate
checksums <- .data |>
dplyr::mutate(dplyr::across(
tidyselect::all_of(hash_cols),
~ dplyr::coalesce(as.character(.), ""),
.names = "{.col}.__chr"
))

# Compute checksums locally then join back onto original data
checksums <- checksums |>
dplyr::collect() |>
tidyr::unite(col, tidyselect::ends_with(".__chr")) |>
dplyr::transmute(id__ = dplyr::row_number(),
checksum = openssl::md5({{ col }}))
tidyr::unite(!!col, tidyselect::ends_with(".__chr"), remove = FALSE) |>
dplyr::transmute(
id__ = dplyr::row_number(),
dplyr::across(tidyselect::all_of(col), openssl::md5)
)

.data <- .data |>
dplyr::mutate(id__ = dplyr::row_number()) |>
dplyr::left_join(checksums, by = "id__", copy = TRUE) |>
dplyr::select(!c(tidyselect::ends_with(".__chr"), "id__"))
dplyr::select(!"id__")

return(.data)
}

#'
# It seems we need to do more hacking since
# @importFrom openssl md5 does not work in the below usecase.
# defining md5 here succesfully causes local objects to use the openssl md5 function
# @importFrom openssl md5 does not work in the below use case.
# defining md5 here successfully causes local objects to use the openssl md5 function
# and remote objects to use their own md5 functions.
md5 <- openssl::md5

# Some backends have native md5 support, these use this function
# Some backends have native md5 support, these use this function.
#' @noRd
digest_to_checksum_native_md5 <- function(.data, col) {
digest_to_checksum_native_md5 <- function(
.data,
col = formals(digest_to_checksum)$col,
exclude = formals(digest_to_checksum)$exclude) {

hash_cols <- setdiff(colnames(.data), c(col, exclude))

# The md5 algorithm needs character inputs, so we convert the hash columns to character and concatenate
.data <- .data |>
dplyr::mutate(dplyr::across(
tidyselect::all_of(hash_cols),
~ dplyr::coalesce(as.character(.), ""),
.names = "{.col}.__chr"
)) |>
tidyr::unite(!!col, tidyselect::ends_with(".__chr"), remove = TRUE) |>
dplyr::mutate(dplyr::across(tidyselect::all_of(col), md5))

return(.data)
}

digest_to_checksum_internal.tbl_PqConnection <- digest_to_checksum_native_md5

digest_to_checksum_internal.data.frame <- digest_to_checksum_native_md5
#' @export
digest_to_checksum.tbl_PqConnection <- digest_to_checksum_native_md5

digest_to_checksum_internal.tibble <- digest_to_checksum_native_md5
#' @export
digest_to_checksum.data.frame <- digest_to_checksum_native_md5
2 changes: 1 addition & 1 deletion R/getTableSignature.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
until_ts = "TEXT"
),
"Microsoft SQL Server" = c(
checksum = "varchar(32)",
checksum = "varchar(40)",

Check warning on line 22 in R/getTableSignature.R

View check run for this annotation

Codecov / codecov/patch

R/getTableSignature.R#L22

Added line #L22 was not covered by tests
from_ts = "DATETIME2",
until_ts = "DATETIME2"
)
Expand Down
3 changes: 3 additions & 0 deletions R/unite.tbl_dbi.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Resolve visible binding warning for SQL commands
utils::globalVariables(c("NULLIF", "CONCAT_WS"))

#' tidyr::unite for tbl_dbi
#'
#' @inheritParams tidyr::unite
Expand Down
11 changes: 10 additions & 1 deletion R/update_snapshot.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,21 @@
dplyr::select(
colnames(dplyr::select(db_table, !tidyselect::any_of(c("checksum", "from_ts", "until_ts"))))
) |>
digest_to_checksum(col = "checksum", warn = TRUE) |>
digest_to_checksum(col = "checksum") |>
filter_keys(filters) |>
dplyr::compute()

if (!identical(dbplyr::remote_con(.data), conn)) {
if (table_exists(conn, "update_snapshot_patch")) DBI::dbRemoveTable(conn, "update_snapshot_patch")
.data <- dplyr::copy_to(conn, .data, name = "SCDB_update_snapshot_patch", temporary = TRUE)

Check warning on line 131 in R/update_snapshot.R

View check run for this annotation

Codecov / codecov/patch

R/update_snapshot.R#L130-L131

Added lines #L130 - L131 were not covered by tests
}

# Apply filter to current records
if (!is.null(filters) && !identical(dbplyr::remote_con(filters), conn)) {
if (table_exists(conn, "update_snapshot_patch_filters")) DBI::dbRemoveTable(conn, "update_snapshot_patch_filters")

Check warning on line 136 in R/update_snapshot.R

View check run for this annotation

Codecov / codecov/patch

R/update_snapshot.R#L136

Added line #L136 was not covered by tests

filters <- dplyr::copy_to(conn, filters, name = "SCDB_update_snapshot_filters", temporary = TRUE)

Check warning on line 138 in R/update_snapshot.R

View check run for this annotation

Codecov / codecov/patch

R/update_snapshot.R#L138

Added line #L138 was not covered by tests
}
db_table <- filter_keys(db_table, filters)

# Determine the next timestamp in the data (can be NA if none is found)
Expand Down
1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Rasmus
realpath
RPostgres

SHA
Skytte
sql
SQLiteConnections
Expand Down
12 changes: 7 additions & 5 deletions man/digest_to_checksum.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-digest_to_checksum.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ test_that("digest_to_checksum() warns works correctly when overwriting", {

expect_warning(checksum_vector2 <- mtcars |>
digest_to_checksum(col = "checksum") |>
digest_to_checksum(col = "checksum", warn = TRUE) |>
digest_to_checksum(col = "checksum") |>
dplyr::pull(checksum))

expect_identical(checksum_vector, checksum_vector2)
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-update_snapshot.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ test_that("update_snapshot() works", {

target <- mtcars |>
dplyr::copy_to(conn, df = _, name = "temp", overwrite = TRUE) |>
digest_to_checksum(col = "checksum", warn = FALSE) |>
digest_to_checksum(col = "checksum") |>
dplyr::mutate(from_ts = !!db_timestamp("2022-10-01 09:00:00", conn),
until_ts = !!db_timestamp(NA, conn))

Expand Down
Loading