From d4ac807676046282560a1850c68c7f219a9eb9ea Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 11 Jun 2024 12:08:10 -0500 Subject: [PATCH 1/2] Support explicit timestamps in fromDataFrame convenience helper --- R/DataFrame.R | 16 +++++++++++++--- inst/tinytest/test_dataframe.R | 24 ++++++++++++++++++++++++ inst/tinytest/test_timetravel.R | 2 +- man/fromDataFrame.Rd | 8 +++++++- 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/R/DataFrame.R b/R/DataFrame.R index de184fbaea..cef973d4b4 100644 --- a/R/DataFrame.R +++ b/R/DataFrame.R @@ -68,6 +68,10 @@ ##' @param offsets_filters A character vector with filters for coordinates, default is \code{ZSTD}. ##' @param validity_filters A character vector with filters for coordinates, default is \code{RLE}. ##' @param debug Logical flag to select additional output. +##' @param timestamps Vector with up to two \code{POSIXct} variables denoting open intervals; default +##' is length zero where start and end are set (implicitly) to current time; in case of one value it +##' is used as the interval end, and in case of two values they are taken as start and end. This +##' applies to write and append modes only and not to schema creation. ##' @return Null, invisibly. ##' @examples ##' \dontshow{ctx <- tiledb_ctx(limitTileDBCores())} @@ -83,10 +87,13 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa capacity = 10000L, tile_domain = NULL, tile_extent = NULL, mode = c("ingest", "schema_only", "append"), filter_list = NULL, coords_filters = "ZSTD", offsets_filters = "ZSTD", - validity_filters = "RLE", debug = FALSE) { + validity_filters = "RLE", debug = FALSE, + timestamps = as.POSIXct(double(), origin="1970-01-01")) { stopifnot("Argument 'obj' should be a 'data.frame' (or a related object)" = inherits(obj, "data.frame"), - "Argument 'uri' should be a character variable" = is.character(uri)) + "Argument 'uri' should be a character variable" = is.character(uri), + "Argument 'timestamps' must be a POSIXct vector" = inherits(timestamps, "POSIXct"), + "Argument 'timestamps' must be 0, 1 or 2 values" = length(timestamps) %in% c(0L, 1L, 2L)) if (!is.null(col_index) && is.character(col_index)) col_index <- match(col_index, colnames(obj)) dims <- dim(obj) mode <- match.arg(mode) @@ -276,7 +283,10 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa tiledb_array_create(uri, schema) if (mode != "schema_only") { - df <- tiledb_array(uri, query_type = "WRITE") + df <- switch(length(timestamps) + 1, # switch takes ints starting at one + tiledb_array(uri, query_type = "WRITE"), + tiledb_array(uri, query_type = "WRITE", timestamp_end=timestamps[1]), + tiledb_array(uri, query_type = "WRITE", timestamp_start=timestamps[1], timestamp_end=timestamps[2])) ## when setting an index when likely want 'sparse write to dense array if (!is.null(col_index) && !sparse) query_layout(df) <- "UNORDERED" diff --git a/inst/tinytest/test_dataframe.R b/inst/tinytest/test_dataframe.R index 846af739cd..fc3a296700 100644 --- a/inst/tinytest/test_dataframe.R +++ b/inst/tinytest/test_dataframe.R @@ -340,3 +340,27 @@ fromDataFrame(D, uri, col_index=1) arr <- tiledb_array(uri, return_as="data.frame") res <- arr[] expect_equivalent(res, D) + + +## fromDataFrame with timestamps +D <- data.frame(key=(1:10)*10, value=letters[1:10]) +uri <- tempfile() +now <- Sys.time() +fromDataFrame(D, uri) # no timestamps +expect_equal(nrow(tiledb_array(uri, return_as="data.frame")[]), 10) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_end=as.POSIXct(100, origin="1970-01-01"))[]), 0) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_start=now + 1)[]), 0) +unlink(uri, recursive=TRUE) + +fromDataFrame(D, uri, timestamps=as.POSIXct(100, origin="1970-01-01")) # end timestamps +expect_equal(nrow(tiledb_array(uri, return_as="data.frame")[]), 10) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_end=as.POSIXct(50, origin="1970-01-01"))[]), 0) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_start=as.POSIXct(50, origin="1970-01-01"))[]), 10) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_start=as.POSIXct(150, origin="1970-01-01"))[]), 0) +unlink(uri, recursive=TRUE) + +fromDataFrame(D, uri, timestamps=c(as.POSIXct(100, origin="1970-01-01"), as.POSIXct(100, origin="1970-01-01"))) # start and end +expect_equal(nrow(tiledb_array(uri, return_as="data.frame")[]), 10) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_end=as.POSIXct(50, origin="1970-01-01"))[]), 0) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_start=as.POSIXct(50, origin="1970-01-01"))[]), 10) +expect_equal(nrow(tiledb_array(uri, return_as="data.frame", timestamp_start=as.POSIXct(150, origin="1970-01-01"))[]), 0) diff --git a/inst/tinytest/test_timetravel.R b/inst/tinytest/test_timetravel.R index d4cf893ab8..ec10f8f074 100644 --- a/inst/tinytest/test_timetravel.R +++ b/inst/tinytest/test_timetravel.R @@ -172,7 +172,7 @@ invisible( tiledb_array_create(tmp, schema) ) I <- c(1, 2, 2) J <- c(1, 4, 3) data <- c(1L, 2L, 3L) -now1 <- as.POSIXct(60, tz="UTC") # the epoch plus one minute +now1 <- as.POSIXct(60, tz="UTC", origin="1970-01-01") # the epoch plus one minute A <- tiledb_array(uri = tmp, timestamp_start=now1, timestamp_end=now1) A[I, J] <- data diff --git a/man/fromDataFrame.Rd b/man/fromDataFrame.Rd index 80773ce186..50e75b4c94 100644 --- a/man/fromDataFrame.Rd +++ b/man/fromDataFrame.Rd @@ -21,7 +21,8 @@ fromDataFrame( coords_filters = "ZSTD", offsets_filters = "ZSTD", validity_filters = "RLE", - debug = FALSE + debug = FALSE, + timestamps = as.POSIXct(double(), origin = "1970-01-01") ) } \arguments{ @@ -74,6 +75,11 @@ or attributes. The \code{filter} argument still applies for all unnamed argument \item{validity_filters}{A character vector with filters for coordinates, default is \code{RLE}.} \item{debug}{Logical flag to select additional output.} + +\item{timestamps}{Vector with up to two \code{POSIXct} variables denoting open intervals; default +is length zero where start and end are set (implicitly) to current time; in case of one value it +is used as the interval end, and in case of two values they are taken as start and end. This +applies to write and append modes only and not to schema creation.} } \value{ Null, invisibly. From 26605f09516784f6a4dd162bc44f19eddc93e91b Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Tue, 11 Jun 2024 12:36:05 -0500 Subject: [PATCH 2/2] Condition new tests on 2.15.0 or later --- inst/tinytest/test_dataframe.R | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tinytest/test_dataframe.R b/inst/tinytest/test_dataframe.R index fc3a296700..e94c3d50c5 100644 --- a/inst/tinytest/test_dataframe.R +++ b/inst/tinytest/test_dataframe.R @@ -343,6 +343,7 @@ expect_equivalent(res, D) ## fromDataFrame with timestamps +if (tiledb_version(TRUE) < "2.15.0") exit_file("Remaining tests require TileDB 2.15.0 or later") D <- data.frame(key=(1:10)*10, value=letters[1:10]) uri <- tempfile() now <- Sys.time()