From c6cb9281ad4c1e5142a7c1cc877b33227e1c1600 Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Fri, 25 Aug 2023 11:10:38 +0100 Subject: [PATCH 1/4] Add support for validating packets --- R/location.R | 26 ++++--- R/outpack_hash.R | 1 + R/outpack_index.R | 6 +- R/validate.R | 124 +++++++++++++++++++++++++++++++++ tests/testthat/test-validate.R | 110 +++++++++++++++++++++++++++++ 5 files changed, 258 insertions(+), 9 deletions(-) create mode 100644 R/validate.R create mode 100644 tests/testthat/test-validate.R diff --git a/R/location.R b/R/location.R index 1a4f2a8c..235b2f18 100644 --- a/R/location.R +++ b/R/location.R @@ -152,7 +152,6 @@ orderly_location_remove <- function(name, root = NULL, locate = TRUE) { name)) } location_check_exists(root, name) - config <- root$config index <- root$index$data() known_here <- index$location$packet[index$location$location == name] @@ -160,13 +159,6 @@ orderly_location_remove <- function(name, root = NULL, locate = TRUE) { only_here <- setdiff(known_here, known_elsewhere) if (length(only_here) > 0) { - if (!location_exists(root, "orphan")) { - config$location <- rbind( - config$location, - new_location_entry(orphan, "orphan", NULL)) - rownames(config$location) <- NULL - } - mark_packets_orphaned(name, only_here, root) } @@ -178,6 +170,7 @@ orderly_location_remove <- function(name, root = NULL, locate = TRUE) { ## rebuild() anywhere; it can probably be relaxed if the refresh was ## more careful, but this is a rare operation. root$index$rebuild() + config <- root$config config$location <- config$location[config$location$name != name, ] config_update(config, root) invisible() @@ -728,6 +721,14 @@ location_exists <- function(root, name) { mark_packets_orphaned <- function(location, packet_id, root) { + if (!location_exists(root, "orphan")) { + config <- root$config + config$location <- rbind( + config$location, + new_location_entry(orphan, "orphan", NULL)) + rownames(config$location) <- NULL + config_update(config, root) + } src <- file.path(root$path, ".outpack", "location", location, packet_id) dest <- file.path(root$path, ".outpack", "location", "orphan", packet_id) fs::dir_create(dirname(dest)) @@ -735,6 +736,15 @@ mark_packets_orphaned <- function(location, packet_id, root) { } +drop_local_packet <- function(packet_id, root) { + location <- root$index$location(NULL) + known_at <- location$location[location$packet == packet_id] + if (!any(known_at != local)) { + mark_packets_orphaned(local, packet_id, root) + } +} + + ## This approach may be suboptimal in the case where the user does not ## already have a file store, as it means that files will be copied ## around and hashed more than ideal: diff --git a/R/outpack_hash.R b/R/outpack_hash.R index 0cc278fc..60bfdc98 100644 --- a/R/outpack_hash.R +++ b/R/outpack_hash.R @@ -32,6 +32,7 @@ hash_validate_file <- function(path, expected, body = NULL, call = NULL) { } + hash_validate_data <- function(data, expected, name = deparse(substitute(x)), body = NULL, call = NULL) { hash_validate(rehash(data, hash_data, expected), expected, name, diff --git a/R/outpack_index.R b/R/outpack_index.R index 8a89d831..d0b3cb86 100644 --- a/R/outpack_index.R +++ b/R/outpack_index.R @@ -36,7 +36,11 @@ outpack_index <- R6::R6Class( location = function(name) { self$refresh() - private$data_$location[private$data_$location$location %in% name, ] + if (is.null(name)) { + private$data_$location + } else { + private$data_$location[private$data_$location$location %in% name, ] + } }, unpacked = function() { diff --git a/R/validate.R b/R/validate.R new file mode 100644 index 00000000..666df873 --- /dev/null +++ b/R/validate.R @@ -0,0 +1,124 @@ +##' Validate unpacked packets. Over time, expect this function to +##' become more fully featured, validating more. +##' +##' The actions that we can take on finding an invalid packet are: +##' +##' * `inform` (the default): just print information about the problem +##' +##' * `orphan`: mark the packet as orphaned within the metadata, but +##' do not touch the files in your archive (by default the directory +##' `archive/`) - this is a safe option and will leave you in a +##' consistent state without deleting anything. +##' +##' * `delete`: in addition to marking the packet as an orphan, also +##' delete the files from your archive. +##' +##' Later, we will add a "repair" option to try and fix broken +##' packets. +##' +##' The validation interacts with the option +##' `core.require_complete_tree`; if this option is `TRUE`, then a +##' packet is only valid if all its (recursive) dependencies are also +##' valid, so the action will apply to packets that have also had +##' their upstream dependencies invalidated. This validation will +##' happen even if the query implied by `...` does not include these +##' packets if a complete tree is required. +##' +##' The validation will also interact with `core.use_file_store` once +##' repair is supported, as this becomes trivial. +##' +##' @title Validate unpacked packets. +##' +##' @param ... Either arguments that a search can be constructed from +##' (useful options here include `name = "something"`), a character +##' vector of ids, or leave empty to validate everything. +##' +##' @param action The action to take on finding an invalid packet. See +##' Details. +##' +##' @inheritParams orderly_metadata +##' +##' @return Invisibly, a character vector of repaired (or invalid) +##' packets. +##' +##' @export +orderly_validate_archive <- function(..., action = "inform", + root = NULL, locate = TRUE) { + root <- root_open(root, locate = TRUE, require_orderly = FALSE) + action <- match_value(action, c("inform", "orphan", "delete", "repair")) + + if (is.null(root$config$core$path_archive)) { + cli::cli_abort("You have no archive to validate") + } + + if (dots_is_literal_id(...)) { + ids <- ..1 + } else { + options <- orderly_search_options(location = local) + ids <- orderly_search(..., options = options, root = root) + } + + cache <- new.env(parent = emptyenv()) + for (id in sort(ids)) { + cache[[id]] <- orderly_validate_archive_packet(id, action, cache, root) + } + res <- as.list(cache) + valid <- vlapply(res, "[[", "valid") + invisible(sort(names(valid)[!valid])) +} + + +orderly_validate_archive_packet <- function(id, action, cache, root) { + if (!is.null(cache[[id]])) { + return(cache[[id]]) + } + + res <- orderly_validate_archive_packet_check(id, action, cache, root) + cache[[id]] <- res + + if (res$valid) { + cli::cli_alert_success("{res$id} ({res$name}) is valid") + } else { + reason <- c(if (!all(res$files$valid)) "files", + if (!all(res$depends$valid)) "upstream packets") + cli::cli_alert_danger( + "{res$id} ({res$name}) is invalid due to its {reason}") + if (action == "orphan") { + drop_local_packet(id, root) + } else if (action == "delete") { + drop_local_packet(id, root) + unlink(file.path(root$path, root$config$core$path_archive, res$name, id), + recursive = TRUE) + } + } + + res +} + + +orderly_validate_archive_packet_check <- function(id, action, cache, root) { + meta <- outpack_metadata_core(id, root) + name <- meta$name + path <- file.path(root$path, root$config$core$path_archive, name, id) + + depends <- meta$depends + depends$valid <- rep(NA, nrow(depends)) + if (root$config$core$require_complete_tree) { + for (i in seq_len(nrow(depends))) { + depends$valid[[i]] <- orderly_validate_archive_packet( + depends$packet[[i]], action, cache, root)$valid + } + } + + hash_compare <- function(path, hash) { + if (!file.exists(path)) NA_character_ else rehash(path, hash_file, hash) + } + files <- meta$files + files$hash_found <- list_to_character( + Map(hash_compare, file.path(path, files$path), files$hash)) + files$valid <- !is.na(files$hash_found) & files$hash_found == files$hash + + valid <- all(files$valid) && all(depends$valid) + + list(valid = valid, id = id, name = name, files = files, depends = depends) +} diff --git a/tests/testthat/test-validate.R b/tests/testthat/test-validate.R new file mode 100644 index 00000000..8f1ad948 --- /dev/null +++ b/tests/testthat/test-validate.R @@ -0,0 +1,110 @@ +test_that("Can validate a packet that is valid", { + root <- create_temporary_root() + id <- create_random_packet(root = root) + res <- evaluate_promise(orderly_validate_archive(id, root = root)) + expect_match(res$messages, sprintf("%s (data) is valid", id), fixed = TRUE) + expect_equal(res$result, character()) +}) + + +test_that("Can validate a packet that is invalid", { + root <- create_temporary_root() + id <- create_random_packet(root = root) + path <- file.path(root$path, "archive", "data", id, "data.rds") + file.create(path) # truncate file + res <- evaluate_promise(orderly_validate_archive(id, root = root)) + expect_match(res$messages, sprintf("%s (data) is invalid", id), fixed = TRUE) + expect_equal(res$result, id) + expect_true(file.exists(path)) # not deleted + expect_equal(root$index$rebuild()$unpacked(), id) # still there +}) + + +test_that("Can orphan an invalid packet", { + root <- create_temporary_root() + ids <- replicate(3, create_random_packet(root = root)) + id <- ids[[2]] + path <- file.path(root$path, "archive", "data", id, "data.rds") + file.create(path) # truncate file + res <- evaluate_promise( + orderly_validate_archive(id, action = "orphan", root = root)) + expect_match(res$messages, sprintf("%s (data) is invalid", id), fixed = TRUE) + expect_equal(res$result, id) + expect_true(file.exists(path)) # not deleted + expect_equal(root$index$rebuild()$unpacked(), ids[-2]) + expect_equal(root$index$location(orphan)$packet, id) +}) + + +test_that("Can delete an invalid packet", { + root <- create_temporary_root() + ids <- replicate(3, create_random_packet(root = root)) + id <- ids[[2]] + path <- file.path(root$path, "archive", "data", id, "data.rds") + file.create(path) # truncate file + res <- evaluate_promise( + orderly_validate_archive(action = "delete", root = root)) + expect_match(res$messages, sprintf("%s (data) is invalid", id), fixed = TRUE, + all = FALSE) + expect_equal(res$result, id) + expect_false(file.exists(path)) + expect_equal(root$index$rebuild()$unpacked(), ids[-2]) + expect_equal(root$index$location(orphan)$packet, id) +}) + + +test_that("nothing to validate if no archive", { + root <- create_temporary_root(use_file_store = TRUE, path_archive = NULL) + ids <- replicate(3, create_random_packet(root = root)) + expect_error(orderly_validate_archive(root = root), + "You have no archive to validate") +}) + + +test_that("recursively validate, errors in upstream are a problem", { + root <- create_temporary_root(require_complete_tree = TRUE) + ids <- create_random_packet_chain(5, root = root) + + file.create(file.path(root$path, "archive", "c", ids[["c"]], "data.rds")) + + res <- evaluate_promise( + orderly_validate_archive(ids[["d"]], root = root)) + expect_equal(res$result, unname(ids[3:4])) + expect_length(res$messages, 4) + expect_match(res$messages[[1]], sprintf("%s (a) is valid", ids[["a"]]), + fixed = TRUE) + expect_match(res$messages[[2]], sprintf("%s (b) is valid", ids[["b"]]), + fixed = TRUE) + expect_match(res$messages[[3]], + sprintf("%s (c) is invalid due to its files", ids[["c"]]), + fixed = TRUE) + expect_match(res$messages[[4]], + sprintf("%s (d) is invalid due to its upstream packets", + ids[["d"]]), + fixed = TRUE) + + res2 <- evaluate_promise( + orderly_validate_archive(ids[["d"]], action = "orphan", root = root)) + expect_equal(res, res2) + + expect_equal(root$index$rebuild()$unpacked(), unname(ids[-(3:4)])) + expect_equal(root$index$location(orphan)$packet, unname(ids[3:4])) +}) + + +test_that("invalidate all children of corrupt parent", { + root <- create_temporary_root(require_complete_tree = TRUE) + id1 <- create_random_packet(root) + id2 <- replicate(3, create_random_dependent_packet(root, "child", id1)) + id3 <- create_random_dependent_packet(root, "grandchild", id2) + res <- evaluate_promise(orderly_validate_archive(root = root)) + expect_equal(res$result, character()) + expect_length(res$messages, 5) + re <- "^.+ ([0-9]{8}-[0-9]{6}-[[:xdigit:]]{8}) \\([a-z]+\\) is valid\\n$" + expect_match(res$messages, re, all = TRUE) + expect_equal(sub(re, "\\1", res$messages), c(id1, id2, id3)) + ## Same if we start from the end + expect_equal( + evaluate_promise(orderly_validate_archive(id3, root = root)), + res) +}) From f5ea8a85a64a687893ffc895aa15f6932b1ee885 Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Fri, 25 Aug 2023 12:15:27 +0100 Subject: [PATCH 2/4] Add docs --- NAMESPACE | 1 + _pkgdown.yml | 1 + man/orderly_validate_archive.Rd | 62 +++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 man/orderly_validate_archive.Rd diff --git a/NAMESPACE b/NAMESPACE index 7c24b9fe..7069d421 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,4 +39,5 @@ export(orderly_search) export(orderly_search_options) export(orderly_shared_resource) export(orderly_strict_mode) +export(orderly_validate_archive) importFrom(R6,R6Class) diff --git a/_pkgdown.yml b/_pkgdown.yml index 9023f44e..7e9056f4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -80,3 +80,4 @@ reference: contents: - orderly_copy_files - orderly_example + - orderly_validate_archive diff --git a/man/orderly_validate_archive.Rd b/man/orderly_validate_archive.Rd new file mode 100644 index 00000000..335c3e32 --- /dev/null +++ b/man/orderly_validate_archive.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{orderly_validate_archive} +\alias{orderly_validate_archive} +\title{Validate unpacked packets.} +\usage{ +orderly_validate_archive(..., action = "inform", root = NULL, locate = TRUE) +} +\arguments{ +\item{...}{Either arguments that a search can be constructed from +(useful options here include \code{name = "something"}), a character +vector of ids, or leave empty to validate everything.} + +\item{action}{The action to take on finding an invalid packet. See +Details.} + +\item{root}{The path to the root directory, or \code{NULL} (the +default) to search for one from the current working directory if +\code{locate} is \code{TRUE}. This function does not require that the +directory is configured for orderly, and can be any \code{outpack} +root (see \link{orderly_init} for details).} + +\item{locate}{Logical, indicating if the root should be searched +for. If \code{TRUE}, then we looks in the directory given for \code{root} +(or the working directory if \code{NULL}) and then up through its +parents until it finds an \code{.outpack} directory or +\code{orderly_config.yml}} +} +\value{ +Invisibly, a character vector of repaired (or invalid) +packets. +} +\description{ +Validate unpacked packets. Over time, expect this function to +become more fully featured, validating more. +} +\details{ +The actions that we can take on finding an invalid packet are: +\itemize{ +\item \code{inform} (the default): just print information about the problem +\item \code{orphan}: mark the packet as orphaned within the metadata, but +do not touch the files in your archive (by default the directory +\verb{archive/}) - this is a safe option and will leave you in a +consistent state without deleting anything. +\item \code{delete}: in addition to marking the packet as an orphan, also +delete the files from your archive. +} + +Later, we will add a "repair" option to try and fix broken +packets. + +The validation interacts with the option +\code{core.require_complete_tree}; if this option is \code{TRUE}, then a +packet is only valid if all its (recursive) dependencies are also +valid, so the action will apply to packets that have also had +their upstream dependencies invalidated. This validation will +happen even if the query implied by \code{...} does not include these +packets if a complete tree is required. + +The validation will also interact with \code{core.use_file_store} once +repair is supported, as this becomes trivial. +} From 8cef9874e1f7ecb772bf975765943639d570bcb1 Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Fri, 25 Aug 2023 14:12:19 +0100 Subject: [PATCH 3/4] Rebuild index after orphaning packets --- R/location.R | 3 --- R/validate.R | 7 +++++-- tests/testthat/test-validate.R | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/location.R b/R/location.R index 235b2f18..30986e01 100644 --- a/R/location.R +++ b/R/location.R @@ -166,9 +166,6 @@ orderly_location_remove <- function(name, root = NULL, locate = TRUE) { if (fs::dir_exists(location_path)) { fs::dir_delete(location_path) } - ## This forces a rebuild of the index, and is the only call with - ## rebuild() anywhere; it can probably be relaxed if the refresh was - ## more careful, but this is a rare operation. root$index$rebuild() config <- root$config config$location <- config$location[config$location$name != name, ] diff --git a/R/validate.R b/R/validate.R index 666df873..7bbf582a 100644 --- a/R/validate.R +++ b/R/validate.R @@ -63,8 +63,11 @@ orderly_validate_archive <- function(..., action = "inform", cache[[id]] <- orderly_validate_archive_packet(id, action, cache, root) } res <- as.list(cache) - valid <- vlapply(res, "[[", "valid") - invisible(sort(names(valid)[!valid])) + invalid <- sort(names(Filter(function(x) !x$valid, res))) + if (length(invalid) > 0) { + root$index$rebuild() + } + invisible(invalid) } diff --git a/tests/testthat/test-validate.R b/tests/testthat/test-validate.R index 8f1ad948..deeb5227 100644 --- a/tests/testthat/test-validate.R +++ b/tests/testthat/test-validate.R @@ -16,7 +16,7 @@ test_that("Can validate a packet that is invalid", { expect_match(res$messages, sprintf("%s (data) is invalid", id), fixed = TRUE) expect_equal(res$result, id) expect_true(file.exists(path)) # not deleted - expect_equal(root$index$rebuild()$unpacked(), id) # still there + expect_equal(root$index$unpacked(), id) # still there }) @@ -31,7 +31,7 @@ test_that("Can orphan an invalid packet", { expect_match(res$messages, sprintf("%s (data) is invalid", id), fixed = TRUE) expect_equal(res$result, id) expect_true(file.exists(path)) # not deleted - expect_equal(root$index$rebuild()$unpacked(), ids[-2]) + expect_equal(root$index$unpacked(), ids[-2]) expect_equal(root$index$location(orphan)$packet, id) }) @@ -48,7 +48,7 @@ test_that("Can delete an invalid packet", { all = FALSE) expect_equal(res$result, id) expect_false(file.exists(path)) - expect_equal(root$index$rebuild()$unpacked(), ids[-2]) + expect_equal(root$index$unpacked(), ids[-2]) expect_equal(root$index$location(orphan)$packet, id) }) @@ -87,7 +87,7 @@ test_that("recursively validate, errors in upstream are a problem", { orderly_validate_archive(ids[["d"]], action = "orphan", root = root)) expect_equal(res, res2) - expect_equal(root$index$rebuild()$unpacked(), unname(ids[-(3:4)])) + expect_equal(root$index$unpacked(), unname(ids[-(3:4)])) expect_equal(root$index$location(orphan)$packet, unname(ids[3:4])) }) From 9311ea02ea7d26825d8fbea2b8dec7298b87d8e8 Mon Sep 17 00:00:00 2001 From: Rich FitzJohn Date: Fri, 25 Aug 2023 15:21:04 +0100 Subject: [PATCH 4/4] Tidy up logic around upstream packets --- R/validate.R | 2 +- tests/testthat/test-validate.R | 36 +++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/R/validate.R b/R/validate.R index 7bbf582a..41d873cd 100644 --- a/R/validate.R +++ b/R/validate.R @@ -105,7 +105,7 @@ orderly_validate_archive_packet_check <- function(id, action, cache, root) { path <- file.path(root$path, root$config$core$path_archive, name, id) depends <- meta$depends - depends$valid <- rep(NA, nrow(depends)) + depends$valid <- rep(TRUE, nrow(depends)) if (root$config$core$require_complete_tree) { for (i in seq_len(nrow(depends))) { depends$valid[[i]] <- orderly_validate_archive_packet( diff --git a/tests/testthat/test-validate.R b/tests/testthat/test-validate.R index deeb5227..23e366d8 100644 --- a/tests/testthat/test-validate.R +++ b/tests/testthat/test-validate.R @@ -103,7 +103,41 @@ test_that("invalidate all children of corrupt parent", { re <- "^.+ ([0-9]{8}-[0-9]{6}-[[:xdigit:]]{8}) \\([a-z]+\\) is valid\\n$" expect_match(res$messages, re, all = TRUE) expect_equal(sub(re, "\\1", res$messages), c(id1, id2, id3)) - ## Same if we start from the end + ## Same if we start from the end; all 5 in the same order + expect_equal( + evaluate_promise(orderly_validate_archive(id3, root = root)), + res) + + file.create(file.path(root$path, "archive", "child", id2[[1]], "data.rds")) + res <- evaluate_promise(orderly_validate_archive(id3, root = root)) + expect_match(res$messages[c(1, 3, 4)], re, all = TRUE) + expect_match(res$messages[[2]], + sprintf("%s (child) is invalid due to its files", id2[[1]]), + fixed = TRUE) + expect_match( + res$messages[[5]], + sprintf("%s (grandchild) is invalid due to its upstream packets", id3), + fixed = TRUE) +}) + + +test_that("don't invalidate children when complete tree off", { + root <- create_temporary_root(require_complete_tree = FALSE) + id1 <- create_random_packet(root) + id2 <- replicate(3, create_random_dependent_packet(root, "child", id1)) + id3 <- create_random_dependent_packet(root, "grandchild", id2) + res <- evaluate_promise(orderly_validate_archive(root = root)) + expect_equal(res$result, character()) + expect_length(res$messages, 5) + re <- "^.+ ([0-9]{8}-[0-9]{6}-[[:xdigit:]]{8}) \\([a-z]+\\) is valid\\n$" + expect_match(res$messages, re, all = TRUE) + expect_equal(sub(re, "\\1", res$messages), c(id1, id2, id3)) + ## No longer recurses if called from the end + res <- evaluate_promise(orderly_validate_archive(id3, root = root)) + expect_length(res$messages, 1) + + ## We no longer know, or care, if this is invalid due to children + file.create(file.path(root$path, "archive", "child", id2[[1]], "data.rds")) expect_equal( evaluate_promise(orderly_validate_archive(id3, root = root)), res)