diff --git a/R/outpack_helpers.R b/R/outpack_helpers.R index 0fdce5d3..68cd5057 100644 --- a/R/outpack_helpers.R +++ b/R/outpack_helpers.R @@ -120,13 +120,25 @@ orderly_copy_files <- function(..., files, dest, overwrite = TRUE, tryCatch( file_export(root, id, plan$there, plan$here, dest, overwrite), not_found_error = function(e) { - if (!as_orderly_search_options(options)$allow_remote) { - stop(paste0( - "Unable to copy files, as they are not available locally\n", - "To fetch from a location, try again with", - " 'options = list(allow_remote = TRUE)'\n", - "Original error:\n", e$message), - call. = FALSE) + if (id %in% root$index$unpacked()) { + ## The most likely reason for things to have failed is that + ## the user has deleted part of the archive. + name <- outpack_metadata_core(id, root)$name + packet_exists <- file.exists( + file.path(root$path, root$config$core$path_archive, name, id)) + reason <- if (packet_exists) "locally modified" else "deleted" + cmd <- sprintf( + 'orderly2::orderly_validate_archive("%s", action = "orphan")', id) + cli::cli_abort( + c("Unable to copy files, due to {reason} packet {id}", + i = "Consider '{cmd}' to remove this packet from consideration"), + parent = e) + } else if (!as_orderly_search_options(options)$allow_remote) { + cli::cli_abort( + c("Unable to copy files, as they are not available locally", + i = paste("To fetch from a location, try again with", + "options = list(allow_remote = TRUE)")), + parent = e) } copy_files_from_remote(id, plan$there, plan$here, dest, overwrite, root, environment()) diff --git a/R/outpack_packet.R b/R/outpack_packet.R index 056b51a5..5683436c 100644 --- a/R/outpack_packet.R +++ b/R/outpack_packet.R @@ -87,7 +87,7 @@ outpack_packet_end <- function(packet, insert = TRUE) { packet$time$end <- Sys.time() hash_algorithm <- packet$root$config$core$hash_algorithm elapsed_str <- format(packet$time$end - packet$time$start) - cli::cli_alert_success( + cli::cli_alert_info( "Finished {packet$id} at {packet$time$end} ({elapsed_str})") json <- outpack_metadata_create(packet$path, packet$name, packet$id, packet$time, diff --git a/R/outpack_root.R b/R/outpack_root.R index 87f9d6b6..4a29bc10 100644 --- a/R/outpack_root.R +++ b/R/outpack_root.R @@ -26,7 +26,7 @@ outpack_root <- R6::R6Class( ## Not just for the file store, but this is how we can interact with ## the files safely: -file_export <- function(root, id, there, here, dest, overwrite) { +file_export <- function(root, id, there, here, dest, overwrite, call = NULL) { ## This validation *always* occurs; does the packet even claim to ## have this path? validate_packet_has_file(root, id, there) @@ -51,22 +51,24 @@ file_export <- function(root, id, there, here, dest, overwrite) { } } else { there_full <- file.path(root$path, root$config$core$path_archive, - meta$name, meta$id, there) + meta$name, meta$id, there) if (!all(file.exists(there_full))) { - missing <- hash[!file.exists(there_full)] - message <- paste("File not found in archive:\n%s", - paste(sprintf(" - %s", missing), collapse = "\n")) - stop(not_found_error(message, missing)) + cli::cli_abort( + c("File not found in archive", + set_names(there[!file.exists(there_full)], "x")), + class = "not_found_error", + call = call) } - ## TODO: Ideally we would have an argument/option support a faster - ## possibility here if requested (e.g., no validation validate just - ## size, validate hash); this only applies to this non-file-store - ## using branch, so typically would affect users running "draft" - ## type analyses for (i in seq_along(here_full)) { tryCatch( hash_validate_file(there_full[[i]], hash[[i]]), - error = function(e) stop(not_found_error(e$message, there_full[[i]]))) + error = function(e) { + cli::cli_abort( + "File '{there}' in '{meta$name}/{meta$id}' is corrupt", + parent = e, + class = "not_found_error", + call = call) + }) } fs::file_copy(there_full, here_full, overwrite) } diff --git a/R/util.R b/R/util.R index 5ff53269..098ea7d3 100644 --- a/R/util.R +++ b/R/util.R @@ -30,6 +30,9 @@ vnapply <- function(X, FUN, ...) { # nolint set_names <- function(x, nms) { + if (length(nms) == 1 && length(x) != 1) { + nms <- rep_len(nms, length(x)) + } names(x) <- nms x } diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R index 468f6cc4..e6ea573b 100644 --- a/tests/testthat/test-run.R +++ b/tests/testthat/test-run.R @@ -1009,3 +1009,76 @@ test_that("can run example with artefacts and no resources", { expect_true(file.exists( file.path(path, "archive", "implicit", id, "mygraph.png"))) }) + + +test_that("cope with manually deleted packets, exclude from deps", { + path <- test_prepare_orderly_example(c("data", "depends")) + ids <- vcapply(1:3, function(i) { + orderly_run_quietly("data", envir = new.env(), root = path) + }) + + id <- ids[[3]] + unlink(file.path(path, "archive", "data", id), recursive = TRUE) + + err <- expect_error( + orderly_run_quietly("depends", root = path, envir = new.env()), + "Failed to run report") + expect_equal( + err$parent$message, + set_names(paste("Unable to copy files, due to deleted packet", id), + "")) + cmd <- sprintf('orderly2::orderly_validate_archive("%s", action = "orphan")', + id) + expect_equal( + err$parent$body, + c(i = sprintf("Consider '%s' to remove this packet from consideration", + cmd))) + + expect_s3_class(err$parent$parent, "not_found_error") + expect_equal( + err$parent$parent$message, + set_names("File not found in archive", "")) + expect_equal(err$parent$parent$body, c(x = "data.rds")) + + suppressMessages(orderly_validate_archive(id, action = "orphan", root = path)) + id2 <- orderly_run_quietly("depends", root = path, envir = new.env()) + expect_equal(orderly_metadata(id2, path)$depends$packet, ids[[2]]) +}) + + +test_that("cope with corrupted packets, exclude from deps", { + path <- test_prepare_orderly_example(c("data", "depends")) + ids <- vcapply(1:3, function(i) { + orderly_run_quietly("data", envir = new.env(), root = path) + }) + + id <- ids[[3]] + file.create(file.path(path, "archive", "data", id, "data.rds")) # truncate + + err <- expect_error( + orderly_run_quietly("depends", root = path, envir = new.env()), + "Failed to run report") + expect_equal( + err$parent$message, + set_names(paste("Unable to copy files, due to locally modified packet", id), + "")) + cmd <- sprintf('orderly2::orderly_validate_archive("%s", action = "orphan")', + id) + expect_equal( + err$parent$body, + c(i = sprintf("Consider '%s' to remove this packet from consideration", + cmd))) + + expect_s3_class(err$parent$parent, "not_found_error") + expect_equal( + err$parent$parent$message, + sprintf("File 'data.rds' in 'data/%s' is corrupt", id)) + expect_null(err$parent$parent$body) + expect_match( + err$parent$parent$parent$message, + "Hash of '.+/data.rds' does not match!") + + suppressMessages(orderly_validate_archive(id, action = "orphan", root = path)) + id2 <- orderly_run_quietly("depends", root = path, envir = new.env()) + expect_equal(orderly_metadata(id2, path)$depends$packet, ids[[2]]) +}) diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index 4577c981..02672683 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -282,3 +282,15 @@ test_that("can print pretty bytes", { expect_equal(pretty_bytes(5000000000), "5,000 MB") expect_equal(pretty_bytes(5123456789), "5,123.5 MB") }) + + +test_that("set_names copes with common pathologies", { + expect_equal(set_names(character(), "x"), + structure(character(), names = character())) + expect_equal(set_names("a", "x"), + c("x" = "a")) + expect_equal(set_names(c("a", "b"), "x"), + c("x" = "a", x = "b")) + expect_equal(set_names(c("a", "b"), c("x", "y")), + c("x" = "a", y = "b")) +}) diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 883c44d3..4ec9fed8 100644 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -22,6 +22,10 @@ orderly_file <- function(...) { system.file(..., package = "orderly2", mustWork = TRUE) } +inline <- function(x) { + sprintf("`%s`", format(x)) +} + knitr::opts_chunk$set( collapse = TRUE) ``` @@ -64,7 +68,7 @@ An orderly report is a directory `src/` containing a file `orderly.R`. Tha ```{r, include = FALSE} fs::dir_create(file.path(path, "src", "incoming_data")) -write.csv(data.frame(x = 1:10, y = 1:10 + rnorm(10)), +write.csv(data.frame(x = 1:10, y = 1:10 + rnorm(10)), file.path(path, "src", "incoming_data", "data.csv"), row.names = FALSE) writeLines(c( @@ -227,7 +231,7 @@ This creates a report that has a single parameter `n_samples` with a default val orderly2::orderly_parameters(n_samples = NULL)` ``` -to define a parameter with no default, or defined multiple parameters with +to define a parameter with no default, or defined multiple parameters with ```r orderly2::orderly_parameters(n_samples = 10, distribution = "normal") @@ -382,6 +386,49 @@ orderly2::orderly_gitignore_update("incoming_data", root = path) This creates (or updates) a `.gitignore` file within the report so that generated files will not be included by git. If you have already accidentally committed them then the gitignore has no real effect and you should do some git surgery, see the git manuals or this [handy, if profane, guide](https://ohshitgit.com/). +# Deleting things from the archive + +If you delete packets from your `archive/` directory then this puts `orderly2` into an inconsistent state with its metadata store. Sometimes this does not matter (e.g., if you delete old copies that would never be candidates for inclusion with `orderly2::orderly_dependency` you will never notice). However, if you delete the most recent copy of a packet and then try and depend on it, you will get an error. + +At the moment, we have two copies of the `incoming_data` task: + +```{r} +orderly2::orderly_metadata_extract( + name = "incoming_data", + extract = c(time = "time.start"), + root = path) +``` + +```{r include = FALSE} +id_latest <- orderly2::orderly_search("latest", name = "incoming_data", + root = path) +unlink(file.path(path, "archive", "incoming_data", id_latest), recursive = TRUE) +``` + +When we run the `analysis` task, it will pull in the most recent version (`r inline(id_latest)`). However, if you had deleted this manually (e.g., to save space or accidentally) or corrupted it (e.g., by opening some output in Excel and letting it save changes) it will not be able to be included, and running `analysis` will fail: + +```{r error = TRUE} +orderly2::orderly_run("analysis", root = path) +``` + +The error here tries to be fairly informative, telling us that we failed because when copying files from `r inline(id_latest)` we found that the packet was corrupt, because the file `data.rds` was not found in the archive. It also suggests a fix; we can tell `orderly2` that `r inline(id_latest)` is "orphaned" and should not be considered for inclusion when we look for dependencies. + +We can carry out the suggestion and just validate this packet by running + +```{r echo = FALSE, results = "asis"} +r_output( + sprintf('orderly2::orderly_validate_archive("%s", action = "orphan")', + id_latest)) +``` + +or we can validate *all* the packets we have: + +```{r} +orderly2::orderly_validate_archive(action = "orphan", root = path) +``` + +If we had the option `core.require_complete_tree` enabled, then this process would also look for any packets that used our now-deleted packet and orphan those too, as we no longer have a complete tree that includes them. + # Debugging and coping with errors (To be written) diff --git a/vignettes/migrating.Rmd b/vignettes/migrating.Rmd index 17c4c9ed..5a4481fd 100644 --- a/vignettes/migrating.Rmd +++ b/vignettes/migrating.Rmd @@ -75,6 +75,8 @@ In version 1, we had built-in support for accessing data from SQL databases; thi `orderly2` no longer requires a separate `orderly_commit()` call after `orderly_run()`; we no longer make a distinction between local draft and archive packets. Instead, we have added finer-grained control over where dependencies are resolved from (locally, or from some subset of your servers), which generalises the way that draft/archive was used in practice. See `?orderly_run` for more details on how dependencies are resolved. +This has implications for deleting things; the draft directory was always an easy target for deletion, but now after deletion you will need to tell `orderly2` that you have deleted things. See `vignette("introduction")` for details on this (section "Deleting things from the archive"). + ## No more testing or development mode We have had two different, but unsatisfactory, mechanisms for developing an orderly report: @@ -120,3 +122,5 @@ We will merge `orderly2` into the `orderly` package, so once we are ready for re * The YAML format is inflexible, error prone for users, and leads to duplication * It was too focussed around our initial needs with the [Vaccine Impact Modelling Consortium](https://vaccineimpact.org) +* It was fairly easy to get your archive and SQLite database into an inconsistent state (e.g., by deleting or moving files from the archive) +* The SQLite database behaved poorly on shared file systems