diff --git a/DESCRIPTION b/DESCRIPTION index 6c8a23d9..560cd5c9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: dplyr, data.table, diff --git a/NAMESPACE b/NAMESPACE index 85ab7af7..9bb56a2c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -86,6 +86,7 @@ export(register_study_files) export(remanifest) export(remove_button) export(remove_wiki_subpage) +export(set_annotations) export(summarize_attribute) export(summarize_file_access) export(swap_col) diff --git a/R/annotation_qc.R b/R/annotation_qc.R index 6d6589d0..1e2e757f 100644 --- a/R/annotation_qc.R +++ b/R/annotation_qc.R @@ -163,7 +163,7 @@ infer_data_type <- function(dataset_id) { children <- first(children, 3) data_type <- c() for (entity in children) { - e <- .syn$getAnnotations(entity) + e <- .syn$get_annotations(entity) data_type <- append(data_type, e$Component) } data_type <- unique(data_type) diff --git a/R/annotations.R b/R/annotations.R index 17d14b9d..e331b281 100644 --- a/R/annotations.R +++ b/R/annotations.R @@ -1,16 +1,32 @@ +#' Wrapper around the Python `set_annotations` that pulls current annotations +#' and adds new annotations with given annotations data or replaces +#' data for annotations with the same keys existing on the entity. +#' @param id Synapse entity id. +#' @param annotations A flat list representing annotation key-value pairs, +#' e.g. `list(foo = "bar", rank = 1, authors = c("jack", "jane"))` +#' @export +set_annotations <- function(id, annotations) { + e_annotations <- .syn$get_annotations(id) + for (k in names(annotations)) { + e_annotations[k] <- annotations[[k]] + } + .syn$set_annotations(e_annotations) +} + #' Set annotations from a manifest -#' -#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) -#' suggest doing batch annotations through a fileview. However, it is often simpler to -#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. +#' +#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) +#' suggest doing batch annotations through a fileview. However, it is often simpler to +#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. #' This is like how schematic works, except without any validation (so works best for power-users who know the data model well). -#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. -#' -#' @param manifest A table manifest. Needs to contain `entityId`. +#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. +#' +#' @param manifest A `data.frame` representing a manifest. +#' Needs to contain `entityId` (if parsed from a standard manifest.csv, the df should already contain `entityId`). #' @param ignore_na Whether to ignore annotations that are `NA`; default TRUE. #' @param ignore_blank Whether to ignore annotations that are that empty strings; default TRUE. #' @param verbose Be chatty, default FALSE. -#' @export +#' @export annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE, verbose = FALSE) { # Split by `entityId` annotations <- as.data.table(manifest) @@ -20,47 +36,45 @@ annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TR filterNA <- if(ignore_na) function(x) !any(is.na(x)) else TRUE # will ignore entirely if list with NA, e.g. c(NA, 1, 2) -- should warn if list filterBlank <- if(ignore_blank) function(x) !any(x == "") else TRUE # same as above annotations <- lapply(annotations, function(x) Filter(function(x) filterNA(x) & filterBlank(x) & length(x), unlist(x, recursive = F))) - for(entity in names(annotations)) { - .syn$setAnnotations(entity = entity, annotations = as.list(annotations[[entity]])) + for(entity_id in names(annotations)) { + set_annotations(entity_id, annotations[[entity_id]]) } if (verbose) message("Annotations submitted") } #' Copy annotations -#' +#' #' Copy annotations (all or selectively) from a source entity to one or more target entities. -#' If annotations already exist on target entities, the copy will replace the current values. -#' -#' @param entity_from Syn id from which to copy. -#' @param entity_to One or more syn ids to copy annotations to. -#' @param select Vector of properties to selectively copy if present on the entity. +#' If same annotation keys already exist on target entities, the copy will replace the current values. +#' +#' @param entity_from Syn id from which to copy. +#' @param entity_to One or more syn ids to copy annotations to. +#' @param select Vector of properties to selectively copy if present on the entity. #' If not specified, will copy over everything, which may not be desirable. -#' @param update Whether to immediately update or return annotation objects only. +#' @param update Whether to immediately update or return annotation objects only. #' @export copy_annotations <- function(entity_from, entity_to, select = NULL, update = FALSE) { - + .check_login() - - annotations <- .syn$get_annotations(entity_from) + + from_annotations <- .syn$get_annotations(entity_from) + # Check `select` if(is.null(select)) { - cp <- annotations + select <- names(from_annotations) } else { - cp <- reticulate::dict() - for(k in names(annotations)) { - if(k %in% select) cp[k] <- annotations[k] - } + select <- select[select %in% names(from_annotations)] } - - if(update) { - for(e in entity_to) { - .syn$setAnnotations(e, annotations = cp) + + for(id in entity_to) { + to_annotations <- .syn$get_annotations(id) + for(k in select) { + to_annotations[k] <- from_annotations[k] } - } else { - return(cp) + if(update) .syn$set_annotations(to_annotations) else return(to_annotations) } } diff --git a/R/calculate_related_studies.R b/R/calculate_related_studies.R index 2f02ceaa..72c506cc 100644 --- a/R/calculate_related_studies.R +++ b/R/calculate_related_studies.R @@ -111,9 +111,9 @@ calculate_related_studies <- function(study_table_id, for(i in 1:nrow(studies_updated)) { id <- studies_updated[i, "studyId"] relatedStudies <- studies_updated[i, "relatedStudies"] - annotations <- .syn$getAnnotations(id) + annotations <- .syn$get_annotations(id) annotations$relatedStudies <- relatedStudies - invisible(.syn$setAnnotations(id, annotations)) + invisible(set_annotations(id, annotations)) } } else { studies_updated diff --git a/R/register_study.R b/R/register_study.R index 84dabcdb..2ef95127 100644 --- a/R/register_study.R +++ b/R/register_study.R @@ -68,7 +68,7 @@ add_new_study_meta <- function(id, study_meta) { if(is.null(study_meta$studyStatus) || is.na(study_meta$studyStatus)) study_meta$studyStatus <- "Active" if(is.null(study_meta$dataStatus) || is.na(study_meta$dataStatus)) study_meta$dataStatus <- "Data Pending" - study <- .syn$setAnnotations(id, study_meta) + study <- set_annotations(id, study_meta) invisible(study) } diff --git a/README.md b/README.md index 88690d7c..cdc17761 100644 --- a/README.md +++ b/README.md @@ -8,20 +8,27 @@ The goal of `nfportalutils` is to provide convenience functions for project and (meta)data management in the NF-OSI data portal scope. Currently, `develop` branch is default so package install and docs refer to code in this branch. +The package interops with the [Python synapse client](https://github.com/Sage-Bionetworks/synapsePythonClient) via reticulate. +You will have to set up both (see #Installation). Outside of the tested versions, there may be some issues. The tested versions are: +- Python Synapse Client == 4.3.1 +- reticulate == 1.39.0 + ## Docs :point_right: [Package documentation!](https://nf-osi.github.io/nfportalutils/) ## Installation -You can install `nfportalutils` from here: +This presumes you have already set up R with RStudio. -``` r -remotes::install_github("nf-osi/nfportalutils") -``` - -The package interops with the [Python synapse client](https://github.com/Sage-Bionetworks/synapsePythonClient) via reticulate. -You will have to download the Python synapse client first. +1. Install `reticulate` following guide at https://rstudio.github.io/reticulate/index.html#installation. +2. Install `synapseclient==4.3.1` following https://rstudio.github.io/reticulate/articles/python_packages.html, which will use a default environment "r-reticulate". +3. Lastly, install `nfportalutils`. At startup, `nfportalutils` imports `synapseclient` from the default "r-reticulate". + - As regular users: `remotes::install_github("nf-osi/nfportalutils", build_vignettes = TRUE)` or `remotes::install_github("nf-osi/nfportalutils@some-branch", build_vignettes = TRUE)` + - For developers, presumably working with `devtools`: + - Clone the repo, checkout your desired development branch. + - Make sure the package repo root is working directory, then in R run `devtools::install()`. +4. Browse some vignettes: `browseVignettes("nfportalutils")`. ## For Users diff --git a/_pkgdown.yml b/_pkgdown.yml index 8fe504be..b8825da4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,7 @@ reference: - subtitle: General annotations desc: Add and manage annotations on Synapse entities - contents: + - set_annotations - update_study_annotations - annotate_with_manifest - copy_annotations diff --git a/man/annotate_with_manifest.Rd b/man/annotate_with_manifest.Rd index e499bb96..fb699ac6 100644 --- a/man/annotate_with_manifest.Rd +++ b/man/annotate_with_manifest.Rd @@ -12,7 +12,8 @@ annotate_with_manifest( ) } \arguments{ -\item{manifest}{A table manifest. Needs to contain \code{entityId}.} +\item{manifest}{A \code{data.frame} representing a manifest. +Needs to contain \code{entityId} (if parsed from a standard manifest.csv, the df should already contain \code{entityId}).} \item{ignore_na}{Whether to ignore annotations that are \code{NA}; default TRUE.} diff --git a/man/copy_annotations.Rd b/man/copy_annotations.Rd index 5c343464..9e6230d2 100644 --- a/man/copy_annotations.Rd +++ b/man/copy_annotations.Rd @@ -18,5 +18,5 @@ If not specified, will copy over everything, which may not be desirable.} } \description{ Copy annotations (all or selectively) from a source entity to one or more target entities. -If annotations already exist on target entities, the copy will replace the current values. +If same annotation keys already exist on target entities, the copy will replace the current values. } diff --git a/man/set_annotations.Rd b/man/set_annotations.Rd new file mode 100644 index 00000000..ff0db764 --- /dev/null +++ b/man/set_annotations.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/annotations.R +\name{set_annotations} +\alias{set_annotations} +\title{Wrapper around the Python \code{set_annotations} that pulls current annotations +and adds new annotations with given annotations data or replaces +data for annotations with the same keys existing on the entity.} +\usage{ +set_annotations(id, annotations) +} +\arguments{ +\item{id}{Synapse entity id.} + +\item{annotations}{A flat list representing annotation key-value pairs, +e.g. \code{list(foo = "bar", rank = 1, authors = c("jack", "jane"))}} +} +\description{ +Wrapper around the Python \code{set_annotations} that pulls current annotations +and adds new annotations with given annotations data or replaces +data for annotations with the same keys existing on the entity. +} diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R index c3fce801..73f37743 100644 --- a/tests/testthat/helpers.R +++ b/tests/testthat/helpers.R @@ -1,12 +1,12 @@ # Implementing skips according to suggested handling when using reticulate # See https://rstudio.github.io/reticulate/articles/package.html # Skips tests on CRAN machines or other incompatible testing environments -# where Python can't be configured so package checks don't fail +# where Python can't be configured so package checks don't fail # Skip if Python synapseclient module not installed/accessible # This is normally imported upon package load, see `zzz.R` skip_if_no_synapseclient <- function() { - have_synapseclient <- py_module_available("synapseclient") + have_synapseclient <- py_module_available("synapseclient") if(!have_synapseclient) skip("synapseclient not available for testing") } @@ -14,14 +14,14 @@ skip_if_no_synapseclient <- function() { # Skip if Python synapseutils module not installed/accessible # This is normally imported upon package load, see `zzz.R` skip_if_no_synapseutils <- function() { - have_synapseutils <- py_module_available("synapseclient") + have_synapseutils <- py_module_available("synapseclient") if(!have_synapseutils) skip("synapseutils not available for testing") } # Skip if no pandas; pandas is needed for smaller subset of functions in the package skip_if_no_pandas <- function() { - have_pandas <- py_module_available("pandas") + have_pandas <- py_module_available("pandas") if(!have_pandas) skip("pandas not available for testing") } @@ -37,6 +37,6 @@ skip_if_no_token <- function() { # (e.g. someone pasted in wrong token), this creates a skip cascade for tests that presume # successful login. skip_if_no_login <- function() { - if(!exists(".syn")) + if(!exists(".syn") || is.null(.syn$username)) skip("not logged in for tests") } diff --git a/tests/testthat/test-add_pubmed_publications.R b/tests/testthat/test-add_pubmed_publications.R deleted file mode 100644 index 8849056e..00000000 --- a/tests/testthat/test-add_pubmed_publications.R +++ /dev/null @@ -1,3 +0,0 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) diff --git a/tests/testthat/test_copy_annotations.R b/tests/testthat/test_copy_annotations.R new file mode 100644 index 00000000..9a2ed138 --- /dev/null +++ b/tests/testthat/test_copy_annotations.R @@ -0,0 +1,47 @@ +test_that("Copy annotations works", { + skip_if_no_synapseclient() + skip_if_no_login() + + PARENT_TEST_PROJECT <- "syn26462036" + # Create some folder objects with some annotations + entity_a <- synapseclient$Folder("Entity A", + parent = PARENT_TEST_PROJECT, + annotations = list(foo = "bar", favorites = c("raindrops", "whiskers"))) + entity_a <- .syn$store(entity_a) + + entity_b <- synapseclient$Folder("Entity B", + parent = PARENT_TEST_PROJECT, + annotations = list(favorites = c("kettles", "mittens"), after_a = TRUE)) + entity_b <- .syn$store(entity_b) + + entity_c <- synapseclient$Folder("Entity C", + parent = PARENT_TEST_PROJECT) + entity_c <- .syn$store(entity_c) + + # when copying all annotations from A->B (default) + copy_annotations(entity_from = entity_a$properties$id, + entity_to = entity_b$properties$id, + select = NULL, + update = TRUE) + + # when copying selective annotations from A->C + copy_annotations(entity_from = entity_a$properties$id, + entity_to = entity_c$properties$id, + select = c("favorites", "key_not_on_a"), + update = TRUE) + + result_b <- .syn$get_annotations(entity_b) + result_c <- .syn$get_annotations(entity_c) + .syn$delete(entity_a) + .syn$delete(entity_b) + .syn$delete(entity_c) + testthat::expect_equal(result_b$foo, "bar") + testthat::expect_equal(result_b$favorites, c("raindrops", "whiskers")) + testthat::expect_equal(result_b$after_a, TRUE) + testthat::expect_error(result_c$foo) # Expect KeyError since key should not be present + testthat::expect_equal(result_c$favorites, c("raindrops", "whiskers")) + testthat::expect_error(result_c$key_not_on_a) + +}) + + diff --git a/tests/testthat/test_manifest_annotations.R b/tests/testthat/test_manifest_annotations.R new file mode 100644 index 00000000..0d6088c6 --- /dev/null +++ b/tests/testthat/test_manifest_annotations.R @@ -0,0 +1,23 @@ +test_that("Annotate with manifest works", { + skip_if_no_synapseclient() + skip_if_no_login() + + PARENT_TEST_PROJECT <- "syn26462036" + # Use some folders to represent objects to annotate + objs <- make_folder(parent = PARENT_TEST_PROJECT, folders = c("mock_file_1", "mock_file_2", "mock_file_3")) + ids <- sapply(objs, function(x) x$properties$id) + # Partial manifest as a data.table with list columns + manifest <- data.table( + entityId = ids, + assay = "drugScreen", + experimentalTimepoint = c(1L, 3L, 7L), + experimentalTimepointUnit = "days", + cellType = list(c("schwann", "macrophage"), c("schwann", "macrophage"), c("schwann", "macrophage")) + ) + annotate_with_manifest(manifest) + remanifested <- list() + for(i in ids) { + remanifested[[i]] <- .syn$get_annotations(i) + } + for(i in ids) .syn$delete(i) +}) diff --git a/vignettes/annotate-data-intro.Rmd b/vignettes/annotate-data-intro.Rmd new file mode 100644 index 00000000..588f7526 --- /dev/null +++ b/vignettes/annotate-data-intro.Rmd @@ -0,0 +1,90 @@ +--- +title: "Introduction to utils for annotating data" +output: rmarkdown::html_vignette +date: 2022-10-17 +vignette: > + %\VignetteIndexEntry{annotating-nextflow-processed-data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Intro + +This introduces the annotation utilities with typical examples. +This is expected to be the more useful starting point for using nfportalutils for annotation tasks, to be followed by the more specialized vignetted for annotating NF processed data if needed. + +### Set up +```{r, eval=FALSE} +library(nfportalutils) + +syn_login() + +# Change this to a dev project you have access to +PROJECT <- "syn26462036" +``` + +### Set annotations on a single file + +Create a demo entity. +```{r, eval=FALSE} + +synapseclient <- reticulate::import("synapseclient") +# Create an entity with some initial annotations +entity <- synapseclient$Folder("Demo Entity", + parent = PROJECT, + annotations = list(foo = "bar", favorites = c("raindrops", "whiskers"))) + +entity <- .syn$store(entity) +``` + +`set_annotations` can be used to add new annotations or correct an existing annotation on an entity. +This wraps the Python client to make it more intuitive to pass in an R list as the annotations as above. +Here, add another annotation *and* correct the `favorites` to "chocolate". +The returned data shows the unchanged `foo`, the updated `favorites`, and a new `n`. +```{r, eval=FALSE} +set_annotations(id = entity$properties$id, annotations = list(favorites = "chocolate", n = 7L)) +``` + +Cleanup. +```{r, eval=FALSE} +.syn$delete(entity) +``` + +### Annotate in batch using a manifest + +A better way to use `set_annotations` for a set of entities, usually files. + +First create multiple entities that need to be annotated or corrected in batch. +```{r, eval=FALSE} +objs <- make_folder(parent = PARENT_TEST_PROJECT, folders = c("mock_file_1", "mock_file_2", "mock_file_3")) +ids <- sapply(objs, function(x) x$properties$id) +``` + +Create example manifest. Note: Another way includes reading in a shematic csv manifest with entityIds and Filenames. +```{r, eval=FALSE} +manifest <- data.table( + entityId = ids, + assay = "drugScreen", + experimentalTimepoint = c(1L, 3L, 7L), + experimentalTimepointUnit = "days", + cellType = list(c("schwann", "macrophage"), c("schwann", "macrophage"), c("schwann", "macrophage")) + ) +manifest +``` + +Apply: +```{r, eval=FALSE} +annotate_with_manifest(manifest) +``` + +Cleanup. +```{r, eval=FALSE} +for (id in ids) .syn$delete(id) +```