Skip to content

Commit

Permalink
Merge pull request #171 from nf-osi/patch/synapser-phase-a
Browse files Browse the repository at this point in the history
synapser refactor phase a
  • Loading branch information
anngvu authored Jul 15, 2024
2 parents 4320d35 + 9b87c11 commit 0ba25ec
Show file tree
Hide file tree
Showing 36 changed files with 929 additions and 471 deletions.
17 changes: 9 additions & 8 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@ jobs:
fail-fast: false
matrix:
config:
- {os: macOS-13, r: 'release'}
- {os: windows-2022, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}
- {os: macOS-13, r: '4.2.1'}
- {os: windows-2022, r: '4.2.1'}
- {os: ubuntu-latest, r: '4.3.3'}
- {os: ubuntu-latest, r: '4.2.1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -46,12 +45,14 @@ jobs:
extra-packages: any::rcmdcheck
needs: check

- name: If source build fails on macOS or Windows, fall back to typical install
if: steps.install-deps.outcome == 'failure'
- name: Install working archive version for macOS and Windows
if: runner.os == 'Windows' || runner.os == 'macOS'
shell: Rscript {0}
run: |
reticulate::install_miniconda()
install.packages("synapser", repos="http://ran.synapse.org")
reticulate::install_python("3.9.12")
reticulate::py_install("synapseclient==3.1.1", pip = TRUE)
install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.tar.gz", repos=NULL, type="source")
- uses: r-lib/actions/check-r-package@v2
with:
Expand Down
13 changes: 12 additions & 1 deletion .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ jobs:
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true


- name: Install synapseclient
run: |
pip install synapseclient
synapse --version
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::pkgdown, local::.
Expand All @@ -37,6 +42,12 @@ jobs:
run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
shell: Rscript {0}

- name: Deploy preview for PRs
if: github.event_name == 'pull_request'
uses: rossjrw/pr-preview-action@v1
with:
source-dir: ./docs

- name: Deploy to GitHub pages 🚀
if: github.event_name != 'pull_request'
uses: JamesIves/[email protected]
Expand Down
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ Imports:
plyr,
readxl,
yaml,
synapser (>= 1.0.0)
synapser (>= 1.0.0),
synapser (< 2.0.0)
URL: https://github.com/nf-osi/nfportalutils
BugReports: https://github.com/nf-osi/nfportalutils/issues
Suggests:
Expand All @@ -52,4 +53,4 @@ Suggests:
Config/testthat/edition: 2
VignetteBuilder: knitr
Remotes:
github::Sage-Bionetworks/synapser
github::Sage-Bionetworks/synapser@1.3.0
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export(check_readpair_validity)
export(check_wiki_links)
export(convert_to_stringlist)
export(copy_annotations)
export(copy_table)
export(data_curator_app_subpage)
export(delete_provenance)
export(dsp_dataset_mapping)
Expand Down
85 changes: 43 additions & 42 deletions R/access_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ summarize_file_access <- function(principal_id, # 3378999 for NF-OSI
fileview_id # "syn16858331"
) {

.check_login()
tryCatch({
view <- .syn$tableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}"))
view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) %>%
synapser::as.data.frame() %>%
as.data.table()
}, error = function(e) stop("Could not query view!"))
view <- as.data.table(view$asDataFrame())
files_by_benefactor <- view[type == "file", .N, by = .(benefactorId)]
access <- view[, check_access(benefactorId, principal_id, access_type), by = .(benefactorId)]
# files_by_benefactor can be smaller than access because there are folders without files
Expand All @@ -56,7 +56,7 @@ check_access <- function(id,
stopifnot(is.numeric(principal_id))

acl_result <- tryCatch({
.syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>%
synapser::synRestGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>%
rbindlist(.)
}, error = function(e) stop(glue::glue("Error for {id}: {e$message}")))

Expand All @@ -69,26 +69,26 @@ check_access <- function(id,

# -- SETTING ACCESS -------------------------------------------------------------#

#' Set public access to VIEW (READ) only for an entity
#'
#' Set both registered users and non-registered users to have VIEW-only permissions.
#' Set public access to VIEW (READ) only for an entity
#'
#' Set both registered users and non-registered users to have VIEW-only permissions.
#' See code{link{make_public}} for more permissive permissions to download (for registered users), which is usually set later at data release time.
#'
#'
#' @param id Synapse entity id.
#' @export
make_public_viewable <- function(id) {
.check_login()

ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948"
PUBLIC_GROUP <- "273949"
# set registered synapse users to view, download
.syn$setPermissions(entity = id,
principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP,
accessType = list("READ"))
synapser::synSetPermissions(entity = id,
principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP,
accessType = list("READ"))

# set public to view
.syn$setPermissions(entity = id,
principalId = PUBLIC_GROUP,
accessType = list("READ"))
synapser::synSetPermissions(entity = id,
principalId = PUBLIC_GROUP,
accessType = list("READ"))
}


Expand All @@ -101,18 +101,18 @@ make_public_viewable <- function(id) {
#' @param id Synapse entity id.
#' @export
make_public <- function(id) {
.check_login()

ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948"
PUBLIC_GROUP <- "273949"
# set registered synapse users to view, download
.syn$setPermissions(entity = id,
principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP,
accessType = list("READ","DOWNLOAD"))
synapser::synSetPermissions(entity = id,
principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP,
accessType = list("READ","DOWNLOAD"))

# set public to view
.syn$setPermissions(entity = id,
principalId = PUBLIC_GROUP,
accessType = list("READ"))
synapser::synSetPermissions(entity = id,
principalId = PUBLIC_GROUP,
accessType = list("READ"))
}


Expand All @@ -130,49 +130,50 @@ make_public <- function(id) {
#' @param dataset_name Optional name for dataset to be created
#' @export
grant_specific_file_access <- function(principal_id, entity_ids, create_dataset = F, project_id = NULL, dataset_name = NULL) {
# .check_login()

if(create_dataset & is.null(project_id)){
if(create_dataset && is.null(project_id)){
stop("project_id must be provided if create_dataset = T")
}

# set registered synapse users to view, download
sapply(entity_ids, function(id){
.syn$setPermissions(entity = id,
principalId = principal_id,
accessType = list("READ","DOWNLOAD"))
synapser::synSetPermissions(entity = id,
principalId = principal_id,
accessType = list("READ","DOWNLOAD"))
})

##need to grab the current versions for dataset creation
dataset_items <- lapply(entity_ids, function(id){
vsn <- .syn$get(id, downloadFile = F)$versionNumber
list(entityId = id, versionNumber = vsn)
})

if(is.null(dataset_name)){
dataset_name <- glue::glue("Dataset {Sys.Date()} for {principal_id}")
}

if(create_dataset){
tryCatch({
# First attempt with addAnnotationColumns = TRUE
dataset <- .syn$store(synapseclient$Dataset(name = dataset_name,
parent = project_id, dataset_items = dataset_items, addAnnotationColumns = TRUE))
dataset <- new_dataset(name = dataset_name,
parent = project_id,
items = entity_ids,
addAnnotationColumns = TRUE,
dry_run = FALSE)
message(glue::glue("{emoji::emoji(\"thumbsup\")} Dataset created with annotation columns at {dataset$properties$id}"))
}, error = function(e) {
# If error, retry with addAnnotationColumns = FALSE
dataset <- .syn$store(synapseclient$Dataset(name = dataset_name,
parent = project_id, dataset_items = dataset_items, addAnnotationColumns = FALSE))
.syn$setPermissions(entity = dataset$properties$id, principalId = principal_id,
accessType = list("READ", "DOWNLOAD"))
message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. Annotation columns will need to be added manually."))
dataset <- new_dataset(name = dataset_name,
parent = project_id,
items = entity_ids,
addAnnotationColumns = FALSE,
dry_run = FALSE)
synapser::synSetPermissions(entity = dataset$properties$id,
principalId = principal_id,
accessType = list("READ", "DOWNLOAD"))
message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}.
Annotation columns will need to be added manually."))
})
}

message(glue::glue('{emoji::emoji("astonished")} Principal {principal_id} added to {length(entity_ids)} entities'))

#TODO: set schema programmatically? might be easier to add annotations to schema in web client as needed to support principal_id...
## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name.
## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name.

}

29 changes: 18 additions & 11 deletions R/add_publication_from_pubmed.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
.add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch?
pmids <- new_data <- NULL
counter <- 0L
function(pmid, study_id, disease_focus, manifestation,
function(pmid, study_id,
disease_focus = c(""), manifestation = c(""),
publication_table_id, study_table_id, dry_run = T) {

.check_login()

counter <<- counter + 1L
# cat("current record:", counter) # make verbose?
# Query only for data needed, i.e. PMID to check non-dup; result can be cached
if(is.null(pmids)) {
pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F)
pmids <- synapser::synTableQuery(glue::glue("select pmid from {publication_table_id}")) %>%
synapser::as.data.frame() %>%
unlist(use.names = F)
pmids <- gsub("PMID:", "", pmids)
if(cache) pmids <<- pmids
}

Expand All @@ -25,9 +27,14 @@
if(!length(record)) return()

study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ")
study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame()
record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)),
studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency)))
study <- synapser::synTableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"), includeRowIdAndRowVersion = F)%>%
synapser::as.data.frame()
record <- cbind(record,
diseaseFocus = I(list(disease_focus)),
manifestation = I(list(manifestation)),
studyId = I(list(study$studyId)),
studyName = I(list(study$studyName)),
fundingAgency = I(unique(sapply(study$fundingAgency, jsonlite::fromJSON))))

# If batch mode, rbind and defer table schemafication until all records processed
if(batch) {
Expand All @@ -37,8 +44,8 @@
new_data <- as_table_schema(record, publication_table_id)
}
if(!dry_run) {
new_data <- .syn$store(new_data)
message(glue::glue('PMID:{new_data$asDataFrame()$pmid} added!'))
new_data <- synapser::synStore(new_data)
message(glue::glue('Added new pmid(s)!'))
} else {
new_data
}
Expand All @@ -54,8 +61,8 @@
#'
#' @param pmid PubMed ID (*not* PMCID) of the publication to be added.
#' @param study_id Synapse id(s) of the study that are associated with the publication.
#' @param disease_focus The disease focus(s) that are associated with the publication.
#' @param manifestation The manifestation(s) that are associated with the publication.
#' @param disease_focus (Optional) The disease focus(s) that are associated with the publication.
#' @param manifestation (Optional) The manifestation(s) that are associated with the publication.
#' @param publication_table_id Synapse id of the portal publication table. Must have write access.
#' @param study_table_id Synapse id of the portal study table. Need read access.
#' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata.
Expand Down
6 changes: 2 additions & 4 deletions R/add_publication_from_unpaywall.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,9 @@ add_publication_from_unpaywall <- function(publication_table_id,

#TODO: Check schema up-front and convert metadata to json in correct format

.check_login()
schema <- synapser::synGet(entity = publication_table_id)

schema <- .syn$get(entity = publication_table_id)

pub_table <- .syn$tableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>%
pub_table <- synapser::synTableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>%
readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns

if(doi %in% pub_table$doi){
Expand Down
23 changes: 12 additions & 11 deletions R/annotation_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @param output_format Format of 'excel', 'google_sheet', or 'dataframe'. Defaults to 'excel'.
#' @param use_annotations Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF.
#' @param service Service endpoint to use. Defaults to the schematic production endpoint.
#' @param access_token Synapse auth token, defaults to `SYNAPSE_AUTH_TOKEN` set in env.
#' @returns For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data.
#' @export
manifest_generate <- function(data_type,
Expand All @@ -20,11 +21,11 @@ manifest_generate <- function(data_type,
asset_view = "syn16858331",
output_format = "excel",
use_annotations = TRUE,
service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate") {
service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate",
access_token = Sys.getenv("SYNAPSE_AUTH_TOKEN")) {

# yes, param needs to be re-encoded like this for 'dataframe'
output_format_param <- if (output_format == "dataframe") "dataframe (only if getting existing manifests)" else output_format
access_token <- .syn$credentials$secret
use_annotations <- tolower(as.character(use_annotations))

req <- httr::GET(service,
Expand Down Expand Up @@ -157,13 +158,13 @@ manifest_passed <- function(result) {
#' @export
infer_data_type <- function(dataset_id) {

children <- .syn$getChildren(dataset_id)
children <- reticulate::iterate(children)
children <- synapser::synGetChildren(dataset_id)
children <- synapser::as.list(children)
if(!length(children)) return(list(result = NA, notes = "Empty dataset folder"))
children <- first(children, 3)
data_type <- c()
for (entity in children) {
e <- .syn$getAnnotations(entity)
e <- synapser::synGetAnnotations(entity)
data_type <- append(data_type, e$Component)
}
data_type <- unique(data_type)
Expand Down Expand Up @@ -202,9 +203,9 @@ meta_qc_dataset <- function(dataset_id,
schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
cleanup = TRUE) {

dataset_name <- .syn$get(dataset_id)$properties$name
dataset_name <- synapser::synGet(dataset_id)$properties$name

files <- reticulate::iterate(.syn$getChildren(dataset_id))
files <- synapser::as.list(synapser::synGetChildren(dataset_id))
if(!length(files)) {
return(list(result = NA,
notes = "Empty dataset with no files",
Expand Down Expand Up @@ -304,15 +305,15 @@ list_project_datasets <- function(project_id,

} else {

in_data <- .syn$getChildren(data_root)
in_data <- reticulate::iterate(in_data)
in_data <- synapser::synGetChildren(data_root)
in_data <- synapser::as.list(in_data)
datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data)
if(!length(datasets)) warning("No datasets found under data root.")
datasets
}
} else {
children <- .syn$getChildren(project_id)
datasets <- reticulate::iterate(children)
children <- synapser::synGetChildren(project_id)
datasets <- synapser::as.list(children)
datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.table.Dataset", datasets)
if(!length(datasets)) warning("No dataset entities found in project.")
datasets
Expand Down
Loading

0 comments on commit 0ba25ec

Please sign in to comment.