[BIOMAGE-1016] Sample uuids fix (#72)

* Sample ids property sent as list instead of string * Fix * Draft fix for uuids * Fix * fix * fix * checks Co-authored-by: Martin Fosco <[email protected]> Co-authored-by: Oliver Gibson <[email protected]> Co-authored-by: Martin Fosco <[email protected]>
hms-dbmi-cellenics · Jun 15, 2021 · 909c112 · 909c112
1 parent 4c2f338
commit 909c112
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 94 deletions.
diff --git a/pipeline-runner/src/data-ingest/0-download_gem2s.r b/pipeline-runner/src/data-ingest/0-download_gem2s.r
@@ -20,7 +20,7 @@ task <- function(input, pipeline_config) {
             message(gem_key)
             sample_name = sample_names[[match(sample,sample_uuids)]]
             #Preparing directories
-            local_dir <- file.path('/input',sample_name)
+            local_dir <- file.path('/input',sample)
             #unlink(local_dir, recursive = TRUE)
             dir.create('/input')
             dir.create(local_dir)
@@ -50,7 +50,7 @@ task <- function(input, pipeline_config) {
     #     Key = meta_key
     #)
     #writeBin(body, con = "/input/meta.json")
-    config <- list(name = input$experimentName, samples=input$sampleNames,
+    config <- list(name = input$experimentName, samples=input$sampleIds,
     organism = input$organism,
     input = list(type="10x")
     )

diff --git a/pipeline-runner/src/data-ingest/5_Upload-to-aws.r b/pipeline-runner/src/data-ingest/5_Upload-to-aws.r
@@ -4,87 +4,12 @@ color_pool <- RJSONIO::fromJSON("/src/data-ingest/color_pool.json")
 input_dir <- '/input'
 output_dir <- '/output'
 
-# creates the table information for samples
-create_samples_table <- function(config, experiment_id, project_id) {
-  # In samples_table we are going to add the core of the information
-  samples_table <- list()
-
-  # flag_filtered information
-  df_prefiltered <- read.csv(file.path(output_dir, 'df_flag_filtered.txt'),
-                             sep = '\t',
-                             row.names = 'samples')
-
-  samples <- row.names(df_prefiltered)
-  samples_table$ids <- lapply(samples, function(x) paste0("sample-", x, sep = ""))
-
-  # For the current datasets it could happen that they are not in the gz format, so we leave the alternative tsv format.
-  mime_options = c(
-    "tsv" = "application/tsv",
-    "gz" = "application/gzip",
-    "mtx" = "application/mtx"
-  )
-
-  for (sample in samples) {
-
-    prefiltered <- df_prefiltered[sample, 'flag_filtered'] == 'Filtered'
-
-    # Identify datetime
-    cdate <- mdate <- Sys.time()
-    fnames <- list()
-
-    # files that are not hidden
-    sample_files <- file.path(
-      sample,
-      list.files(file.path(input_dir, sample))
-    )
-
-    # Iterate over each file to create the slot
-    for (sample_file in sample_files) {
-
-      fext <- tail(strsplit(sample_file, '[.]')[[1]], 1)
-
-      fnames[[sample_file]] <- list(
-        objectKey = '',
-        name = sample_file,
-        size = file.info(file.path(input_dir, sample_file))$size,
-        mime = mime_options[[fext]],
-        success = TRUE,
-        error = FALSE
-      )
-    }
-
-    # Add the whole information to each sample
-    samples_table[[paste0("sample-", sample)]] <- list(
-      name = sample,
-      uuid = uuid::UUIDgenerate(),
-      species = config$organism,
-      type = config$input[['type']],
-      createdDate = strftime(cdate, usetz = TRUE),
-      lastModified = strftime(mdate, usetz = TRUE),
-      complete = TRUE,
-      error = FALSE,
-      fileNames = sample_files,
-      files = fnames,
-      preFiltered = prefiltered
-    )
-
-  }
-
-
-  return(list(
-    "experimentId" = experiment_id,
-    "samples" = samples_table,
-    "projectUuid" = project_id
-  ))
-}
-
-
-samples_sets <- function(){
+samples_sets <- function(config){
   sample_annotations <- read.csv(file.path(output_dir, "samples-cells.csv"),
                                  sep = "\t",
                                  col.names = c("Cells_ID","Value"),
                                  na.strings = "None")
-
+  
   cell_set <- list(key = "sample",
                    name = "Samples",
                    rootNode = TRUE,
@@ -95,15 +20,14 @@ samples_sets <- function(){
 
   for (sample in samples) {
     view <- sample_annotations[sample_annotations$Value == sample, "Cells_ID"]
-    child <- list(key = paste0("sample-", sample),
-                  name = sample,
+    child <- list(key = paste0(sample),
+                  name = config$sampleNames[[match(sample,config$sampleIds)]],
                   color = color_pool[1],
                   cellIds = view)
 
     color_pool <- color_pool[-1]
     cell_set$children[[length(cell_set$children)+1]] <- child
   }
-
   return(cell_set)
 }
 
@@ -151,8 +75,6 @@ task <- function(input, pipeline_config) {
 
   experiment_id <- input$experimentId
   project_id <- input$projectId
-  sample_names <- input$sampleNames
-  sample_uuids <- input$sampleUuids
 
   # save experiment_id for record-keeping
   writeLines(experiment_id, file.path(output_dir, "experiment_id.txt"))
@@ -172,9 +94,7 @@ task <- function(input, pipeline_config) {
     type = "cellSets"
   )
 
-  # TODO: maybe we don't need samples_data
-  samples_data <- create_samples_table(config, experiment_id, project_id)
-  samples_set <- samples_sets()
+  samples_set <- samples_sets(input)
 
   # Design cell_set meta_data for DynamoDB
   cell_sets <- list(scratchpad,samples_set)
@@ -220,13 +140,6 @@ task <- function(input, pipeline_config) {
                             item = experiment_data,
                             task_name = "uploadToAWS")
 
-  # samples data to dynamodb
-  send_dynamodb_item_to_api(pipeline_config,
-                            experiment_id = experiment_id,
-                            table = pipeline_config$samples_table,
-                            item = samples_data,
-                            task_name = "uploadToAWS")
-
   if (cluster_env == "production")
     print(sprintf("https://scp.biomage.net/experiments/%s/data-exploration", experiment_id))