stemangiola · Biomiha · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/R/methods.R b/R/methods.R
@@ -41,23 +41,25 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
     # CRAN Note
     .cell <- NULL
     .feature <- NULL
-
+    arg_list <- c(mget(ls(environment(), sorted=F)), match.call(expand.dots=F)$...)
+    all_assays <- get_all_assays(.data)$assay_id
+    if(is.null(arg_list$assays)) assays_from_join_call <- all_assays
     # Shape is long
     if (shape == "long") {
-
         # Suppress generic data frame creation message produced by left_join
         suppressMessages({
-            .data <-
-                .data %>%
-                    left_join(
-                        by=c_(.data)$name,
-                        get_abundance_sc_long(
-                            .data=.data,
-                            features=features,
-                            all=all,
-                            exclude_zeros=exclude_zeros)) %>%
-                    select(!!c_(.data)$symbol, .feature,
-                        contains(".abundance"), everything())
+          .data <-
+            .data %>%
+            left_join(
+              by=c_(.data)$name,
+              get_abundance_sc_long(
+                .data=.data,
+                features=features,
+                all=all,
+                exclude_zeros=exclude_zeros, 
+                ...)) %>%
+            select(!!c_(.data)$symbol, .feature,
+                   contains(".abundance"), everything())
         })
 
         # Provide data frame creation and abundance column message
@@ -79,14 +81,16 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
         .data
 
     # Shape if wide
-    } else {
+    } else if (shape == "wide"){
+        if(is.null(arg_list$assays)) stop("Please provide assays")
         .data  %>%
             left_join(
                 by=c_(.data)$name,
                 get_abundance_sc_wide(
                     .data=.data,
                     features=features,
-                    all=all, ...))
+                    all=all, 
+                    ...))
     }
 })
 
@@ -138,86 +142,158 @@ tidy.SingleCellExperiment <- function(object) {
 #' @importFrom SummarizedExperiment assays assays<- assayNames
 #' @importFrom S4Vectors split
 #' @importFrom stringr str_remove
-#' @importFrom dplyr group_split
+#' @importFrom dplyr full_join
+#' @importFrom dplyr left_join
+#' @importFrom dplyr group_by
+#' @importFrom dplyr pick
+#' @importFrom dplyr group_rows
+#' @importFrom dplyr group_keys
+#' @importFrom dplyr bind_rows
+#' @importFrom dplyr pull
+#' @importFrom tidyr unite
+#' @importFrom tidyr separate
+#' @importFrom purrr reduce
+#' @importFrom purrr map
+#' @importFrom purrr set_names
+#' @importFrom purrr list_transpose
 #'
 #'
 #' @export
-setMethod("aggregate_cells", "SingleCellExperiment", function(.data,
-    .sample=NULL, slot="data", assays=NULL,
-    aggregation_function=Matrix::rowSums,
-    ...) {
-
-    # Fix NOTEs
-    feature <- NULL
-    .sample <- enquo(.sample)
-
-    # Subset only wanted assays
-    if (!is.null(assays)) {
-        assays(.data) <- assays(.data)[assays]
-    }
+setMethod("aggregate_cells", "SingleCellExperiment",  function(.data,
+                               .sample = NULL, assays = NULL,
+                               aggregation_function = Matrix::rowSums,
+                               ...) {
+  # Fix NOTEs
+  feature <- NULL
+  .sample <- enquo(.sample)
+
+  arg_list <- c(mget(ls(environment(), sorted = F)), match.call(expand.dots = F)$...)
+  assays_to_use <- eval(arg_list$assays)
+  if (is.null(assays_to_use)) assays_to_use <- tail(names(assays(.data)), n = 1)
+
+  sample_groups <- .data |>
+    as_tibble() |>
+    group_by(pick({{ .sample }}))
+
+  sample_group_idx <- sample_groups |>
+    group_rows()
 
+  sample_group_keys <- sample_groups |>
+    group_keys()
 
-    grouping_factor =
-      .data |>
-      colData() |>
-      as_tibble() |>
-      select(!!.sample) |>
-      suppressMessages() |>
-      unite("my_id_to_split_by___", !!.sample, sep = "___") |>
-      pull(my_id_to_split_by___) |>
-      as.factor()
-
-    list_count_cells = table(grouping_factor) |> as.list()
-
-    # New method
-    list_assays =
-      .data |>
-      assays() |>
-      as.list() |>
-      map(~ .x |> splitColData(grouping_factor)) |>
-      unlist(recursive=FALSE)
-
-    list_assays =
-      list_assays |>
-      map2(names(list_assays), ~ {
-        # Get counts
-        .x %>%
-          aggregation_function(na.rm=TRUE) %>%
-          enframe(
-            name =".feature",
-            value="x") %>% # sprintf("%s", .y)) %>%
-
-          # In case we don't have rownames
-          mutate(.feature=as.character(.feature))
-      }) |>
-      enframe(name = ".sample") |>
-
-      # Clean groups
-      mutate(assay_name = assayNames(!!.data) |> rep(each=length(levels(grouping_factor)))) |>
-      mutate(.sample = .sample |> str_remove(assay_name) |> str_remove("\\.")) |>
-      group_split(.sample) |>
-      map(~ .x |>  unnest(value) |> pivot_wider(names_from = assay_name, values_from = x) ) |>
-
-      # Add cell count
-      map2(
-        list_count_cells,
-        ~ .x |> mutate(.aggregated_cells = .y)
-      )
-
-
-    do.call(rbind, list_assays) |>
-
-        left_join(
-          .data |>
-            colData() |>
-            as_tibble() |>
-            subset(!!.sample) |>
-            unite("my_id_to_split_by___", !!.sample, remove=FALSE, sep = "___"),
-            by= join_by(".sample" == "my_id_to_split_by___")
-        ) |>
-
-        as_SummarizedExperiment(
-            .sample=.sample,
-            .transcript=.feature,
-            .abundance=!!as.symbol(names(.data@assays)))
+  .sample_names <- colnames(sample_group_keys)
+
+  grouping_factor_names <- sample_group_keys |>
+    unite(col = "grouping_factor", !!.sample, sep = "___") |>
+    pull(grouping_factor)
+
+  sce_split <- map(.x = seq_along(sample_group_idx), .f = \(.num) .data[, sample_group_idx[[.num]]]) |>
+    purrr::set_names(grouping_factor_names)
+
+  grouping_factor <-
+    .data |>
+    colData() |>
+    as_tibble() |>
+    select(!!.sample) |>
+    suppressMessages() |>
+    unite("my_id_to_split_by___", !!.sample, sep = "___") |>
+    pull(my_id_to_split_by___) |>
+    as.factor()
+
+  list_count_cells <- table(grouping_factor) |>
+    enframe(name = "grouping_factor", value = ".aggregated_cells") |>
+    mutate(.aggregated_cells = as.integer(.aggregated_cells))
+
+  feature_df <- get_all_features(.data)
+  selected_features <- feature_df[feature_df$assay_id %in% assays_to_use, ]
+  selected_experiments_list <- split(x = selected_features, f = as.character(selected_features$exp_id))
+  if ("Main" %in% names(selected_experiments_list)) selected_experiments_list <- selected_experiments_list[c("Main", setdiff(names(selected_experiments_list), "Main"))]
+
+  aggregate_assays_fun <- function(exp) {
+    selected_exp <- unique(exp$exp_id)
+    selected_assays <- exp |> distinct(assay_name, .keep_all = TRUE)
+    if (selected_exp == "Main") {
+      aggregate_sce_fun <- function(sce) {
+        aggregated_vals <- assays(sce)[selected_assays$assay_name] |>
+          as.list() |>
+          map(.f = \(.list) aggregation_function(.list))
+        map(.x = seq_along(aggregated_vals), \(.num) enframe(x = aggregated_vals[[.num]], name = ".feature", value = selected_assays$assay_id[[.num]])) |>
+          suppressMessages(reduce(full_join))
+      }
+      aggregated_list <- lapply(sce_split, aggregate_sce_fun) |>
+        purrr::list_transpose() |>
+        map(.f = \(.list) .list |> bind_rows(.id = "grouping_factor"))
+      interim_res <- map(.x = seq_along(aggregated_list), .f = \(.num) aggregated_list[[.num]] |> 
+            separate(col = grouping_factor, into = .sample_names, sep = "___")) |> 
+        purrr::set_names(nm = selected_exp)
+      map(.x = seq_along(interim_res), .f = \(.num) interim_res[[.num]] |> mutate(assay_type = names(interim_res)[[.num]])) |> 
+        purrr::reduce(full_join) |> 
+        mutate(assay_type = ifelse(assay_type == "Main", yes = "RNA", no = assay_type)) |> 
+        select(assay_type, everything())
+    } else {
+      aggregate_sce_fun <- function(sce) {
+        aggregated_vals <- assays(altExps(sce)[[selected_exp]])[selected_assays$assay_name] |>
+          as.list() |>
+          set_names(selected_assays$assay_id) |>
+          map(.f = \(.list) aggregation_function(.list))
+        map(.x = seq_along(aggregated_vals), \(.num) enframe(x = aggregated_vals[[.num]], name = ".feature", value = selected_assays$assay_id[[.num]])) |>
+          suppressMessages(reduce(full_join))
+      }
+      aggregated_list <- lapply(sce_split, aggregate_sce_fun) |>
+        list_transpose() |>
+        map(.f = \(.list) .list |> bind_rows(.id = "grouping_factor"))
+      interim_res <- map(.x = seq_along(aggregated_list), .f = \(.num) aggregated_list[[.num]] |> 
+                           separate(col = grouping_factor, into = .sample_names, sep = "___")) |> 
+        purrr::set_names(nm = selected_exp)
+      map(.x = seq_along(interim_res), .f = \(.num) interim_res[[.num]] |> 
+            mutate(assay_type = names(interim_res)[[.num]])) |>
+        purrr::reduce(full_join) |> 
+        mutate(assay_type = ifelse(assay_type == "Main", yes = "RNA", no = assay_type)) |> 
+        select(assay_type, everything())
+    }
+  }
+  se <- lapply(selected_experiments_list, aggregate_assays_fun) |> 
+    purrr::reduce(full_join) |> 
+    suppressMessages()
+
+  if(se |> 
+     distinct(assay_type, .feature) |> 
+     pull(.feature) |> 
+     duplicated() |> 
+     any()) {
+    warning("tidySingleCellExperiment says: The selected assays have overlapping feature names. The feature names have been combined with the selected assay_type, to keep the rownames of the SingleCellExperiment unique. You can find the original feature names in the orig.feature.names column of the rowData slot of your object.")
+    orig_features <- se |> 
+      distinct(assay_type, .feature)
+    dup_features <- orig_features |> 
+      filter(duplicated(.feature)) |> 
+      pull(.feature)
+    se <- se |> 
+      mutate(.feature = case_when(.feature %in% dup_features ~ str_c(assay_type, .feature, sep = ".."), .default = .feature))
+  }
+
+  se <- se |> 
+    tidybulk::as_SummarizedExperiment(
+      .sample = .sample_names,
+      .transcript = .feature,
+      .abundance = setdiff(colnames(se), c("assay_type", .sample_names, ".feature")))
+  if(exists("assay_type", where = as.data.frame(colData(se)))) {
+    rowData(se) <- rownames(se) |> 
+      enframe(name = NULL, value = "rowname") |> 
+      mutate(assay_type = unique(colData(se)$assay_type)) |> 
+      tibble::column_to_rownames() |> 
+      as.data.frame() |> 
+      as(Class = "DataFrame")
+    colData(se)$assay_type <- NULL
+  }
+  if(rownames(se) |> grep(pattern = "\\.\\.") |> any()) {
+    rowData(se) <- rowData(se) |> 
+      as.data.frame() |> 
+      rownames_to_column() |> 
+      mutate(orig.feature.names = rowname,
+             orig.feature.names = str_remove_all(string = orig.feature.names, pattern = ".+(?=\\.\\.)"),
+             orig.feature.names = str_remove_all(string = orig.feature.names, pattern = "^\\..")) |> 
+      column_to_rownames() |> 
+      as(Class = "DataFrame")
+  }
+  return(se)
 })