Merge pull request #196 from RECETOX/small_refactorings

Small refactorings on combine.seq.3 and general adaptive.bin
RECETOX · Apr 19, 2023 · 387c165 · 387c165
2 parents 463e1cd + a555966
commit 387c165
Show file tree

Hide file tree

Showing 13 changed files with 138 additions and 143 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [dev] - unreleased
 ### Added
 ### Changed
+- refactored adaptive.bin and combine.seq.3 [#196](https://github.com/RECETOX/recetox-aplcms/pull/196)
 - refactored find.match [#193](https://github.com/RECETOX/recetox-aplcms/pull/193)
 - Simplified Evaluation Conditions For Test Cases [#197](https://github.com/RECETOX/recetox-aplcms/pull/197)
 - Renamed `proc.cdf` to `remove_noise` [#190](https://github.com/RECETOX/recetox-aplcms/pull/190)

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,11 +4,11 @@ S3method(solve,a)
 S3method(solve,sigma)
 export(adaptive.bin)
 export(adjust.time)
+export(aggregate_by_rt)
 export(bigauss.esti)
 export(bigauss.esti.EM)
 export(bigauss.mix)
 export(comb)
-export(combine.seq.3)
 export(compute_boundaries)
 export(compute_bounds)
 export(compute_breaks)

diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R
@@ -78,15 +78,15 @@ adaptive.bin <- function(features,
 
   cat(c("m/z tolerance is: ", mz_tol, "\n"))
 
-  times <- sort(unique(features$rt))
+  scan_times <- sort(unique(features$rt))
 
-  min_time <- min(times)
-  max_time <- max(times)
-  time_range <- (max_time - min_time)
+  num_scans <- length(scan_times)
+  time_range <- span(scan_times)
 
   # calculate function parameters
-  min.count.run <- min_run * length(times) / time_range
-  aver.time.range <- (time_range) / length(times)
+  scans_per_second <- num_scans / time_range
+  min_scans <- min_run * scans_per_second
+  aver_cycle_time <- (time_range) / num_scans
 
   # init data
   newprof <- matrix(0, nrow = length(features$mz), ncol = 4)
@@ -106,7 +106,7 @@ adaptive.bin <- function(features,
 
     this_table <- dplyr::slice(features, (start:end))
 
-    if (length(unique(this_table$rt)) >= min.count.run * min_pres) {
+    if (length(unique(this_table$rt)) >= min_scans * min_pres) {
       # reorder in order of rt (scan number)
       this_table <- this_table |> dplyr::arrange_at("rt")
       mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median)
@@ -127,39 +127,40 @@ adaptive.bin <- function(features,
         }
 
         # get rows which fulfill condition
-        that <- this_table |> dplyr::filter(mz > boundaries$lower & mz <= boundaries$upper)
+        that <- this_table |> dplyr::filter(dplyr::between(mz, boundaries$lower, boundaries$upper))
 
         if (nrow(that) > 0) {
-          that <- combine.seq.3(that) |> dplyr::arrange_at("mz")
+          that <- aggregate_by_rt(that) |> dplyr::arrange_at("mz")
+
           that.range <- span(that$rt)
 
-          if (that.range > 0.5 * time_range & length(that$rt) > that.range * min_pres & length(that$rt) / (that.range / aver.time.range) > min_pres) {
+          if (that.range > 0.5 * time_range & length(that$rt) > that.range * min_pres & length(that$rt) / (that.range / aver_cycle_time) > min_pres) {
             that$intensities <- rm.ridge(that$rt, that$intensities, bw = max(10 *min_run, that.range / 2))
 
             that <- that |> dplyr::filter(intensities != 0)
           }
 
-          that.n <- length(that$mz)
+          num_pts_in_group <- length(that$mz)
 
-          newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$rt, that$intensities, rep(pointers$curr.label, that.n))
-          height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that$intensities))
+          newprof[pointers$prof.pointer:(pointers$prof.pointer + num_pts_in_group - 1), ] <- cbind(that$mz, that$rt, that$intensities, rep(pointers$curr.label, num_pts_in_group))
+          height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, num_pts_in_group, max(that$intensities))
 
           # increment counters
-          pointers <- increment_counter(pointers, that.n)
+          pointers <- increment_counter(pointers, num_pts_in_group)
         }
       }
-    } else {
+    } else { # not enough points in profile
       if (runif(1) < 0.05) {
         this_table <- this_table |> dplyr::arrange_at("rt")
 
-        that.merged <- combine.seq.3(this_table)
-        that.n <- nrow(that.merged)
+        that.merged <- aggregate_by_rt(this_table)
+        num_pts_in_group <- nrow(that.merged)
 
-        newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$rt, that.merged$intensities, rep(pointers$curr.label, that.n))
-        height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that.merged$intensities))
+        newprof[pointers$prof.pointer:(pointers$prof.pointer + num_pts_in_group - 1), ] <- cbind(that.merged$mz, that.merged$rt, that.merged$intensities, rep(pointers$curr.label, num_pts_in_group))
+        height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, num_pts_in_group, max(that.merged$intensities))
 
         # increment counters
-        pointers <- increment_counter(pointers, that.n)
+        pointers <- increment_counter(pointers, num_pts_in_group)
       }
     }
   }
@@ -174,7 +175,7 @@ adaptive.bin <- function(features,
   raw.prof <- new("list")
   raw.prof$height.rec <- height.rec
   raw.prof$features <- newprof_tibble
-  raw.prof$min.count.run <- min.count.run
+  raw.prof$min.count.run <- min_scans
 
   return(raw.prof)
 }
diff --git a/R/aggregate_by_rt.R b/R/aggregate_by_rt.R
@@ -0,0 +1,17 @@
+#' Aggregate the intensities and select median mz for features with identical rt.
+#'
+#' @description This functions computes median mz and sum of intensities over features with same rt.
+#' @param features dataframe of retention time, m/z ratio, signal strength.
+#' @return returns a tibble with the following columns
+#' \itemize{
+#'   \item mz - m/z ratio
+#'   \item rt - retention time
+#'   \item intensities - signal strength
+#' }
+#' @export
+aggregate_by_rt <- function(features) {
+    features |>
+     dplyr::group_by(rt) |>
+     dplyr::summarize(mass = median(mz[which.max(intensities)]), area = sum(intensities)) |>
+     dplyr::rename(mz = mass, intensities = area)
+}
diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R
diff --git a/R/compute_clusters.R b/R/compute_clusters.R
@@ -103,4 +103,29 @@ compute_clusters <- function(feature_tables,
   feature_tables <- sort_data(sample_names, feature_tables)
 
   return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative))
-}
+}
+
+
+
+# compute_clusters_v2 <- function(feature_tables, mz_tol_ppm, rt_tol) {
+
+#   mz_tol <- mz_tol_ppm * 1e-06
+
+#   match_fun <- list(
+#     ~ abs(.x - .y) < mz_tol * .x,
+#     ~ abs(.x - .y) < rt_tol
+#   )
+
+#   join_and_update <- function(df_a, df_b) {
+#     merged <- fuzzyjoin::fuzzy_ful.l_join(df_a, df_b, by=c("mz", "rt"), match_fun = match_fun) |>
+#      dplyr::rename(mz = mz.x, rt = rt.x)
+#   }
+
+#   aligned <- purrr::reduce(feature_tables, join_and_update)
+# }
+
+#   multi_match_fun <- function(x, y) {
+#     return(abs(x[,1] - y[,1]) < 10e-05 & abs(x[,2] - y[,2]) < 2)
+#   }
+
+#   f4 <- fuzzyjoin::fuzzy_full_join(f1, f2, by=NULL, multi_by=c("mz", "rt"), match_fun = NULL, multi_match_fun = multi_match_fun)
diff --git a/R/recover.weaker.R b/R/recover.weaker.R
@@ -521,7 +521,7 @@ compute_rectangle <- function(data_table,
 
     # get values in RT region of interest?
     if (nrow(that) > recover_min_count) {
-      that.prof <- combine.seq.3(that)
+      that.prof <- aggregate_by_rt(that)
       that.mass <- sum(that.prof$mz * that.prof$intensities) / sum(that.prof$intensities)
       curr.rec <- c(that.mass, NA, NA)
 

diff --git a/R/remove_noise.R b/R/remove_noise.R
@@ -11,6 +11,7 @@ load_file <- function(filename) {
 }
 
 #' Load data either from cache or load raw file and detect peaks.
+#' @description  THIS FUNCTION IS DEPRECATED!
 #' @export
 load_data <- function(filename,
                       cache,
@@ -70,13 +71,14 @@ remove_noise <- function(filename,
                      intensity_weighted,
                      do.plot,
                      cache) {
-  raw.prof <- load_data(
-    filename,
-    cache,
-    min_run,
-    min_pres,
-    mz_tol,
-    intensity_weighted
+  raw.data <- load_file(filename)
+
+  raw.prof <- adaptive.bin(
+    raw.data,
+    min_run = min_run,
+    min_pres = min_pres,
+    mz_tol = mz_tol,
+    intensity_weighted = intensity_weighted
   )
 
   newprof <- cbind(
@@ -85,6 +87,7 @@ remove_noise <- function(filename,
     raw.prof$features$intensities,
     raw.prof$features$grps
   )
+
   run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1]
 
   newprof <- newprof[newprof[, 4] %in% run.sel, ]