RECETOX · hechth · Aug 23, 2022 · Aug 22, 2022 · Aug 22, 2022 · Aug 22, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102)
 - refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
 - refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
+- refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
 ### Removed
 
 ## [0.9.4] - 2022-05-10

diff --git a/NAMESPACE b/NAMESPACE
@@ -70,6 +70,7 @@ export(rev_cum_sum)
 export(rm.ridge)
 export(semi.sup)
 export(sort_samples_by_acquisition_number)
+export(span)
 export(two.step.hybrid)
 export(unsupervised)
 export(validate_inputs)

diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R
@@ -62,9 +62,9 @@ increment_counter <- function(pointers, that.n){
 #' \itemize{
 #'   \item height.rec - The records of the height of each EIC.
 #'   \item masses - The vector of m/z values after binning.
-#'   \item labels - The vector of retention time after binning.
+#'   \item rt - The vector of retention time after binning.
 #'   \item intensi - The vector of intensity values after binning.
-#'   \item grps - The EIC labels, i.e. which EIC each observed data point belongs to.
+#'   \item grps - The EIC rt, i.e. which EIC each observed data point belongs to.
 #'   \item times - All the unique retention time values, ordered.
 #'   \item tol - The m/z tolerance level.
 #'   \item min.count.run - The minimum number of elution time points for a series of signals grouped by m/z to be considered a peak.
@@ -73,16 +73,16 @@ increment_counter <- function(pointers, that.n){
 #' @examples
 #' adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
 adaptive.bin <- function(features,
-                         min.run,
-                         min.pres,
-                         tol,
-                         baseline.correct,
-                         weighted = FALSE) {
+                         min_elution_length,
+                         min_presence,
+                         mz_tol,
+                         baseline_correct,
+                         intensity_weighted = FALSE) {
   # order inputs after mz values
   features <- features |> dplyr::arrange_at("mz")
 
 
-  cat(c("m/z tolerance is: ", tol, "\n"))
+  cat(c("m/z tolerance is: ", mz_tol, "\n"))
 
   times <- sort(unique(features$rt))
 
@@ -91,7 +91,7 @@ adaptive.bin <- function(features,
   time_range <- (max_time - min_time)
 
   # calculate function parameters
-  min.count.run <- min.run * length(times) / time_range
+  min.count.run <- min_elution_length * length(times) / time_range
   aver.time.range <- (time_range) / length(times)
 
   # init data
@@ -101,7 +101,7 @@ adaptive.bin <- function(features,
   # init counters
   pointers <- list(curr.label = 1, prof.pointer = 1, height.pointer = 1)
 
-  breaks <- compute_breaks(tol, features$mz, features$intensities, weighted)
+  breaks <- compute_breaks(mz_tol, features$mz, features$intensities, intensity_weighted)
 
   for (i in 1:(length(breaks) - 1))
   {
@@ -110,12 +110,12 @@ adaptive.bin <- function(features,
     start <- breaks[i] + 1
     end <- breaks[i + 1]
 
-    this_table <- data.frame(labels = features$rt[start:end], mz = features$mz[start:end], intensities = features$intensities[start:end])
+    this_table <- dplyr::slice(features, (start:end))
 
-    if (length(unique(this_table$labels)) >= min.count.run * min.pres) {
-      # reorder in order of labels (scan number)
-      this_table <- this_table |> dplyr::arrange_at("labels")
-      mass.den <- compute_densities(this_table$mz, tol, weighted, this_table$intensities, median)
+    if (length(unique(this_table$rt)) >= min.count.run * min_presence) {
+      # reorder in order of rt (scan number)
+      this_table <- this_table |> dplyr::arrange_at("rt")
+      mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median)
 
       mass.den$y[mass.den$y < min(this_table$intensities) / 10] <- 0
       mass.turns <- find.turn.point(mass.den$y)
@@ -137,17 +137,17 @@ adaptive.bin <- function(features,
 
         if (nrow(that) > 0) {
           that <- combine.seq.3(that) |> dplyr::arrange_at("mz")
-          that.range <- diff(range(that$labels))
+          that.range <- span(that$rt)
 
-          if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) {
-            that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min.run, that.range / 2))
+          if (that.range > 0.5 * time_range & length(that$rt) > that.range * min_presence & length(that$rt) / (that.range / aver.time.range) > min_presence) {
+            that$intensities <- rm.ridge(that$rt, that$intensities, bw = max(10 *min_elution_length, that.range / 2))
 
             that <- that |> dplyr::filter(intensities != 0)
           }
 
           that.n <- length(that$mz)
 
-          newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$labels, that$intensities, rep(pointers$curr.label, that.n))
+          newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$rt, that$intensities, rep(pointers$curr.label, that.n))
           height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that$intensities))
 
           # increment counters
@@ -156,12 +156,12 @@ adaptive.bin <- function(features,
       }
     } else {
       if (runif(1) < 0.05) {
-        this_table <- this_table |> dplyr::arrange_at("labels")
+        this_table <- this_table |> dplyr::arrange_at("rt")
 
         that.merged <- combine.seq.3(this_table)
         that.n <- nrow(that.merged)
 
-        newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$labels, that.merged$intensities, rep(pointers$curr.label, that.n))
+        newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$rt, that.merged$intensities, rep(pointers$curr.label, that.n))
         height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that.merged$intensities))
 
         # increment counters
@@ -175,14 +175,11 @@ adaptive.bin <- function(features,
 
   newprof <- newprof[order(newprof[, 1], newprof[, 2]), ]
 
+  newprof_tibble <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensities = newprof[, 3], grps = newprof[, 4])
+
   raw.prof <- new("list")
   raw.prof$height.rec <- height.rec
-  raw.prof$masses <- newprof[, 1]
-  raw.prof$labels <- newprof[, 2]
-  raw.prof$intensi <- newprof[, 3]
-  raw.prof$grps <- newprof[, 4]
-  raw.prof$times <- times
-  raw.prof$tol <- tol
+  raw.prof$features <- newprof_tibble
   raw.prof$min.count.run <- min.count.run
 
   return(raw.prof)

diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R
@@ -6,18 +6,18 @@
 #' @return returns
 #' \itemize{
 #'   \item masses - m/z ratio
-#'   \item labels - retention time
+#'   \item rt - retention time
 #'   \item intensi - signal strength
 #' }
 #' @export
 #' @examples
 #' combine.seq.3(table)
 combine.seq.3 <- function(features) {
     l <- nrow(features)
-    breaks <- compute_breaks_3(features$labels)
+    breaks <- compute_breaks_3(features$rt)
     new_table <- tibble::tibble(
         mz = rep(0, length(breaks) - 1),
-        labels = unique(features$labels),
+        rt = unique(features$rt),
         intensities = rep(0, length(breaks) - 1)
     )
 

diff --git a/R/extract_features.R b/R/extract_features.R
@@ -39,8 +39,8 @@ NULL
 extract_features <- function(
   cluster,
   filenames,
-  min_pres,
-  min_run,
+  min_presence,
+  min_elution_length,
   mz_tol,
   baseline_correct,
   baseline_correct_noise_percentile,
@@ -92,20 +92,21 @@ extract_features <- function(
     'compute_start_bound',
     'compute_end_bound',
     'compute_bounds',
-    'compute_scale'
+    'compute_scale',
+    'span'
   ))
   snow::clusterEvalQ(cluster, library("dplyr"))
 
 
   snow::parLapply(cluster, filenames, function(filename) {
     profile <- proc.cdf(
       filename = filename,
-      min.pres = min_pres,
-      min.run = min_run,
-      tol = mz_tol,
-      baseline.correct = baseline_correct,
-      baseline.correct.noise.percentile = baseline_correct_noise_percentile,
-      intensity.weighted = intensity_weighted,
+      min_presence = min_presence,
+      min_elution_length = min_elution_length,
+      mz_tol = mz_tol,
+      baseline_correct = baseline_correct,
+      baseline_correct_noise_percentile = baseline_correct_noise_percentile,
+      intensity_weighted = intensity_weighted,
       do.plot = FALSE,
       cache = FALSE
     )

diff --git a/R/hybrid.R b/R/hybrid.R
@@ -239,8 +239,8 @@ hybrid <- function(
   extracted <- extract_features(
     cluster = cluster,
     filenames = filenames,
-    min_pres = min_pres,
-    min_run = min_run,
+    min_presence = min_pres,
+    min_elution_length = min_run,
     mz_tol = mz_tol,
     baseline_correct = baseline_correct,
     baseline_correct_noise_percentile = baseline_correct_noise_percentile,

diff --git a/R/load.lcms.R b/R/load.lcms.R
@@ -58,7 +58,6 @@ load.lcms <- function(filename) {
     labels <- c(labels, this_labels)
   }
 
-  times <- b[!is.na(b)]
   mzR::close(mz_conn)
 
   features <- tibble::tibble(mz = masses, rt = labels, intensities = intensi)

diff --git a/R/proc.cdf.R b/R/proc.cdf.R
@@ -14,18 +14,25 @@ load_file <- function(filename) {
 #' @export
 load_data <- function(filename,
                       cache,
-                      min.run,
-                      min.pres,
-                      tol,
-                      baseline.correct,
-                      intensity.weighted) {
-  rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "")
+                      min_elution_length,
+                      min_presence,
+                      mz_tol,
+                      baseline_correct,
+                      intensity_weighted) {
+  rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min_elution_length, "_", min_presence, "_", mz_tol, ".rawprof", sep = "")
 
   if (cache && file.exists(rawprof_filename)) {
     load(rawprof_filename)
   } else {
     raw.data <- load_file(filename)
-    raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
+    raw.prof <- adaptive.bin(
+      raw.data,
+      min_elution_length = min_elution_length,
+      min_presence = min_presence,
+      mz_tol = mz_tol,
+      baseline_correct = baseline_correct,
+      intensity_weighted = intensity_weighted
+    )
   }
 
   if (cache && !file.exists(rawprof_filename)) {
@@ -40,11 +47,11 @@ load_data <- function(filename,
 #' This function applies the run filter to remove noise. Data points are grouped into EICs in this step.
 #' 
 #' @param filename The CDF file name. If the file is not in the working directory, the path needs to be given.
-#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of 
+#' @param min_presence Run filter parameter. The minimum proportion of presence in the time period for a series of 
 #'  signals grouped by m/z to be considered a peak.
-#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by 
+#' @param min_elution_length Run filter parameter. The minimum length of elution time for a series of signals grouped by 
 #'  m/z to be considered a peak.
-#' @param tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of 
+#' @param mz_tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of 
 #'  the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is 
 #'  the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended.
 #' @param baseline.correct After grouping the observations, the highest intensity in each group is found. If 
@@ -60,32 +67,56 @@ load_data <- function(filename,
 #' @examples
 #' proc.cdf(input_path, min_pres, min_run, tol, intensity.weighted = intensity_weighted)
 proc.cdf <- function(filename,
-                     min.pres = 0.5,
-                     min.run = 12,
-                     tol = 1e-05,
-                     baseline.correct = 0.0,
-                     baseline.correct.noise.percentile = 0.05,
+                     min_presence = 0.5,
+                     min_elution_length = 12,
+                     mz_tol = 1e-05,
+                     baseline_correct = 0.0,
+                     baseline_correct_noise_percentile = 0.05,
                      do.plot = FALSE,
-                     intensity.weighted = FALSE,
+                     intensity_weighted = FALSE,
                      cache = FALSE) {
-  raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted)
+  raw.prof <- load_data(
+    filename,
+    cache,
+    min_elution_length,
+    min_presence,
+    mz_tol,
+    baseline_correct,
+    intensity_weighted
+  )
 
-  newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
-  run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1]
+  newprof <- cbind(
+    raw.prof$features$mz,
+    raw.prof$features$rt,
+    raw.prof$features$intensities,
+    raw.prof$features$grps
+  )
+  run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1]
 
   newprof <- newprof[newprof[, 4] %in% run.sel, ]
-  new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run)
+  new.prof <- cont.index(
+    newprof,
+    min.pres = min_presence,
+    min.run = min_elution_length
+  )
 
   if (do.plot) {
     plot_raw_profile_histogram(
       raw.prof,
-      min.pres,
-      baseline.correct,
-      baseline.correct.noise.percentile,
-      tol,
+      min_presence,
+      baseline_correct,
+      baseline_correct_noise_percentile,
+      mz_tol,
       new.prof
     )
   }
 
-  return(new.prof$new.rec)
+  new_rec_tibble <- tibble::tibble(
+    mz = new.prof$new.rec[, 1],
+    rt = new.prof$new.rec[, 2],
+    intensity = new.prof$new.rec[, 3],
+    group_number = new.prof$new.rec[, 4]
+  )
+
+  return(new_rec_tibble)
 }