RECETOX · hechth · Aug 23, 2022 · Aug 22, 2022 · Aug 22, 2022 · Aug 22, 2022
diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R
@@ -73,16 +73,16 @@ increment_counter <- function(pointers, that.n){
 #' @examples
 #' adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
 adaptive.bin <- function(features,
-                         min.run,
-                         min.pres,
-                         tol,
-                         baseline.correct,
-                         weighted = FALSE) {
+                         min_elution_length,
+                         min_presence,
+                         mz_tol,
+                         baseline_correct,
+                         intensity_weighted = FALSE) {
   # order inputs after mz values
   features <- features |> dplyr::arrange_at("mz")
 
 
-  cat(c("m/z tolerance is: ", tol, "\n"))
+  cat(c("m/z tolerance is: ", mz_tol, "\n"))
 
   times <- sort(unique(features$rt))
 
@@ -91,7 +91,7 @@ adaptive.bin <- function(features,
   time_range <- (max_time - min_time)
 
   # calculate function parameters
-  min.count.run <- min.run * length(times) / time_range
+  min.count.run <- min_elution_length * length(times) / time_range
   aver.time.range <- (time_range) / length(times)
 
   # init data
@@ -101,7 +101,7 @@ adaptive.bin <- function(features,
   # init counters
   pointers <- list(curr.label = 1, prof.pointer = 1, height.pointer = 1)
 
-  breaks <- compute_breaks(tol, features$mz, features$intensities, weighted)
+  breaks <- compute_breaks(mz_tol, features$mz, features$intensities, intensity_weighted)
 
   for (i in 1:(length(breaks) - 1))
   {
@@ -112,10 +112,10 @@ adaptive.bin <- function(features,
 
     this_table <- data.frame(labels = features$rt[start:end], mz = features$mz[start:end], intensities = features$intensities[start:end])
 
-    if (length(unique(this_table$labels)) >= min.count.run * min.pres) {
+    if (length(unique(this_table$labels)) >= min.count.run * min_presence) {
       # reorder in order of labels (scan number)
       this_table <- this_table |> dplyr::arrange_at("labels")
-      mass.den <- compute_densities(this_table$mz, tol, weighted, this_table$intensities, median)
+      mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median)
 
       mass.den$y[mass.den$y < min(this_table$intensities) / 10] <- 0
       mass.turns <- find.turn.point(mass.den$y)
@@ -139,8 +139,8 @@ adaptive.bin <- function(features,
           that <- combine.seq.3(that) |> dplyr::arrange_at("mz")
           that.range <- diff(range(that$labels))
 
-          if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) {
-            that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min.run, that.range / 2))
+          if (that.range > 0.5 * time_range & length(that$labels) > that.range * min_presence & length(that$labels) / (that.range / aver.time.range) > min_presence) {
+            that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min_elution_length, that.range / 2))
 
             that <- that |> dplyr::filter(intensities != 0)
           }
@@ -175,14 +175,11 @@ adaptive.bin <- function(features,
 
   newprof <- newprof[order(newprof[, 1], newprof[, 2]), ]
 
+  newprof_tibble <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensities = newprof[, 3], grps = newprof[, 4])
+
   raw.prof <- new("list")
   raw.prof$height.rec <- height.rec
-  raw.prof$masses <- newprof[, 1]
-  raw.prof$labels <- newprof[, 2]
-  raw.prof$intensi <- newprof[, 3]
-  raw.prof$grps <- newprof[, 4]
-  raw.prof$times <- times
-  raw.prof$tol <- tol
+  raw.prof$features <- newprof_tibble
   raw.prof$min.count.run <- min.count.run
 
   return(raw.prof)

diff --git a/R/proc.cdf.R b/R/proc.cdf.R
@@ -14,18 +14,25 @@ load_file <- function(filename) {
 #' @export
 load_data <- function(filename,
                       cache,
-                      min.run,
-                      min.pres,
-                      tol,
-                      baseline.correct,
-                      intensity.weighted) {
-  rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "")
+                      min_elution_length,
+                      min_presence,
+                      mz_tol,
+                      baseline_correct,
+                      intensity_weighted) {
+  rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min_elution_length, "_", min_presence, "_", mz_tol, ".rawprof", sep = "")
 
   if (cache && file.exists(rawprof_filename)) {
     load(rawprof_filename)
   } else {
     raw.data <- load_file(filename)
-    raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
+    raw.prof <- adaptive.bin(
+      raw.data,
+      min_elution_length = min_elution_length,
+      min_presence = min_presence,
+      mz_tol = mz_tol,
+      baseline_correct = baseline_correct,
+      intensity_weighted = intensity_weighted
+    )
   }
 
   if (cache && !file.exists(rawprof_filename)) {
@@ -40,11 +47,11 @@ load_data <- function(filename,
 #' This function applies the run filter to remove noise. Data points are grouped into EICs in this step.
 #' 
 #' @param filename The CDF file name. If the file is not in the working directory, the path needs to be given.
-#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of 
+#' @param min_presence Run filter parameter. The minimum proportion of presence in the time period for a series of 
 #'  signals grouped by m/z to be considered a peak.
-#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by 
+#' @param min_elution_length Run filter parameter. The minimum length of elution time for a series of signals grouped by 
 #'  m/z to be considered a peak.
-#' @param tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of 
+#' @param mz_tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of 
 #'  the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is 
 #'  the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended.
 #' @param baseline.correct After grouping the observations, the highest intensity in each group is found. If 
@@ -60,32 +67,56 @@ load_data <- function(filename,
 #' @examples
 #' proc.cdf(input_path, min_pres, min_run, tol, intensity.weighted = intensity_weighted)
 proc.cdf <- function(filename,
-                     min.pres = 0.5,
-                     min.run = 12,
-                     tol = 1e-05,
-                     baseline.correct = 0.0,
-                     baseline.correct.noise.percentile = 0.05,
+                     min_presence = 0.5,
+                     min_elution_length = 12,
+                     mz_tol = 1e-05,
+                     baseline_correct = 0.0,
+                     baseline_correct_noise_percentile = 0.05,
                      do.plot = FALSE,
-                     intensity.weighted = FALSE,
+                     intensity_weighted = FALSE,
                      cache = FALSE) {
-  raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted)
+  raw.prof <- load_data(
+    filename,
+    cache,
+    min_elution_length,
+    min_presence,
+    mz_tol,
+    baseline_correct,
+    intensity_weighted
+  )
 
-  newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
-  run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1]
+  newprof <- cbind(
+    raw.prof$features$mz,
+    raw.prof$features$rt,
+    raw.prof$features$intensities,
+    raw.prof$features$grps
+  )
+  run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1]
 
   newprof <- newprof[newprof[, 4] %in% run.sel, ]
-  new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run)
+  new.prof <- cont.index(
+    newprof,
+    min.pres = min_presence,
+    min.run = min_elution_length
+  )
 
   if (do.plot) {
     plot_raw_profile_histogram(
       raw.prof,
-      min.pres,
-      baseline.correct,
-      baseline.correct.noise.percentile,
-      tol,
+      min_presence,
+      baseline_correct,
+      baseline_correct_noise_percentile,
+      mz_tol,
       new.prof
     )
   }
 
-  return(new.prof$new.rec)
+  new_rec_tibble <- tibble::tibble(
+    mz = new.prof$new.rec[, 1],
+    rt = new.prof$new.rec[, 2],
+    intensity = new.prof$new.rec[, 3],
+    group_number = new.prof$new.rec[, 4]
+  )
+
+  return(new_rec_tibble)
 }
diff --git a/tests/remote-files/filtered.txt b/tests/remote-files/filtered.txt
@@ -1,4 +1,4 @@
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0_cdf.Rds
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened_cdf.Rds
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened_cdf.Rds
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened_cdf.Rds
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened.parquet
diff --git a/tests/testthat/test-proc.cdf.R b/tests/testthat/test-proc.cdf.R
@@ -6,58 +6,56 @@ patrick::with_parameters_test_that(
     testdata <- file.path("..", "testdata")
     input_path <- file.path(testdata, "input", filename)
 
-    actual <- proc.cdf(
+    sut <- proc.cdf(
       input_path,
-      min.pres = min_pres,
-      min.run = min_run,
-      tol = mz_tol,
-      intensity.weighted = intensity_weighted,
+      min_presence = min_presence,
+      min_elution_length = min_elution_length,
+      mz_tol = mz_tol,
+      intensity_weighted = intensity_weighted,
       cache = cache
     )
 
-    expected_path <- file.path(testdata, "filtered", expected_filename)
-    expected <- readRDS(expected_path)
+    expected_path <- file.path(testdata, "filtered", paste0(.test_name, ".parquet"))
 
     # exclude last column from comparison as there lies the stochastic nature
-    expect_equal(actual[, 1:3], expected[, 1:3])
+    expected <- arrow::read_parquet(expected_path) |> dplyr::select(-group_number)
+    actual <- sut |> dplyr::select(-group_number)
+
+    expect_equal(actual, expected)
   },
   patrick::cases(
     mbr_test0 = list(
       filename = c("mbr_test0.mzml"),
-      expected_filename = "mbr_test0_cdf.Rds",
       mz_tol = 1e-05,
-      min_pres = 0.5,
-      min_run = 12,
+      min_presence = 0.5,
+      min_elution_length = 12,
       intensity_weighted = FALSE,
       cache = FALSE,
       ci_skip = FALSE
     ),
-    RCX_06_shortened_v2 = list(
+    RCX_06_shortened = list(
       filename = c("RCX_06_shortened.mzML"),
-      expected_filename = "RCX_06_shortened_cdf.Rds",
       mz_tol = 1e-06,
-      min_pres = 0.7,
-      min_run = 4,
+      min_presence = 0.7,
+      min_elution_length = 4,
       intensity_weighted = TRUE,
       cache = FALSE,
       ci_skip = FALSE
     ),
-    RCX_07_shortened_v2 = list(
+    RCX_07_shortened = list(
       filename = c("RCX_07_shortened.mzML"),
-      expected_filename = "RCX_07_shortened_cdf.Rds",
       mz_tol = 1e-06,
-      min_pres = 0.7,
-      min_run = 4,
+      min_presence = 0.7,
+      min_elution_length = 4,
       intensity_weighted = TRUE,
       cache = FALSE,
       ci_skip = TRUE
     ),
-    RCX_08_shortened_v2 = list(
+    RCX_08_shortened = list(
       filename = c("RCX_08_shortened.mzML"),
-      expected_filename = "RCX_08_shortened_cdf.Rds",
       mz_tol = 1e-06,
-      min_pres = 0.7,
-      min_run = 4,
+      min_presence = 0.7,
+      min_elution_length = 4,
       intensity_weighted = TRUE,
       cache = FALSE,
       ci_skip = TRUE

diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R
@@ -3,7 +3,7 @@ patrick::with_parameters_test_that(
   {
     testdata <- file.path("..", "testdata")
     input_path <- file.path(testdata, "filtered", filename)
-    extracted_features <- readRDS(input_path)
+    extracted_features <- arrow::read_parquet(input_path)
 
     actual <- prof.to.features(
       profile = extracted_features,
@@ -20,39 +20,39 @@ patrick::with_parameters_test_that(
   },
   patrick::cases(
     mbr_test0 = list(
-      filename = c("mbr_test0_cdf.Rds"),
+      filename = c("mbr_test0.parquet"),
       expected_filename = "mbr_test0_features.Rds",
       sd_cut = c(0.1, 100),
       sigma_ratio_lim = c(0.1, 10),
       shape_model = "bi-Gaussian",
       do.plot = FALSE
     ),
     RCX_06_shortened_gaussian = list(
-      filename = c("RCX_06_shortened_cdf.Rds"),
+      filename = c("RCX_06_shortened.parquet"),
       expected_filename = "RCX_06_shortened_gaussian_features.Rds",
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
       shape_model = "Gaussian",
       do.plot = FALSE
     ),
     RCX_06_shortened_v2 = list(
-      filename = c("RCX_06_shortened_cdf.Rds"),
+      filename = c("RCX_06_shortened.parquet"),
       expected_filename = "RCX_06_shortened_features.Rds",
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
       shape_model = "bi-Gaussian",
       do.plot = FALSE
     ),
     RCX_07_shortened_v2 = list(
-      filename = c("RCX_07_shortened_cdf.Rds"),
+      filename = c("RCX_07_shortened.parquet"),
       expected_filename = "RCX_07_shortened_features.Rds",
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
       shape_model = "bi-Gaussian",
       do.plot = FALSE
     ),
     RCX_8_shortened_v2 = list(
-      filename = c("RCX_08_shortened_cdf.Rds"),
+      filename = c("RCX_08_shortened.parquet"),
       expected_filename = "RCX_08_shortened_features.Rds",
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),