diff --git a/CHANGELOG.md b/CHANGELOG.md index 450181a0..6fcd8bb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102) - refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) - refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) +- refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137) ### Removed ## [0.9.4] - 2022-05-10 diff --git a/NAMESPACE b/NAMESPACE index 96b3b4dd..d01b6c1e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,7 @@ export(rev_cum_sum) export(rm.ridge) export(semi.sup) export(sort_samples_by_acquisition_number) +export(span) export(two.step.hybrid) export(unsupervised) export(validate_inputs) diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R index 3f9431e7..ec94dd32 100644 --- a/R/adaptive.bin.R +++ b/R/adaptive.bin.R @@ -62,9 +62,9 @@ increment_counter <- function(pointers, that.n){ #' \itemize{ #' \item height.rec - The records of the height of each EIC. #' \item masses - The vector of m/z values after binning. -#' \item labels - The vector of retention time after binning. +#' \item rt - The vector of retention time after binning. #' \item intensi - The vector of intensity values after binning. -#' \item grps - The EIC labels, i.e. which EIC each observed data point belongs to. +#' \item grps - The EIC rt, i.e. which EIC each observed data point belongs to. #' \item times - All the unique retention time values, ordered. #' \item tol - The m/z tolerance level. #' \item min.count.run - The minimum number of elution time points for a series of signals grouped by m/z to be considered a peak. @@ -73,16 +73,16 @@ increment_counter <- function(pointers, that.n){ #' @examples #' adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted) adaptive.bin <- function(features, - min.run, - min.pres, - tol, - baseline.correct, - weighted = FALSE) { + min_elution_length, + min_presence, + mz_tol, + baseline_correct, + intensity_weighted = FALSE) { # order inputs after mz values features <- features |> dplyr::arrange_at("mz") - cat(c("m/z tolerance is: ", tol, "\n")) + cat(c("m/z tolerance is: ", mz_tol, "\n")) times <- sort(unique(features$rt)) @@ -91,7 +91,7 @@ adaptive.bin <- function(features, time_range <- (max_time - min_time) # calculate function parameters - min.count.run <- min.run * length(times) / time_range + min.count.run <- min_elution_length * length(times) / time_range aver.time.range <- (time_range) / length(times) # init data @@ -101,7 +101,7 @@ adaptive.bin <- function(features, # init counters pointers <- list(curr.label = 1, prof.pointer = 1, height.pointer = 1) - breaks <- compute_breaks(tol, features$mz, features$intensities, weighted) + breaks <- compute_breaks(mz_tol, features$mz, features$intensities, intensity_weighted) for (i in 1:(length(breaks) - 1)) { @@ -110,12 +110,12 @@ adaptive.bin <- function(features, start <- breaks[i] + 1 end <- breaks[i + 1] - this_table <- data.frame(labels = features$rt[start:end], mz = features$mz[start:end], intensities = features$intensities[start:end]) + this_table <- dplyr::slice(features, (start:end)) - if (length(unique(this_table$labels)) >= min.count.run * min.pres) { - # reorder in order of labels (scan number) - this_table <- this_table |> dplyr::arrange_at("labels") - mass.den <- compute_densities(this_table$mz, tol, weighted, this_table$intensities, median) + if (length(unique(this_table$rt)) >= min.count.run * min_presence) { + # reorder in order of rt (scan number) + this_table <- this_table |> dplyr::arrange_at("rt") + mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median) mass.den$y[mass.den$y < min(this_table$intensities) / 10] <- 0 mass.turns <- find.turn.point(mass.den$y) @@ -137,17 +137,17 @@ adaptive.bin <- function(features, if (nrow(that) > 0) { that <- combine.seq.3(that) |> dplyr::arrange_at("mz") - that.range <- diff(range(that$labels)) + that.range <- span(that$rt) - if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) { - that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min.run, that.range / 2)) + if (that.range > 0.5 * time_range & length(that$rt) > that.range * min_presence & length(that$rt) / (that.range / aver.time.range) > min_presence) { + that$intensities <- rm.ridge(that$rt, that$intensities, bw = max(10 *min_elution_length, that.range / 2)) that <- that |> dplyr::filter(intensities != 0) } that.n <- length(that$mz) - newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$labels, that$intensities, rep(pointers$curr.label, that.n)) + newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$rt, that$intensities, rep(pointers$curr.label, that.n)) height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that$intensities)) # increment counters @@ -156,12 +156,12 @@ adaptive.bin <- function(features, } } else { if (runif(1) < 0.05) { - this_table <- this_table |> dplyr::arrange_at("labels") + this_table <- this_table |> dplyr::arrange_at("rt") that.merged <- combine.seq.3(this_table) that.n <- nrow(that.merged) - newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$labels, that.merged$intensities, rep(pointers$curr.label, that.n)) + newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$rt, that.merged$intensities, rep(pointers$curr.label, that.n)) height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that.merged$intensities)) # increment counters @@ -175,14 +175,11 @@ adaptive.bin <- function(features, newprof <- newprof[order(newprof[, 1], newprof[, 2]), ] + newprof_tibble <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensities = newprof[, 3], grps = newprof[, 4]) + raw.prof <- new("list") raw.prof$height.rec <- height.rec - raw.prof$masses <- newprof[, 1] - raw.prof$labels <- newprof[, 2] - raw.prof$intensi <- newprof[, 3] - raw.prof$grps <- newprof[, 4] - raw.prof$times <- times - raw.prof$tol <- tol + raw.prof$features <- newprof_tibble raw.prof$min.count.run <- min.count.run return(raw.prof) diff --git a/R/combine.seq.3.R b/R/combine.seq.3.R index 17b781ce..51588e2c 100644 --- a/R/combine.seq.3.R +++ b/R/combine.seq.3.R @@ -6,7 +6,7 @@ #' @return returns #' \itemize{ #' \item masses - m/z ratio -#' \item labels - retention time +#' \item rt - retention time #' \item intensi - signal strength #' } #' @export @@ -14,10 +14,10 @@ #' combine.seq.3(table) combine.seq.3 <- function(features) { l <- nrow(features) - breaks <- compute_breaks_3(features$labels) + breaks <- compute_breaks_3(features$rt) new_table <- tibble::tibble( mz = rep(0, length(breaks) - 1), - labels = unique(features$labels), + rt = unique(features$rt), intensities = rep(0, length(breaks) - 1) ) diff --git a/R/extract_features.R b/R/extract_features.R index 02f95763..261445fe 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -39,8 +39,8 @@ NULL extract_features <- function( cluster, filenames, - min_pres, - min_run, + min_presence, + min_elution_length, mz_tol, baseline_correct, baseline_correct_noise_percentile, @@ -92,7 +92,8 @@ extract_features <- function( 'compute_start_bound', 'compute_end_bound', 'compute_bounds', - 'compute_scale' + 'compute_scale', + 'span' )) snow::clusterEvalQ(cluster, library("dplyr")) @@ -100,12 +101,12 @@ extract_features <- function( snow::parLapply(cluster, filenames, function(filename) { profile <- proc.cdf( filename = filename, - min.pres = min_pres, - min.run = min_run, - tol = mz_tol, - baseline.correct = baseline_correct, - baseline.correct.noise.percentile = baseline_correct_noise_percentile, - intensity.weighted = intensity_weighted, + min_presence = min_presence, + min_elution_length = min_elution_length, + mz_tol = mz_tol, + baseline_correct = baseline_correct, + baseline_correct_noise_percentile = baseline_correct_noise_percentile, + intensity_weighted = intensity_weighted, do.plot = FALSE, cache = FALSE ) diff --git a/R/hybrid.R b/R/hybrid.R index 0b164863..b677f314 100644 --- a/R/hybrid.R +++ b/R/hybrid.R @@ -239,8 +239,8 @@ hybrid <- function( extracted <- extract_features( cluster = cluster, filenames = filenames, - min_pres = min_pres, - min_run = min_run, + min_presence = min_pres, + min_elution_length = min_run, mz_tol = mz_tol, baseline_correct = baseline_correct, baseline_correct_noise_percentile = baseline_correct_noise_percentile, diff --git a/R/load.lcms.R b/R/load.lcms.R index a3e18f7c..5b4ce4a2 100644 --- a/R/load.lcms.R +++ b/R/load.lcms.R @@ -58,7 +58,6 @@ load.lcms <- function(filename) { labels <- c(labels, this_labels) } - times <- b[!is.na(b)] mzR::close(mz_conn) features <- tibble::tibble(mz = masses, rt = labels, intensities = intensi) diff --git a/R/proc.cdf.R b/R/proc.cdf.R index a33a76d5..c210e020 100644 --- a/R/proc.cdf.R +++ b/R/proc.cdf.R @@ -14,18 +14,25 @@ load_file <- function(filename) { #' @export load_data <- function(filename, cache, - min.run, - min.pres, - tol, - baseline.correct, - intensity.weighted) { - rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "") + min_elution_length, + min_presence, + mz_tol, + baseline_correct, + intensity_weighted) { + rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min_elution_length, "_", min_presence, "_", mz_tol, ".rawprof", sep = "") if (cache && file.exists(rawprof_filename)) { load(rawprof_filename) } else { raw.data <- load_file(filename) - raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted) + raw.prof <- adaptive.bin( + raw.data, + min_elution_length = min_elution_length, + min_presence = min_presence, + mz_tol = mz_tol, + baseline_correct = baseline_correct, + intensity_weighted = intensity_weighted + ) } if (cache && !file.exists(rawprof_filename)) { @@ -40,11 +47,11 @@ load_data <- function(filename, #' This function applies the run filter to remove noise. Data points are grouped into EICs in this step. #' #' @param filename The CDF file name. If the file is not in the working directory, the path needs to be given. -#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of +#' @param min_presence Run filter parameter. The minimum proportion of presence in the time period for a series of #' signals grouped by m/z to be considered a peak. -#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by +#' @param min_elution_length Run filter parameter. The minimum length of elution time for a series of signals grouped by #' m/z to be considered a peak. -#' @param tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of +#' @param mz_tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of #' the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is #' the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended. #' @param baseline.correct After grouping the observations, the highest intensity in each group is found. If @@ -60,32 +67,56 @@ load_data <- function(filename, #' @examples #' proc.cdf(input_path, min_pres, min_run, tol, intensity.weighted = intensity_weighted) proc.cdf <- function(filename, - min.pres = 0.5, - min.run = 12, - tol = 1e-05, - baseline.correct = 0.0, - baseline.correct.noise.percentile = 0.05, + min_presence = 0.5, + min_elution_length = 12, + mz_tol = 1e-05, + baseline_correct = 0.0, + baseline_correct_noise_percentile = 0.05, do.plot = FALSE, - intensity.weighted = FALSE, + intensity_weighted = FALSE, cache = FALSE) { - raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted) + raw.prof <- load_data( + filename, + cache, + min_elution_length, + min_presence, + mz_tol, + baseline_correct, + intensity_weighted + ) - newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps) - run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1] + newprof <- cbind( + raw.prof$features$mz, + raw.prof$features$rt, + raw.prof$features$intensities, + raw.prof$features$grps + ) + run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1] newprof <- newprof[newprof[, 4] %in% run.sel, ] - new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run) + new.prof <- cont.index( + newprof, + min.pres = min_presence, + min.run = min_elution_length + ) if (do.plot) { plot_raw_profile_histogram( raw.prof, - min.pres, - baseline.correct, - baseline.correct.noise.percentile, - tol, + min_presence, + baseline_correct, + baseline_correct_noise_percentile, + mz_tol, new.prof ) } - return(new.prof$new.rec) + new_rec_tibble <- tibble::tibble( + mz = new.prof$new.rec[, 1], + rt = new.prof$new.rec[, 2], + intensity = new.prof$new.rec[, 3], + group_number = new.prof$new.rec[, 4] + ) + + return(new_rec_tibble) } diff --git a/R/prof.to.features.R b/R/prof.to.features.R index 74939e1c..46663fed 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -596,7 +596,7 @@ bigauss.mix <- function(rt_profile, power = 1, do.plot = FALSE, sigma.ratio.lim normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, aver_diff) { x <- that.curve[, 1] y <- that.curve[, 2] - rt_int_list <- list(labels = x, intensities = y) + rt_int_list <- list(rt = x, intensities = y) if (length(pks) == 1) { mu_sc_std <- compute_mu_sc_std(rt_int_list, aver_diff) @@ -610,13 +610,14 @@ normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, aver_diff miu <- sigma <- sc <- pks w <- matrix(0, nrow = l, ncol = length(x)) + for (m in 1:l) { # this pattern occurs multiple times in other scripts this.low <- max(vlys[vlys <= pks[m]]) this.high <- min(vlys[vlys >= pks[m]]) - indices <- between(x, this.low, this.high) + indices <- dplyr::between(x, this.low, this.high) this.x <- x[indices] this.y <- y[indices] @@ -625,7 +626,7 @@ normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, aver_diff sigma[m] <- NaN sc[m] <- 1 } else { - rt_int_list_this <- list(labels = this.x, intensities = this.y) + rt_int_list_this <- list(rt = this.x, intensities = this.y) mu_sc_std <- compute_mu_sc_std(rt_int_list_this, aver_diff) miu[m] <- mu_sc_std$label sc[m] <- mu_sc_std$intensity @@ -678,7 +679,7 @@ normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, aver_diff for (m in 1:l) { this.y <- y * w[m, ] - rt_int_list_this <- list(labels = x, intensities = this.y) + rt_int_list_this <- list(rt = x, intensities = this.y) mu_sc_std <- compute_mu_sc_std(rt_int_list_this, aver_diff) miu[m] <- mu_sc_std$label sc[m] <- mu_sc_std$intensity diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 68f134c4..be43a882 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -249,14 +249,14 @@ get_mzrange_bound_indices <- function(aligned_feature_mass, #' Get indices where rt in `features` is within `rt_tol` of `target_time`. #' @param target_time float Target retention time region. -#' @param features tibble Feature table including `labels` column. +#' @param features tibble Feature table including `rt` column. #' @param rt_tol float Retention time tolerance. #' @return vector Indices which are within `rt_tol` from `target_time` or #' 1 if `target_time` is NA. #' @export get_rt_region_indices <- function(target_time, features, rt_tol) { if (!is.null(target_time) && !is.na(target_time)) { - selection <- which(abs(features$labels - target_time) < rt_tol) + selection <- which(abs(features$rt - target_time) < rt_tol) } else { selection <- 1:nrow(features) } @@ -265,7 +265,7 @@ get_rt_region_indices <- function(target_time, features, rt_tol) { #' Get peaks and valleys of smoothed rt values in range. #' -#' @param features tibble Data table with `labels` and `intensities` columns. +#' @param features tibble Data table with `rt` and `intensities` columns. #' @param times vector Raw retention time data from raw data file. #' @param bw float Bandwidth to use for kernel smoothing. #' @return Returns a list object with the following objects in it: @@ -274,10 +274,10 @@ get_rt_region_indices <- function(target_time, features, rt_tol) { #' \item vlys - vector - The points in the data where the density is low #' (forming a valley in the function). get_features_in_rt_range <- function(features, times, bw) { - time_curve <- times[between(times, min(features$labels), max(features$labels))] + time_curve <- times[between(times, min(features$rt), max(features$rt))] this.curve <- cbind(time_curve, time_curve * 0) - this.curve[this.curve[, 1] %in% features$labels, 2] <- features$intensities + this.curve[this.curve[, 1] %in% features$rt, 2] <- features$intensities this.smooth <- ksmooth( this.curve[, 1], @@ -307,7 +307,7 @@ count_peaks <- function(roi, times) { #' Compute peaks and valleys which have at least `recover_min_count` peaks. #' -#' @param features tibble Features with `mz`, `labels` and `intensities`. +#' @param features tibble Features with `mz`, `rt` and `intensities`. #' @param times vector Retention time values from the raw data file. #' @param bandwidth float Bandwidth to use in smoothing. #' @param target_rt float Retention time at which to recover the intensity. @@ -327,7 +327,7 @@ compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_m pks <- roi$pks vlys <- roi$vlys - num_peaks <- count_peaks(roi, features$labels) + num_peaks <- count_peaks(roi, features$rt) if (!is.null (target_rt) && !is.na(target_rt)) { pks.d <- abs(pks - target_rt) # distance from the target peak location @@ -342,7 +342,7 @@ compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_m #' Compute interpolated retention time, its standard deviation, and intensity values,. #' -#' @param features tibble Features with `labels` and `intensities` columns. +#' @param features tibble Features with `rt` and `intensities` columns. #' @param aver_diff float Average retention time difference. #' @return Returns a list object with the following objects in it: #' \itemize{ @@ -351,7 +351,7 @@ compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_m #' \item sigma - float - Standard deviation of retention times #' @export compute_mu_sc_std <- function(features, aver_diff) { - x <- features$labels + x <- features$rt y <- features$intensities sum_y <- sum(y) @@ -375,7 +375,7 @@ compute_mu_sc_std <- function(features, aver_diff) { #' @param mz Mz value of the feature. #' @param peak Peak around which to detect the new feature. #' @param valleys Valley points to compute the boundary region. -#' @param features tibble Tibble with `labels` and `intensities` column. +#' @param features tibble Tibble with `rt` and `intensities` column. #' @param aver_diff float Average retention time difference. #' @param times vector Raw retention time values from raw data file. #' @param delta_rt vector Differences between consecutive retention time values (diff(times)). @@ -392,23 +392,23 @@ compute_curr_rec_with_enough_peaks <- function(mz, boundaries <- compute_boundaries(valleys, peak) subset <- features |> - dplyr::filter(between(labels, boundaries$lower, boundaries$upper)) + dplyr::filter(between(rt, boundaries$lower, boundaries$upper)) if (nrow(subset) == 1) { intensity <- subset$intensities * aver_diff - label <- subset$labels + label <- subset$rt } else if (nrow(subset) >= 10) { res <- compute_mu_sc_std(subset, aver_diff) intensity <- res$intensity label <- res$label } else { intensity <- interpol.area( - subset$labels, + subset$rt, subset$intensities, times, delta_rt ) - label <- median(subset$labels) + label <- median(subset$rt) } return(c(mz, label, intensity)) @@ -456,7 +456,7 @@ compute_peaks_and_valleys <- function(dens) { #' Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery. #' -#' @param data_table tibble Feature table with `mz`, `labels` and `intensities` column. +#' @param data_table tibble Feature table with `mz`, `rt` and `intensities` column. #' @param aligned_feature_mz float Mz value of feature in aligned feature table. #' @param breaks vector Integer boundaries of clusters in mz values. #' @param custom_mz_tol float Custom mz tolerance for the feature. @@ -471,7 +471,7 @@ compute_peaks_and_valleys <- function(dens) { #' @param bandwidth float Bandwidth to use in smoothing. #' @param min.bw float Minimum bandwidth to use. #' @param max.bw float Maximum bandwidth to use. -#' @return tibble Tibble with `mz`, `labels` and `intensities` columns. +#' @return tibble Tibble with `mz`, `rt` and `intensities` columns. compute_rectangle <- function(data_table, aligned_feature_mz, breaks, @@ -497,7 +497,7 @@ compute_rectangle <- function(data_table, features <- dplyr::slice( data_table, (breaks[bounds$start] + 1):breaks[bounds$end] - ) |> dplyr::arrange_at("labels") + ) |> dplyr::arrange_at("rt") mass.den <- compute_mass_density( features, @@ -510,7 +510,7 @@ compute_rectangle <- function(data_table, pks_in_tol <- abs(mass_range$pks - aligned_feature_mz) < custom_mz_tol / 1.5 mass_range$pks <- mass_range$pks[pks_in_tol] - this.rec <- tibble::tibble(mz = Inf, labels = Inf, intensities = Inf) + this.rec <- tibble::tibble(mz = Inf, rt = Inf, intensities = Inf) for (peak in mass_range$pks) { # get mass values of valleys the closest to the peak mass <- compute_boundaries(mass_range$vlys, peak) @@ -533,7 +533,7 @@ compute_rectangle <- function(data_table, if (length(thee.sel) > recover_min_count) { if (length(thee.sel) > 1) { curr.rec[3] <- interpol.area( - that.prof$labels[thee.sel], + that.prof$rt[thee.sel], that.prof$intensities[thee.sel], times, delta_rt @@ -541,25 +541,25 @@ compute_rectangle <- function(data_table, } else { curr.rec[3] <- that.prof$intensities[thee.sel] * aver_diff } - curr.rec[2] <- median(that.prof$labels[thee.sel]) + curr.rec[2] <- median(that.prof$rt[thee.sel]) this.rec <- tibble::add_row( this.rec, tibble::tibble_row( mz = curr.rec[1], - labels = curr.rec[2], + rt = curr.rec[2], intensities = curr.rec[3] ) ) } } else { - labels_intensities <- dplyr::select( + rt_intensities <- dplyr::select( that.prof, - c("labels", "intensities") - ) |> dplyr::arrange_at("labels") - bw <- min(max(bandwidth * (span(labels_intensities$labels)), min.bw), max.bw) + c("rt", "intensities") + ) |> dplyr::arrange_at("rt") + bw <- min(max(bandwidth * (span(rt_intensities$rt)), min.bw), max.bw) all <- compute_pks_vlys_rt( - labels_intensities, + rt_intensities, times, bw, target_rt, @@ -571,7 +571,7 @@ compute_rectangle <- function(data_table, that.mass, peak, all$vlys, - labels_intensities, + rt_intensities, aver_diff, times, delta_rt @@ -581,7 +581,7 @@ compute_rectangle <- function(data_table, this.rec, tibble::tibble_row( mz = curr.rec[1], - labels = curr.rec[2], + rt = curr.rec[2], intensities = curr.rec[3] ) ) @@ -594,7 +594,7 @@ compute_rectangle <- function(data_table, #' Refine the selection based on mz and rt differences. #' @param target_rt float Target retention time value. -#' @param rectangle tibble Features with columns `labels` and `mz`. +#' @param rectangle tibble Features with columns `rt` and `mz`. #' @param aligned_mz float Mz value in the aligned feature table of the #' feature to be recovered. #' @param rt_tol float Retention time tolerance. @@ -602,7 +602,7 @@ compute_rectangle <- function(data_table, #' @return int Index of value in rectable closest to `target_rt` and `aligned_mz`. refine_selection <- function(target_rt, rectangle, aligned_mz, rt_tol, mz_tol) { if (!is.na(target_rt)) { - rt_term <- (rectangle$labels - target_rt)^2 / rt_tol^2 + rt_term <- (rectangle$rt - target_rt)^2 / rt_tol^2 mz_term <- (rectangle$mz - aligned_mz)^2 / mz_tol^2 this.d <- rt_term + mz_term } else { @@ -669,8 +669,8 @@ recover.weaker <- function(filename, intensity.weighted = FALSE) { # load raw data - data_table <- load_file(filename) |> dplyr::rename(labels = rt) |> dplyr::arrange_at("mz") - times <- sort(unique(data_table$labels)) + data_table <- load_file(filename) |> dplyr::arrange_at("mz") + times <- sort(unique(data_table$rt)) # Initialize parameters with default values if (is.na(recover_mz_range)) recover_mz_range <- 1.5 * align.mz.tol @@ -763,10 +763,10 @@ recover.weaker <- function(filename, ) } - this.pos.diff <- which.min(abs(extracted_features$pos - this.rec$labels[this.sel])) + this.pos.diff <- which.min(abs(extracted_features$pos - this.rec$rt[this.sel])) extracted_features <- extracted_features |> tibble::add_row( mz = this.rec$mz[this.sel], - pos = this.rec$labels[this.sel], + pos = this.rec$rt[this.sel], area = this.rec$intensities[this.sel] ) @@ -774,13 +774,13 @@ recover.weaker <- function(filename, adjusted_features <- adjusted_features |> tibble::add_row( mz = this.rec$mz[this.sel], - rt = this.rec$labels[this.sel] + this.time.adjust, + rt = this.rec$rt[this.sel] + this.time.adjust, area = this.rec$intensities[this.sel], sample_id = grep(sample_name, colnames(aligned.ftrs)) - 4 # offset for other columns `mz`, `rt` etc ) sample_intensities[i] <- this.rec$intensities[this.sel] - sample_times[i] <- this.rec$labels[this.sel] + this.time.adjust + sample_times[i] <- this.rec$rt[this.sel] + this.time.adjust this.mz[i] <- this.rec$mz[this.sel] } } diff --git a/R/unsupervised.R b/R/unsupervised.R index b5f1810b..5a6cc75a 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -215,8 +215,8 @@ unsupervised <- function( extracted <- extract_features( cluster = cluster, filenames = filenames, - min_pres = min_pres, - min_run = min_run, + min_presence = min_pres, + min_elution_length = min_run, mz_tol = mz_tol, baseline_correct = baseline_correct, baseline_correct_noise_percentile = baseline_correct_noise_percentile, diff --git a/R/utils.R b/R/utils.R index 5384f0fe..83e1706e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -92,6 +92,7 @@ create_feature_sample_table <- function(features) { return(table) } +#' @export span <- function(x) { diff(range(x, na.rm = TRUE)) } diff --git a/tests/remote-files/filtered.txt b/tests/remote-files/filtered.txt index f748950d..7c2e6777 100644 --- a/tests/remote-files/filtered.txt +++ b/tests/remote-files/filtered.txt @@ -1,4 +1,4 @@ -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0_cdf.Rds -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened_cdf.Rds -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened_cdf.Rds -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened_cdf.Rds \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened.parquet \ No newline at end of file diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R index bc8a600c..ee6ed724 100644 --- a/tests/testthat/test-extract_features.R +++ b/tests/testthat/test-extract_features.R @@ -32,8 +32,8 @@ patrick::with_parameters_test_that( actual <- extract_features( cluster = cluster, filenames, - min_pres = min_pres, - min_run = min_run, + min_presence = min_presence, + min_elution_length = min_elution_length, mz_tol = mz_tol, baseline_correct = 0, baseline_correct_noise_percentile = 0.05, @@ -79,8 +79,8 @@ patrick::with_parameters_test_that( files = c("RCX_06_shortened.mzML", "RCX_07_shortened.mzML", "RCX_08_shortened.mzML"), expected_files = c("RCX_06_shortened.parquet", "RCX_07_shortened.parquet", "RCX_08_shortened.parquet"), mz_tol = 1e-05, - min_pres = 0.5, - min_run = 12, + min_presence = 0.5, + min_elution_length = 12, intensity_weighted = FALSE, sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), @@ -90,8 +90,8 @@ patrick::with_parameters_test_that( files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"), expected_files = c("8_qc_no_dil_milliq.parquet", "21_qc_no_dil_milliq.parquet", "29_qc_no_dil_milliq.parquet"), mz_tol = 1e-05, - min_pres = 0.5, - min_run = 12, + min_presence = 0.5, + min_elution_length = 12, intensity_weighted = FALSE, sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), diff --git a/tests/testthat/test-proc.cdf.R b/tests/testthat/test-proc.cdf.R index 285ac7a1..490774c1 100644 --- a/tests/testthat/test-proc.cdf.R +++ b/tests/testthat/test-proc.cdf.R @@ -6,58 +6,56 @@ patrick::with_parameters_test_that( testdata <- file.path("..", "testdata") input_path <- file.path(testdata, "input", filename) - actual <- proc.cdf( + sut <- proc.cdf( input_path, - min.pres = min_pres, - min.run = min_run, - tol = mz_tol, - intensity.weighted = intensity_weighted, + min_presence = min_presence, + min_elution_length = min_elution_length, + mz_tol = mz_tol, + intensity_weighted = intensity_weighted, cache = cache ) - expected_path <- file.path(testdata, "filtered", expected_filename) - expected <- readRDS(expected_path) + expected_path <- file.path(testdata, "filtered", paste0(.test_name, ".parquet")) # exclude last column from comparison as there lies the stochastic nature - expect_equal(actual[, 1:3], expected[, 1:3]) + expected <- arrow::read_parquet(expected_path) |> dplyr::select(-group_number) + actual <- sut |> dplyr::select(-group_number) + + expect_equal(actual, expected) }, patrick::cases( mbr_test0 = list( filename = c("mbr_test0.mzml"), - expected_filename = "mbr_test0_cdf.Rds", mz_tol = 1e-05, - min_pres = 0.5, - min_run = 12, + min_presence = 0.5, + min_elution_length = 12, intensity_weighted = FALSE, cache = FALSE, ci_skip = FALSE ), - RCX_06_shortened_v2 = list( + RCX_06_shortened = list( filename = c("RCX_06_shortened.mzML"), - expected_filename = "RCX_06_shortened_cdf.Rds", mz_tol = 1e-06, - min_pres = 0.7, - min_run = 4, + min_presence = 0.7, + min_elution_length = 4, intensity_weighted = TRUE, cache = FALSE, ci_skip = FALSE ), - RCX_07_shortened_v2 = list( + RCX_07_shortened = list( filename = c("RCX_07_shortened.mzML"), - expected_filename = "RCX_07_shortened_cdf.Rds", mz_tol = 1e-06, - min_pres = 0.7, - min_run = 4, + min_presence = 0.7, + min_elution_length = 4, intensity_weighted = TRUE, cache = FALSE, ci_skip = TRUE ), - RCX_08_shortened_v2 = list( + RCX_08_shortened = list( filename = c("RCX_08_shortened.mzML"), - expected_filename = "RCX_08_shortened_cdf.Rds", mz_tol = 1e-06, - min_pres = 0.7, - min_run = 4, + min_presence = 0.7, + min_elution_length = 4, intensity_weighted = TRUE, cache = FALSE, ci_skip = TRUE diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R index 4bbd7478..e3949cd0 100644 --- a/tests/testthat/test-prof.to.features.R +++ b/tests/testthat/test-prof.to.features.R @@ -3,7 +3,7 @@ patrick::with_parameters_test_that( { testdata <- file.path("..", "testdata") input_path <- file.path(testdata, "filtered", filename) - extracted_features <- readRDS(input_path) + extracted_features <- arrow::read_parquet(input_path) actual <- prof.to.features( profile = extracted_features, @@ -20,7 +20,7 @@ patrick::with_parameters_test_that( }, patrick::cases( mbr_test0 = list( - filename = c("mbr_test0_cdf.Rds"), + filename = c("mbr_test0.parquet"), expected_filename = "mbr_test0_features.Rds", sd_cut = c(0.1, 100), sigma_ratio_lim = c(0.1, 10), @@ -28,7 +28,7 @@ patrick::with_parameters_test_that( do.plot = FALSE ), RCX_06_shortened_gaussian = list( - filename = c("RCX_06_shortened_cdf.Rds"), + filename = c("RCX_06_shortened.parquet"), expected_filename = "RCX_06_shortened_gaussian_features.Rds", sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), @@ -36,7 +36,7 @@ patrick::with_parameters_test_that( do.plot = FALSE ), RCX_06_shortened_v2 = list( - filename = c("RCX_06_shortened_cdf.Rds"), + filename = c("RCX_06_shortened.parquet"), expected_filename = "RCX_06_shortened_features.Rds", sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), @@ -44,7 +44,7 @@ patrick::with_parameters_test_that( do.plot = FALSE ), RCX_07_shortened_v2 = list( - filename = c("RCX_07_shortened_cdf.Rds"), + filename = c("RCX_07_shortened.parquet"), expected_filename = "RCX_07_shortened_features.Rds", sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), @@ -52,7 +52,7 @@ patrick::with_parameters_test_that( do.plot = FALSE ), RCX_8_shortened_v2 = list( - filename = c("RCX_08_shortened_cdf.Rds"), + filename = c("RCX_08_shortened.parquet"), expected_filename = "RCX_08_shortened_features.Rds", sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), diff --git a/tests/testthat/test-recover-weaker.R b/tests/testthat/test-recover-weaker.R index 09c6b066..be7a3134 100644 --- a/tests/testthat/test-recover-weaker.R +++ b/tests/testthat/test-recover-weaker.R @@ -22,7 +22,6 @@ patrick::with_parameters_test_that( adjusted <- lapply(filenames, function(x) { arrow::read_parquet(x) |> dplyr::rename(rt = pos, sample_id = V6) }) - #adjusted <- lapply(adjusted, as.data.frame) aligned <- load_aligned_features( file.path(testdata, "aligned", "rt_cross_table.parquet"), @@ -51,9 +50,7 @@ patrick::with_parameters_test_that( intensity.weighted = intensity.weighted ) }) - - - intensity.weighted = FALSE + feature_table <- aligned$rt_crosstab[, 1:4] rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times)) int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))