Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Returned tibble from adaptive.bin and proc.cdf #137

Merged
merged 16 commits into from
Aug 23, 2022
Merged
33 changes: 15 additions & 18 deletions R/adaptive.bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@ increment_counter <- function(pointers, that.n){
#' @examples
#' adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
adaptive.bin <- function(features,
min.run,
min.pres,
tol,
baseline.correct,
weighted = FALSE) {
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted = FALSE) {
# order inputs after mz values
features <- features |> dplyr::arrange_at("mz")


cat(c("m/z tolerance is: ", tol, "\n"))
cat(c("m/z tolerance is: ", mz_tol, "\n"))

times <- sort(unique(features$rt))

Expand All @@ -91,7 +91,7 @@ adaptive.bin <- function(features,
time_range <- (max_time - min_time)

# calculate function parameters
min.count.run <- min.run * length(times) / time_range
min.count.run <- min_elution_length * length(times) / time_range
aver.time.range <- (time_range) / length(times)

# init data
Expand All @@ -101,7 +101,7 @@ adaptive.bin <- function(features,
# init counters
pointers <- list(curr.label = 1, prof.pointer = 1, height.pointer = 1)

breaks <- compute_breaks(tol, features$mz, features$intensities, weighted)
breaks <- compute_breaks(mz_tol, features$mz, features$intensities, intensity_weighted)

for (i in 1:(length(breaks) - 1))
{
Expand All @@ -112,10 +112,10 @@ adaptive.bin <- function(features,

this_table <- data.frame(labels = features$rt[start:end], mz = features$mz[start:end], intensities = features$intensities[start:end])
zargham-ahmad marked this conversation as resolved.
Show resolved Hide resolved

if (length(unique(this_table$labels)) >= min.count.run * min.pres) {
if (length(unique(this_table$labels)) >= min.count.run * min_presence) {
# reorder in order of labels (scan number)
this_table <- this_table |> dplyr::arrange_at("labels")
mass.den <- compute_densities(this_table$mz, tol, weighted, this_table$intensities, median)
mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median)

mass.den$y[mass.den$y < min(this_table$intensities) / 10] <- 0
mass.turns <- find.turn.point(mass.den$y)
Expand All @@ -139,8 +139,8 @@ adaptive.bin <- function(features,
that <- combine.seq.3(that) |> dplyr::arrange_at("mz")
that.range <- diff(range(that$labels))

if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) {
that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min.run, that.range / 2))
if (that.range > 0.5 * time_range & length(that$labels) > that.range * min_presence & length(that$labels) / (that.range / aver.time.range) > min_presence) {
that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min_elution_length, that.range / 2))

that <- that |> dplyr::filter(intensities != 0)
}
Expand Down Expand Up @@ -175,14 +175,11 @@ adaptive.bin <- function(features,

newprof <- newprof[order(newprof[, 1], newprof[, 2]), ]

newprof_tibble <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensities = newprof[, 3], grps = newprof[, 4])

raw.prof <- new("list")
raw.prof$height.rec <- height.rec
raw.prof$masses <- newprof[, 1]
raw.prof$labels <- newprof[, 2]
raw.prof$intensi <- newprof[, 3]
raw.prof$grps <- newprof[, 4]
raw.prof$times <- times
raw.prof$tol <- tol
raw.prof$features <- newprof_tibble
raw.prof$min.count.run <- min.count.run

return(raw.prof)
Expand Down
81 changes: 56 additions & 25 deletions R/proc.cdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,25 @@ load_file <- function(filename) {
#' @export
load_data <- function(filename,
cache,
min.run,
min.pres,
tol,
baseline.correct,
intensity.weighted) {
rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "")
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted) {
rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min_elution_length, "_", min_presence, "_", mz_tol, ".rawprof", sep = "")

if (cache && file.exists(rawprof_filename)) {
load(rawprof_filename)
} else {
raw.data <- load_file(filename)
raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
raw.prof <- adaptive.bin(
raw.data,
min_elution_length = min_elution_length,
min_presence = min_presence,
mz_tol = mz_tol,
baseline_correct = baseline_correct,
intensity_weighted = intensity_weighted
)
}

if (cache && !file.exists(rawprof_filename)) {
Expand All @@ -40,11 +47,11 @@ load_data <- function(filename,
#' This function applies the run filter to remove noise. Data points are grouped into EICs in this step.
#'
#' @param filename The CDF file name. If the file is not in the working directory, the path needs to be given.
#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of
#' @param min_presence Run filter parameter. The minimum proportion of presence in the time period for a series of
#' signals grouped by m/z to be considered a peak.
#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by
#' @param min_elution_length Run filter parameter. The minimum length of elution time for a series of signals grouped by
#' m/z to be considered a peak.
#' @param tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of
#' @param mz_tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of
#' the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is
#' the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended.
#' @param baseline.correct After grouping the observations, the highest intensity in each group is found. If
Expand All @@ -60,32 +67,56 @@ load_data <- function(filename,
#' @examples
#' proc.cdf(input_path, min_pres, min_run, tol, intensity.weighted = intensity_weighted)
proc.cdf <- function(filename,
min.pres = 0.5,
min.run = 12,
tol = 1e-05,
baseline.correct = 0.0,
baseline.correct.noise.percentile = 0.05,
min_presence = 0.5,
min_elution_length = 12,
mz_tol = 1e-05,
baseline_correct = 0.0,
baseline_correct_noise_percentile = 0.05,
do.plot = FALSE,
intensity.weighted = FALSE,
intensity_weighted = FALSE,
cache = FALSE) {
raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted)
raw.prof <- load_data(
filename,
cache,
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted
)

newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1]
newprof <- cbind(
raw.prof$features$mz,
raw.prof$features$rt,
raw.prof$features$intensities,
raw.prof$features$grps
)
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1]

newprof <- newprof[newprof[, 4] %in% run.sel, ]
new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run)
new.prof <- cont.index(
newprof,
min.pres = min_presence,
min.run = min_elution_length
)

if (do.plot) {
plot_raw_profile_histogram(
raw.prof,
min.pres,
baseline.correct,
baseline.correct.noise.percentile,
tol,
min_presence,
baseline_correct,
baseline_correct_noise_percentile,
mz_tol,
new.prof
)
}

return(new.prof$new.rec)
new_rec_tibble <- tibble::tibble(
mz = new.prof$new.rec[, 1],
rt = new.prof$new.rec[, 2],
intensity = new.prof$new.rec[, 3],
group_number = new.prof$new.rec[, 4]
)

return(new_rec_tibble)
}
8 changes: 4 additions & 4 deletions tests/remote-files/filtered.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0_cdf.Rds
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened_cdf.Rds
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened_cdf.Rds
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened_cdf.Rds
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/mbr_test0.parquet
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_06_shortened.parquet
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_07_shortened.parquet
https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/RCX_08_shortened.parquet
44 changes: 21 additions & 23 deletions tests/testthat/test-proc.cdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,58 +6,56 @@ patrick::with_parameters_test_that(
testdata <- file.path("..", "testdata")
input_path <- file.path(testdata, "input", filename)

actual <- proc.cdf(
sut <- proc.cdf(
input_path,
min.pres = min_pres,
min.run = min_run,
tol = mz_tol,
intensity.weighted = intensity_weighted,
min_presence = min_presence,
min_elution_length = min_elution_length,
mz_tol = mz_tol,
intensity_weighted = intensity_weighted,
cache = cache
)

expected_path <- file.path(testdata, "filtered", expected_filename)
expected <- readRDS(expected_path)
expected_path <- file.path(testdata, "filtered", paste0(.test_name, ".parquet"))

# exclude last column from comparison as there lies the stochastic nature
expect_equal(actual[, 1:3], expected[, 1:3])
expected <- arrow::read_parquet(expected_path) |> dplyr::select(-group_number)
actual <- sut |> dplyr::select(-group_number)

expect_equal(actual, expected)
},
patrick::cases(
mbr_test0 = list(
filename = c("mbr_test0.mzml"),
expected_filename = "mbr_test0_cdf.Rds",
mz_tol = 1e-05,
min_pres = 0.5,
min_run = 12,
min_presence = 0.5,
min_elution_length = 12,
intensity_weighted = FALSE,
cache = FALSE,
ci_skip = FALSE
),
RCX_06_shortened_v2 = list(
RCX_06_shortened = list(
filename = c("RCX_06_shortened.mzML"),
expected_filename = "RCX_06_shortened_cdf.Rds",
mz_tol = 1e-06,
min_pres = 0.7,
min_run = 4,
min_presence = 0.7,
min_elution_length = 4,
intensity_weighted = TRUE,
cache = FALSE,
ci_skip = FALSE
),
RCX_07_shortened_v2 = list(
RCX_07_shortened = list(
filename = c("RCX_07_shortened.mzML"),
expected_filename = "RCX_07_shortened_cdf.Rds",
mz_tol = 1e-06,
min_pres = 0.7,
min_run = 4,
min_presence = 0.7,
min_elution_length = 4,
intensity_weighted = TRUE,
cache = FALSE,
ci_skip = TRUE
),
RCX_08_shortened_v2 = list(
RCX_08_shortened = list(
filename = c("RCX_08_shortened.mzML"),
expected_filename = "RCX_08_shortened_cdf.Rds",
mz_tol = 1e-06,
min_pres = 0.7,
min_run = 4,
min_presence = 0.7,
min_elution_length = 4,
intensity_weighted = TRUE,
cache = FALSE,
ci_skip = TRUE
Expand Down
12 changes: 6 additions & 6 deletions tests/testthat/test-prof.to.features.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ patrick::with_parameters_test_that(
{
testdata <- file.path("..", "testdata")
input_path <- file.path(testdata, "filtered", filename)
extracted_features <- readRDS(input_path)
extracted_features <- arrow::read_parquet(input_path)

actual <- prof.to.features(
profile = extracted_features,
Expand All @@ -20,39 +20,39 @@ patrick::with_parameters_test_that(
},
patrick::cases(
mbr_test0 = list(
filename = c("mbr_test0_cdf.Rds"),
filename = c("mbr_test0.parquet"),
expected_filename = "mbr_test0_features.Rds",
sd_cut = c(0.1, 100),
sigma_ratio_lim = c(0.1, 10),
shape_model = "bi-Gaussian",
do.plot = FALSE
),
RCX_06_shortened_gaussian = list(
filename = c("RCX_06_shortened_cdf.Rds"),
filename = c("RCX_06_shortened.parquet"),
expected_filename = "RCX_06_shortened_gaussian_features.Rds",
sd_cut = c(0.01, 500),
sigma_ratio_lim = c(0.01, 100),
shape_model = "Gaussian",
do.plot = FALSE
),
RCX_06_shortened_v2 = list(
filename = c("RCX_06_shortened_cdf.Rds"),
filename = c("RCX_06_shortened.parquet"),
expected_filename = "RCX_06_shortened_features.Rds",
sd_cut = c(0.01, 500),
sigma_ratio_lim = c(0.01, 100),
shape_model = "bi-Gaussian",
do.plot = FALSE
),
RCX_07_shortened_v2 = list(
filename = c("RCX_07_shortened_cdf.Rds"),
filename = c("RCX_07_shortened.parquet"),
expected_filename = "RCX_07_shortened_features.Rds",
sd_cut = c(0.01, 500),
sigma_ratio_lim = c(0.01, 100),
shape_model = "bi-Gaussian",
do.plot = FALSE
),
RCX_8_shortened_v2 = list(
filename = c("RCX_08_shortened_cdf.Rds"),
filename = c("RCX_08_shortened.parquet"),
expected_filename = "RCX_08_shortened_features.Rds",
sd_cut = c(0.01, 500),
sigma_ratio_lim = c(0.01, 100),
Expand Down