Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Returned tibble from adaptive.bin and proc.cdf #137

Merged
merged 16 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102)
- refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
### Removed

## [0.9.4] - 2022-05-10
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ export(rev_cum_sum)
export(rm.ridge)
export(semi.sup)
export(sort_samples_by_acquisition_number)
export(span)
export(two.step.hybrid)
export(unsupervised)
export(validate_inputs)
Expand Down
51 changes: 24 additions & 27 deletions R/adaptive.bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ increment_counter <- function(pointers, that.n){
#' \itemize{
#' \item height.rec - The records of the height of each EIC.
#' \item masses - The vector of m/z values after binning.
#' \item labels - The vector of retention time after binning.
#' \item rt - The vector of retention time after binning.
#' \item intensi - The vector of intensity values after binning.
#' \item grps - The EIC labels, i.e. which EIC each observed data point belongs to.
#' \item grps - The EIC rt, i.e. which EIC each observed data point belongs to.
#' \item times - All the unique retention time values, ordered.
#' \item tol - The m/z tolerance level.
#' \item min.count.run - The minimum number of elution time points for a series of signals grouped by m/z to be considered a peak.
Expand All @@ -73,16 +73,16 @@ increment_counter <- function(pointers, that.n){
#' @examples
#' adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
adaptive.bin <- function(features,
min.run,
min.pres,
tol,
baseline.correct,
weighted = FALSE) {
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted = FALSE) {
# order inputs after mz values
features <- features |> dplyr::arrange_at("mz")


cat(c("m/z tolerance is: ", tol, "\n"))
cat(c("m/z tolerance is: ", mz_tol, "\n"))

times <- sort(unique(features$rt))

Expand All @@ -91,7 +91,7 @@ adaptive.bin <- function(features,
time_range <- (max_time - min_time)

# calculate function parameters
min.count.run <- min.run * length(times) / time_range
min.count.run <- min_elution_length * length(times) / time_range
aver.time.range <- (time_range) / length(times)

# init data
Expand All @@ -101,7 +101,7 @@ adaptive.bin <- function(features,
# init counters
pointers <- list(curr.label = 1, prof.pointer = 1, height.pointer = 1)

breaks <- compute_breaks(tol, features$mz, features$intensities, weighted)
breaks <- compute_breaks(mz_tol, features$mz, features$intensities, intensity_weighted)

for (i in 1:(length(breaks) - 1))
{
Expand All @@ -110,12 +110,12 @@ adaptive.bin <- function(features,
start <- breaks[i] + 1
end <- breaks[i + 1]

this_table <- data.frame(labels = features$rt[start:end], mz = features$mz[start:end], intensities = features$intensities[start:end])
this_table <- dplyr::slice(features, (start:end))

if (length(unique(this_table$labels)) >= min.count.run * min.pres) {
# reorder in order of labels (scan number)
this_table <- this_table |> dplyr::arrange_at("labels")
mass.den <- compute_densities(this_table$mz, tol, weighted, this_table$intensities, median)
if (length(unique(this_table$rt)) >= min.count.run * min_presence) {
# reorder in order of rt (scan number)
this_table <- this_table |> dplyr::arrange_at("rt")
mass.den <- compute_densities(this_table$mz, mz_tol, intensity_weighted, this_table$intensities, median)

mass.den$y[mass.den$y < min(this_table$intensities) / 10] <- 0
mass.turns <- find.turn.point(mass.den$y)
Expand All @@ -137,17 +137,17 @@ adaptive.bin <- function(features,

if (nrow(that) > 0) {
that <- combine.seq.3(that) |> dplyr::arrange_at("mz")
that.range <- diff(range(that$labels))
that.range <- span(that$rt)

if (that.range > 0.5 * time_range & length(that$labels) > that.range * min.pres & length(that$labels) / (that.range / aver.time.range) > min.pres) {
that$intensities <- rm.ridge(that$labels, that$intensities, bw = max(10 * min.run, that.range / 2))
if (that.range > 0.5 * time_range & length(that$rt) > that.range * min_presence & length(that$rt) / (that.range / aver.time.range) > min_presence) {
that$intensities <- rm.ridge(that$rt, that$intensities, bw = max(10 *min_elution_length, that.range / 2))

that <- that |> dplyr::filter(intensities != 0)
}

that.n <- length(that$mz)

newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$labels, that$intensities, rep(pointers$curr.label, that.n))
newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that$mz, that$rt, that$intensities, rep(pointers$curr.label, that.n))
height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that$intensities))

# increment counters
Expand All @@ -156,12 +156,12 @@ adaptive.bin <- function(features,
}
} else {
if (runif(1) < 0.05) {
this_table <- this_table |> dplyr::arrange_at("labels")
this_table <- this_table |> dplyr::arrange_at("rt")

that.merged <- combine.seq.3(this_table)
that.n <- nrow(that.merged)

newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$labels, that.merged$intensities, rep(pointers$curr.label, that.n))
newprof[pointers$prof.pointer:(pointers$prof.pointer + that.n - 1), ] <- cbind(that.merged$mz, that.merged$rt, that.merged$intensities, rep(pointers$curr.label, that.n))
height.rec[pointers$height.pointer, ] <- c(pointers$curr.label, that.n, max(that.merged$intensities))

# increment counters
Expand All @@ -175,14 +175,11 @@ adaptive.bin <- function(features,

newprof <- newprof[order(newprof[, 1], newprof[, 2]), ]

newprof_tibble <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensities = newprof[, 3], grps = newprof[, 4])

raw.prof <- new("list")
raw.prof$height.rec <- height.rec
raw.prof$masses <- newprof[, 1]
raw.prof$labels <- newprof[, 2]
raw.prof$intensi <- newprof[, 3]
raw.prof$grps <- newprof[, 4]
raw.prof$times <- times
raw.prof$tol <- tol
raw.prof$features <- newprof_tibble
raw.prof$min.count.run <- min.count.run

return(raw.prof)
Expand Down
6 changes: 3 additions & 3 deletions R/combine.seq.3.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
#' @return returns
#' \itemize{
#' \item masses - m/z ratio
#' \item labels - retention time
#' \item rt - retention time
#' \item intensi - signal strength
#' }
#' @export
#' @examples
#' combine.seq.3(table)
combine.seq.3 <- function(features) {
l <- nrow(features)
breaks <- compute_breaks_3(features$labels)
breaks <- compute_breaks_3(features$rt)
new_table <- tibble::tibble(
mz = rep(0, length(breaks) - 1),
labels = unique(features$labels),
rt = unique(features$rt),
intensities = rep(0, length(breaks) - 1)
)

Expand Down
19 changes: 10 additions & 9 deletions R/extract_features.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ NULL
extract_features <- function(
cluster,
filenames,
min_pres,
min_run,
min_presence,
min_elution_length,
mz_tol,
baseline_correct,
baseline_correct_noise_percentile,
Expand Down Expand Up @@ -92,20 +92,21 @@ extract_features <- function(
'compute_start_bound',
'compute_end_bound',
'compute_bounds',
'compute_scale'
'compute_scale',
'span'
))
snow::clusterEvalQ(cluster, library("dplyr"))


snow::parLapply(cluster, filenames, function(filename) {
profile <- proc.cdf(
filename = filename,
min.pres = min_pres,
min.run = min_run,
tol = mz_tol,
baseline.correct = baseline_correct,
baseline.correct.noise.percentile = baseline_correct_noise_percentile,
intensity.weighted = intensity_weighted,
min_presence = min_presence,
min_elution_length = min_elution_length,
mz_tol = mz_tol,
baseline_correct = baseline_correct,
baseline_correct_noise_percentile = baseline_correct_noise_percentile,
intensity_weighted = intensity_weighted,
do.plot = FALSE,
cache = FALSE
)
Expand Down
4 changes: 2 additions & 2 deletions R/hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,8 @@ hybrid <- function(
extracted <- extract_features(
cluster = cluster,
filenames = filenames,
min_pres = min_pres,
min_run = min_run,
min_presence = min_pres,
min_elution_length = min_run,
mz_tol = mz_tol,
baseline_correct = baseline_correct,
baseline_correct_noise_percentile = baseline_correct_noise_percentile,
Expand Down
1 change: 0 additions & 1 deletion R/load.lcms.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ load.lcms <- function(filename) {
labels <- c(labels, this_labels)
}

times <- b[!is.na(b)]
mzR::close(mz_conn)

features <- tibble::tibble(mz = masses, rt = labels, intensities = intensi)
Expand Down
81 changes: 56 additions & 25 deletions R/proc.cdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,25 @@ load_file <- function(filename) {
#' @export
load_data <- function(filename,
cache,
min.run,
min.pres,
tol,
baseline.correct,
intensity.weighted) {
rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "")
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted) {
rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min_elution_length, "_", min_presence, "_", mz_tol, ".rawprof", sep = "")

if (cache && file.exists(rawprof_filename)) {
load(rawprof_filename)
} else {
raw.data <- load_file(filename)
raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
raw.prof <- adaptive.bin(
raw.data,
min_elution_length = min_elution_length,
min_presence = min_presence,
mz_tol = mz_tol,
baseline_correct = baseline_correct,
intensity_weighted = intensity_weighted
)
}

if (cache && !file.exists(rawprof_filename)) {
Expand All @@ -40,11 +47,11 @@ load_data <- function(filename,
#' This function applies the run filter to remove noise. Data points are grouped into EICs in this step.
#'
#' @param filename The CDF file name. If the file is not in the working directory, the path needs to be given.
#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of
#' @param min_presence Run filter parameter. The minimum proportion of presence in the time period for a series of
#' signals grouped by m/z to be considered a peak.
#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by
#' @param min_elution_length Run filter parameter. The minimum length of elution time for a series of signals grouped by
#' m/z to be considered a peak.
#' @param tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of
#' @param mz_tol m/z tolerance level for the grouping of data points. This value is expressed as the fraction of
#' the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is
#' the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended.
#' @param baseline.correct After grouping the observations, the highest intensity in each group is found. If
Expand All @@ -60,32 +67,56 @@ load_data <- function(filename,
#' @examples
#' proc.cdf(input_path, min_pres, min_run, tol, intensity.weighted = intensity_weighted)
proc.cdf <- function(filename,
min.pres = 0.5,
min.run = 12,
tol = 1e-05,
baseline.correct = 0.0,
baseline.correct.noise.percentile = 0.05,
min_presence = 0.5,
min_elution_length = 12,
mz_tol = 1e-05,
baseline_correct = 0.0,
baseline_correct_noise_percentile = 0.05,
do.plot = FALSE,
intensity.weighted = FALSE,
intensity_weighted = FALSE,
cache = FALSE) {
raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted)
raw.prof <- load_data(
filename,
cache,
min_elution_length,
min_presence,
mz_tol,
baseline_correct,
intensity_weighted
)

newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1]
newprof <- cbind(
raw.prof$features$mz,
raw.prof$features$rt,
raw.prof$features$intensities,
raw.prof$features$grps
)
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1]

newprof <- newprof[newprof[, 4] %in% run.sel, ]
new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run)
new.prof <- cont.index(
newprof,
min.pres = min_presence,
min.run = min_elution_length
)

if (do.plot) {
plot_raw_profile_histogram(
raw.prof,
min.pres,
baseline.correct,
baseline.correct.noise.percentile,
tol,
min_presence,
baseline_correct,
baseline_correct_noise_percentile,
mz_tol,
new.prof
)
}

return(new.prof$new.rec)
new_rec_tibble <- tibble::tibble(
mz = new.prof$new.rec[, 1],
rt = new.prof$new.rec[, 2],
intensity = new.prof$new.rec[, 3],
group_number = new.prof$new.rec[, 4]
)

return(new_rec_tibble)
}
Loading