diff --git a/DESCRIPTION b/DESCRIPTION index 3b1effc..5acef3f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, - snow, dplyr, tidyr, stringr, tibble, tools, arrow + snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes diff --git a/NAMESPACE b/NAMESPACE index 1061d6a..c5c8173 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,15 @@ # Generated by roxygen2: do not edit by hand export(adaptive.bin) +export(add_feature_ids) export(adjust.time) export(aggregate_by_rt) +export(as_feature_sample_table) export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) +export(check_files) +export(clean_data_matrix) export(comb) export(compute_boundaries) export(compute_bounds) @@ -16,6 +20,7 @@ export(compute_clusters) export(compute_clusters_simple) export(compute_comb) export(compute_corrected_features) +export(compute_corrected_features_v2) export(compute_curr_rec_with_enough_peaks) export(compute_delta_rt) export(compute_densities) @@ -40,10 +45,14 @@ export(compute_template) export(compute_template_adjusted_rt) export(compute_uniq_grp) export(correct_time) +export(correct_time_v2) export(count_peaks) export(create_aligned_feature_table) +export(create_features_from_cluster) +export(create_intensity_row) +export(create_metadata) export(create_output) -export(create_rows) +export(create_rt_row) export(draw_rt_correction_plot) export(draw_rt_normal_peaks) export(duplicate.row.remove) @@ -59,6 +68,7 @@ export(get_features_in_rt_range) export(get_mzrange_bound_indices) export(get_num_workers) export(get_rt_region_indices) +export(get_sample_name) export(get_single_occurrence_mask) export(get_times_to_use) export(hybrid) @@ -90,8 +100,6 @@ export(remove_noise) export(rev_cum_sum) export(rm.ridge) export(run_filter) -export(select_mz) -export(select_rt) export(semi.sup) export(solve_a) export(solve_sigma) diff --git a/R/adjust.time.R b/R/adjust.time.R index fe896d1..1d0f5a8 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -2,6 +2,10 @@ NULL #> NULL +#' Combine template and sample features +#' @param template_features Tibble Template feature table (mz, rt, cluster, sample_id). +#' @param features Tibble Sample feature table (mz, rt, cluster, sample_id). +#' @return Tibble Combined feature table (rbind). #' @export compute_comb <- function(template_features, features) { combined <- dplyr::bind_rows( @@ -11,6 +15,12 @@ compute_comb <- function(template_features, features) { return(combined) } +#' Select features to use for retention time alignment +#' @description This function selects features present in both the sample +#' feature table and template feature table given they have the same cluster, +#' are adjacent in the combined table. +#' @param combined Tibble Table with (mz, rt, cluster, sample_id). +#' @return List of bool Returns list of bools with TRUE at each index where this condition is met. #' @export compute_sel <- function(combined) { l <- nrow(combined) @@ -19,6 +29,11 @@ compute_sel <- function(combined) { return(sel) } +#' Create two column table with paired sample and template retention times. +#' @param combined Tibble Table with features from sample and template. +#' @param sel list of bools List of bools indiciating which features to pair. +#' See 'compute_sel'. +#' @param j string Template sample_id. #' @export compute_template_adjusted_rt <- function(combined, sel, j) { all_features <- cbind(combined$rt[sel], combined$rt[sel + 1]) @@ -34,6 +49,13 @@ compute_template_adjusted_rt <- function(combined, sel, j) { return(all_features) } +#' Correct the rt in feature table based on paired feature rts and differences. +#' @description This is a newer implementation based on dplyr which might be more efficient than the other function. +#' @param features Tibble The feature table for which to correct rts. +#' @param template_rt List of floats Template retention times for the paired features. +#' @param delta_rt List of floats Differences between the paired rts. +#' @return Tibble A table with corrected retention times. +#' @export compute_corrected_features_v2 <- function(features, template_rt, delta_rt) { features <- features |> dplyr::arrange_at(c("rt", "mz")) idx <- dplyr::between(features$rt, min(template_rt), max(template_rt)) @@ -59,20 +81,25 @@ compute_corrected_features_v2 <- function(features, template_rt, delta_rt) { return(features |> dplyr::arrange_at(c("mz", "rt"))) } +#' Correct the rt in feature table based on paired feature rts and differences. +#' @param features Tibble The feature table for which to correct rts. +#' @param template_rt List of floats Template retention times for the paired features. +#' @param delta_rt List of floats Differences between the paired rts. +#' @return Tibble A table with corrected retention times. #' @export -compute_corrected_features <- function(features, delta_rt, avg_time) { +compute_corrected_features <- function(features, template_rt, delta_rt) { features <- features |> dplyr::arrange_at(c("rt", "mz")) corrected <- features$rt original <- features$rt - idx <- dplyr::between(original, min(delta_rt), max(delta_rt)) + idx <- dplyr::between(original, min(template_rt), max(template_rt)) to_correct <- original[idx] this.smooth <- ksmooth( + template_rt, delta_rt, - avg_time, kernel = "normal", - bandwidth = (max(delta_rt) - min(delta_rt)) / 5, + bandwidth = (max(template_rt) - min(template_rt)) / 5, x.points = to_correct ) @@ -80,8 +107,8 @@ compute_corrected_features <- function(features, delta_rt, avg_time) { lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) - idx_lower <- original < min(delta_rt) - idx_upper <- original > max(delta_rt) + idx_lower <- original < min(template_rt) + idx_upper <- original > max(template_rt) corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment @@ -91,6 +118,10 @@ compute_corrected_features <- function(features, delta_rt, avg_time) { return(features) } +#' Fill missing values based on original retention times. +#' @param orig.features Non-corrected feature table. +#' @param this.features Feature table with eventual missing values. +#' @return Tibble Feature table with filles values. #' @export fill_missing_values <- function(orig.feature, this.feature) { missing_values <- which(is.na(this.feature$rt)) @@ -104,6 +135,10 @@ fill_missing_values <- function(orig.feature, this.feature) { return(this.feature) } +#' Function to perform retention time correction +#' @param this.feature Tibble Feature table for which to correct rt. +#' @param template_features Tibble Template feature table to use for correction. +#' @return Tibble this.feature table with corrected rt values. #' @export correct_time <- function(this.feature, template_features) { orig.features <- this.feature @@ -137,6 +172,10 @@ correct_time <- function(this.feature, template_features) { return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster"))) } +#' Select the template feature table. +#' @description The current implementation selects the table with the most features as the template. +#' @param extracted_features List of tables Tables from which to select the template. +#' @return Tibble Template feature table. #' @export compute_template <- function(extracted_features) { num.ftrs <- sapply(extracted_features, nrow) @@ -149,6 +188,14 @@ compute_template <- function(extracted_features) { return(tibble::as_tibble(template_features)) } +#' Rewritten version of 'correct_time' +#' @description This function uses dplyr to do the same as +#' 'correct_time', just with less code. Most functions used in the original +#' function are replaced with simple data transformations. +#' @param features Tibble Table with features to correct. +#' @param template Tibble Template feature table to use for correction. +#' @return Tibble Corrected feature table. +#' @export correct_time_v2 <- function(features, template) { if (unique(features$sample_id) == unique(template$sample_id)) return(tibble::as_tibble(features)) diff --git a/R/feature.align.R b/R/feature.align.R index acc239b..5f541a0 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,114 +1,181 @@ -#' @import foreach - -create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) { - features <- new("list") - features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames) - features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames) - features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames) - return(features) +#' Create a metadata row tibble with min, max and mean mz and RT values. +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @param sample_names A list of sample names. +#' @export +create_metadata <- function(sample_grouped, sample_names) { + sample_presence <- sapply(sample_names, + FUN = function(x) { + as.numeric(any(sample_grouped$sample_id == x)) + } + ) + + metadata_row <- dplyr::summarise( + sample_grouped, + mzmean = mean(mz), + mzmin = min(mz), + mzmax = max(mz), + rtmean = mean(rt), + rtmin = min(rt), + rtmax = max(rt), + npeaks = n() + ) %>% rename(mz = "mzmean", rt = "rtmean") + + metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence)) + return(metadata_row) } +#' Compute summed area for each sample +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @return Summed area for each sample. #' @export -create_output <- function(sample_grouped, sample_names) { - number_of_samples <- length(sample_names) - intensity_row <- rep(0, number_of_samples) - rt_row <- rep(0, number_of_samples) - sample_presence <- rep(0, number_of_samples) - - for (i in seq_along(intensity_row)) { - filtered <- filter(sample_grouped, sample_id == sample_names[i]) - if (nrow(filtered) != 0) { - sample_presence[i] <- 1 - intensity_row[i] <- sum(filtered$area) - rt_row[i] <- median(filtered$rt) - } - } +create_intensity_row <- function(sample_grouped) { + sample_grouped %>% + dplyr::group_by(sample_id) %>% + dplyr::summarise(intensity = sum(area)) %>% + tidyr::pivot_wider(names_from = "sample_id", values_from = "intensity") +} - mz <- sample_grouped$mz - rt <- sample_grouped$rt - metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence) +#' Compute median RT for each sample +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @return Median RT for each sample. +#' @export +create_rt_row <- function(sample_grouped) { + sample_grouped %>% + dplyr::group_by(sample_id) %>% + dplyr::summarise(rt = median(rt)) %>% + tidyr::pivot_wider(names_from = "sample_id", values_from = "rt") +} - return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) +#' Create a list containing 3 tibbles: metadata, intensities and RTs. +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. +#' @export +create_output <- function(sample_grouped, sample_names) { + metadata_row <- create_metadata(sample_grouped, sample_names) + intensity_row <- create_intensity_row(sample_grouped) + rt_row <- create_rt_row(sample_grouped) + + return(list( + metadata_row = metadata_row, + intensity_row = intensity_row, + rt_row = rt_row + )) } +#' Validates if the data is present in more than "min_occurence" of samples. +#' @param samples A subset of the features_table. +#' @param min_occurrence A minimal number of profiles a feature has to be present in. +#' @return boolean value whether it is TRUE or FALSE. #' @export validate_contents <- function(samples, min_occurrence) { - # validate whether data is still from at least 'min_occurrence' number of samples - if (!is.null(nrow(samples))) { - if (length(unique(samples$sample_id)) >= min_occurrence) { - return(TRUE) - } - return(FALSE) + if (!is.null(nrow(samples))) { + if (length(unique(samples$sample_id)) >= min_occurrence) { + return(TRUE) } return(FALSE) + } + return(FALSE) } +#' Compute the kernel density estimation and find the peaks and valleys of a smooth curve. +#' @param data A vector of m/z or RTs for a particular cluster. +#' @param bandwidth A bandwidth value for the KDE computation. +#' @return A list of peaks and valleys positions. #' @export find_optima <- function(data, bandwidth) { - # Kernel Density Estimation - den <- density(data, bw = bandwidth) - # select statistically significant points - turns <- find.turn.point(den$y) - return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys])) + den <- density(data, bw = bandwidth) + turns <- find.turn.point(den$y) + return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys])) } +#' Subset data within lower and upper bound from density estimation +#' @param sample A subset of the features_table. +#' @param turns A list of peaks and valleys positions. +#' @param index Whether it subsets on m/z [1] or RT [2] column. +#' @param i Iterates over the peaks in the turns list. +#' @return Dataframe subsetted within lower and upper bound from density estimation. #' @export filter_based_on_density <- function(sample, turns, index, i) { - # select data within lower and upper bound from density estimation - lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) - upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) - selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) - return(sample[selected, ]) + lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) + upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) + selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) + return(sample[selected, ]) } +#' Group the mz and RT for particular cluster. +#' @param features The features table subsetted for a particular cluster. +#' @param mz_tol_relative The m/z tolerance level for peak alignment. +#' @param rt_tol_relative The retention time tolerance level for peak alignment. +#' @param min_occurrence A minimal number of profiles a feature has to be present in. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export -select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 2, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(create_output(sample_grouped, sample_names)) +create_features_from_cluster <- function(features, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names) { + if (!validate_contents(features, min_occurrence)) { + return(NULL) + } + + # create empty tibble rows + metadata <- NULL + intensity <- NULL + rt <- NULL + + # split according to mz values + turns_mz <- find_optima(features$mz, bandwidth = mz_tol_relative * median(features$mz)) + for (i in seq_along(turns_mz$peaks)) { + sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i) + if (validate_contents(sample_grouped_mz, min_occurrence)) { + # split according to rt values + turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414) + for (ii in seq_along(turns_rt$peaks)) { + sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii) + + # create output rows if valid + if (validate_contents(sample_grouped_rt, min_occurrence)) { + metadata <- dplyr::bind_rows(metadata, create_metadata(sample_grouped_rt, sample_names)) + intensity <- dplyr::bind_rows(intensity, create_intensity_row(sample_grouped_rt)) + rt <- dplyr::bind_rows(rt, create_rt_row(sample_grouped_rt)) } + } } + } + + return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt)) } +#' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. +#' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. #' @export -select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz)) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 1, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names)) - } - } +comb <- function(x, ...) { + mapply(plyr::rbind.fill, x, ..., SIMPLIFY = FALSE) } +#' Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble +#' @param x A dataframe +#' @param sample_names List of sample names. +#' @return Cleaned tibble. #' @export -create_rows <- function(features, - i, - sel.labels, - mz_tol_relative, - rt_tol_relative, - min_occurrence, - sample_names) { - if (i %% 100 == 0) { - gc() - } # call Garbage Collection for performance improvement? - - sample <- dplyr::filter(features, cluster == sel.labels[i]) - if (nrow(sample) > 1) { - if (validate_contents(sample, min_occurrence)) { - return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) - } - } else if (min_occurrence == 1) { - return(create_output(sample_grouped, sample_names)) - } - return(NULL) +clean_data_matrix <- function(x, sample_names) { + x <- x %>% + replace(is.na(.), 0) %>% + dplyr::relocate(sample_names) |> + add_feature_ids() + return(x) } +#' Add `id` column to a dataframe +#' @param x A dataframe +#' @return The same dataframe but with an additional `id` column +#' in first place which contains the rownames. #' @export -comb <- function(x, ...) { - mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE))) +add_feature_ids <- function(x) { + x$id <- as.numeric(rownames(x)) + return(tibble::as_tibble(x |> dplyr::relocate(id))) } #' Align peaks from spectra into a feature table. @@ -122,7 +189,7 @@ comb <- function(x, ...) { #' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. #' @param cluster The number of CPU cores to be used -#' @return A tibble with three tables containing aligned metadata, intensities an RTs. +#' @return A list of 3 tibbles containing aligned metadata, intensities an RTs. #' #' @export create_aligned_feature_table <- function(features_table, @@ -131,55 +198,37 @@ create_aligned_feature_table <- function(features_table, rt_tol_relative, mz_tol_relative, cluster = 4) { - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - - # NOTE: side effect (doParallel has no functionality to clean up) - doParallel::registerDoParallel(cluster) - register_functions_to_cluster(cluster) - } - - - - number_of_samples <- length(sample_names) - metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) - intensity_colnames <- c("id", sample_names) - rt_colnames <- c("id", sample_names) - - aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) - - # table with number of values per group - groups_cardinality <- table(features_table$cluster) - # count those with minimal occurrence - sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) - - # retention time alignment - aligned_features <- foreach::foreach( - i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE - ) %dopar% { - rows <- create_rows( - features_table, - i, - sel.labels, - mz_tol_relative, - rt_tol_relative, - min_occurrence, - sample_names - ) - - if (!is.null(rows)) { - rows$metadata_row <- c(i, rows$metadata_row) - rows$intensity_row <- c(i, rows$intensity_row) - rows$rt_row <- c(i, rows$rt_row) - } - - list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row) - } - - colnames(aligned_features$metadata) <- metadata_colnames - colnames(aligned_features$intensity) <- intensity_colnames - colnames(aligned_features$rt) <- rt_colnames - - return(aligned_features) + if (!is(cluster, "cluster")) { + cluster <- parallel::makeCluster(cluster) + on.exit(parallel::stopCluster(cluster)) + + # NOTE: side effect (doParallel has no functionality to clean up) + doParallel::registerDoParallel(cluster) + register_functions_to_cluster(cluster) + } + + # table with number of values per group + groups_cardinality <- table(features_table$cluster) + # count those with minimal occurrence + sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) + + # retention time alignment + aligned_features <- foreach::foreach( + i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE + ) %do% { + rows <- create_features_from_cluster( + dplyr::filter(features_table, cluster == sel.labels[i]), + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names + ) + list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row) + } + + aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names) + aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names) + aligned_features$metadata <- add_feature_ids(aligned_features$metadata) + + return(aligned_features) } diff --git a/R/unsupervised.R b/R/unsupervised.R index 3b7070b..4093b9c 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -2,6 +2,13 @@ NULL #> NULL +#' Read the metadata table, retention time data matrix and intensity data matrix +#' and combine them into a single table +#' @param metadata Tibble Feature metadata table with information concerning the peaks. +#' @param rt_crosstab Tibble Data matrix with features on rows and samples on columns holding rt data. +#' @param int_crosstab Tibble Data matrix with features on rows and samples on columns holding intensity data. +#' @return Tibble A merged table containing all information. +#' @export as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) { feature_names <- as.character(rt_crosstab$id) sample_names <- colnames(metadata)[-c(1:8)] @@ -27,6 +34,9 @@ as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) { return(data) } +#' Check files whether they exist. +#' @param filenames list of filenames Filenames to check whether they exist. +#' @export check_files <- function(filenames) { missing <- !file.exists(filenames) missing_filenames <- paste0('\t', filenames[missing], collapse = '\n') @@ -36,6 +46,10 @@ check_files <- function(filenames) { } } +#' Get the sample name as basename of the file. +#' @param filename string Name of the file. +#' @return string Sample name. +#' @export get_sample_name <- function(filename) { tools::file_path_sans_ext(basename(filename)) } diff --git a/R/utils.R b/R/utils.R index 6441ad0..db06a3b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -8,6 +8,7 @@ register_functions_to_cluster <- function(cluster) { 'prof.to.features', 'load.lcms', 'adaptive.bin', + 'add_feature_ids', 'find.turn.point', 'msExtrema', 'find_local_maxima', @@ -45,14 +46,17 @@ register_functions_to_cluster <- function(cluster) { 'compute_uniq_grp', 'predict_smoothed_rt', 'label_val_to_keep', - "create_rows", + "create_features_from_cluster", "validate_contents", - "select_mz", - "select_rt", "find_optima", "filter_based_on_density", "create_output", + "create_metadata", + "create_rt_row", + "create_intensity_row", "comb", + "clean_data_matrix", + "create_aligned_feature_table", 'bigauss.esti.EM', 'solve_sigma', 'prep_uv', @@ -81,7 +85,10 @@ register_functions_to_cluster <- function(cluster) { 'get_mzrange_bound_indices', 'compute_mass_density', 'l2normalize', - 'compute_peaks_and_valleys' + 'compute_peaks_and_valleys', + 'as_feature_sample_table', + 'check_files', + 'get_sample_name' )) snow::clusterEvalQ(cluster, library("dplyr")) snow::clusterEvalQ(cluster, library("stringr")) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index c903498..1e98906 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -29,3 +29,4 @@ dependencies: - r-httpgd - r-microbenchmark - r-covr + - r-plyr diff --git a/conda/environment.yaml b/conda/environment.yaml index 3ab1060..1b2776b 100644 --- a/conda/environment.yaml +++ b/conda/environment.yaml @@ -18,4 +18,5 @@ dependencies: - r-tidyr - r-stringr - r-tibble + - r-plyr diff --git a/man/clean_data_matrix.Rd b/man/clean_data_matrix.Rd new file mode 100644 index 0000000..e535404 --- /dev/null +++ b/man/clean_data_matrix.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{clean_data_matrix} +\alias{clean_data_matrix} +\title{Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble} +\usage{ +clean_data_matrix(x, sample_names) +} +\arguments{ +\item{x}{A dataframe} + +\item{sample_names}{List of sample names.} +} +\value{ +Cleaned tibble. +} +\description{ +Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble +} diff --git a/man/comb.Rd b/man/comb.Rd new file mode 100644 index 0000000..2087909 --- /dev/null +++ b/man/comb.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{comb} +\alias{comb} +\title{Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.} +\usage{ +comb(x, ...) +} +\value{ +Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. +} +\description{ +Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. +} diff --git a/man/compute_clusters_simple.Rd b/man/compute_clusters_simple.Rd index 09ab9f3..3a4855b 100644 --- a/man/compute_clusters_simple.Rd +++ b/man/compute_clusters_simple.Rd @@ -7,7 +7,7 @@ compute_clusters_simple(feature_tables, sample_names, mz_tol_ppm, rt_tol) } \arguments{ -\item{feature_tables}{list of tibbles List of feature tables coming from all samples.} +\item{feature_tables}{list of tibbles feature tables coming from all samples.} \item{sample_names}{list of strings Sample names of the feature tables used to distinguish the samples.} diff --git a/man/create_aligned_feature_table.Rd b/man/create_aligned_feature_table.Rd index 034df37..66a676d 100644 --- a/man/create_aligned_feature_table.Rd +++ b/man/create_aligned_feature_table.Rd @@ -30,7 +30,7 @@ percentage of the m/z value. This value, multiplied by the m/z value, becomes th \item{cluster}{The number of CPU cores to be used} } \value{ -A tibble with three tables containing aligned metadata, intensities an RTs. +A list of 3 tibbles containing aligned metadata, intensities an RTs. } \description{ Align peaks from spectra into a feature table. diff --git a/man/create_features_from_cluster.Rd b/man/create_features_from_cluster.Rd new file mode 100644 index 0000000..84bebba --- /dev/null +++ b/man/create_features_from_cluster.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_features_from_cluster} +\alias{create_features_from_cluster} +\title{Group the mz and RT for particular cluster.} +\usage{ +create_features_from_cluster( + features, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names +) +} +\arguments{ +\item{features}{The features table subsetted for a particular cluster.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment.} + +\item{min_occurrence}{A minimal number of profiles a feature has to be present in.} + +\item{sample_names}{A list of sample names.} +} +\value{ +A list containing 3 tibbles: metadata, intensities and RTs. +} +\description{ +Group the mz and RT for particular cluster. +} diff --git a/man/create_intensity_row.Rd b/man/create_intensity_row.Rd new file mode 100644 index 0000000..0976c45 --- /dev/null +++ b/man/create_intensity_row.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_intensity_row} +\alias{create_intensity_row} +\title{Compute summed area for each sample} +\usage{ +create_intensity_row(sample_grouped) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} +} +\value{ +Summed area for each sample. +} +\description{ +Compute summed area for each sample +} diff --git a/man/create_output.Rd b/man/create_output.Rd new file mode 100644 index 0000000..9c1223e --- /dev/null +++ b/man/create_output.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_output} +\alias{create_output} +\title{Create a list containing 3 tibbles: metadata, intensities and RTs.} +\usage{ +create_output(sample_grouped, sample_names) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} + +\item{sample_names}{A list of sample names.} +} +\value{ +A list containing 3 tibbles: metadata, intensities and RTs. +} +\description{ +Create a list containing 3 tibbles: metadata, intensities and RTs. +} diff --git a/man/create_rt_row.Rd b/man/create_rt_row.Rd new file mode 100644 index 0000000..0059fb9 --- /dev/null +++ b/man/create_rt_row.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_rt_row} +\alias{create_rt_row} +\title{Compute median RT for each sample} +\usage{ +create_rt_row(sample_grouped) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} +} +\value{ +Median RT for each sample. +} +\description{ +Compute median RT for each sample +} diff --git a/man/filter_based_on_density.Rd b/man/filter_based_on_density.Rd new file mode 100644 index 0000000..fe85d5c --- /dev/null +++ b/man/filter_based_on_density.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{filter_based_on_density} +\alias{filter_based_on_density} +\title{Subset data within lower and upper bound from density estimation} +\usage{ +filter_based_on_density(sample, turns, index, i) +} +\arguments{ +\item{sample}{A subset of the features_table.} + +\item{turns}{A list of peaks and valleys positions.} + +\item{index}{Whether it subsets on m/z [1] or RT [2] column.} + +\item{i}{Iterates over the peaks in the turns list.} +} +\value{ +Dataframe subsetted within lower and upper bound from density estimation. +} +\description{ +Subset data within lower and upper bound from density estimation +} diff --git a/man/find_optima.Rd b/man/find_optima.Rd new file mode 100644 index 0000000..81e681b --- /dev/null +++ b/man/find_optima.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{find_optima} +\alias{find_optima} +\title{Compute the kernel density estimation and find the peaks and valleys of a smooth curve.} +\usage{ +find_optima(data, bandwidth) +} +\arguments{ +\item{data}{A vector of m/z or RTs for a particular cluster.} + +\item{bandwidth}{A bandwidth value for the KDE computation.} +} +\value{ +A list of peaks and valleys positions. +} +\description{ +Compute the kernel density estimation and find the peaks and valleys of a smooth curve. +} diff --git a/man/remove_noise.Rd b/man/remove_noise.Rd index debe393..e310fcd 100644 --- a/man/remove_noise.Rd +++ b/man/remove_noise.Rd @@ -13,7 +13,8 @@ remove_noise( baseline_correct_noise_percentile, intensity_weighted, do.plot, - cache + cache, + grouping_threshold = Inf ) } \arguments{ @@ -40,6 +41,8 @@ run filter, to be used as the baseline threshold of signal strength.} \item{do.plot}{Indicates whether plot should be drawn.} \item{cache}{Whether to use cache} + +\item{grouping_threshold}{The maximum difference between two scans to be considered the same EIC. Default is Inf.} } \value{ A matrix with four columns: m/z value, retention time, intensity, and group number. diff --git a/man/validate_contents.Rd b/man/validate_contents.Rd new file mode 100644 index 0000000..e8f9889 --- /dev/null +++ b/man/validate_contents.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{validate_contents} +\alias{validate_contents} +\title{Validates if the data is present in more than "min_occurence" of samples.} +\usage{ +validate_contents(samples, min_occurrence) +} +\arguments{ +\item{samples}{A subset of the features_table.} + +\item{min_occurrence}{A minimal number of profiles a feature has to be present in.} +} +\value{ +boolean value whether it is TRUE or FALSE. +} +\description{ +Validates if the data is present in more than "min_occurence" of samples. +} diff --git a/tests/testdata/aligned/output_create-features.rds b/tests/testdata/aligned/output_create-features.rds new file mode 100644 index 0000000..d184014 Binary files /dev/null and b/tests/testdata/aligned/output_create-features.rds differ diff --git a/tests/testdata/input/feature-align_create-features.parquet b/tests/testdata/input/feature-align_create-features.parquet new file mode 100644 index 0000000..32e5c5d Binary files /dev/null and b/tests/testdata/input/feature-align_create-features.parquet differ diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R index b60f27b..a267541 100644 --- a/tests/testthat/test-feature-align.R +++ b/tests/testthat/test-feature-align.R @@ -1,3 +1,9 @@ +update_expected <- function(actual) { + arrow::write_parquet(actual$metadata, file.path("..", "testdata", "aligned", "metadata_table.parquet")) + arrow::write_parquet(actual$intensity, file.path("..", "testdata", "aligned", "intensity_table.parquet")) + arrow::write_parquet(actual$rt, file.path("..", "testdata", "aligned", "rt_table.parquet")) +} + patrick::with_parameters_test_that( "feature.align test", { @@ -66,12 +72,16 @@ patrick::with_parameters_test_that( get_num_workers() ) - aligned_expected <- list( - metadata = arrow::read_parquet(file.path(testdata, "aligned", "metadata_table.parquet")), - intensity = arrow::read_parquet(file.path(testdata, "aligned", "intensity_table.parquet")), - rt = arrow::read_parquet(file.path(testdata, "aligned", "rt_table.parquet")) + aligned_expected <- load_aligned_features( + file.path(testdata, "aligned", "metadata_table.parquet"), + file.path(testdata, "aligned", "intensity_table.parquet"), + file.path(testdata, "aligned", "rt_table.parquet"), + file.path(testdata, "aligned", "tolerances.parquet") ) + aligned_expected["mz_tol_relative"] <- NULL + aligned_expected["rt_tol_relative"] <- NULL + expect_equal(aligned_actual, aligned_expected) }, patrick::cases( diff --git a/tests/testthat/test-feature-align_select-mz.R b/tests/testthat/test-feature-align_select-mz.R new file mode 100644 index 0000000..59261c5 --- /dev/null +++ b/tests/testthat/test-feature-align_select-mz.R @@ -0,0 +1,16 @@ +test_that("create_features_from_cluster() function works", { + sample <- read_parquet("../testdata/input/feature-align_create-features.parquet") + sample_names <- c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened") + min_occurrence <- 2 + mz_tol_relative <- 6.85676325338646e-06 + rt_tol_relative <- 2.17918873407775 + + actual <- create_features_from_cluster(sample, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names) + + expected <- readRDS("../testdata/aligned/output_create-features.rds") + expect_equal(actual, expected) +}) \ No newline at end of file diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index ed28fd3..8a064ba 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -25,6 +25,7 @@ patrick::with_parameters_test_that("basic hybrid test", { actual <- as_tibble(result$recovered_feature_sample_table) keys <- c("mz", "rt", "sample", "sample_rt", "sample_intensity") + # arrow::write_parquet(actual, file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet"))) expected <- arrow::read_parquet( file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")) )