diff --git a/DESCRIPTION b/DESCRIPTION
index 3b1effc..5acef3f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach,
-        snow, dplyr, tidyr, stringr, tibble, tools, arrow
+        snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr
 biocViews: Technology, MassSpectrometry
 License: GPL-2
 LazyLoad: yes
diff --git a/NAMESPACE b/NAMESPACE
index 1061d6a..c5c8173 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,11 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(adaptive.bin)
+export(add_feature_ids)
 export(adjust.time)
 export(aggregate_by_rt)
+export(as_feature_sample_table)
 export(bigauss.esti)
 export(bigauss.esti.EM)
 export(bigauss.mix)
+export(check_files)
+export(clean_data_matrix)
 export(comb)
 export(compute_boundaries)
 export(compute_bounds)
@@ -16,6 +20,7 @@ export(compute_clusters)
 export(compute_clusters_simple)
 export(compute_comb)
 export(compute_corrected_features)
+export(compute_corrected_features_v2)
 export(compute_curr_rec_with_enough_peaks)
 export(compute_delta_rt)
 export(compute_densities)
@@ -40,10 +45,14 @@ export(compute_template)
 export(compute_template_adjusted_rt)
 export(compute_uniq_grp)
 export(correct_time)
+export(correct_time_v2)
 export(count_peaks)
 export(create_aligned_feature_table)
+export(create_features_from_cluster)
+export(create_intensity_row)
+export(create_metadata)
 export(create_output)
-export(create_rows)
+export(create_rt_row)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
@@ -59,6 +68,7 @@ export(get_features_in_rt_range)
 export(get_mzrange_bound_indices)
 export(get_num_workers)
 export(get_rt_region_indices)
+export(get_sample_name)
 export(get_single_occurrence_mask)
 export(get_times_to_use)
 export(hybrid)
@@ -90,8 +100,6 @@ export(remove_noise)
 export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
-export(select_mz)
-export(select_rt)
 export(semi.sup)
 export(solve_a)
 export(solve_sigma)
diff --git a/R/adjust.time.R b/R/adjust.time.R
index fe896d1..1d0f5a8 100644
--- a/R/adjust.time.R
+++ b/R/adjust.time.R
@@ -2,6 +2,10 @@
 NULL
 #> NULL
 
+#' Combine template and sample features
+#' @param template_features Tibble Template feature table (mz, rt, cluster, sample_id).
+#' @param features Tibble Sample feature table (mz, rt, cluster, sample_id).
+#' @return Tibble Combined feature table (rbind).
 #' @export
 compute_comb <- function(template_features, features) {
   combined <- dplyr::bind_rows(
@@ -11,6 +15,12 @@ compute_comb <- function(template_features, features) {
   return(combined)
 }
 
+#' Select features to use for retention time alignment
+#' @description This function selects features present in both the sample
+#' feature table and template feature table given they have the same cluster,
+#' are adjacent in the combined table.
+#' @param combined Tibble Table with (mz, rt, cluster, sample_id).
+#' @return List of bool Returns list of bools with TRUE at each index where this condition is met.
 #' @export
 compute_sel <- function(combined) {
   l <- nrow(combined)
@@ -19,6 +29,11 @@ compute_sel <- function(combined) {
   return(sel)
 }
 
+#' Create two column table with paired sample and template retention times.
+#' @param combined Tibble Table with features from sample and template.
+#' @param sel list of bools List of bools indiciating which features to pair.
+#' See 'compute_sel'.
+#' @param j string Template sample_id.
 #' @export
 compute_template_adjusted_rt <- function(combined, sel, j) {
   all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
@@ -34,6 +49,13 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
   return(all_features)
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @description This is a newer implementation based on dplyr which might be more efficient than the other function.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
+#' @export
 compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
   idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
@@ -59,20 +81,25 @@ compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   return(features |> dplyr::arrange_at(c("mz", "rt")))
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
 #' @export
-compute_corrected_features <- function(features, delta_rt, avg_time) {
+compute_corrected_features <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
 
   corrected <- features$rt
   original <- features$rt
 
-  idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
+  idx <- dplyr::between(original, min(template_rt), max(template_rt))
   to_correct <- original[idx]
   this.smooth <- ksmooth(
+    template_rt,
     delta_rt,
-    avg_time,
     kernel = "normal",
-    bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
+    bandwidth = (max(template_rt) - min(template_rt)) / 5,
     x.points = to_correct
   )
 
@@ -80,8 +107,8 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
   lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
   upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
 
-  idx_lower <- original < min(delta_rt)
-  idx_upper <- original > max(delta_rt)
+  idx_lower <- original < min(template_rt)
+  idx_upper <- original > max(template_rt)
 
   corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
   corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
@@ -91,6 +118,10 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
   return(features)
 }
 
+#' Fill missing values based on original retention times.
+#' @param orig.features Non-corrected feature table.
+#' @param this.features Feature table with eventual missing values.
+#' @return Tibble Feature table with filles values.
 #' @export
 fill_missing_values <- function(orig.feature, this.feature) {
   missing_values <- which(is.na(this.feature$rt))
@@ -104,6 +135,10 @@ fill_missing_values <- function(orig.feature, this.feature) {
   return(this.feature)
 }
 
+#' Function to perform retention time correction
+#' @param this.feature Tibble Feature table for which to correct rt.
+#' @param template_features Tibble Template feature table to use for correction.
+#' @return Tibble this.feature table with corrected rt values.
 #' @export
 correct_time <- function(this.feature, template_features) {
     orig.features <- this.feature
@@ -137,6 +172,10 @@ correct_time <- function(this.feature, template_features) {
   return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
 }
 
+#' Select the template feature table.
+#' @description The current implementation selects the table with the most features as the template.
+#' @param extracted_features List of tables Tables from which to select the template.
+#' @return Tibble Template feature table.
 #' @export
 compute_template <- function(extracted_features) {
   num.ftrs <- sapply(extracted_features, nrow)
@@ -149,6 +188,14 @@ compute_template <- function(extracted_features) {
   return(tibble::as_tibble(template_features))
 }
 
+#' Rewritten version of 'correct_time'
+#' @description This function uses dplyr to do the same as
+#' 'correct_time', just with less code. Most functions used in the original
+#' function are replaced with simple data transformations.
+#' @param features Tibble Table with features to correct.
+#' @param template Tibble Template feature table to use for correction.
+#' @return Tibble Corrected feature table.
+#' @export
 correct_time_v2 <- function(features, template) {
   if (unique(features$sample_id) == unique(template$sample_id))
     return(tibble::as_tibble(features))
diff --git a/R/feature.align.R b/R/feature.align.R
index acc239b..5f541a0 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,114 +1,181 @@
-#' @import foreach
-
-create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) {
-    features <- new("list")
-    features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames)
-    features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames)
-    features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames)
-    return(features)
+#' Create a metadata row tibble with min, max and mean mz and RT values.
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @param sample_names A list of sample names.
+#' @export
+create_metadata <- function(sample_grouped, sample_names) {
+  sample_presence <- sapply(sample_names,
+    FUN = function(x) {
+      as.numeric(any(sample_grouped$sample_id == x))
+    }
+  )
+
+  metadata_row <- dplyr::summarise(
+    sample_grouped,
+    mzmean = mean(mz),
+    mzmin = min(mz),
+    mzmax = max(mz),
+    rtmean = mean(rt),
+    rtmin = min(rt),
+    rtmax = max(rt),
+    npeaks = n()
+  ) %>% rename(mz = "mzmean", rt = "rtmean")
+
+  metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence))
+  return(metadata_row)
 }
 
+#' Compute summed area for each sample
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @return Summed area for each sample.
 #' @export
-create_output <- function(sample_grouped, sample_names) {
-    number_of_samples <- length(sample_names)
-    intensity_row <- rep(0, number_of_samples)
-    rt_row <- rep(0, number_of_samples)
-    sample_presence <- rep(0, number_of_samples)
-
-    for (i in seq_along(intensity_row)) {
-        filtered <- filter(sample_grouped, sample_id == sample_names[i])
-        if (nrow(filtered) != 0) {
-            sample_presence[i] <- 1
-            intensity_row[i] <- sum(filtered$area)
-            rt_row[i] <- median(filtered$rt)
-        }
-    }
+create_intensity_row <- function(sample_grouped) {
+  sample_grouped %>%
+    dplyr::group_by(sample_id) %>%
+    dplyr::summarise(intensity = sum(area)) %>%
+    tidyr::pivot_wider(names_from = "sample_id", values_from = "intensity")
+}
 
-    mz <- sample_grouped$mz
-    rt <- sample_grouped$rt
-    metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence)
+#' Compute median RT for each sample
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @return Median RT for each sample.
+#' @export
+create_rt_row <- function(sample_grouped) {
+  sample_grouped %>%
+    dplyr::group_by(sample_id) %>%
+    dplyr::summarise(rt = median(rt)) %>%
+    tidyr::pivot_wider(names_from = "sample_id", values_from = "rt")
+}
 
-    return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
+#' Create a list containing 3 tibbles: metadata, intensities and RTs.
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
+#' @export
+create_output <- function(sample_grouped, sample_names) {
+  metadata_row <- create_metadata(sample_grouped, sample_names)
+  intensity_row <- create_intensity_row(sample_grouped)
+  rt_row <- create_rt_row(sample_grouped)
+
+  return(list(
+    metadata_row = metadata_row,
+    intensity_row = intensity_row,
+    rt_row = rt_row
+  ))
 }
 
+#' Validates if the data is present in more than "min_occurence" of samples.
+#' @param samples A subset of the features_table.
+#' @param min_occurrence A minimal number of profiles a feature has to be present in.
+#' @return boolean value whether it is TRUE or FALSE.
 #' @export
 validate_contents <- function(samples, min_occurrence) {
-    # validate whether data is still from at least 'min_occurrence' number of samples
-    if (!is.null(nrow(samples))) {
-        if (length(unique(samples$sample_id)) >= min_occurrence) {
-            return(TRUE)
-        }
-        return(FALSE)
+  if (!is.null(nrow(samples))) {
+    if (length(unique(samples$sample_id)) >= min_occurrence) {
+      return(TRUE)
     }
     return(FALSE)
+  }
+  return(FALSE)
 }
 
+#' Compute the kernel density estimation and find the peaks and valleys of a smooth curve.
+#' @param data A vector of m/z or RTs for a particular cluster.
+#' @param bandwidth A bandwidth value for the KDE computation.
+#' @return A list of peaks and valleys positions.
 #' @export
 find_optima <- function(data, bandwidth) {
-    # Kernel Density Estimation
-    den <- density(data, bw = bandwidth)
-    # select statistically significant points
-    turns <- find.turn.point(den$y)
-    return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
+  den <- density(data, bw = bandwidth)
+  turns <- find.turn.point(den$y)
+  return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
 }
 
+#' Subset data within lower and upper bound from density estimation
+#' @param sample A subset of the features_table.
+#' @param turns A list of peaks and valleys positions.
+#' @param index Whether it subsets on m/z [1] or RT [2] column.
+#' @param i Iterates over the peaks in the turns list.
+#' @return Dataframe subsetted within lower and upper bound from density estimation.
 #' @export
 filter_based_on_density <- function(sample, turns, index, i) {
-    # select data within lower and upper bound from density estimation
-    lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
-    upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
-    selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
-    return(sample[selected, ])
+  lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
+  upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
+  selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
+  return(sample[selected, ])
 }
 
+#' Group the mz and RT for particular cluster.
+#' @param features The features table subsetted for a particular cluster.
+#' @param mz_tol_relative The m/z tolerance level for peak alignment.
+#' @param rt_tol_relative The retention time tolerance level for peak alignment.
+#' @param min_occurrence A minimal number of profiles a feature has to be present in.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
-select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
-    turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
-    for (i in seq_along(turns$peaks)) {
-        sample_grouped <- filter_based_on_density(sample, turns, 2, i)
-        if (validate_contents(sample_grouped, min_occurrence)) {
-            return(create_output(sample_grouped, sample_names))
+create_features_from_cluster <- function(features,
+                                         mz_tol_relative,
+                                         rt_tol_relative,
+                                         min_occurrence,
+                                         sample_names) {
+  if (!validate_contents(features, min_occurrence)) {
+    return(NULL)
+  }
+
+  # create empty tibble rows
+  metadata <- NULL
+  intensity <- NULL
+  rt <- NULL
+
+  # split according to mz values
+  turns_mz <- find_optima(features$mz, bandwidth = mz_tol_relative * median(features$mz))
+  for (i in seq_along(turns_mz$peaks)) {
+    sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i)
+    if (validate_contents(sample_grouped_mz, min_occurrence)) {
+      # split according to rt values
+      turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414)
+      for (ii in seq_along(turns_rt$peaks)) {
+        sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii)
+
+        # create output rows if valid
+        if (validate_contents(sample_grouped_rt, min_occurrence)) {
+          metadata <- dplyr::bind_rows(metadata, create_metadata(sample_grouped_rt, sample_names))
+          intensity <- dplyr::bind_rows(intensity, create_intensity_row(sample_grouped_rt))
+          rt <- dplyr::bind_rows(rt, create_rt_row(sample_grouped_rt))
         }
+      }
     }
+  }
+
+  return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt))
 }
 
+#' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
+#' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
 #' @export
-select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
-    turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
-    for (i in seq_along(turns$peaks)) {
-        sample_grouped <- filter_based_on_density(sample, turns, 1, i)
-        if (validate_contents(sample_grouped, min_occurrence)) {
-            return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
-        }
-    }
+comb <- function(x, ...) {
+  mapply(plyr::rbind.fill, x, ..., SIMPLIFY = FALSE)
 }
 
+#' Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble
+#' @param x A dataframe
+#' @param sample_names List of sample names.
+#' @return Cleaned tibble.
 #' @export
-create_rows <- function(features,
-                        i,
-                        sel.labels,
-                        mz_tol_relative,
-                        rt_tol_relative,
-                        min_occurrence,
-                        sample_names) {
-    if (i %% 100 == 0) {
-        gc()
-    } # call Garbage Collection for performance improvement?
-
-    sample <- dplyr::filter(features, cluster == sel.labels[i])
-    if (nrow(sample) > 1) {
-        if (validate_contents(sample, min_occurrence)) {
-            return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
-        }
-    } else if (min_occurrence == 1) {
-        return(create_output(sample_grouped, sample_names))
-    }
-    return(NULL)
+clean_data_matrix <- function(x, sample_names) {
+  x <- x %>%
+    replace(is.na(.), 0) %>%
+    dplyr::relocate(sample_names) |>
+    add_feature_ids()
+  return(x)
 }
 
+#' Add `id` column to a dataframe
+#' @param x A dataframe
+#' @return The same dataframe but with an additional `id` column 
+#' in first place which contains the rownames.
 #' @export
-comb <- function(x, ...) {
-    mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE)))
+add_feature_ids <- function(x) {
+  x$id <- as.numeric(rownames(x))
+  return(tibble::as_tibble(x |> dplyr::relocate(id)))
 }
 
 #' Align peaks from spectra into a feature table.
@@ -122,7 +189,7 @@ comb <- function(x, ...) {
 #' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which
 #'  allows the program to search for the tolerance level based on the data.
 #' @param cluster The number of CPU cores to be used
-#' @return A tibble with three tables containing aligned metadata, intensities an RTs.
+#' @return A list of 3 tibbles containing aligned metadata, intensities an RTs.
 #'
 #' @export
 create_aligned_feature_table <- function(features_table,
@@ -131,55 +198,37 @@ create_aligned_feature_table <- function(features_table,
                                          rt_tol_relative,
                                          mz_tol_relative,
                                          cluster = 4) {
-    if (!is(cluster, "cluster")) {
-        cluster <- parallel::makeCluster(cluster)
-        on.exit(parallel::stopCluster(cluster))
-        
-        # NOTE: side effect (doParallel has no functionality to clean up)
-        doParallel::registerDoParallel(cluster)
-        register_functions_to_cluster(cluster)
-    }
-
-
-
-    number_of_samples <- length(sample_names)
-    metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
-    intensity_colnames <- c("id", sample_names)
-    rt_colnames <- c("id", sample_names)
-
-    aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)
-
-    # table with number of values per group
-    groups_cardinality <- table(features_table$cluster)
-    # count those with minimal occurrence
-    sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
-
-    # retention time alignment
-    aligned_features <- foreach::foreach(
-        i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
-    ) %dopar% {
-        rows <- create_rows(
-            features_table,
-            i,
-            sel.labels,
-            mz_tol_relative,
-            rt_tol_relative,
-            min_occurrence,
-            sample_names
-        )
-
-        if (!is.null(rows)) {
-            rows$metadata_row <- c(i, rows$metadata_row)
-            rows$intensity_row <- c(i, rows$intensity_row)
-            rows$rt_row <- c(i, rows$rt_row)
-        }
-
-        list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row)
-    }
-
-    colnames(aligned_features$metadata) <- metadata_colnames
-    colnames(aligned_features$intensity) <- intensity_colnames
-    colnames(aligned_features$rt) <- rt_colnames
-
-    return(aligned_features)
+  if (!is(cluster, "cluster")) {
+    cluster <- parallel::makeCluster(cluster)
+    on.exit(parallel::stopCluster(cluster))
+
+    # NOTE: side effect (doParallel has no functionality to clean up)
+    doParallel::registerDoParallel(cluster)
+    register_functions_to_cluster(cluster)
+  }
+
+  # table with number of values per group
+  groups_cardinality <- table(features_table$cluster)
+  # count those with minimal occurrence
+  sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
+
+  # retention time alignment
+  aligned_features <- foreach::foreach(
+    i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
+  ) %do% {
+    rows <- create_features_from_cluster(
+      dplyr::filter(features_table, cluster == sel.labels[i]),
+      mz_tol_relative,
+      rt_tol_relative,
+      min_occurrence,
+      sample_names
+    )
+    list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row)
+  }
+
+  aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names)
+  aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names)
+  aligned_features$metadata <- add_feature_ids(aligned_features$metadata)
+
+  return(aligned_features)
 }
diff --git a/R/unsupervised.R b/R/unsupervised.R
index 3b7070b..4093b9c 100644
--- a/R/unsupervised.R
+++ b/R/unsupervised.R
@@ -2,6 +2,13 @@
 NULL
 #> NULL
 
+#' Read the metadata table, retention time data matrix and intensity data matrix
+#' and combine them into a single table
+#' @param metadata Tibble Feature metadata table with information concerning the peaks.
+#' @param rt_crosstab Tibble Data matrix with features on rows and samples on columns holding rt data.
+#' @param int_crosstab Tibble Data matrix with features on rows and samples on columns holding intensity data.
+#' @return Tibble A merged table containing all information.
+#' @export 
 as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) {
   feature_names <- as.character(rt_crosstab$id)
   sample_names <- colnames(metadata)[-c(1:8)]
@@ -27,6 +34,9 @@ as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) {
   return(data)
 }
 
+#' Check files whether they exist.
+#' @param filenames list of filenames Filenames to check whether they exist.
+#' @export
 check_files <- function(filenames) {
   missing <- !file.exists(filenames)
   missing_filenames <- paste0('\t', filenames[missing], collapse = '\n')
@@ -36,6 +46,10 @@ check_files <- function(filenames) {
   }
 }
 
+#' Get the sample name as basename of the file.
+#' @param filename string Name of the file.
+#' @return string Sample name.
+#' @export
 get_sample_name <- function(filename) {
   tools::file_path_sans_ext(basename(filename))
 }
diff --git a/R/utils.R b/R/utils.R
index 6441ad0..db06a3b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -8,6 +8,7 @@ register_functions_to_cluster <- function(cluster) {
         'prof.to.features',
         'load.lcms',
         'adaptive.bin',
+        'add_feature_ids',
         'find.turn.point',
         'msExtrema',
         'find_local_maxima',
@@ -45,14 +46,17 @@ register_functions_to_cluster <- function(cluster) {
         'compute_uniq_grp',
         'predict_smoothed_rt',
         'label_val_to_keep',
-        "create_rows",
+        "create_features_from_cluster",
         "validate_contents",
-        "select_mz",
-        "select_rt",
         "find_optima",
         "filter_based_on_density",
         "create_output",
+        "create_metadata",
+        "create_rt_row",
+        "create_intensity_row",
         "comb",
+        "clean_data_matrix",
+        "create_aligned_feature_table",
         'bigauss.esti.EM',
         'solve_sigma',
         'prep_uv',
@@ -81,7 +85,10 @@ register_functions_to_cluster <- function(cluster) {
         'get_mzrange_bound_indices',
         'compute_mass_density',
         'l2normalize',
-        'compute_peaks_and_valleys'
+        'compute_peaks_and_valleys',
+        'as_feature_sample_table',
+        'check_files',
+        'get_sample_name'
     ))
     snow::clusterEvalQ(cluster, library("dplyr"))
     snow::clusterEvalQ(cluster, library("stringr"))
diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml
index c903498..1e98906 100644
--- a/conda/environment-dev.yaml
+++ b/conda/environment-dev.yaml
@@ -29,3 +29,4 @@ dependencies:
   - r-httpgd
   - r-microbenchmark
   - r-covr
+  - r-plyr
diff --git a/conda/environment.yaml b/conda/environment.yaml
index 3ab1060..1b2776b 100644
--- a/conda/environment.yaml
+++ b/conda/environment.yaml
@@ -18,4 +18,5 @@ dependencies:
   - r-tidyr
   - r-stringr
   - r-tibble
+  - r-plyr
 
diff --git a/man/clean_data_matrix.Rd b/man/clean_data_matrix.Rd
new file mode 100644
index 0000000..e535404
--- /dev/null
+++ b/man/clean_data_matrix.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{clean_data_matrix}
+\alias{clean_data_matrix}
+\title{Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble}
+\usage{
+clean_data_matrix(x, sample_names)
+}
+\arguments{
+\item{x}{A dataframe}
+
+\item{sample_names}{List of sample names.}
+}
+\value{
+Cleaned tibble.
+}
+\description{
+Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble
+}
diff --git a/man/comb.Rd b/man/comb.Rd
new file mode 100644
index 0000000..2087909
--- /dev/null
+++ b/man/comb.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{comb}
+\alias{comb}
+\title{Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.}
+\usage{
+comb(x, ...)
+}
+\value{
+Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
+}
+\description{
+Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
+}
diff --git a/man/compute_clusters_simple.Rd b/man/compute_clusters_simple.Rd
index 09ab9f3..3a4855b 100644
--- a/man/compute_clusters_simple.Rd
+++ b/man/compute_clusters_simple.Rd
@@ -7,7 +7,7 @@
 compute_clusters_simple(feature_tables, sample_names, mz_tol_ppm, rt_tol)
 }
 \arguments{
-\item{feature_tables}{list of tibbles List of feature tables coming from all samples.}
+\item{feature_tables}{list of tibbles feature tables coming from all samples.}
 
 \item{sample_names}{list of strings Sample names of the feature tables used to distinguish the samples.}
 
diff --git a/man/create_aligned_feature_table.Rd b/man/create_aligned_feature_table.Rd
index 034df37..66a676d 100644
--- a/man/create_aligned_feature_table.Rd
+++ b/man/create_aligned_feature_table.Rd
@@ -30,7 +30,7 @@ percentage of the m/z value. This value, multiplied by the m/z value, becomes th
 \item{cluster}{The number of CPU cores to be used}
 }
 \value{
-A tibble with three tables containing aligned metadata, intensities an RTs.
+A list of 3 tibbles containing aligned metadata, intensities an RTs.
 }
 \description{
 Align peaks from spectra into a feature table.
diff --git a/man/create_features_from_cluster.Rd b/man/create_features_from_cluster.Rd
new file mode 100644
index 0000000..84bebba
--- /dev/null
+++ b/man/create_features_from_cluster.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_features_from_cluster}
+\alias{create_features_from_cluster}
+\title{Group the mz and RT for particular cluster.}
+\usage{
+create_features_from_cluster(
+  features,
+  mz_tol_relative,
+  rt_tol_relative,
+  min_occurrence,
+  sample_names
+)
+}
+\arguments{
+\item{features}{The features table subsetted for a particular cluster.}
+
+\item{mz_tol_relative}{The m/z tolerance level for peak alignment.}
+
+\item{rt_tol_relative}{The retention time tolerance level for peak alignment.}
+
+\item{min_occurrence}{A minimal number of profiles a feature has to be present in.}
+
+\item{sample_names}{A list of sample names.}
+}
+\value{
+A list containing 3 tibbles: metadata, intensities and RTs.
+}
+\description{
+Group the mz and RT for particular cluster.
+}
diff --git a/man/create_intensity_row.Rd b/man/create_intensity_row.Rd
new file mode 100644
index 0000000..0976c45
--- /dev/null
+++ b/man/create_intensity_row.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_intensity_row}
+\alias{create_intensity_row}
+\title{Compute summed area for each sample}
+\usage{
+create_intensity_row(sample_grouped)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+}
+\value{
+Summed area for each sample.
+}
+\description{
+Compute summed area for each sample
+}
diff --git a/man/create_output.Rd b/man/create_output.Rd
new file mode 100644
index 0000000..9c1223e
--- /dev/null
+++ b/man/create_output.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_output}
+\alias{create_output}
+\title{Create a list containing 3 tibbles: metadata, intensities and RTs.}
+\usage{
+create_output(sample_grouped, sample_names)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+
+\item{sample_names}{A list of sample names.}
+}
+\value{
+A list containing 3 tibbles: metadata, intensities and RTs.
+}
+\description{
+Create a list containing 3 tibbles: metadata, intensities and RTs.
+}
diff --git a/man/create_rt_row.Rd b/man/create_rt_row.Rd
new file mode 100644
index 0000000..0059fb9
--- /dev/null
+++ b/man/create_rt_row.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_rt_row}
+\alias{create_rt_row}
+\title{Compute median RT for each sample}
+\usage{
+create_rt_row(sample_grouped)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+}
+\value{
+Median RT for each sample.
+}
+\description{
+Compute median RT for each sample
+}
diff --git a/man/filter_based_on_density.Rd b/man/filter_based_on_density.Rd
new file mode 100644
index 0000000..fe85d5c
--- /dev/null
+++ b/man/filter_based_on_density.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{filter_based_on_density}
+\alias{filter_based_on_density}
+\title{Subset data within lower and upper bound from density estimation}
+\usage{
+filter_based_on_density(sample, turns, index, i)
+}
+\arguments{
+\item{sample}{A subset of the features_table.}
+
+\item{turns}{A list of peaks and valleys positions.}
+
+\item{index}{Whether it subsets on m/z [1] or RT [2] column.}
+
+\item{i}{Iterates over the peaks in the turns list.}
+}
+\value{
+Dataframe subsetted within lower and upper bound from density estimation.
+}
+\description{
+Subset data within lower and upper bound from density estimation
+}
diff --git a/man/find_optima.Rd b/man/find_optima.Rd
new file mode 100644
index 0000000..81e681b
--- /dev/null
+++ b/man/find_optima.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{find_optima}
+\alias{find_optima}
+\title{Compute the kernel density estimation and find the peaks and valleys of a smooth curve.}
+\usage{
+find_optima(data, bandwidth)
+}
+\arguments{
+\item{data}{A vector of m/z or RTs for a particular cluster.}
+
+\item{bandwidth}{A bandwidth value for the KDE computation.}
+}
+\value{
+A list of peaks and valleys positions.
+}
+\description{
+Compute the kernel density estimation and find the peaks and valleys of a smooth curve.
+}
diff --git a/man/remove_noise.Rd b/man/remove_noise.Rd
index debe393..e310fcd 100644
--- a/man/remove_noise.Rd
+++ b/man/remove_noise.Rd
@@ -13,7 +13,8 @@ remove_noise(
   baseline_correct_noise_percentile,
   intensity_weighted,
   do.plot,
-  cache
+  cache,
+  grouping_threshold = Inf
 )
 }
 \arguments{
@@ -40,6 +41,8 @@ run filter, to be used as the baseline threshold of signal strength.}
 \item{do.plot}{Indicates whether plot should be drawn.}
 
 \item{cache}{Whether to use cache}
+
+\item{grouping_threshold}{The maximum difference between two scans to be considered the same EIC. Default is Inf.}
 }
 \value{
 A matrix with four columns: m/z value, retention time, intensity, and group number.
diff --git a/man/validate_contents.Rd b/man/validate_contents.Rd
new file mode 100644
index 0000000..e8f9889
--- /dev/null
+++ b/man/validate_contents.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{validate_contents}
+\alias{validate_contents}
+\title{Validates if the data is present in more than "min_occurence" of samples.}
+\usage{
+validate_contents(samples, min_occurrence)
+}
+\arguments{
+\item{samples}{A subset of the features_table.}
+
+\item{min_occurrence}{A minimal number of profiles a feature has to be present in.}
+}
+\value{
+boolean value whether it is TRUE or FALSE.
+}
+\description{
+Validates if the data is present in more than "min_occurence" of samples.
+}
diff --git a/tests/testdata/aligned/output_create-features.rds b/tests/testdata/aligned/output_create-features.rds
new file mode 100644
index 0000000..d184014
Binary files /dev/null and b/tests/testdata/aligned/output_create-features.rds differ
diff --git a/tests/testdata/input/feature-align_create-features.parquet b/tests/testdata/input/feature-align_create-features.parquet
new file mode 100644
index 0000000..32e5c5d
Binary files /dev/null and b/tests/testdata/input/feature-align_create-features.parquet differ
diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R
index b60f27b..a267541 100644
--- a/tests/testthat/test-feature-align.R
+++ b/tests/testthat/test-feature-align.R
@@ -1,3 +1,9 @@
+update_expected <- function(actual) {
+  arrow::write_parquet(actual$metadata, file.path("..", "testdata", "aligned", "metadata_table.parquet"))
+  arrow::write_parquet(actual$intensity, file.path("..", "testdata", "aligned", "intensity_table.parquet"))
+  arrow::write_parquet(actual$rt, file.path("..", "testdata", "aligned", "rt_table.parquet"))
+}
+
 patrick::with_parameters_test_that(
   "feature.align test",
   {
@@ -66,12 +72,16 @@ patrick::with_parameters_test_that(
         get_num_workers()
     )
 
-    aligned_expected <- list(
-      metadata = arrow::read_parquet(file.path(testdata, "aligned", "metadata_table.parquet")),
-      intensity = arrow::read_parquet(file.path(testdata, "aligned", "intensity_table.parquet")),
-      rt = arrow::read_parquet(file.path(testdata, "aligned", "rt_table.parquet"))
+    aligned_expected <- load_aligned_features(
+      file.path(testdata, "aligned", "metadata_table.parquet"),
+      file.path(testdata, "aligned", "intensity_table.parquet"),
+      file.path(testdata, "aligned", "rt_table.parquet"),
+      file.path(testdata, "aligned", "tolerances.parquet")
     )
 
+    aligned_expected["mz_tol_relative"] <- NULL
+    aligned_expected["rt_tol_relative"] <- NULL
+
     expect_equal(aligned_actual, aligned_expected)
   },
   patrick::cases(
diff --git a/tests/testthat/test-feature-align_select-mz.R b/tests/testthat/test-feature-align_select-mz.R
new file mode 100644
index 0000000..59261c5
--- /dev/null
+++ b/tests/testthat/test-feature-align_select-mz.R
@@ -0,0 +1,16 @@
+test_that("create_features_from_cluster() function works", {
+  sample <- read_parquet("../testdata/input/feature-align_create-features.parquet")
+  sample_names <- c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
+  min_occurrence <- 2
+  mz_tol_relative <- 6.85676325338646e-06
+  rt_tol_relative <- 2.17918873407775
+
+  actual <- create_features_from_cluster(sample,
+                      mz_tol_relative,
+                      rt_tol_relative,
+                      min_occurrence,
+                      sample_names)
+  
+  expected <- readRDS("../testdata/aligned/output_create-features.rds")
+  expect_equal(actual, expected)
+})
\ No newline at end of file
diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R
index ed28fd3..8a064ba 100644
--- a/tests/testthat/test-hybrid.R
+++ b/tests/testthat/test-hybrid.R
@@ -25,6 +25,7 @@ patrick::with_parameters_test_that("basic hybrid test", {
   actual <- as_tibble(result$recovered_feature_sample_table)
   keys <- c("mz", "rt", "sample", "sample_rt", "sample_intensity")
 
+  # arrow::write_parquet(actual, file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")))
   expected <- arrow::read_parquet(
     file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet"))
   )