microbiome · TuomasBorman · Oct 4, 2024 · Sep 5, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/inst/extdata/PRISM.RData b/inst/extdata/PRISM.RData
diff --git a/inst/pages/cross_correlation.qmd b/inst/pages/cross_correlation.qmd
@@ -48,15 +48,6 @@ data(HintikkaXOData, package = "mia")
 mae <- HintikkaXOData
 ```
 
-```{r cross-association2}
-library(stringr)
-# Drop those bacteria that do not include information in Phylum or lower levels
-mae[[1]] <- mae[[1]][!is.na(rowData(mae[[1]])$Phylum), ]
-# Clean taxonomy data, so that names do not include additional characters
-rowData(mae[[1]]) <- DataFrame(
-    apply(rowData(mae[[1]]), 2, str_remove, pattern = "._[0-9]__"))
-```
-
 ```{r}
 # Available alternative experiments
 experiments(mae)
@@ -84,7 +75,7 @@ bacterium X is present, is the concentration of metabolite Y lower or higher"?
 
 ```{r cross-correlation5}
 # Agglomerate microbiome data at family level
-mae[[1]] <- agglomerateByPrevalence(mae[[1]], rank = "Family")
+mae[[1]] <- agglomerateByPrevalence(mae[[1]], rank = "Family", na.rm = TRUE)
 # Does log10 transform for microbiome data
 mae[[1]] <- transformAssay(mae[[1]], method = "log10", pseudocount = TRUE)
 

diff --git a/inst/pages/integrated_learner.qmd b/inst/pages/integrated_learner.qmd
@@ -5,8 +5,6 @@ library(rebook)
 chapterPreamble()
 ```
 
-## Multi-omics Integration
-
 In multiview data analysis, there are two main types of approaches: early fusion
 and late fusion. \underline{Early fusion} combines all datasets into a single
 representation, which then serves as input for a supervised learning model. In
@@ -19,161 +17,203 @@ advanced method, called cooperative learning [@Ding2022], which is also known as
 predictions from different data views to align through an agreement parameter
 ($\rho$).
 
-## Multi-omics Prediction and Classification of Binary IBD Disease Status
+## Prediction of fat and prebiotics in diet of rats
 
-In this chapter, we showcase examples of various integration paradigms (early,
-late, and intermediate fusion) using the R packages `multiview` [@Ding2022]
-and `SuperLearner` [@Van2007]. We make use of the publicly available source code
-from the multi-omics integrated framework (`IntegratedLearner`) proposed by
-@Mallick2024.
+As an example we would build a random forest classifier based on theHintikkaxOData 
+multi-omics profiles to classify fat and prebiotics in diet of rats. 
 
-## Input data
+### With validation set
 
-We use the publicly available Inflammatory Bowel Diseases (IBD) data from the 
-`curatedMetagenomicData` package [@Lloyd-Price2019], where we aim to 
-predict IBD disease status based on both taxonomic (species abundances) and 
-functional (pathway abundances) profiles.
+```{r valid-set}
 
-```{r load-pkg-data}
+###########################
+# load required libraries #
+###########################
+library(mia)
+library(SuperLearner)
+library(IntegratedLearner)
+library(tidyverse)
+library(cowplot)
 
-##################
-# Load iHMP data #
-##################
+########################
+# load multi-omic data #
+########################
+data("HintikkaXOData")
+mae <- HintikkaXOData
 
-library(curatedMetagenomicData)
-library(dplyr)
-library(tidyverse)
+######################################
+# Add validation set info to colData #
+######################################
 
-se_relative <- sampleMetadata |>
-  filter(study_name == "HMP_2019_ibdmdb") |>
-  returnSamples("relative_abundance", rownames = "short")
+colData(mae)[["valid"]] <- sample(c(TRUE, FALSE), nrow(colData(mae)), 
+                                  replace = TRUE)
 
-se_pathway <- sampleMetadata |>
-  filter(study_name == "HMP_2019_ibdmdb") |>
-  returnSamples("pathway_abundance", rownames = "short")
+###########################
+# Run with validation set #
+###########################
+
+fit1 <- IntegratedLearnerFromMAE(
+    mae,
+    experiment = c(1, 2, 3),
+    assay.type = c("counts", "nmr", "signals"),
+    outcome.col = "Fat",
+    valid.col = "valid",
+    folds = 10,
+    na.rm = TRUE,
+    base_learner = 'SL.randomForest',
+    meta_learner = 'SL.nnls.auc',
+    verbose = TRUE,
+    family=binomial()
+)
 
 ```
 
-We will first prepare the input sample metadata and feature table for both
-relative abundance and pathway abundance data.
+IntegratedLearner offers easily accessible and interpretable summary outputs 
+including 1) computation time, 2) per-layer AUC/ R2 scores on training , 3) AUC/ R2
+metrics for stacked and concatenated model if run_stacked=TRUE and r
+un_concat=TRUE and 4) estimated per-layer weights from meta learner in stacked 
+model.
 
-```{r prepare-metadata-feature}
+We can visualize the classification performance by constructing layer-wise ROC 
+curves for both train and test set using plot.learner() function that takes 
+IntegratedLearner object as input.
 
-##########################
-# Create sample metadata #
-##########################
+```{r plot fit1}
+###################
+# Summary outputs #
+###################
 
-sample_metadata <- select(
-    as.data.frame(colData(se_relative)),
-    c("study_name", "disease", "subject_id"))
+plot <- IntegratedLearner:::plot.learner(fit1)
 
-# Define response variable & sample id
-sample_metadata$Y <- ifelse(sample_metadata$disease == "IBD", 1, 0)
-sample_metadata <- sample_metadata %>%
-    select(subject_id, Y) %>% rename(subjectID = subject_id)
+```
 
-###########################
-# Create Species Features #
-###########################
+The stacked model achieves superior accuracy to individual layers and concatenation 
+model in independent validation, indicating that the stacked multi-omics classifier 
+leads to a competitive or superior cross-validated and independent validation 
+classification accuracy than its single-omics counterparts.
 
-feature_species <- as.data.frame(assay(se_relative))
-rownames(feature_species) <- sub('.*s__', '', rownames(feature_species))
+### Without validataion set
 
-###########################
-# Create Pathway Features #
-###########################
+The default base model recommendation for IntegratedLearner is Bayesian additive 
+regression trees or BART (base_learner='SL.BART'). Using BART as base-model yields 
+uncertainty estimates (i.e. credible intervals) of the prediction and model 
+parameters in addition to reporting a small set of interpretable features for 
+follow-up experiments (i.e. feature importance scores).
 
-feature_pwys <- as.data.frame(assay(se_pathway))
-feature_pwys <- rownames_to_column(feature_pwys, "ID")
-feature_pwys <- feature_pwys %>%
-  filter(!grepl("\\|", ID)) %>%
-  filter(!ID %in% c('UNMAPPED', 'UNINTEGRATED')) %>%
-  column_to_rownames('ID') %>%
-  as.data.frame()
+```{r no_valid set}
 
-################################
-# Combine Species and Pathways #
-################################
+##############################
+# Run without validation set #
+##############################
 
-feature_table <- bind_rows(feature_species, feature_pwys)
+fit2 <- IntegratedLearnerFromMAE(
+    mae,
+    experiment = c(1, 2, 3),
+    assay.type = c("counts", "nmr", "signals"),
+    outcome.col = "Diet",
+    folds = 10,
+    na.rm = TRUE # The data has missing values that are removed
+)
 
-########################################################
-# Check row names of feature_table and sample_metadata #
-########################################################
+```
+
+```{r plot fit2}
+
+###################
+# Summary outputs #
+###################
 
-all(colnames(feature_table) == rownames(sample_metadata))
+plot <- IntegratedLearner:::plot.learner(fit2)
 
 ```
 
-We will then create a metadata table for the features. This table captures
-high-level information related to the features (e.g., which layer they belong
-to).
+As before, the multi-omics stacked prediction model leads to a better 
+cross-validated accuracy than its single-omics counterparts and concatenation 
+model.
 
-```{r create-metadata-table}
+When base_learner='SL.BART', in addition to point predictions, we can also 
+generate 1) credible intervals for all observations, 2) estimated layer weights 
+in the meta model and 3) feature importance scores.
 
-######################################
-# Create metadata table for features #
-######################################
+```{r weights fit2}
+weights <- fit2$weights
 
-rowID <- rep(
-    c('Species', 'Pathways'),
-    c(nrow(feature_species), nrow(feature_pwys)))
-feature_metadata <- cbind.data.frame(
-    featureID = rownames(feature_table), featureType = rowID)
-rownames(feature_metadata) <- feature_metadata$featureID
+dataX <- fit2$X_train_layers
+dataY <- fit2$Y_train
 
-################
-# Sanity check #
-################
 
-all(rownames(feature_table) == rownames(feature_metadata)) # TRUE
-all(colnames(feature_table) == rownames(sample_metadata)) # TRUE
+post.samples <- vector("list", length(weights))
+names(post.samples) <- names(dataX)
 
+for(i in seq_along(post.samples)){
+post.samples[[i]] <- bartMachine::bart_machine_get_posterior(fit2$model_fits$model_layers[[i]],dataX[[i]])$y_hat_posterior_samples
+}
+
+weighted.post.samples <-Reduce('+', Map('*', post.samples, weights))
+rownames(weighted.post.samples) <- rownames(dataX[[1]])
+names(dataY) <- rownames(dataX[[1]])
 ```
 
-Further data pre-processing is necessary to handle near-zero-variance features
-in this dataset. Ultimately, 483 features are retained, consisting of 360
-pathways and 123 species. A combination of variance and prevalence filtering
-is applied to the feature table, while related metadata is cleaned to ensure the
-retention of matching samples.
+```{r capture output}
 
-```{r create-feature-table}
+library(mcmcplots)
+capture.output(temp <- caterplot(t(weighted.post.samples), add = FALSE))
 
-#################
-# feature_table #
-#################
+```
 
-library(caret)
+We show below the 68% and 95% credible intervals obtained from stacked model for 
+all 40 observations. The filled circle indicates the posterior median and empty 
+circle indicates the true value of the observation.
 
-feature_table_t <- as.data.frame(t(feature_table))
-abd_threshold = 0
-prev_threshold = 0.1
-nzv <- nearZeroVar(feature_table_t)
-features_var_filtered <- feature_table_t[, -nzv]
-features_var_filtered <- as.data.frame(features_var_filtered)
-features_filtered <- features_var_filtered[
-  , colSums(features_var_filtered > abd_threshold) >
-    nrow(features_var_filtered) * prev_threshold
-  ]
-feature_table <- as.data.frame(t(features_filtered))
+```{r plot capture}
 
-####################
-# feature_metadata #
-####################
+temp<-caterplot(t(weighted.post.samples),
+          horizontal = FALSE, add = FALSE)
+points(dataY[temp])
+title(main ="", xlab = "Observations", ylab = "Fat and prebiotics in rats",
+       line = NA, outer = FALSE)
+
+```
 
-feature_metadata <- feature_metadata[rownames(feature_table), ]
-table(feature_metadata$featureType)
 
-######################
-# Sanity check again #
-######################
+## Prediction of Binary IBD Disease Status
 
-all(rownames(feature_table) == rownames(feature_metadata)) # TRUE
-all(colnames(feature_table) == rownames(sample_metadata)) # TRUE
+In this example, we showcase various integration paradigms (early,
+late, and intermediate fusion) using the R packages `multiview` [@Ding2022]
+and `SuperLearner` [@Van2007]. We make use of the publicly available source code
+from the multi-omics integrated framework (`IntegratedLearner`) proposed by
+@Mallick2024.
+
+## Input data
+
+We use the publicly available Inflammatory Bowel Diseases (IBD) data from the 
+`curatedMetagenomicData` package [@Lloyd-Price2019], where we aim to 
+predict IBD disease status based on both taxonomic (species abundances) and 
+functional (pathway abundances) profiles.
+
+```{r load_data_prism}
+
+#################
+# load IBD data #
+#################
+
+pcl <- system.file(
+    "extdata", "PRISM.RData", package = "OMA")
+
+load(pcl)
+
+########################################################################
+# Get feature table,  sample metadata  and feature metadata respectively #
+########################################################################
+
+feature_table <- pcl$feature_table
+sample_metadata <- pcl$sample_metadata
+feature_metadata <- pcl$feature_metadata
 
 ```
 
-Following the preprocessing, we conduct additional input data preparation,
+
+Following the data loading, we conduct additional input data preparation,
 which includes configuring parameters for sample splitting and 5-fold
 cross-validation. It's important to note that this dataset contains repeated
 measures (i.e., multiple samples per subject). To address this, we will conduct
@@ -389,8 +429,8 @@ names(dataList) <- name_layers
 table(feature_metadata$featureType)
 
 # Select indices of Pathways and Species rows
-pathways_ind <- which(feature_metadata$featureType == "Pathways")
-species_ind <- which(feature_metadata$featureType == "Species")
+pathways_ind <- which(feature_metadata$featureType == "metabolites")
+species_ind <- which(feature_metadata$featureType == "species")
 
 # Create two data lists for pathways and species
 dataList[[1]] <- t(feature_table[pathways_ind, ])

diff --git a/inst/pages/multiassay_ordination.qmd b/inst/pages/multiassay_ordination.qmd
@@ -37,18 +37,6 @@ data(HintikkaXOData, package = "mia")
 mae <- HintikkaXOData
 ```
 
-```{r MOFA2, message=FALSE, warning=FALSE}
-library(MOFA2)
-
-# For inter-operability between Python and R, and setting Python dependencies,
-# reticulate package is needed
-library(reticulate)
-# Let us assume that these have been installed already.
-reticulate::install_miniconda(force = TRUE)
-reticulate::use_miniconda(condaenv = "env1", required = FALSE)
-reticulate::py_install(packages = c("mofapy2"), pip = TRUE, python_version=3.6)
-```
-
 The `mae` object could be used straight to create the MOFA model. Yet,
 we transform our assays since the model assumes normality per
 default, and Gaussian model is recommended