From b1ccaaa299ec2b03c12eabbb3b3d8d02883a226a Mon Sep 17 00:00:00 2001 From: Muluh <127390183+Daenarys8@users.noreply.github.com> Date: Fri, 4 Oct 2024 07:14:12 +0100 Subject: [PATCH] Update examples 2- IL (#617) Signed-off-by: Daena Rys Co-authored-by: TuomasBorman --- DESCRIPTION | 7 +- inst/pages/cross_correlation.qmd | 11 +- inst/pages/integrated_learner.qmd | 635 +++++++-------------------- inst/pages/multiassay_ordination.qmd | 12 - 4 files changed, 175 insertions(+), 490 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d812a687..c45f0a95 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: OMA Title: Orchestrating Microbiome Analysis with Bioconductor -Version: 0.98.26 +Version: 0.98.27 Date: 2024-10-01 Authors@R: c(person("Leo", "Lahti", role = c("aut"), @@ -62,6 +62,7 @@ Suggests: gtools, gsEasy, igraph, + IntegratedLearner, knitr, Maaslin2, mia, @@ -97,7 +98,6 @@ Suggests: SPRING, stats, stringr, - SuperLearner, tidyverse, topGO, vegan, @@ -109,7 +109,8 @@ Remotes: github::microbiome/miaTime, github::stefpeschel/NetCoMi, github::zdk123/SpiecEasi, - github::GraceYoon/SPRING + github::GraceYoon/SPRING, + github::himelmallick/IntegratedLearner VignetteBuilder: knitr RoxygenNote: 7.3.1 BiocType: Book diff --git a/inst/pages/cross_correlation.qmd b/inst/pages/cross_correlation.qmd index 5a150bc8..f3b1f245 100644 --- a/inst/pages/cross_correlation.qmd +++ b/inst/pages/cross_correlation.qmd @@ -48,15 +48,6 @@ data(HintikkaXOData, package = "mia") mae <- HintikkaXOData ``` -```{r cross-association2} -library(stringr) -# Drop those bacteria that do not include information in Phylum or lower levels -mae[[1]] <- mae[[1]][!is.na(rowData(mae[[1]])$Phylum), ] -# Clean taxonomy data, so that names do not include additional characters -rowData(mae[[1]]) <- DataFrame( - apply(rowData(mae[[1]]), 2, str_remove, pattern = "._[0-9]__")) -``` - ```{r} # Available alternative experiments experiments(mae) @@ -84,7 +75,7 @@ bacterium X is present, is the concentration of metabolite Y lower or higher"? ```{r cross-correlation5} # Agglomerate microbiome data at family level -mae[[1]] <- agglomerateByPrevalence(mae[[1]], rank = "Family") +mae[[1]] <- agglomerateByPrevalence(mae[[1]], rank = "Family", na.rm = TRUE) # Does log10 transform for microbiome data mae[[1]] <- transformAssay(mae[[1]], method = "log10", pseudocount = TRUE) diff --git a/inst/pages/integrated_learner.qmd b/inst/pages/integrated_learner.qmd index a42213b3..21e99eda 100644 --- a/inst/pages/integrated_learner.qmd +++ b/inst/pages/integrated_learner.qmd @@ -1,11 +1,13 @@ -# Multi-omics Integration {#sec-multi-omics-integration} +# Multi-omics prediction and classification {#sec-multi-omics-integration} ```{r setup, echo=FALSE, results="asis"} library(rebook) chapterPreamble() ``` -## Multi-omics Integration +This chapter illustrates how we can create a classification model that utilizes +multiomics data. In contrast [@sec-machine_learning] shows classification models +that utilizes single omic data. In multiview data analysis, there are two main types of approaches: early fusion and late fusion. \underline{Early fusion} combines all datasets into a single @@ -19,518 +21,221 @@ advanced method, called cooperative learning [@Ding2022], which is also known as predictions from different data views to align through an agreement parameter ($\rho$). -## Multi-omics Prediction and Classification of Binary IBD Disease Status - -In this chapter, we showcase examples of various integration paradigms (early, -late, and intermediate fusion) using the R packages `multiview` [@Ding2022] -and `SuperLearner` [@Van2007]. We make use of the publicly available source code -from the multi-omics integrated framework (`IntegratedLearner`) proposed by -@Mallick2024. - -## Input data +## Import and preprocess data We use the publicly available Inflammatory Bowel Diseases (IBD) data from the `curatedMetagenomicData` package [@Lloyd-Price2019], where we aim to predict IBD disease status based on both taxonomic (species abundances) and functional (pathway abundances) profiles. -```{r load-pkg-data} - -################## -# Load iHMP data # -################## +```{r} +#| label: load-pkg-data library(curatedMetagenomicData) library(dplyr) -library(tidyverse) +library(mia) -se_relative <- sampleMetadata |> +tse1 <- sampleMetadata |> filter(study_name == "HMP_2019_ibdmdb") |> returnSamples("relative_abundance", rownames = "short") -se_pathway <- sampleMetadata |> +tse2 <- sampleMetadata |> filter(study_name == "HMP_2019_ibdmdb") |> returnSamples("pathway_abundance", rownames = "short") +# Create a MAE object +mae <- MultiAssayExperiment( + ExperimentList(microbiota = tse1, pathway = tse2) + ) +mae ``` -We will first prepare the input sample metadata and feature table for both -relative abundance and pathway abundance data. - -```{r prepare-metadata-feature} - -########################## -# Create sample metadata # -########################## - -sample_metadata <- select( - as.data.frame(colData(se_relative)), - c("study_name", "disease", "subject_id")) - -# Define response variable & sample id -sample_metadata$Y <- ifelse(sample_metadata$disease == "IBD", 1, 0) -sample_metadata <- sample_metadata |> - select(subject_id, Y) |> rename(subjectID = subject_id) - -########################### -# Create Species Features # -########################### - -feature_species <- as.data.frame(assay(se_relative)) -rownames(feature_species) <- sub('.*s__', '', rownames(feature_species)) - -########################### -# Create Pathway Features # -########################### - -feature_pwys <- as.data.frame(assay(se_pathway)) -feature_pwys <- rownames_to_column(feature_pwys, "ID") -feature_pwys <- feature_pwys |> - filter(!grepl("\\|", ID)) |> - filter(!ID %in% c('UNMAPPED', 'UNINTEGRATED')) |> - column_to_rownames('ID') |> - as.data.frame() - -################################ -# Combine Species and Pathways # -################################ - -feature_table <- bind_rows(feature_species, feature_pwys) - -######################################################## -# Check row names of feature_table and sample_metadata # -######################################################## - -all(colnames(feature_table) == rownames(sample_metadata)) +Let's first check how many patients are in each group. +```{r} +#| label: check_diagnoses +table(mae[[1]]$disease) / ncol(mae[[1]]) ``` -We will then create a metadata table for the features. This table captures -high-level information related to the features (e.g., which layer they belong -to). - -```{r create-metadata-table} - -###################################### -# Create metadata table for features # -###################################### - -rowID <- rep( - c('Species', 'Pathways'), - c(nrow(feature_species), nrow(feature_pwys))) -feature_metadata <- cbind.data.frame( - featureID = rownames(feature_table), featureType = rowID) -rownames(feature_metadata) <- feature_metadata$featureID - -################ -# Sanity check # -################ - -all(rownames(feature_table) == rownames(feature_metadata)) # TRUE -all(colnames(feature_table) == rownames(sample_metadata)) # TRUE - -``` +The dataset appears to be imbalanced, with nearly three-quarters of the +patients having IBD. This imbalance may impact the training process and how +the model learns to predict outcomes for underrepresented group members. Further data pre-processing is necessary to handle near-zero-variance features -in this dataset. Ultimately, 483 features are retained, consisting of 360 -pathways and 123 species. A combination of variance and prevalence filtering -is applied to the feature table, while related metadata is cleaned to ensure the -retention of matching samples. - -```{r create-feature-table} - -################# -# feature_table # -################# - -library(caret) - -feature_table_t <- as.data.frame(t(feature_table)) -abd_threshold = 0 -prev_threshold = 0.1 -nzv <- nearZeroVar(feature_table_t) -features_var_filtered <- feature_table_t[, -nzv] -features_var_filtered <- as.data.frame(features_var_filtered) -features_filtered <- features_var_filtered[ - , colSums(features_var_filtered > abd_threshold) > - nrow(features_var_filtered) * prev_threshold - ] -feature_table <- as.data.frame(t(features_filtered)) - -#################### -# feature_metadata # -#################### - -feature_metadata <- feature_metadata[rownames(feature_table), ] -table(feature_metadata$featureType) - -###################### -# Sanity check again # -###################### - -all(rownames(feature_table) == rownames(feature_metadata)) # TRUE -all(colnames(feature_table) == rownames(sample_metadata)) # TRUE - +in this dataset. A combination of variance and prevalence filtering +is applied to the dataset to include only meaningful features. + +```{r} +#| label: preprocess + +# Subset by prevalence +mae[[1]] <- subsetByPrevalent( + mae[[1]], assay.type = "relative_abundance", prevalence = 0.1, + include.lowest = TRUE) +mae[[2]] <- subsetByPrevalent( + mae[[2]], assay.type = "pathway_abundance", prevalence = 0.1, + include.lowest = TRUE) + +# Subset those features that have near zero variance +rowData(mae[[1]])[["sd"]] <- rowSds(assay(mae[[1]], "relative_abundance")) +rowData(mae[[2]])[["sd"]] <- rowSds(assay(mae[[2]], "pathway_abundance")) +mae[[1]] <- mae[[1]][ rowData(mae[[1]])[["sd"]] > 0.001, ] +mae[[2]] <- mae[[2]][ rowData(mae[[2]])[["sd"]] > 0.001, ] + +# Transform the data with CLR +mae[[1]] <- transformAssay( + mae[[1]], assay.type = "relative_abundance", method = "clr", + pseudocount = TRUE) +mae[[2]] <- transformAssay( + mae[[2]], assay.type = "pathway_abundance", method = "clr", + pseudocount = TRUE) ``` -Following the preprocessing, we conduct additional input data preparation, -which includes configuring parameters for sample splitting and 5-fold -cross-validation. It's important to note that this dataset contains repeated -measures (i.e., multiple samples per subject). To address this, we will conduct -5-fold cross-validation at the subject level. The source code is derived from -the `IntegratedLearner` R package [@Mallick2024]. +Ultimately, `r nrow(mae[[1]])+nrow(mae[[1]])` features are retained, consisting +of `r nrow(mae[[1]])` pathways and `r nrow(mae[[2]])` species. -```{r additional-data-preprocessing} +## Fit model -library(caret) +We randomly select 20% of the samples for the validation set, ensuring they are +not used in training. This approach provides a more robust estimate of how well +the model generalizes to unseen data. -# Set parameters and extract subject IDs for sample splitting -seed <- 1 -set.seed(seed) -subjectID <- unique(sample_metadata$subjectID) - -################################## -# Trigger 5-fold CV (Outer Loop) # -################################## - -subjectCvFoldsIN <- createFolds(1:length(subjectID), k = 5, returnTrain=TRUE) - -######################################### -# Curate subject-level samples per fold # -######################################### - -obsIndexIn <- vector("list", 5) -for(k in 1:length(obsIndexIn)){ - x <- which(!sample_metadata$subjectID %in% subjectID[subjectCvFoldsIN[[k]]]) - obsIndexIn[[k]] <- x -} -names(obsIndexIn) <- sapply(1:5, function(x) paste(c("fold", x), collapse = '')) - -############################### -# Set up data for SL training # -############################### - -cvControl = list(V = 5, shuffle = FALSE, validRows = obsIndexIn) - -################################################# -# Stacked generalization input data preparation # -################################################# - -feature_metadata$featureType <- as.factor(feature_metadata$featureType) -name_layers <- with( - droplevels(feature_metadata), - list(levels = levels(featureType)), nlevels = nlevels(featureType))$levels -SL_fit_predictions <- vector("list", length(name_layers)) -SL_fit_layers <- vector("list", length(name_layers)) -names(SL_fit_layers) <- name_layers -names(SL_fit_predictions) <- name_layers -X_train_layers <- vector("list", length(name_layers)) -names(X_train_layers) <- name_layers -layer_wise_predictions_train <- vector("list", length(name_layers)) -names(layer_wise_predictions_train) <- name_layers +```{r} +#| label: select_validation +set.seed(377) +colData(mae)[["validation_set"]] <- sample( + c(TRUE, FALSE), size = ncol(mae[[1]]), replace = TRUE, prob = c(0.2, 0.8)) ``` -Next, we run the late fusion algorithm described in @Mallick2024. -To this end, we subset the data for each feature type and conduct the analysis -on each layer for the first-stage learning to predict the outcome. In other -words, we fit a machine learning algorithm per layer (called a 'base learner'), -utilizing the SuperLearner R package. - -```{r late-fusion} - -######################################################## -# Subset data per omics and run each individual layers # -######################################################## - -for (i in seq_along(name_layers)){ - - # Prepare single-omic input data - include_list <- feature_metadata |> filter(featureType == name_layers[i]) - t_dat_slice <- feature_table[ - rownames(feature_table) %in% include_list$featureID, ] - dat_slice <- as.data.frame(t(t_dat_slice)) - Y = sample_metadata$Y - X = dat_slice - X_train_layers[[i]] <- X - - # Run user-specified base learner - library(SuperLearner) - - SL_fit_layers[[i]] <- SuperLearner( - Y = Y, - X = X, - cvControl = cvControl, - SL.library = c('SL.randomForest'), - family = binomial()) - - # Append the corresponding y and X to the results - SL_fit_layers[[i]]$Y <- sample_metadata['Y'] - SL_fit_layers[[i]]$X <- X - - # Remove redundant data frames and collect pre-stack predictions - rm(t_dat_slice) - rm(dat_slice) - rm(X) - SL_fit_predictions[[i]] <- SL_fit_layers[[i]]$Z - - # Re-fit to entire dataset for final predictions - layer_wise_predictions_train[[i]] <- SL_fit_layers[[i]]$SL.predict - -} - +Now, we are ready to fit the model using the `IntegratedLearner` package, which +supports early, late, and intermediate fusion methods [@Mallick2024]. Under the +hood, it leverages the `SuperLearner` package. The process begins by fitting +"base learners" for each layer, with separate models trained for each omic +dataset. Then, a "meta learner" integrates the outputs from these individual +models to make the final predictions. For "base learners" and "meta learner", +we use _random forest_ and _rank loss minimization_ models, respectively. + +```{r} +#| label: fit_model + +library(IntegratedLearner) +library(SuperLearner) + +# Fit the model +fit <- IntegratedLearnerFromMAE( + mae, + # Select experiments and assays + experiment = names(experiments(mae)), + assay.type = c("clr", "clr"), + # Select columns from sample metadata + outcome.col = "disease", + valid.col = "validation_set", + # Options for base and meta models + base_learner = "SL.randomForest", + meta_learner = "SL.nnls.auc", + # The outcome is binary + family = binomial() +) ``` -The next step of the analysis is to combine the layer-wise cross-validated -predictions with a meta-model (meta_learner) to generate final predictions -based on all available data points for the stacked model. Here, we will use -`glmnet` as the meta-learner. However, other choices are also possible. - -```{r layer-wise-cross-validated-prediction} +The output include the following models: -############################## -# Prepare stacked input data # -############################## + - Individual models: Trained separately for each omic dataset. + - Stacked model (late fusion): Combines the predictions from each individual model into a single meta-model. + - Concatenated model (early fusion): Trains a model where all features are merged before training. -combo <- as.data.frame(do.call(cbind, SL_fit_predictions)) -names(combo) <- name_layers +## Visualize results -############################### -# Set aside final predictions # -############################### +We can summarize the model performance using a Receiver Operating +Characteristic (ROC) plot. -combo_final <- as.data.frame(do.call(cbind, layer_wise_predictions_train)) -names(combo_final) <- name_layers +```{r} +#| label: visualize_result -#################### -# Stack all models # -#################### - -# Run user-specified meta learner -SL_fit_stacked <- SuperLearner( - Y = Y, - X = combo, - cvControl = cvControl, - verbose = TRUE, - SL.library = 'SL.glmnet', - family = binomial()) - -# Extract the fit object from superlearner -model_stacked <- SL_fit_stacked$fitLibrary[[1]]$object -stacked_prediction_train <- predict.SuperLearner( - SL_fit_stacked, newdata = combo_final)$pred - -# Append the corresponding y and X to the results -SL_fit_stacked$Y <- sample_metadata['Y'] -SL_fit_stacked$X <- combo +library(tidyverse) +library(cowplot) +p <- IntegratedLearner:::plot.learner(fit) ``` -In contrast to late fusion, in early fusion, we will simply supply a combined -representation of the data and we will use the `random forest` method for -building an integrated prediction model. +Based on the ROC plot described below, we observe that the AUC is +`r fit$AUC.test[[2]]` when considering only the pathway abundance data in the +model, and `r fit$AUC.test[[1]]` for the model including only the species +abundance data. The AUC increases to `r fit$AUC.test[[4]]` when using +the early fusion model and reaches `r fit$AUC.test[[3]]` with the late fusion +model. Overall, most integrated classifiers outperform individual layers in +distinguishing between T2D and healthy controls. -```{r early-fusion} +The model's performance can also be evaluated and summarized using a confusion +matrix. -##################################### -# EARLY FUSION (CONCATENATED MODEL) # -##################################### +```{r} +#| label: conf_matrix -fulldat<-as.data.frame(t(feature_table)) - -# Early Fusion using Random Forest -SL_fit_concat <- SuperLearner( - Y = Y, - X = fulldat, - cvControl = cvControl, - SL.library = 'SL.randomForest', - family = binomial()) - -# Extract the fit object from SuperLearner -model_concat <- SL_fit_concat$fitLibrary[[1]]$object - -# Append the corresponding y and X to the results -SL_fit_concat$Y <- sample_metadata['Y'] -SL_fit_concat$X <- fulldat -``` - -Finally, we consider the intermediate fusion approach, which combines ideas -from both late fusion and early fusion by integrating the usual squared-error -loss of predictions with an "agreement" (fusion) penalty ($\rho$) so that the -predictions from different data views agree. - -The intermediate fusion adjusts the degree of fusion in an adaptive manner, -where the test set prediction error is estimated with a cross-validation method. -By varying the weight of the fusion penalty - hyperparameter $\rho$, we obtain -the early and late fusion approaches as special cases. If $\rho = 0$, we have a -simple form of early fusion; if $\rho = 1$, we obtain a simple form of late -fusion. For $0 < \rho < 1$, we obtain the intermediate fusion. - -Here, as examples, we will prepare the input data, fit the multiview model, and -run the cross-validation in this section. - -```{r prepare-input-data} - -############################## -# Prepare data for multiview # -############################## - -# Separate omics layers -feature_metadata$featureType <- as.factor(feature_metadata$featureType) -name_layers <- with( - droplevels(feature_metadata), - list(levels = levels(featureType)), - nlevels = nlevels(featureType) - )$levels - -# Define a list -dataList <- vector("list", length = length(name_layers)) -names(dataList) <- name_layers -table(feature_metadata$featureType) - -# Select indices of Pathways and Species rows -pathways_ind <- which(feature_metadata$featureType == "Pathways") -species_ind <- which(feature_metadata$featureType == "Species") - -# Create two data lists for pathways and species -dataList[[1]] <- t(feature_table[pathways_ind, ]) -dataList[[2]] <- t(feature_table[species_ind, ]) - -# Extract y and X's -dataList <- lapply(dataList, as.matrix) -dataList <- lapply(dataList, scale) - -######################## -# Run cross-validation # -######################## - -set.seed(1234) -library(multiview) -library(glmnet) - -cvfit <- cv.multiview(dataList, Y, family = binomial(), alpha = 0.5) -DD <- as.data.frame(as.matrix(coef_ordered(cvfit, s="lambda.min", alpha = 0.5))) -DD$standardized_coef <- as.numeric(DD$standardized_coef) -DD$coef <- as.numeric(DD$coef) +library(caret) +obs <- factor(fit$Y_test) +pred <- ifelse(fit$yhat.test$stacked > 0.5, levels(obs)[[2]], levels(obs)[[1]]) +pred <- factor(pred, levels = levels(obs)) +conf <- confusionMatrix(data = pred, reference = obs) +conf ``` -In this section, we visualize the prediction performance by gathering all -predictions and extracting the data necessary for ROC plotting, corresponding -to each integration paradigm. - -```{r visualization1} -######################################################### -# Gather all predictions and ROC curve data preparation # -######################################################### - -coop_pred <- predict( - cvfit, newx = dataList, s = "lambda.min", - alpha = 0.5, type = "response") -yhat.train <- cbind( - combo,stacked_prediction_train, SL_fit_concat$Z, coop_pred) -colnames(yhat.train) <- c( - colnames(combo), "Late Fusion", "Early Fusion", "Intermediate Fusion") - -# Extract ROC plot data -list.ROC <- vector("list", length = ncol(yhat.train)) -names(list.ROC) <- colnames(yhat.train) - -# Loop over layers -library(ROCR) - -for(k in 1:length(list.ROC)){ - preds <- yhat.train[ ,k] - pred <- prediction(preds, Y) - AUC <- round(performance(pred, "auc")@y.values[[1]], 2) - perf <- performance(pred, "sens", "spec") - list.ROC[[k]] <- data.frame( - sensitivity = slot(perf, "y.values")[[1]], - specificity = 1 - slot(perf, "x.values")[[1]], - AUC = AUC, - layer = names(list.ROC)[k]) -} - -# Combine -ROC_table <- do.call('rbind', list.ROC) - +The model appears to be performing well the the accuracy being +`r paste0(round(conf$overall[[1]]*100, 2), "%")`. The model seems to precict +correctly almost all IBD patients. It is also worth noting that over +three-quarters of the controls are classified correctly + +Lastly, we may be interested in identifying the features that contribute most +to predicting the outcome. To do this, we first extract feature importance +scores from the individual models. Next, we scale these importance scores based +on the weights assigned to each layer in the stacked model. This process +provides us with the overall importance of each feature in the final model. + +```{r} +#| label: feat_importance + +library(ggplot2) + +# Get individual models +models <- fit$model_fits$model_layers +# Get importances +importances <- lapply(seq_len(length(models)), function(i){ + # Get importances + temp <- models[[i]]$importance + # Scale based on weight in stacked model + temp <- temp * fit$weights[[i]] + return(temp) + }) +# Combine and order to most important features +importances <- do.call(rbind, importances) +importances <- importances[ + order(importances, decreasing = TRUE), , drop = FALSE] +# Add features to column +importances <- importances |> as.data.frame() +importances[["Feature"]] <- factor( + rownames(importances), levels = rownames(importances)) +# Convert to 0-1 scale +importances[[1]] <- importances[[1]] / sum(importances[[1]]) +# Get top 20 importances +top_n <- 20 +importances <- importances[ seq_len(top_n), ] + +# Plot as a bar plot +p <- ggplot(importances, aes(x = MeanDecreaseGini, y = Feature)) + + geom_bar(stat = "identity") +p ``` -Based on the ROC plot described below, we observe that the AUC is 0.65 when -considering only the pathway abundance data in the model, and 0.64 for the model -including only the species abundance data. The AUC increases to 0.68 when using -the early fusion model and reaches 1 with the late fusion model. In contrast, -the intermediate fusion model achieves an AUC of 0.99. Overall, most integrated -classifiers outperform individual layers in distinguishing between IBD and -non-IBD controls. - -```{r visualization2} - -# Prepare data for plotting -plot_data <- ROC_table -plot_data$displayItem <- paste(plot_data$layer, " AUC = ", plot_data$AUC, sep="") -plot_data$displayItem <- factor( - plot_data$displayItem, levels = unique(plot_data$displayItem)) - -# ROC curves -p <- ggplot( - plot_data, - aes(x = specificity, - y = sensitivity, - group = displayItem)) + - geom_line(aes(x = specificity,y = sensitivity,color = displayItem)) + - theme( - legend.position = "bottom", - legend.background=element_blank(), - legend.box.background=element_rect(colour = "black")) + - theme_bw() + - xlab("False Positive Rate") + - ylab("True Positive Rate") + - theme(legend.position = "right", legend.direction = "vertical") + - labs(color = '') - -# Print -print(p) +From the plot, we can observe that `r importances[1, "Feature"]` and +`r importances[1, "Feature"]` appear to have the greatest predictive power +among all the features in determining the outcome. However, the predictive +power appears to be fairly evenly distributed across all features. -``` +::: {.callout-note} -Finally, we visualize the results for the top 20 features for each layer based -on the intermediate fusion. - -```{r visualization3, fig.width = 6, fig.height = 7} -# Only plot 20 features per layer -DD <- DD |> - group_by(view) |> - top_n(n = 20, wt = abs(standardized_coef)) - -DD$view_col <- gsub(":.*", "", DD$view_col) - -# Visualization -library(forcats) - -p <- DD |> - mutate(view_col = fct_reorder(view_col, standardized_coef)) |> - ggplot(aes(x = view_col, y = standardized_coef, fill = view, width = 0.75)) + - geom_bar(stat = "identity", show.legend = FALSE, width = 1) + - coord_flip() + - facet_wrap(~ view, scales = 'free_y', nrow = 2) + - ylab('Standardized LFC') + - xlab('') + - #labs(title = "IBD-associated\nmulti-omics features") + - ggtitle('IBD-associated\nmulti-omics features') + - theme_bw() + theme( - plot.title = element_text(hjust = 0.5, vjust = 0.5), - strip.background = element_blank(), - panel.grid.major = element_line(colour = "grey80"), - panel.border = element_blank(), - axis.ticks = element_line(size = 0), - panel.grid.minor.y = element_blank(), - panel.grid.major.y = element_blank()) -p -``` +For more details on modeling with `IntegratedLearner`, refer to +[IntegratedLearner vignette](http://htmlpreview.github.io/?https://github.com/himelmallick/IntegratedLearner/blob/master/vignettes/IntegratedLearner.html). -Based on the barplot, we observe that \emph{Proteobacteria_bacterium_CAG.139} -is the species most positively associated, while \emph{Gemmiger_formicilis} is -the most negatively associated species. Additionally, L-lysine biosynthesis VI -is the pathway most positively associated, and PYRIDNUSCYN-PWY: NAD biosynthesis -I (from aspartate) is the pathway most positively associated, respectively. +::: diff --git a/inst/pages/multiassay_ordination.qmd b/inst/pages/multiassay_ordination.qmd index 27e33928..2fa4d2e2 100644 --- a/inst/pages/multiassay_ordination.qmd +++ b/inst/pages/multiassay_ordination.qmd @@ -37,18 +37,6 @@ data(HintikkaXOData, package = "mia") mae <- HintikkaXOData ``` -```{r MOFA2, message=FALSE, warning=FALSE} -library(MOFA2) - -# For inter-operability between Python and R, and setting Python dependencies, -# reticulate package is needed -library(reticulate) -# Let us assume that these have been installed already. -reticulate::install_miniconda(force = TRUE) -reticulate::use_miniconda(condaenv = "env1", required = FALSE) -reticulate::py_install(packages = c("mofapy2"), pip = TRUE, python_version=3.6) -``` - The `mae` object could be used straight to create the MOFA model. Yet, we transform our assays since the model assumes normality per default, and Gaussian model is recommended