From b4b8c92882d8dbbef79a8d61a2ece5beca879b80 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Thu, 30 May 2024 10:31:23 +0300 Subject: [PATCH 01/18] Updated exercises --- inst/pages/98_exercises.qmd | 276 ++++++++++++++++++++++++++++++++++-- 1 file changed, 265 insertions(+), 11 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 3ee568d27..b0d57cdab 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -508,7 +508,7 @@ However, such data are not always included. ```{r} #| label: Other-elements-ex1 -#| eval: FALSE +#| eval: TRUE #| code-fold: true #| code-summary: "Show solution" @@ -565,10 +565,59 @@ colData(tse)$Barcode_full_length ### Subsetting +In this exercise, we'll go over the basics of subsetting a TSE object. +Keep in mind that it's good practice to always keep the original data intact. +Therefore, we suggest creating a new tse object whenever you subset. + +Please have a try at the following exercises. + 1. Subset the TreeSE object to specific samples. + +```{r} +#| label: subsetting-ex1 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +tse_subset_by_sample <- tse[ , tse$SampleType %in% + c("Feces", "Skin", "Tongue")] + +unique(tse_subset_by_sample$SampleType) +``` + 2. Subset the TreeSE object to specific features. + +```{r} +#| label: subsetting-ex2 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +tse_subset_by_sample_and_feature <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", + "Euryarchaeota", + "Actinobacteria") + ,] + +dim(tse_subset_by_sample_and_feature) +``` + 3. Subset the TreeSE object to specific samples and features. +```{r} +#| label: subsetting-ex3 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +tse_subset_by_feature <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", + "Euryarchaeota", + "Actinobacteria") + , + tse$SampleType %in% + c("Feces", "Skin", "Tongue")] + +unique(rowData(tse_subset_by_feature)$Phylum) +``` ### Library sizes @@ -580,12 +629,64 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add ### Prevalent and core taxonomic features -1. Estimate prevalence for your chosen feature (row, taxonomic group) -2. Identify all prevalent features and subset the data accordingly -3. Report the thresholds and the dimensionality of the data before and after subsetting -4. Visualize prevalence +1. Estimate prevalence for your chosen feature (specific row and taxonomic group) +```{r} +#| label: prevalence-ex1 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +# First, merge the features by rank +altExp(tse,"Phylum") <- mergeFeaturesByRank(tse, "Phylum") + +# Then get the prevalence for a specific sample +getPrevalence(altExp(tse,"Phylum"), detection = 1/100, + sort = TRUE, assay.type = "counts", + as_relative = TRUE)["Phylum:Proteobacteria"] +``` +2. Identify all prevalent features and subset the data accordingly. + +```{r} +#| label: prevalence-ex2 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +# Get the prevalent features +prevalentFeatures <- getPrevalentFeatures(altExp(tse,"Phylum"), detection = 0, + prevalence = 50/100) +# Subset the data to the prevalent features +prevalenceTse <- tse[rowData(tse)$Phylum %in% prevalentFeatures,] -Useful functions: getPrevalence, getPrevalent, subsetByPrevalent +# Alternatively subsetByPrevalentFeatures can be used to subset directly +prevalenceTse <- subsetByPrevalentFeatures(altExp(tse,"Phylum"), detection = 0, + prevalence = 50/100) +``` +3. Visualize prevalence using a violin/bean plot. + +```{r} +#| label: prevalence-ex3 +#| eval: False +#| code-fold: true +#| code-summary: "Show solution" + +# Import the scater library +library(scater) + +# Store the prevalence of each taxon +rowData(altExp(tse,"Phylum"))$prevalence <- + getPrevalence(altExp(tse,"Phylum"), detection = 1/100, + sort = FALSE, + assay.type = "counts", as_relative = TRUE) + +# Then plot the row data +plotRowData(altExp(tse,"Phylum"), "prevalence", + colour_by = "Phylum") +``` + + + +Useful functions: getPrevalence, getPrevalentFeatures, subsetByPrevalentFeatures ### Data exploration @@ -651,6 +752,86 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, transformAs As for assays, you can access the desired altExp by name or index. 6. **Extra**: Split the data based on other features with `splitOn`. +### Beta Diversity using Alternative experiments + +This Exercise will focus on calcuating dissimilarities or beta diversity using alternative experiments. + +1. Import the mia and vegan packages and load a dataset. This can be your own or one of the packages built in to mia. + +```{r} +#| label: beta-AltExp-ex1 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +library(mia) +library(vegan) + +data("Tengeler2020", package = "mia") +tse <- Tengeler2020 +``` + +2. Calculate relative abundances and create alternative experiments using `splitByRanks` or `splitOn` + + +```{r} +#| label: beta-AltExp-ex2 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") + +altExps(tse) <- splitByRanks(tse) +altExps(tse) +``` + +3. Run PCoA on on one of the created alternative experiments using Bray-Curtis dissimilarity. + +```{r} +#| label: beta-AltExp-ex3 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +tse <- runMDS(tse, + FUN = vegan::vegdist, + method = "bray", + assay.type = "relabundance", + name = "MDS_bray") +``` + +4. Import the scater library, to plot the first two coordinates using the `reducedDim`function and plot the coordinates. + +```{r} +#| label: beta-AltExp-ex4 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +p <- plotReducedDim(tse, "MDS_bray") +p +``` + +5. Add the explained variance ratios for each coordinate on the axes labels and plot the PCoA again. + +```{r} +#| label: beta-AltExp-ex5 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +# Calculate explained variance +e <- attr(reducedDim(tse, "MDS_bray"), "eig") +rel_eig <- e / sum(e[e > 0]) + +# Add explained variance for each axis +p <- p + labs(x = paste("PCoA 1 (", round(100 * rel_eig[[1]], 1), "%", ")", sep = ""), + y = paste("PCoA 2 (", round(100 * rel_eig[[2]], 1), "%", ")", sep = "")) + +p +``` + ## Community (alpha) diversity @@ -848,15 +1029,88 @@ the contributions to beta diversity. ### Beta diversity analysis -This exercise prompts you to implement a workflow with distance-based RDA -(dbRDA). You can refer to chapter [@sec-dbrda-workflow] for a step-by-step +In this exercise, we'll ask you to implement a distance-based Redundancy Analysis +(dbRDA) to understand the relationships between microbial community composition and various environmental or experimental factors within your dataset. You can refer to chapter [@sec-dbrda-workflow] for a step-by-step walkthrough, which may be simplified in the future. 1. Import the mia and vegan packages, load any of the example data sets mentioned in Chapter 3.3 with `data` and store it into a variable named `tse`. -4. Create dbRDA with Bray-Curtis dissimilarities on relative abundances. Use PERMANOVA. Can differences between samples be explained with variables of sample meta data? -5. Analyze diets' association on beta diversity. Calculate dbRDA and then PERMANOVA. Visualize coefficients. Which taxa's abundances differ the most between samples? -6. Interpret your results. Is there association between community composition and location? What are those taxa that differ the most; find information from literature. + +```{r} +#| label: dbRDA-ex1 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +library(mia) +library(vegan) + +data("GlobalPatterns", package = "mia") # Feel free to use any dataset +tse <- GlobalPatterns +``` + +2. Create a new assay with the relative abundance for your dataset and merge the +features by a rank of your choice. + +```{r} +#| label: dbRDA-ex2 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") + +tse <- mergeFeaturesByRank(tse, + rank = "Species") +``` + +3. Select a group of samples and perform a permutational analysis on the calculated relative abundance using Bray-Curtis dissimilarities + +```{r} +#| label: dbRDA-ex3 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +tse$Group <- tse$SampleType == "Feces" + +permanova <- adonis2(t(assay(tse, "relabundance")) ~ Group, + by = "margin", # each term (here only 'Group') analyzed individually + data = colData(tse), + method = "bray", + permutations = 99) +``` + +4. Implement dbRDA on relative abundances and perform another permutational analysis on the redundancy analysis. + +```{r} +#| label: dbRDA-ex4 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +dbrda <- dbrda(t(assay(tse,"relabundance")) ~ Group, + data = colData(tse)) + +permanova2 <- anova.cca(dbrda, + by = "margin", + method = "bray", + permutations = 99) +``` + +5. Extract the p-values. Can the differences between samples be explained with variables from their metadata? +```{r} +#| label: dbRDA-ex5 +#| eval: FALSE +#| code-fold: true +#| code-summary: "Show solution" + +p_values <- c(permanova["Group", "Pr(>F)"], permanova2["Group", "Pr(>F)"]) + +p_values <-as.data.frame(p_values) + +rownames(p_values) <- c("adonis2", "dbRDA+anova.cca") +``` Useful functions: scater::runMDS, runRDA, vegan::anova.cca, transformAssay, mergeFeaturesByRank, ggplot, scater::plotReducedDim, vegan::adonis2 From 4b2bffd2ea0f2dd0de23e6aa300ae676fcbbebcd Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Thu, 30 May 2024 10:56:57 +0300 Subject: [PATCH 02/18] Updated exercises --- inst/pages/98_exercises.qmd | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 8d5ddbdbe..834df89db 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -662,10 +662,40 @@ prevalenceTse <- tse[rowData(tse)$Phylum %in% prevalentFeatures,] prevalenceTse <- subsetByPrevalentFeatures(altExp(tse,"Phylum"), detection = 0, prevalence = 50/100) ``` -3. Visualize prevalence using a violin/bean plot. +3. How many features are left ? ```{r} #| label: prevalence-ex3 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +length(assay(prevalenceTse, "counts")) +``` + +4. Recalculate the most prevalent features based on relative abundance. +How many are left? + +```{r} +#| label: prevalence-ex4 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") + +altExp(tse,"Phylum2") <- mergeFeaturesByRank(tse, "Phylum") + +prevalenceTse <- subsetByPrevalentFeatures(altExp(tse,"Phylum2"), detection = 0, + prevalence = 50/100) + +length(assay(prevalenceTse, "relabundance")) +``` + +5. Visualize prevalence using a violin/bean plot. + +```{r} +#| label: prevalence-ex5 #| eval: False #| code-fold: true #| code-summary: "Show solution" From c6a9000d08052a22503f23e048b87f75799addc6 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Thu, 30 May 2024 11:48:47 +0300 Subject: [PATCH 03/18] removed AltExp usage from prevalence exercise --- inst/pages/98_exercises.qmd | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 834df89db..24578bfa1 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -636,7 +636,7 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add #| code-fold: true #| code-summary: "Show solution" -# First, merge the features by rank +# First, merge the features by rank and add it as an alternative experiment. altExp(tse,"Phylum") <- mergeFeaturesByRank(tse, "Phylum") # Then get the prevalence for a specific sample @@ -653,13 +653,13 @@ getPrevalence(altExp(tse,"Phylum"), detection = 1/100, #| code-summary: "Show solution" # Get the prevalent features -prevalentFeatures <- getPrevalentFeatures(altExp(tse,"Phylum"), detection = 0, +prevalentFeatures <- getPrevalentFeatures(tse, detection = 0, prevalence = 50/100) # Subset the data to the prevalent features prevalenceTse <- tse[rowData(tse)$Phylum %in% prevalentFeatures,] # Alternatively subsetByPrevalentFeatures can be used to subset directly -prevalenceTse <- subsetByPrevalentFeatures(altExp(tse,"Phylum"), detection = 0, +prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, prevalence = 50/100) ``` 3. How many features are left ? @@ -684,9 +684,7 @@ How many are left? tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -altExp(tse,"Phylum2") <- mergeFeaturesByRank(tse, "Phylum") - -prevalenceTse <- subsetByPrevalentFeatures(altExp(tse,"Phylum2"), detection = 0, +prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, prevalence = 50/100) length(assay(prevalenceTse, "relabundance")) @@ -704,13 +702,13 @@ length(assay(prevalenceTse, "relabundance")) library(scater) # Store the prevalence of each taxon -rowData(altExp(tse,"Phylum"))$prevalence <- - getPrevalence(altExp(tse,"Phylum"), detection = 1/100, +rowData(tse)$prevalence <- + getPrevalence(tse, detection = 1/100, sort = FALSE, assay.type = "counts", as_relative = TRUE) # Then plot the row data -plotRowData(altExp(tse,"Phylum"), "prevalence", +plotRowData(tse, "prevalence", colour_by = "Phylum") ``` From 1d04e52a94daa6fb0fcb68bcd5b1680b81663536 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Fri, 31 May 2024 09:25:06 +0300 Subject: [PATCH 04/18] made requested changes --- inst/pages/98_exercises.qmd | 38 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 24578bfa1..74c6d2065 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -565,9 +565,9 @@ colData(tse)$Barcode_full_length ### Subsetting -In this exercise, we'll go over the basics of subsetting a TSE object. +In this exercise, we'll go over the basics of subsetting a TreeSE object. Keep in mind that it's good practice to always keep the original data intact. -Therefore, we suggest creating a new tse object whenever you subset. +Therefore, we suggest creating a new TreeSE object whenever you subset. Please have a try at the following exercises. @@ -625,7 +625,7 @@ unique(rowData(tse_subset_by_feature)$Phylum) 1. Calculate library sizes 2. Subsample / rarify the counts (see: rarefyAssay) -Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::addPerCellQC, mia::mergeFeaturesByRank +Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::addPerCellQC, mia::agglomerateByRank ### Prevalent and core taxonomic features @@ -637,7 +637,7 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add #| code-summary: "Show solution" # First, merge the features by rank and add it as an alternative experiment. -altExp(tse,"Phylum") <- mergeFeaturesByRank(tse, "Phylum") +altExp(tse,"Phylum") <- agglomerateByRank(tse, "Phylum") # Then get the prevalence for a specific sample getPrevalence(altExp(tse,"Phylum"), detection = 1/100, @@ -702,8 +702,8 @@ length(assay(prevalenceTse, "relabundance")) library(scater) # Store the prevalence of each taxon -rowData(tse)$prevalence <- - getPrevalence(tse, detection = 1/100, +rowData(prevalenceTse)$prevalence <- + getPrevalence(prevalenceTse, detection = 1/100, sort = FALSE, assay.type = "counts", as_relative = TRUE) @@ -712,8 +712,6 @@ plotRowData(tse, "prevalence", colour_by = "Phylum") ``` - - Useful functions: getPrevalence, getPrevalentFeatures, subsetByPrevalentFeatures ### Data exploration @@ -722,14 +720,13 @@ Useful functions: getPrevalence, getPrevalentFeatures, subsetByPrevalentFeatures 2. Create two histograms. Another shows the distribution of absolute counts, another shows how CLR transformed values are distributed. 3. Visualize how relative abundances are distributed between taxa in samples. -Useful functions: nrow, ncol, dim, summary, table, quantile, unique, transformAssay, ggplot, wilcox.test, mergeFeaturesByRank, miaViz::plotAbundance +Useful functions: nrow, ncol, dim, summary, table, quantile, unique, transformAssay, ggplot, wilcox.test, agglomerateByRank, miaViz::plotAbundance ### Other functions 1. Merge data objects (merge, mergeSEs) 2. Melt the data for visualization purposes (meltSE) - ### Assay transformation 1. Import the mia package, load any of the example data sets mentioned in Chapter 3.3 @@ -754,7 +751,7 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, transformAs with `data` (you need one with taxonomic information at Phylum level) and store it into a variable named `tse`. 2. List the available taxonomic ranks in the data with `taxonomyRanks`. -3. Agglomerate the data to Phylum level with `mergeFeaturesByRank` and the +3. Agglomerate the data to Phylum level with `agglomerateByRank` and the appropriate value for `Rank`. 4. Report the dimensions of the TreeSE before and after agglomerating. You can use `dim` for that. @@ -801,7 +798,6 @@ tse <- Tengeler2020 2. Calculate relative abundances and create alternative experiments using `splitByRanks` or `splitOn` - ```{r} #| label: beta-AltExp-ex2 #| eval: FALSE @@ -873,7 +869,7 @@ p contain the alpha diversity indices? 4. **Extra**: Agglomerate the TreeSE by phylum and compare the mean Shannon diversity of the original experiment with its agglomerated version. You can - use `mergeFeaturesByRank` to perform agglomeration and `mean` to calculate the + use `agglomerateByRank` to perform agglomeration and `mean` to calculate the mean values of the respective columns in colData. ### Visualization @@ -1088,7 +1084,7 @@ features by a rank of your choice. tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -tse <- mergeFeaturesByRank(tse, +tse <- agglomerateByRank(tse, rank = "Species") ``` @@ -1101,12 +1097,6 @@ tse <- mergeFeaturesByRank(tse, #| code-summary: "Show solution" tse$Group <- tse$SampleType == "Feces" - -permanova <- adonis2(t(assay(tse, "relabundance")) ~ Group, - by = "margin", # each term (here only 'Group') analyzed individually - data = colData(tse), - method = "bray", - permutations = 99) ``` 4. Implement dbRDA on relative abundances and perform another permutational analysis on the redundancy analysis. @@ -1124,8 +1114,14 @@ permanova2 <- anova.cca(dbrda, by = "margin", method = "bray", permutations = 99) +permanova2 ``` +The output of the anova holds information on the variation differences between +the chosen groups, in our case w have a p-value of 0.01, indicating that +the groups are statistically different and are unlikely to be due to random +variation + 5. Extract the p-values. Can the differences between samples be explained with variables from their metadata? ```{r} #| label: dbRDA-ex5 @@ -1140,7 +1136,7 @@ p_values <-as.data.frame(p_values) rownames(p_values) <- c("adonis2", "dbRDA+anova.cca") ``` -Useful functions: scater::runMDS, runRDA, vegan::anova.cca, transformAssay, mergeFeaturesByRank, ggplot, scater::plotReducedDim, vegan::adonis2 +Useful functions: scater::runMDS, runRDA, vegan::anova.cca, transformAssay, agglomerateByRank, ggplot, scater::plotReducedDim, vegan::adonis2 ## Differential abundance From 6d2d3aa2c03610b7884582bfb9864188091618e8 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Fri, 31 May 2024 10:50:04 +0300 Subject: [PATCH 05/18] made requested changes --- inst/pages/98_exercises.qmd | 53 ++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 74c6d2065..baa8b8c87 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -603,18 +603,33 @@ dim(tse_subset_by_sample_and_feature) 3. Subset the TreeSE object to specific samples and features. +3.1. Find some samples and features to subset with. + ```{r} -#| label: subsetting-ex3 +#| label: subsetting-ex3.1 #| eval: True #| code-fold: true #| code-summary: "Show solution" -tse_subset_by_feature <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", - "Euryarchaeota", - "Actinobacteria") +head(unique(rowData(tse)$Phylum)) # Features + +unique(tse$SampleType) # Samples +``` + +3.2. Subset the TreeSE with your found samples and features. + +```{r} +#| label: subsetting-ex3.2 +#| eval: True +#| code-fold: true +#| code-summary: "Show solution" + +features <- c("Crenarchaeota","Euryarchaeota","Actinobacteria") +samples <- c("Feces", "Skin", "Tongue") + +tse_subset_by_feature <- tse[rowData(tse)$Phylum %in% features , - tse$SampleType %in% - c("Feces", "Skin", "Tongue")] + tse$SampleType %in% samples] unique(rowData(tse_subset_by_feature)$Phylum) ``` @@ -1084,8 +1099,7 @@ features by a rank of your choice. tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -tse <- agglomerateByRank(tse, - rank = "Species") +tse <- agglomerateByRank(tse, rank = "Species") ``` 3. Select a group of samples and perform a permutational analysis on the calculated relative abundance using Bray-Curtis dissimilarities @@ -1107,20 +1121,21 @@ tse$Group <- tse$SampleType == "Feces" #| code-fold: true #| code-summary: "Show solution" -dbrda <- dbrda(t(assay(tse,"relabundance")) ~ Group, - data = colData(tse)) +tse2 <- runRDA(tse, + assay.type = "relabundance", + formula = assay ~ Group, + distance = "bray", + na.action = na.exclude) + +rda_info <- attr(reducedDim(tse2, "RDA"), "significance") -permanova2 <- anova.cca(dbrda, - by = "margin", - method = "bray", - permutations = 99) -permanova2 +rda_info ``` -The output of the anova holds information on the variation differences between -the chosen groups, in our case w have a p-value of 0.01, indicating that -the groups are statistically different and are unlikely to be due to random -variation +rda_info now holds information on the variation differences between +the chosen groups, in our case, the homogeneity test has a p-value of 0.01, +indicating that the groups are statistically different and it's unlikely to be +due to random variation 5. Extract the p-values. Can the differences between samples be explained with variables from their metadata? ```{r} From dbc85a7af2cec48c87cc7ddce635099b260d5cb2 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:11:38 +0300 Subject: [PATCH 06/18] removed agglomeration and fixed the p-values --- inst/pages/98_exercises.qmd | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index baa8b8c87..81cbdb69a 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1098,8 +1098,6 @@ features by a rank of your choice. #| code-summary: "Show solution" tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") - -tse <- agglomerateByRank(tse, rank = "Species") ``` 3. Select a group of samples and perform a permutational analysis on the calculated relative abundance using Bray-Curtis dissimilarities @@ -1121,13 +1119,13 @@ tse$Group <- tse$SampleType == "Feces" #| code-fold: true #| code-summary: "Show solution" -tse2 <- runRDA(tse, +tse <- runRDA(tse, assay.type = "relabundance", formula = assay ~ Group, distance = "bray", na.action = na.exclude) -rda_info <- attr(reducedDim(tse2, "RDA"), "significance") +rda_info <- attr(reducedDim(tse, "RDA"), "significance") rda_info ``` @@ -1144,11 +1142,7 @@ due to random variation #| code-fold: true #| code-summary: "Show solution" -p_values <- c(permanova["Group", "Pr(>F)"], permanova2["Group", "Pr(>F)"]) - -p_values <-as.data.frame(p_values) - -rownames(p_values) <- c("adonis2", "dbRDA+anova.cca") +rda_info$permanova$`Pr(>F)`[1:2] ``` Useful functions: scater::runMDS, runRDA, vegan::anova.cca, transformAssay, agglomerateByRank, ggplot, scater::plotReducedDim, vegan::adonis2 From 2db16b656e4da67f067c6741163512a768253f9e Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:15:16 +0300 Subject: [PATCH 07/18] changed interpretation --- inst/pages/98_exercises.qmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 81cbdb69a..d86ddf1ff 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1132,10 +1132,11 @@ rda_info rda_info now holds information on the variation differences between the chosen groups, in our case, the homogeneity test has a p-value of 0.01, -indicating that the groups are statistically different and it's unlikely to be -due to random variation +indicating that the groups have different variances, making it so that the +permanova results are compromised and do not correctly explain whether there are +significant differences between groups. -5. Extract the p-values. Can the differences between samples be explained with variables from their metadata? +5. Nevertheless, extract the p-values. Can the differences between samples be explained with variables from their metadata? ```{r} #| label: dbRDA-ex5 #| eval: FALSE From 4605e6ace606adb69b9ae3a6c6cf22017a86b938 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:01:42 +0300 Subject: [PATCH 08/18] Added heatmap exercices --- inst/pages/98_exercises.qmd | 140 ++++++++++++++++++++++++++++++++++-- 1 file changed, 136 insertions(+), 4 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index d86ddf1ff..94c1ad49b 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1434,13 +1434,145 @@ Useful functions: scater::runMDS, runNMDS, transformAssay, ggplot, scater::plotR ### Heatmap visualization -1. Load experimental dataset from mia. -2. Visualize abundances with heatmap. -3. Visualize abundances with heatmap after CLR + Z transformation. +1. Load one of the datasets from mia as well as miaViz. + +```{r} +#| label: heatmap-ex1 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +library(mia) +library(miaViz) +data("GlobalPatterns", package = "mia") +tse <- GlobalPatterns +``` -See the OMA book for examples. +2. Add a relative abundance assay to the TSE. + +```{r} +#| label: heatmap-ex2 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") +``` + +3. Melt the tse into a dataframe, order it by decreasing relative abundance and subset to the first 25 rows for better visibility when plotting the heatmap. + +```{r} +#| label: heatmap-ex3 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +df <- meltAssay(tse, assay.type = "relabundance") + +df <- df[order(abs(df$relabundance), decreasing=TRUE), ] + +top_taxa_samples <- df[1:20,1:2] + +df <- df[df$FeatureID %in%top_taxa_samples$FeatureID & + df$SampleID %in%top_taxa_samples$SampleID, ] +``` + +4. Calculate and store the limits for the heatmap's color scale as well as it's colors breaks and the colors to be used for the color scale. + +```{r} +#| label: heatmap-ex4 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +maxval <- max(abs(df$relabundance)) +limits <- c(0, maxval) +breaks <- seq(from = 0, to = max(limits), by = 0.05) +colours <- c("#F0F0FF", "#007562") +``` + +5. Visualize the relative abundances using the previously created variables. + +```{r} +#| label: heatmap-ex5 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +ggplot(df, aes(x = SampleID, y = FeatureID, fill = relabundance)) + + geom_tile() + + scale_fill_gradientn(name = "Relative abundance", + breaks = breaks, limits = limits, colours = colours) + + theme(text = element_text(size=10), + axis.text.x = element_text(angle=45, hjust=1), + legend.key.size = unit(1, "cm")) + + labs(x = "Samples", y = "Taxa") +``` +### Advanced Heatmaps +1. Import mia, Miaviz, complexHeatmap as well as a dataset of your choice + + +```{r} +#| label: advanced-heatmap-ex1 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +library(mia) +library(miaViz) +library(ComplexHeatmap) + +data("Tengeler2020", package = "mia") +tse <- Tengeler2020 +``` + + +2. Create an assay with relative abundance, then merge the feature by rank Òrder`. + +```{r} +#| label: advanced-heatmap-ex2 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +tse <- transformAssay(tse, assay.type = "counts", + method = "relabundance", pseudocount = 1) + +tse <- mergeFeaturesByRank(tse, rank = "Order") + +``` + +3. Add CLR-Z transformation to the TreeSE. + +```{r} +#| label: heatmap-ex6 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +tse<- transformAssay(tse, assay.type = "relabundance", + method = "clr", pseudocount = TRUE) + +# Add z-transformation on features (taxa) +tse <- transformAssay(tse, assay.type = "clr", + MARGIN = "features", + method = "z", name = "clr_z") +``` + +4. plot a heatmap using the CLR-Z transformation + + + +```{r} +#| label: heatmap-ex6 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +Heatmap(assay(tse, "clr_z"), name = "clr-z") +``` ## Multiomics From 074d7a41b17b8e08af8a45fc341cfa6f5989c2c1 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:02:35 +0300 Subject: [PATCH 09/18] Added heatmap exercices --- inst/pages/98_exercises.qmd | 4 ---- 1 file changed, 4 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 94c1ad49b..dca187b44 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1513,7 +1513,6 @@ ggplot(df, aes(x = SampleID, y = FeatureID, fill = relabundance)) + 1. Import mia, Miaviz, complexHeatmap as well as a dataset of your choice - ```{r} #| label: advanced-heatmap-ex1 #| code-fold: true @@ -1528,7 +1527,6 @@ data("Tengeler2020", package = "mia") tse <- Tengeler2020 ``` - 2. Create an assay with relative abundance, then merge the feature by rank Òrder`. ```{r} @@ -1563,8 +1561,6 @@ tse <- transformAssay(tse, assay.type = "clr", 4. plot a heatmap using the CLR-Z transformation - - ```{r} #| label: heatmap-ex6 #| code-fold: true From 06ca49bf3594dc6ff479880c5054813b2873b425 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Fri, 7 Jun 2024 13:21:41 +0300 Subject: [PATCH 10/18] updated heatmap exercise --- inst/pages/98_exercises.qmd | 61 ++++++++++++------------------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index dca187b44..428f25f6a 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -508,7 +508,7 @@ However, such data are not always included. ```{r} #| label: Other-elements-ex1 -#| eval: TRUE +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -575,7 +575,7 @@ Please have a try at the following exercises. ```{r} #| label: subsetting-ex1 -#| eval: True +#| eval: true #| code-fold: true #| code-summary: "Show solution" @@ -811,7 +811,7 @@ data("Tengeler2020", package = "mia") tse <- Tengeler2020 ``` -2. Calculate relative abundances and create alternative experiments using `splitByRanks` or `splitOn` +2. Agglomerate data to all available taxonomy ranks by using `agglomerateByRanks()` ```{r} #| label: beta-AltExp-ex2 @@ -819,9 +819,12 @@ tse <- Tengeler2020 #| code-fold: true #| code-summary: "Show solution" + + +altExps(tse) <- agglomerateByRanks(tse) + tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -altExps(tse) <- splitByRanks(tse) altExps(tse) ``` @@ -1434,7 +1437,7 @@ Useful functions: scater::runMDS, runNMDS, transformAssay, ggplot, scater::plotR ### Heatmap visualization -1. Load one of the datasets from mia as well as miaViz. +1. Load one of the datasets from mia and the miaViz and sechm packages. ```{r} #| label: heatmap-ex1 @@ -1448,18 +1451,7 @@ data("GlobalPatterns", package = "mia") tse <- GlobalPatterns ``` -2. Add a relative abundance assay to the TSE. - -```{r} -#| label: heatmap-ex2 -#| code-fold: true -#| code-summary: "Show solution" -#| eval: false - -tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") -``` - -3. Melt the tse into a dataframe, order it by decreasing relative abundance and subset to the first 25 rows for better visibility when plotting the heatmap. +2. Get the most prevalent features. Subset your TreeSe to those features and the first twenty taxa. ```{r} #| label: heatmap-ex3 @@ -1467,31 +1459,23 @@ tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") #| code-summary: "Show solution" #| eval: false -df <- meltAssay(tse, assay.type = "relabundance") - -df <- df[order(abs(df$relabundance), decreasing=TRUE), ] +most_prevalent <- names(head(getPrevalence(tse,sort=TRUE))) -top_taxa_samples <- df[1:20,1:2] - -df <- df[df$FeatureID %in%top_taxa_samples$FeatureID & - df$SampleID %in%top_taxa_samples$SampleID, ] +tse <- tse[most_prevalent, 1:20] ``` -4. Calculate and store the limits for the heatmap's color scale as well as it's colors breaks and the colors to be used for the color scale. +3. Add a relative abundance assay to the TreeSE. ```{r} -#| label: heatmap-ex4 +#| label: heatmap-ex3 #| code-fold: true #| code-summary: "Show solution" #| eval: false -maxval <- max(abs(df$relabundance)) -limits <- c(0, maxval) -breaks <- seq(from = 0, to = max(limits), by = 0.05) -colours <- c("#F0F0FF", "#007562") +tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") ``` -5. Visualize the relative abundances using the previously created variables. +5. Visualize the relative abundances with a heatmap using the sechm library ```{r} #| label: heatmap-ex5 @@ -1499,19 +1483,14 @@ colours <- c("#F0F0FF", "#007562") #| code-summary: "Show solution" #| eval: false -ggplot(df, aes(x = SampleID, y = FeatureID, fill = relabundance)) + - geom_tile() + - scale_fill_gradientn(name = "Relative abundance", - breaks = breaks, limits = limits, colours = colours) + - theme(text = element_text(size=10), - axis.text.x = element_text(angle=45, hjust=1), - legend.key.size = unit(1, "cm")) + - labs(x = "Samples", y = "Taxa") +setSechmOption("hmcols", value=c("#F0F0FF","#007562")) + +sechm(tse,features = most_prevalent , assayName="relabundance", show_colnames=TRUE) ``` ### Advanced Heatmaps -1. Import mia, Miaviz, complexHeatmap as well as a dataset of your choice +1. Import mia, miaviz, complexHeatmap as well as a dataset of your choice ```{r} #| label: advanced-heatmap-ex1 @@ -1538,7 +1517,7 @@ tse <- Tengeler2020 tse <- transformAssay(tse, assay.type = "counts", method = "relabundance", pseudocount = 1) -tse <- mergeFeaturesByRank(tse, rank = "Order") +tse <- agglomerateByRank(tse, rank = "Order") ``` From 23ed887fbb41d5a8cee0a3448c78a651e5870801 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Fri, 7 Jun 2024 14:19:50 +0300 Subject: [PATCH 11/18] updated heatmap exercise --- inst/pages/98_exercises.qmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 428f25f6a..67d4cc207 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1447,6 +1447,8 @@ Useful functions: scater::runMDS, runNMDS, transformAssay, ggplot, scater::plotR library(mia) library(miaViz) +library(sechm) + data("GlobalPatterns", package = "mia") tse <- GlobalPatterns ``` From b77a3bc67392bd4eace72d04c716663c05befa96 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:58:53 +0300 Subject: [PATCH 12/18] advanced heatmap is now more adanced --- inst/pages/98_exercises.qmd | 83 ++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 67d4cc207..15afb16ca 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -575,7 +575,7 @@ Please have a try at the following exercises. ```{r} #| label: subsetting-ex1 -#| eval: true +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -589,7 +589,7 @@ unique(tse_subset_by_sample$SampleType) ```{r} #| label: subsetting-ex2 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -607,7 +607,7 @@ dim(tse_subset_by_sample_and_feature) ```{r} #| label: subsetting-ex3.1 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -620,7 +620,7 @@ unique(tse$SampleType) # Samples ```{r} #| label: subsetting-ex3.2 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -647,7 +647,7 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add 1. Estimate prevalence for your chosen feature (specific row and taxonomic group) ```{r} #| label: prevalence-ex1 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -663,7 +663,7 @@ getPrevalence(altExp(tse,"Phylum"), detection = 1/100, ```{r} #| label: prevalence-ex2 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -681,7 +681,7 @@ prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, ```{r} #| label: prevalence-ex3 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -693,7 +693,7 @@ How many are left? ```{r} #| label: prevalence-ex4 -#| eval: True +#| eval: FALSE #| code-fold: true #| code-summary: "Show solution" @@ -1453,17 +1453,15 @@ data("GlobalPatterns", package = "mia") tse <- GlobalPatterns ``` -2. Get the most prevalent features. Subset your TreeSe to those features and the first twenty taxa. +2. Get the most prevalent features. Subset your TreeSe to those features and the first 5. ```{r} -#| label: heatmap-ex3 +#| label: heatmap-ex2 #| code-fold: true #| code-summary: "Show solution" #| eval: false -most_prevalent <- names(head(getPrevalence(tse,sort=TRUE))) - -tse <- tse[most_prevalent, 1:20] +tse <- subsetByPrevalent(tse,prevalence=0.99999)[1:5,] ``` 3. Add a relative abundance assay to the TreeSE. @@ -1477,10 +1475,10 @@ tse <- tse[most_prevalent, 1:20] tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") ``` -5. Visualize the relative abundances with a heatmap using the sechm library +4. Visualize the relative abundances with a heatmap using the sechm library ```{r} -#| label: heatmap-ex5 +#| label: heatmap-ex4 #| code-fold: true #| code-summary: "Show solution" #| eval: false @@ -1492,7 +1490,7 @@ sechm(tse,features = most_prevalent , assayName="relabundance", show_coln ### Advanced Heatmaps -1. Import mia, miaviz, complexHeatmap as well as a dataset of your choice +1. Import mia, miaviz, complexHeatmap, shadowtext as well as a dataset of your choice ```{r} #| label: advanced-heatmap-ex1 @@ -1502,13 +1500,14 @@ sechm(tse,features = most_prevalent , assayName="relabundance", show_coln library(mia) library(miaViz) -library(ComplexHeatmap) +library(shadowtext) +library(ComplexHeatmap) data("Tengeler2020", package = "mia") tse <- Tengeler2020 ``` -2. Create an assay with relative abundance, then merge the feature by rank Òrder`. +2. Agglomerate the data by prevalence to a rank of your choice, apply log10 transformation and assure unique rownames for when we plot the heatmap. ```{r} #| label: advanced-heatmap-ex2 @@ -1516,39 +1515,55 @@ tse <- Tengeler2020 #| code-summary: "Show solution" #| eval: false -tse <- transformAssay(tse, assay.type = "counts", - method = "relabundance", pseudocount = 1) +tse <- agglomerateByPrevalence(tse, rank = "Family") -tse <- agglomerateByRank(tse, rank = "Order") +tse <- transformAssay(tse, method = "log10", pseudocount = TRUE) + +rownames(tse) <- getTaxonomyLabels(tse) ``` -3. Add CLR-Z transformation to the TreeSE. +3. Calculate the correlations and their respective p-values between the log0 and count assays. ```{r} -#| label: heatmap-ex6 +#| label: advanced-heatmap-ex3 #| code-fold: true #| code-summary: "Show solution" #| eval: false -tse<- transformAssay(tse, assay.type = "relabundance", - method = "clr", pseudocount = TRUE) - -# Add z-transformation on features (taxa) -tse <- transformAssay(tse, assay.type = "clr", - MARGIN = "features", - method = "z", name = "clr_z") +correlations <- getCrossAssociation(tse, + test_significance=TRUE, + assay.type1 = "log10", + assay.type2 = "counts", + method = "spearman", + p_adj_threshold = NULL, + cor_threshold = NULL, + # Remove when mia is fixed + mode = "matrix", + sort = TRUE, + show_warnings = FALSE) ``` -4. plot a heatmap using the CLR-Z transformation - +4. plot a heatmap using the correlations and mark significant correlations (p<0.05)with a cross. ```{r} -#| label: heatmap-ex6 +#| label: advanced-heatmap-ex4 #| code-fold: true #| code-summary: "Show solution" #| eval: false -Heatmap(assay(tse, "clr_z"), name = "clr-z") +# Create a heatmap and store it +plot <- Heatmap(correlations$cor, + # Print values to cells + cell_fun = function(j, i, x, y, width, height, fill) { + # If the p-value is under threshold + if( !is.na(correlations$pval[i, j]) & correlations$pval[i, j] < 0.05 ){ + # Print "X" + grid.shadowtext(sprintf("%s", "X"), x, y, gp = gpar(fontsize = 8, col = "#f5f5f5")) + } + }, + heatmap_legend_param = list(title = "", legend_height = unit(5, "cm")) +) +plot ``` ## Multiomics From 59960d1c75b857e7a87b2a29a47d6bbac2a9d2db Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Thu, 13 Jun 2024 09:35:38 +0300 Subject: [PATCH 13/18] Fixed some minor bugs --- inst/pages/98_exercises.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 15afb16ca..21f70314c 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1453,7 +1453,7 @@ data("GlobalPatterns", package = "mia") tse <- GlobalPatterns ``` -2. Get the most prevalent features. Subset your TreeSe to those features and the first 5. +2. Get the most prevalent features. Subset your TreeSe to those features and the first 5 taxa. ```{r} #| label: heatmap-ex2 From 3848ba0d391a7ef61da79911d8814edc7775fd39 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Wed, 19 Jun 2024 09:53:49 +0300 Subject: [PATCH 14/18] final changes --- inst/pages/98_exercises.qmd | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 57b191c85..513cc0c4f 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -582,10 +582,10 @@ Please have a try at the following exercises. #| code-fold: true #| code-summary: "Show solution" -tse_subset_by_sample <- tse[ , tse$SampleType %in% +tse.sub <- tse[ , tse$SampleType %in% c("Feces", "Skin", "Tongue")] -unique(tse_subset_by_sample$SampleType) +unique(tse.sub$SampleType) ``` 2. Subset the TreeSE object to specific features. @@ -596,12 +596,12 @@ unique(tse_subset_by_sample$SampleType) #| code-fold: true #| code-summary: "Show solution" -tse_subset_by_sample_and_feature <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", +tse.sub.feat <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", "Euryarchaeota", "Actinobacteria") ,] -dim(tse_subset_by_sample_and_feature) +dim(tse.sub.feat) ``` 3. Subset the TreeSE object to specific samples and features. @@ -657,7 +657,7 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add # First, merge the features by rank and add it as an alternative experiment. altExp(tse,"Phylum") <- agglomerateByRank(tse, "Phylum") -# Then get the prevalence for a specific sample +# Then get the prevalence for a specific feature getPrevalence(altExp(tse,"Phylum"), detection = 1/100, sort = TRUE, assay.type = "counts", as_relative = TRUE)["Phylum:Proteobacteria"] @@ -671,14 +671,15 @@ getPrevalence(altExp(tse,"Phylum"), detection = 1/100, #| code-summary: "Show solution" # Get the prevalent features -prevalentFeatures <- getPrevalentFeatures(tse, detection = 0, - prevalence = 50/100) +prevalentFeatures <- as.integer(getPrevalentFeatures(tse, detection = 1, + prevalence = 50/100)) # Subset the data to the prevalent features -prevalenceTse <- tse[rowData(tse)$Phylum %in% prevalentFeatures,] +tse.prevalent <- tse[rownames(rowData(tse)) %in% prevalentFeatures,] # Alternatively subsetByPrevalentFeatures can be used to subset directly -prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, - prevalence = 50/100) +tse.prevalent2 <- subsetByPrevalentFeatures(tse, detection = 1, + prevalence = 50/100) +identical(tse.prevalent,tse.prevalent2) ``` 3. How many features are left ? @@ -688,7 +689,7 @@ prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, #| code-fold: true #| code-summary: "Show solution" -length(assay(prevalenceTse, "counts")) +length(assay(tse.prevalent, "counts")) ``` 4. Recalculate the most prevalent features based on relative abundance. @@ -702,7 +703,7 @@ How many are left? tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0, +prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0.1/100, prevalence = 50/100) length(assay(prevalenceTse, "relabundance")) @@ -723,10 +724,10 @@ library(scater) rowData(prevalenceTse)$prevalence <- getPrevalence(prevalenceTse, detection = 1/100, sort = FALSE, - assay.type = "counts", as_relative = TRUE) + assay.type = "relabundance") # Then plot the row data -plotRowData(tse, "prevalence", +plotRowData(prevalenceTse, "prevalence", colour_by = "Phylum") ``` @@ -1485,9 +1486,7 @@ tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") #| code-fold: true #| code-summary: "Show solution" #| eval: false - setSechmOption("hmcols", value=c("#F0F0FF","#007562")) - sechm(tse,features = most_prevalent , assayName="relabundance", show_colnames=TRUE) ``` @@ -1568,8 +1567,6 @@ plot <- Heatmap(correlations$cor, ) plot ``` - - ======= **Extra** Cool Exercise : Identify Patterns and Clusters To make the exercise more engaging, let's do an exploratory data analysis task: From ceda759bc0a6586bd9657c8e354dfa877698b72f Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Wed, 19 Jun 2024 11:07:33 +0300 Subject: [PATCH 15/18] final changes --- inst/pages/98_exercises.qmd | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 513cc0c4f..d339ef8c3 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -574,39 +574,9 @@ Therefore, we suggest creating a new TreeSE object whenever you subset. Please have a try at the following exercises. -1. Subset the TreeSE object to specific samples. +1. Subset the TreeSE object to specific samples and features. -```{r} -#| label: subsetting-ex1 -#| eval: FALSE -#| code-fold: true -#| code-summary: "Show solution" - -tse.sub <- tse[ , tse$SampleType %in% - c("Feces", "Skin", "Tongue")] - -unique(tse.sub$SampleType) -``` - -2. Subset the TreeSE object to specific features. - -```{r} -#| label: subsetting-ex2 -#| eval: FALSE -#| code-fold: true -#| code-summary: "Show solution" - -tse.sub.feat <- tse[rowData(tse)$Phylum %in% c("Crenarchaeota", - "Euryarchaeota", - "Actinobacteria") - ,] - -dim(tse.sub.feat) -``` - -3. Subset the TreeSE object to specific samples and features. - -3.1. Find some samples and features to subset with. +1.1. Find some samples and features to subset with. ```{r} #| label: subsetting-ex3.1 @@ -619,7 +589,7 @@ head(unique(rowData(tse)$Phylum)) # Features unique(tse$SampleType) # Samples ``` -3.2. Subset the TreeSE with your found samples and features. +1.2. Subset the TreeSE with your found samples and features. ```{r} #| label: subsetting-ex3.2 From 79bcb2a8436f1e59e3808f31462afdcfa8666fa2 Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Wed, 19 Jun 2024 11:46:11 +0300 Subject: [PATCH 16/18] corrected spelling mistakes --- inst/pages/98_exercises.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index d339ef8c3..a52ba27d7 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1495,7 +1495,7 @@ rownames(tse) <- getTaxonomyLabels(tse) ``` -3. Calculate the correlations and their respective p-values between the log0 and count assays. +3. Calculate the correlations and their respective p-values between the log10 and count assays. ```{r} #| label: advanced-heatmap-ex3 @@ -1516,7 +1516,7 @@ correlations <- getCrossAssociation(tse, show_warnings = FALSE) ``` -4. plot a heatmap using the correlations and mark significant correlations (p<0.05)with a cross. +4. plot a heatmap using the correlations and mark significant correlations (p<0.05) with a cross. ```{r} #| label: advanced-heatmap-ex4 #| code-fold: true From d602983597b3aeeae00f153c3fc2eec8247de8ea Mon Sep 17 00:00:00 2001 From: Insaynoah <74536546+Insaynoah@users.noreply.github.com> Date: Wed, 26 Jun 2024 07:38:58 +0300 Subject: [PATCH 17/18] removed equal signs --- inst/pages/98_exercises.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index a52ba27d7..039a0f67d 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -1537,7 +1537,7 @@ plot <- Heatmap(correlations$cor, ) plot ``` -======= + **Extra** Cool Exercise : Identify Patterns and Clusters To make the exercise more engaging, let's do an exploratory data analysis task: From 6024a80889ce3d317d278d5f3512d55182c4c3db Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Mon, 1 Jul 2024 19:13:06 +0300 Subject: [PATCH 18/18] up --- inst/pages/98_exercises.qmd | 221 ++++++++++++++++++++++-------------- 1 file changed, 135 insertions(+), 86 deletions(-) diff --git a/inst/pages/98_exercises.qmd b/inst/pages/98_exercises.qmd index 039a0f67d..9b2b583c1 100644 --- a/inst/pages/98_exercises.qmd +++ b/inst/pages/98_exercises.qmd @@ -584,6 +584,10 @@ Please have a try at the following exercises. #| code-fold: true #| code-summary: "Show solution" +library(mia) +data("GlobalPatterns", package = "mia") +tse <- GlobalPatterns + head(unique(rowData(tse)$Phylum)) # Features unique(tse$SampleType) # Samples @@ -597,12 +601,12 @@ unique(tse$SampleType) # Samples #| code-fold: true #| code-summary: "Show solution" -features <- c("Crenarchaeota","Euryarchaeota","Actinobacteria") +features <- c("Crenarchaeota", "Euryarchaeota", "Actinobacteria") samples <- c("Feces", "Skin", "Tongue") -tse_subset_by_feature <- tse[rowData(tse)$Phylum %in% features - , - tse$SampleType %in% samples] +rows <- rowData(tse)$Phylum %in% features +cols <- tse$SampleType %in% samples +tse_subset_by_feature <- tse[rows, cols] unique(rowData(tse_subset_by_feature)$Phylum) ``` @@ -618,20 +622,28 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, scater::add ### Prevalent and core taxonomic features 1. Estimate prevalence for your chosen feature (specific row and taxonomic group) + ```{r} #| label: prevalence-ex1 #| eval: FALSE #| code-fold: true #| code-summary: "Show solution" +library(mia) +data("GlobalPatterns", package = "mia") +tse <- GlobalPatterns + # First, merge the features by rank and add it as an alternative experiment. -altExp(tse,"Phylum") <- agglomerateByRank(tse, "Phylum") +tse_phylum <- agglomerateByRank(tse, "Phylum") + +# Calculate relative abundance +tse_phylum <- transformAssay(tse_phylum, method = "relabundance") + +# Then get the prevalence for a specific feature +res <- getPrevalence(tse_phylum, detection = 1/100, sort = TRUE, assay.type = "relabundance") +res["Proteobacteria"] +``` -# Then get the prevalence for a specific feature -getPrevalence(altExp(tse,"Phylum"), detection = 1/100, - sort = TRUE, assay.type = "counts", - as_relative = TRUE)["Phylum:Proteobacteria"] -``` 2. Identify all prevalent features and subset the data accordingly. ```{r} @@ -641,16 +653,16 @@ getPrevalence(altExp(tse,"Phylum"), detection = 1/100, #| code-summary: "Show solution" # Get the prevalent features -prevalentFeatures <- as.integer(getPrevalentFeatures(tse, detection = 1, - prevalence = 50/100)) +prevalent <- getPrevalent(tse, assay.type = "counts", detection = 1, prevalence = 0.1) # Subset the data to the prevalent features -tse.prevalent <- tse[rownames(rowData(tse)) %in% prevalentFeatures,] +tse_prevalent1 <- tse[rownames(tse) %in% prevalent, ] # Alternatively subsetByPrevalentFeatures can be used to subset directly -tse.prevalent2 <- subsetByPrevalentFeatures(tse, detection = 1, - prevalence = 50/100) -identical(tse.prevalent,tse.prevalent2) +tse_prevalent2 <- subsetByPrevalent(tse, assay.type = "counts", detection = 1, prevalence = 0.1) + +identical(tse_prevalent1, tse_prevalent2) ``` + 3. How many features are left ? ```{r} @@ -659,8 +671,8 @@ identical(tse.prevalent,tse.prevalent2) #| code-fold: true #| code-summary: "Show solution" -length(assay(tse.prevalent, "counts")) -``` +nrow(tse_prevalent1) +``` 4. Recalculate the most prevalent features based on relative abundance. How many are left? @@ -673,13 +685,12 @@ How many are left? tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") -prevalenceTse <- subsetByPrevalentFeatures(tse, detection = 0.1/100, - prevalence = 50/100) +tse_prevalent <- subsetByPrevalent(tse, assay.type = "relabundance", detection = 0.01/100, prevalence = 50/100) -length(assay(prevalenceTse, "relabundance")) +nrow(tse_prevalent) ``` -5. Visualize prevalence using a violin/bean plot. +5. Visualize prevalence of most prevalent Phyla using a violin/bean plot. ```{r} #| label: prevalence-ex5 @@ -690,18 +701,17 @@ length(assay(prevalenceTse, "relabundance")) # Import the scater library library(scater) +# Agglomerate based on prevalence. Agglomerate to Phylum level. +tse_phylum <- subsetByPrevalent(tse, rank = "Phylum", assay.type = "relabundance", detection = 0.001, prevalence = 0.2) # Store the prevalence of each taxon -rowData(prevalenceTse)$prevalence <- - getPrevalence(prevalenceTse, detection = 1/100, - sort = FALSE, - assay.type = "relabundance") +res <- getPrevalence(tse_phylum, detection = 0.001, sort = FALSE, assay.type = "relabundance") +rowData(tse_phylum)$prevalence <- res # Then plot the row data -plotRowData(prevalenceTse, "prevalence", - colour_by = "Phylum") +plotRowData(tse_phylum, "prevalence", colour_by = "Phylum") ``` -Useful functions: getPrevalence, getPrevalentFeatures, subsetByPrevalentFeatures +Useful functions: getPrevalence, getPrevalent, subsetByPrevalent ### Data exploration @@ -766,9 +776,9 @@ Useful functions: nrow, ncol, dim, summary, table, quantile, unique, transformAs As for assays, you can access the desired altExp by name or index. 6. **Extra**: Split the data based on other features with `splitOn`. -### Beta Diversity using Alternative experiments +### Beta Diversity -This Exercise will focus on calcuating dissimilarities or beta diversity using alternative experiments. +This Exercise will focus on calculating dissimilarities or beta diversity. 1. Import the mia and vegan packages and load a dataset. This can be your own or one of the packages built in to mia. @@ -793,16 +803,11 @@ tse <- Tengeler2020 #| code-fold: true #| code-summary: "Show solution" - - -altExps(tse) <- agglomerateByRanks(tse) - -tse <- transformAssay(tse, MARGIN = "samples", method="relabundance") - -altExps(tse) +tse <- agglomerateByRanks(tse) +altExpNames(tse) ``` -3. Run PCoA on on one of the created alternative experiments using Bray-Curtis dissimilarity. +3. Import the scater library and run PCoA on on one of the created alternative experiments using Bray-Curtis dissimilarity. ```{r} #| label: beta-AltExp-ex3 @@ -810,14 +815,24 @@ altExps(tse) #| code-fold: true #| code-summary: "Show solution" -tse <- runMDS(tse, - FUN = vegan::vegdist, - method = "bray", - assay.type = "relabundance", - name = "MDS_bray") +# We use scater library for calculating ordination +library(scater) + +# Choose rank +rank <- "Genus" +# Calculate relative abundance +altExp(tse, rank) <- transformAssay(altExp(tse, rank), MARGIN = "samples", method="relabundance") + +# Calculate ordination +altExp(tse, rank) <- runMDS( + altExp(tse, rank), + FUN = vegan::vegdist, + method = "bray", + assay.type = "relabundance", + name = "MDS_bray") ``` -4. Import the scater library, to plot the first two coordinates using the `reducedDim`function and plot the coordinates. +4. Plot the first two coordinates using the `reducedDim` function and plot the coordinates. ```{r} #| label: beta-AltExp-ex4 @@ -825,7 +840,7 @@ tse <- runMDS(tse, #| code-fold: true #| code-summary: "Show solution" -p <- plotReducedDim(tse, "MDS_bray") +p <- plotReducedDim(altExp(tse, rank), "MDS_bray") p ``` @@ -838,17 +853,18 @@ p #| code-summary: "Show solution" # Calculate explained variance -e <- attr(reducedDim(tse, "MDS_bray"), "eig") +e <- attr(reducedDim(altExp(tse, rank), "MDS_bray"), "eig") rel_eig <- e / sum(e[e > 0]) # Add explained variance for each axis -p <- p + labs(x = paste("PCoA 1 (", round(100 * rel_eig[[1]], 1), "%", ")", sep = ""), - y = paste("PCoA 2 (", round(100 * rel_eig[[2]], 1), "%", ")", sep = "")) +p <- p + + labs( + x = paste("PCoA 1 (", round(100 * rel_eig[[1]], 1), "%", ")", sep = ""), + y = paste("PCoA 2 (", round(100 * rel_eig[[2]], 1), "%", ")", sep = "")) p ``` - ## Community (alpha) diversity ### Estimation @@ -922,7 +938,6 @@ represent an important analysis in several studies. that you are using an appropriate statistical test for the number of groups and distribution. - ## Community similarity ### Reduced dimensions retrieval @@ -1096,11 +1111,12 @@ tse$Group <- tse$SampleType == "Feces" #| code-fold: true #| code-summary: "Show solution" -tse <- runRDA(tse, - assay.type = "relabundance", - formula = assay ~ Group, - distance = "bray", - na.action = na.exclude) +tse <- runRDA( + tse, + assay.type = "relabundance", + formula = assay ~ Group, + distance = "bray", + na.action = na.exclude) rda_info <- attr(reducedDim(tse, "RDA"), "significance") @@ -1113,18 +1129,18 @@ indicating that the groups have different variances, making it so that the permanova results are compromised and do not correctly explain whether there are significant differences between groups. -5. Nevertheless, extract the p-values. Can the differences between samples be explained with variables from their metadata? +5. Nevertheless, extract the p-values. Can the differences between samples be explained with variables from their metadata? + ```{r} #| label: dbRDA-ex5 #| eval: FALSE #| code-fold: true #| code-summary: "Show solution" -rda_info$permanova$`Pr(>F)`[1:2] +rda_info[["permanova"]][["Pr(>F)"]] ``` -Useful functions: scater::runMDS, runRDA, vegan::anova.cca, transformAssay, agglomerateByRank, ggplot, scater::plotReducedDim, vegan::adonis2 - +Useful functions: scater::runMDS, runRDA, transformAssay, agglomerateByRank ## Differential abundance @@ -1427,7 +1443,7 @@ data("GlobalPatterns", package = "mia") tse <- GlobalPatterns ``` -2. Get the most prevalent features. Subset your TreeSe to those features and the first 5 taxa. +2. Agglomerate your TreeSE to include the most prevalent Phyla. ```{r} #| label: heatmap-ex2 @@ -1435,7 +1451,7 @@ tse <- GlobalPatterns #| code-summary: "Show solution" #| eval: false -tse <- subsetByPrevalent(tse,prevalence=0.99999)[1:5,] +tse <- agglomerateByPrevalence(tse, rank = "Phylum", prevalence = 0.9) ``` 3. Add a relative abundance assay to the TreeSE. @@ -1457,7 +1473,7 @@ tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") #| code-summary: "Show solution" #| eval: false setSechmOption("hmcols", value=c("#F0F0FF","#007562")) -sechm(tse,features = most_prevalent , assayName="relabundance", show_colnames=TRUE) +sechm(tse, features = rownames(tse), assayName="relabundance", show_colnames=TRUE) ``` ### Advanced Heatmaps @@ -1492,7 +1508,6 @@ tse <- agglomerateByPrevalence(tse, rank = "Family") tse <- transformAssay(tse, method = "log10", pseudocount = TRUE) rownames(tse) <- getTaxonomyLabels(tse) - ``` 3. Calculate the correlations and their respective p-values between the log10 and count assays. @@ -1503,20 +1518,22 @@ rownames(tse) <- getTaxonomyLabels(tse) #| code-summary: "Show solution" #| eval: false -correlations <- getCrossAssociation(tse, - test_significance=TRUE, - assay.type1 = "log10", - assay.type2 = "counts", - method = "spearman", - p_adj_threshold = NULL, - cor_threshold = NULL, - # Remove when mia is fixed - mode = "matrix", - sort = TRUE, - show_warnings = FALSE) +correlations <- getCrossAssociation( + tse, + test.signif=TRUE, + assay.type1 = "log10", + assay.type2 = "counts", + method = "spearman", + p.adj.threshold = NULL, + cor.threshold = NULL, + # Remove when mia is fixed + mode = "matrix", + sort = TRUE, + show.warnings = FALSE) ``` 4. plot a heatmap using the correlations and mark significant correlations (p<0.05) with a cross. + ```{r} #| label: advanced-heatmap-ex4 #| code-fold: true @@ -1524,18 +1541,50 @@ correlations <- getCrossAssociation(tse, #| eval: false # Create a heatmap and store it -plot <- Heatmap(correlations$cor, - # Print values to cells - cell_fun = function(j, i, x, y, width, height, fill) { - # If the p-value is under threshold - if( !is.na(correlations$pval[i, j]) & correlations$pval[i, j] < 0.05 ){ - # Print "X" - grid.shadowtext(sprintf("%s", "X"), x, y, gp = gpar(fontsize = 8, col = "#f5f5f5")) - } - }, - heatmap_legend_param = list(title = "", legend_height = unit(5, "cm")) +p <- Heatmap( + correlations$cor, + # Print values to cells + cell_fun = function(j, i, x, y, width, height, fill) { + # If the p-value is under threshold + if( !is.na(correlations$pval[i, j]) & correlations$pval[i, j] < 0.05 ){ + # Print "X" + grid.shadowtext(sprintf("%s", "X"), x, y, gp = gpar(fontsize = 8, col = "#f5f5f5")) + } + }, + heatmap_legend_param = list(title = "", legend_height = unit(5, "cm")) ) -plot +p +``` + +5. Plot similar heatmap but adjust color scale to ensure that 0 is in the middle + +Hint: utilize circlize::colorRamp2() + +```{r} +#| label: advanced-heatmap-ex5 +#| code-fold: true +#| code-summary: "Show solution" +#| eval: false + +library(circlize) +# Update color scale to ensure 0 is in the middle +col_fun<-colorRamp2(c(-1, -0.5, 0, 0.5, 1), c("darkblue","skyblue","white", "brown1", "darkred")) + +# Create a heatmap and store it +p <- Heatmap( + correlations$cor, + # Print values to cells + cell_fun = function(j, i, x, y, width, height, fill) { + # If the p-value is under threshold + if( !is.na(correlations$pval[i, j]) & correlations$pval[i, j] < 0.05 ){ + # Print "X" + grid.shadowtext(sprintf("%s", "X"), x, y, gp = gpar(fontsize = 8, col = "#f5f5f5")) + } + }, + heatmap_legend_param = list(title = "", legend_height = unit(5, "cm")), + col = col_fun +) +p ``` **Extra** Cool Exercise : Identify Patterns and Clusters