-
Notifications
You must be signed in to change notification settings - Fork 86
WoM_2023_workshop
- 1.1.1 Taxonomic profile of one metagenome
- 1.1.2 Compute taxonomic profiles for all samples
- 1.1.3 Merge all taxonomic profiles
- 1.1.4 Filter taxonomic profiles to only SGB-level information
- 1.1.5 Visualize the species-level taxonomic profiles with a heatmap
- 1.1.6 Link metadata information (body site and life style) to each sample
- 1.1.7 Visualize all taxonomic profiles in a heatmap with metadata information
From short reads to a relative abudnace table detailing the microbial composition and their contribution to the community.
%%bash
cd Desktop/
sample="sample_o_w_1"
input=/home/hutlab_public/Tutorials/metaphlan4/input
output=metaphlan_output/${sample}
mkdir -p ${output}
# MetaPhlAn profiling
bzcat $(ls -1 ${input}/${sample}/${sample}*.fastq.bz2 | paste -sd ' ' -) | \
metaphlan \
--unclassified_estimation \
--nproc 4 \
--input_type fastq \
--bowtie2out ${output}/${sample}.bowtie2.bz2 \
--samout ${output}/${sample}.sam.bz2 > ${output}/${sample}_profile.tsv
%%bash
# input=/home/hutlab_public/Tutorials/metaphlan4/input
for s in $(ls -d ${input}/*/); do
sample=$(basename ${s})
# skip the sample already profiled above
if [ 'sample_o_w_1' == ${sample} ]; then
continue;
fi;
output=metaphlan_output/${sample}
mkdir ${output}
echo "Computing MetaPhlAn profile for: "${sample}
bzcat $(ls -1 ${input}/${sample}/${sample}*.fastq.bz2 | paste -sd ' ' -) | \
metaphlan \
--unclassified_estimation \
--nproc 4 \
--input_type fastq \
--bowtie2out ${output}/${sample}.bowtie2.bz2 \
--samout ${output}/${sample}.sam.bz2 > ${output}/${sample}_profile.tsv
done
Put all taxonomic profiles into one single table. Species identified in only one fo the samples will be reported with a relative abundance of 0 in all the others.
%%bash
merge_metaphlan_tables.py metaphlan_output/*/*_profile.tsv > metaphlan_output/merged_abundance_table.txt
%%bash
grep -E "sample|t__" metaphlan_output/merged_abundance_table.txt > metaphlan_output/merged_abundance_species.txt
echo "Total number of samples in the merged table:" $(head metaphlan_output/merged_abundance_table.txt | awk -F $'\t' '{ print NF+1 }' )
echo "Total number of species profiled across all samples:" $(wc -l metaphlan_output/merged_abundance_species.txt )
%%bash
hclust2.py \
-i metaphlan_output/merged_abundance_species.txt \
-o metaphlan_output/MetaPhlAn_heatmap.png \
--f_dist_f correlation \
--s_dist_f braycurtis \
--cell_aspect_ratio 1 \
--log_scale \
--flabel_size 7 \
--slabel_size 7 \
--max_flabel_len 100 \
--max_slabel_len 20 \
--dpi 150
import pandas as pd
import numpy as np
df_prof = pd.read_csv('metaphlan_output/merged_abundance_species.txt', header=0, sep='\t')
df_meta = pd.read_csv('/home/hutlab_public/Tutorials/metaphlan4/input/metadata_samples.tsv', header=0, sep='\t')
df_prof['species'] = df_prof['Unnamed: 0'].str.split('|').str[-2].str.strip().replace("s__", "", regex=True) + \
"__" + \
df_prof['Unnamed: 0'].str.split('|').str[-1].str.strip().replace("t__", "", regex=True)
df_prof.drop(set(['NCBI_tax_id', 'Unnamed: 0']).intersection(df_prof.columns.to_list()), axis=1, inplace=True)
df_all = df_prof.set_index('species').T
df_all.reset_index(level=0, inplace=True)
df_all.rename(columns={'index': 'sampleID'}, inplace=True)
df_all = df_all.merge(df_meta[['sampleID', 'body_site', 'life_style']] , on='sampleID', how='left', indicator=True)
subset = df_all[['sampleID', 'body_site', 'life_style' ]]
tuples = [tuple(x) for x in subset.to_numpy()]
df_prof.set_index("species", inplace=True)
df_prof.index.name = None
df_prof.columns = pd.MultiIndex.from_tuples(tuples, names=['sampleID', 'body_site', 'life_style'])
df_prof.to_csv('metaphlan_output/merged_abundance_table_species_metadata.txt', sep='\t', header=True, index=True)
%%bash
hclust2.py \
-i metaphlan_output/merged_abundance_table_species_metadata.txt \
-o metaphlan_output/MetaPhlAn_heatmap_metadata.png \
--legend_file metaphlan_output/MetaPhlAn_heatmap_metadata_legend.png \
--f_dist_f braycurtis \ #correlation
--s_dist_f braycurtis \
--cell_aspect_ratio 1 \
--log_scale \
--flabel_size 7 \
--slabel_size 7 \
--max_flabel_len 100 \
--max_slabel_len 20 \
--dpi 150 \
--metadata_height 0.05 \
--metadata_rows 1,2
See example of a different type and more complex dataset: The 25 most prevalent SGBs in 2,500 food microbiome samples.
- 1.2.1 Single-sample microbial composition (alpha diversity)
- 1.2.2 Visualize alpha diversity per group
- 1.2.3 Between-sample community composition comparison (beta diversity)
- 1.2.4 Dimensionality reduction of beta diversity microbial composition comparison and visualization
%%bash
mpa_path=/home/hutlab_public/.local/lib/python3.6/site-packages/metaphlan/utils
output=metaphlan_output/alpha_diversity
mkdir ${output}
for metric in richness shannon; do
Rscript ${mpa_path}/calculate_diversity.R \
-f metaphlan_output/merged_abundance_species.txt \
-o ${output} \
-d alpha \
-p alpha \
-m ${metric}
done
Back to 1.2 Microbiome composition analysis
Back to top
ls metaphlan_output/alpha_diversity/
# df_meta = pd.read_csv('/home/hutlab_public/Tutorials/metaphlan4/input/metadata_samples.tsv', header=0, sep='\t')
df_alpha = df_alpha.read_csv('metaphlan_output/alpha_diversity/alpha_shannon.tsv', sep='\t', header=0, index_col=0)
df_alpha = df_alpha.reset_index().rename(columns={'index':'sampleID'}).merge(df_meta[['sampleID', 'bs_ls' ]], on='sampleID', how='left')
plt.figure(figsize=(6, 5), dpi=150)
ax = sns.boxplot(data=df_alpha, y="diversity_shannon", x='bs_ls', hue='bs_ls')
ax.set_title('Shannon-Index')
plt.savefig('metaphlan_output/alpha_diversity/alpha_shannon.png', dpi=150)
Back to 1.2 Microbiome composition analysis
Back to top
%%bash
mpa_path=/home/hutlab_public/.local/lib/python3.6/site-packages/metaphlan/utils
output=metaphlan_output/beta_diversity
mkdir -p ${output}
for metric in bray-curtis weighted-unifrac; do
Rscript ${mpa_path}/calculate_diversity.R \
-f metaphlan_output/merged_abundance_species.txt \
-o ${output} \
-d beta \
-p beta \
-t /home/hutlab_public/.local/lib/python3.6/site-packages/metaphlan/utils/mpa_vJan21_CHOCOPhlAnSGB_202103.nwk \
-m ${metric}
done
Back to 1.2 Microbiome composition analysis
Back to top
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns; sns.set(style='ticks')
import matplotlib
import matplotlib.pyplot as plt
# load beta diversity distance-matric (Bray-Curtis)
bc_distmat = pd.read_csv('metaphlan_output/beta_diversity/beta_bray-curtis.tsv', sep='\t', header=0, index_col=0)
# calculate dimensionality reduction using t-SNE (t-Stochastic Neighbour Embedding)
coords_tsne = TSNE(n_components=2, metric="precomputed", random_state=42).fit_transform(bc_distmat)
bc_points = pd.DataFrame(bc_distmat.columns, columns=['sampleID'])
bc_points['tSNE1'] = [i[0] for i in coords_tsne]
bc_points['tSNE2'] = [i[1] for i in coords_tsne]
bc_points.to_csv('metaphlan_output/beta_diversity/beta_bray-curtis_tSNE.tsv', sep='\t', index=False)
# df_meta = pd.read_csv('metaphlan_output/metadata_samples.tsv', header=0, sep='\t')
bc_points = bc_points.merge(df_meta[['sampleID', 'bs_ls' ]], on='sampleID', how='left')
plt.figure(figsize=(6, 5), dpi=150)
ax = sns.scatterplot(data=bc_points, x=bc_points.columns[1], y=bc_points.columns[2], hue='bs_ls', alpha=0.8)
ax.set_title('t-SNE Bray-Curtis')
plt.savefig('metaphlan_output/beta_diversity/beta_bray-curtis_tSNE.png')
See a more complex dimensionality reduction of taxonomic profiles of over 20k human mmicrobiome samples.
- 1.3.1 Run sample2markers on each sample
- 1.3.2 Extract the sequences for each marker of SGB4933
- 1.3.3 StrainPhlAn
- 1.3.4 Plot strain-level phylogenetic trees
Let's focus on the Eubacterium rectale species, SGB ID: 4933 (t__SGB4933
).
%%bash
mkdir -p strainphlan_output/consensus_marker
for s in $(ls -d ${input}/*/); do
sample=$(basename ${s})
sample2markers.py -i metaphlan_output/${sample}/${sample}.sam.bz2 \
-f bz2 \
-o strainphlan_output/consensus_marker \
-n 4 \
--clades t__SGB4933
done
Back to 1.3 Strain-level analysis
Back to top
%%bash
mkdir strainphlan_output/db_markers
extract_markers.py -c t__SGB4933 \
-o strainphlan_output/db_markers
Back to 1.3 Strain-level analysis
Back to top
%%bash
mkdir strainphlan_output/t__SGB4933
strainphlan -s strainphlan_output/consensus_marker/*.pkl \
-m strainphlan_output/db_markers/t__SGB4933.fna \
-o strainphlan_output/t__SGB4933 \
-n 4 \
-c t__SGB4933 \
--marker_in_n_samples 4 \
--sample_with_n_markers 30 \
--sample_with_n_markers_after_filt 30
Back to 1.3 Strain-level analysis
Back to top
%%bash
for var_annot in bs_ls sam; do
echo "Plotting StrainPhlAn4 t__SGB4933 tree with ${var_annot}"
add_metadata_tree.py -t strainphlan_output/t__SGB4933/RAxML_bestTree.t__SGB4933.StrainPhlAn4.tre \
-f /home/hutlab_public/Tutorials/metaphlan4/input/metadata_samples.tsv \
-m ${var_annot} \
--string_to_remove _filtered
plot_tree_graphlan.py -t strainphlan_output/t__SGB4933/RAxML_bestTree.t__SGB4933.StrainPhlAn4.tre.metadata \
-m ${var_annot}
mv strainphlan_output/t__SGB4933/RAxML_bestTree.t__SGB4933.StrainPhlAn4.tre.metadata.png \
strainphlan_output/t__SGB4933/RAxML_bestTree.t__SGB4933.StrainPhlAn4.tre.metadata_${var_annot}.png
done
You can find more on E. rectale species-level diversity here and you can check out the pubblication below.
Karcher, N., Pasolli, E., Asnicar, F. et al. Analysis of 1321 Eubacterium rectale genomes from metagenomes uncovers complex phylogeographic population structure and subspecies functional adaptations. Genome Biol 21, 138 (2020). https://doi.org/10.1186/s13059-020-02042-y
More strain-level analysis on the Lacticaseibacillus paracasei species from both human and food microbiomes and isolate genomes.
Example inspired by Colorectal cancer stool microbiome species prevalence
and Meta-analysis of age-related microbial species using cMD3.0
by Davide Golzato and Paolo Manghi
%%R
# BiocManager::install("curatedMetagenomicData")
suppressPackageStartupMessages({
library(table1)
library(DT)
library(curatedMetagenomicData)
library(dplyr)
library(tidyverse)
library(SummarizedExperiment)
library(TreeSummarizedExperiment)
library(mia)
})
# see type of data per dataset
curatedMetagenomicData("Feng+")
# info about what's inside a dataset
curatedMetagenomicData("FengQ_2015.relative_abundance", dryrun = FALSE, rownames = "short")
Retrieval of datasets from the cMD3 meeting a series of requirements
%%R
#Filter on Age, body_site, study_condition and remove samples missing BMI
metadata <- curatedMetagenomicData::sampleMetadata %>%
filter(age >= 16 &
body_site == "stool" &
study_condition %in% c("control", "CRC") &
is.na(BMI) != TRUE &
is.na(gender) != TRUE &
days_from_first_collection %in% c(0,NA) &
!study_name == "ThomasAM_2019_c")
metadata <- metadata %>%
group_by(study_name, subject_id) %>%
filter(row_number() == 1) %>%
ungroup()
#Apply function to grouped selected dataset
datasets_tokeep <- metadata %>%
select(study_name, gender,BMI, age, study_condition) %>%
group_by(study_name) %>%
summarise(N=n(),age_IQR=IQR(age),bmi_IQR=IQR(BMI), study_condition_type=n_distinct(study_condition) ) %>%
mutate(keep=(N >= 40) & (age_IQR >= 15) & (study_condition_type>=2)) %>%
filter(keep == TRUE)
datasets_tokeep <- datasets_tokeep$study_name
metadata <- metadata %>%
filter(study_name %in% datasets_tokeep)
Download of datasets and data wrangling to structure them in a format suitable for this meta-analysis
Now that we have filtered the metadata, we apply the cMD function "returnSamples()". This function allows us to obtain a TreeSummarizedExperiment object, in which is stored the specified data-type (in this example: relative abundances of species) for the samples for which we have selected the metadata.
%%R
tse <- curatedMetagenomicData::returnSamples(metadata, dataType = "relative_abundance")
The TreeSummarizedExperiment object of taxonomic relative abundances stores the entire taxonomy (from Kingdom to species-level) of each microbial species. We therefore use the function splitByRanks from the mia package which allows us to select higher taxonomic level. We will focus on Species.
We are interested in species only, so we modify the TSE object accordingly Back to top
%%R
species_tse <- splitByRanks(tse, rank = "species")[[1]]
Analysis of microbial species by a linear model and computation of an effect size for each species in each dataset Defining functions used to perform the meta-analysis These functions will be used to compute correlation (R) from the t-value of linear models and transform relative abundance data.
This function is used to transform species relative abundances according to the arcsin(sqrt(RelAb/100)) function to correct for compositionality-related issues
%%R
#Compute Correlation (R) from t-value
R_fromlm <- function(n,t) {
r <- t / sqrt((t^2) + (n - 1))
return(r)
}
#Transform relative abundances
asin_trans <- function(rel_ab){
return(asin(sqrt(rel_ab/100)))
}
In order to proceed with the meta-analysis, we have to compute an effect-size for each population. Each dataset effect-size is computed with the function we define after. This function estimates a per-feature relationship between age and the arcsin-square-root relative abundance of each species. The estimates is extracted by an Ordinary Least Squares (OLS) model, in which the microbial feature (in this case, the species) is the response, and the age of the subject is the predictor. Using this method we can control our model by age & by gender of the patient.
The model has indeed the shape: species∼sex+age+BMI
We compute the effect-size of “age” with respect to “species” (corrected for “gender” & “BMI”) via the formula:
effect-size=t/sqrt(t^2+(n−1))
where
n= is the number of samples in the considered dataset
t= is the T-statistics for age returned by the software
%%R
computeStandardizedMeanDifference <- function(tse_object){
tse_datasets <- unique(colData(tse_object)$study_name)
#Build linear models for all species of each dataset present in tse_object
singleDatasetAnalyze <- function(dataset) {
single_dataset_tse <- tse_object[,tse_object$study_name == dataset]
#Select vectors of relative abundances and metadata
exprs_df <- asin_trans(
assay(single_dataset_tse))
exprs_df <- exprs_df[rowSums(is.na(exprs_df)) != ncol(exprs_df), ]
species <- rownames(exprs_df)
age <- as.double(colData(single_dataset_tse)$age)
bmi <- as.double(colData(single_dataset_tse)$BMI)
gender <- as.factor(colData(single_dataset_tse)$gender)
#
study_condition <- as.factor(colData(single_dataset_tse)$study_condition)
compute_lm <- function(exprs_row){
if(nlevels(gender) >1){
lm_res <- broom::tidy(lm(exprs_row ~ bmi + age + gender + study_condition))
} else {
lm_res <- broom::tidy(lm(exprs_row ~ bmi + age + study_condition))
}
lm_res <- column_to_rownames(lm_res, var = "term")
res <- lm_res["study_condition" ,c("statistic","p.value"),drop=TRUE]
return(res)
}
lmResults <- t(
sapply(species,
FUN = function(x) {
species_relabs <- exprs_df[x,]
res <- compute_lm(species_relabs)
return(res)
}))
n <- ncol(exprs_df)
#Compute effect size and respective standard error for each species in single
#dataset
#Compute effect size for each species in single dataset
R_List <- as.vector(sapply(lmResults[,"statistic"], function(x) R_fromlm(n, x)))
wald_list <- unname(unlist(lmResults[,"p.value"]))
#FDR-correction with Benjamini-Hochberg method for Wald p-values
final_df <- as.matrix(cbind(R_List,
wald_list,
p.adjust(wald_list,method = "BH")))
#Finalize results for the single dataset
colnames(final_df) <- c(paste0(dataset,"_Correlation"),
paste0(dataset,"_pvalue"),
paste0(dataset,"_Qvalue"))
#final_df <-na.omit(final_df)
rownames(final_df) <- species
return(final_df)
}
linear_models <- lapply(tse_datasets, singleDatasetAnalyze)
names(linear_models) <- tse_datasets
return(linear_models)
}
Now that we have computed effect-sizes for each population, we can meta-analyze each microbial feature. In order to do this, we define a function applying the metacor function from the package metafor.
RE: Random Effect
SE: Standard Error
CI: Confidence Interval
%%R
runMetaanalysis <- function(R_vector, n) {
a <- meta::metacor(cor=R_vector,
n=n,
studlab=rownames(R_vector),
method.tau="PM",
sm="ZCOR")
final_vector <-c(a$TE.random,
a$seTE.random,
paste(a$lower.random,a$upper.random,sep=";"),
a$zval.random,
a$pval.random)
names(final_vector) <- c("RE_Correlation","SE_RE","CI_RE","Zscore","p-value")
return(final_vector)
}
Application of the described procedure in the meta-analysis of CRC-related microbial species
%%R
#Computing Correlation (R) of species relative abundances with age for all datasets
Corr_list <- computeStandardizedMeanDifference(species_tse)
#Merging outputs of all datasets so to have a single dataframe with all the
#species found across the cohorts
final_df <- Reduce(function(x, y) merge(x, y, all=TRUE),
lapply(Corr_list, function(x) data.frame(x, rn = row.names(x))))
final_df <- final_df %>% column_to_rownames(var="rn")
#Dataframes with random effect and respective vector containing sample size of
#each dataset
d_matrix <- final_df %>%
select(contains("Correlation"))
d_matrix <- t(d_matrix)
n_vector <- as.data.frame(colData(species_tse)) %>%
group_by(study_name) %>%
summarize(N= n())
n_vector <- n_vector[match(str_replace(rownames(d_matrix), "_Correlation",""),
n_vector$study_name ),]$N
meta_analysis <- t(as.data.frame(apply(FUN=function(x) {runMetaanalysis(x,n_vector)},
MARGIN=2,
d_matrix)))
final_df <- cbind(final_df, meta_analysis)
final_df$FDR_Qvalue <- p.adjust(final_df$`p-value`,method = "BH")
%%R
write.csv(final_df, row.names = TRUE, file = "Desktop/CRC_meta_analyses.csv")
Have fun with meta analyses! Back to top