diff --git a/tools/MultiPEN/MultiPEN-StringDBNetwork.xml b/tools/MultiPEN/MultiPEN-StringDBNetwork.xml deleted file mode 100644 index 6d74a525..00000000 --- a/tools/MultiPEN/MultiPEN-StringDBNetwork.xml +++ /dev/null @@ -1,32 +0,0 @@ - - Compiles network from list of genes - - MultiPEN - - - - - - - - - - - - - - - - - - - - - - 10.1093/nar/gks1094 - - diff --git a/tools/MultiPEN/MultiPEN-feature-selection.xml b/tools/MultiPEN/MultiPEN-feature-selection.xml index cc6f4d8a..e68615a0 100644 --- a/tools/MultiPEN/MultiPEN-feature-selection.xml +++ b/tools/MultiPEN/MultiPEN-feature-selection.xml @@ -7,24 +7,20 @@ - - + diff --git a/tools/MultiPEN/MultiPEN-enrichment-GO.xml b/tools/MultiPEN/Rscript-enrichment-GO.xml similarity index 61% rename from tools/MultiPEN/MultiPEN-enrichment-GO.xml rename to tools/MultiPEN/Rscript-enrichment-GO.xml index 4df51ae6..a3c0f12c 100644 --- a/tools/MultiPEN/MultiPEN-enrichment-GO.xml +++ b/tools/MultiPEN/Rscript-enrichment-GO.xml @@ -1,16 +1,19 @@ - - (enrichment with Gene Ontology) + + over-representation and GSE analysis with Gene Ontology - MultiPEN + bioconductor-clusterProfiler + r-BBmisc + bioconductor-GO.db + bioconductor-org.Hs.eg.db - + @@ -30,9 +33,12 @@ run_MultiPEN_slurm.sh EnrichmentGO ./ '$rankings' 10.1089/omi.2011.0118 + 10.1038/75556 + 10.1093/nar/gku1179 + diff --git a/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml b/tools/MultiPEN/Rscript-enrichment-KEGG.xml similarity index 51% rename from tools/MultiPEN/MultiPEN-enrichment-KEGG.xml rename to tools/MultiPEN/Rscript-enrichment-KEGG.xml index e06b58a8..190da58b 100644 --- a/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml +++ b/tools/MultiPEN/Rscript-enrichment-KEGG.xml @@ -1,16 +1,19 @@ - - (enrichment with KEGG) + + over-representation and GSE analysis with KEGG - MultiPEN + r + bioconductor-clusterProfiler + r-BBmisc + bioconductor-org.Hs.eg.db - + @@ -26,9 +29,12 @@ run_MultiPEN_slurm.sh EnrichmentKEGG ./ '$rankings' 10.1089/omi.2011.0118 + 10.1093/nar/gkw1092 + 10.1093/nar/gkv1070 + 10.1093/nar/28.1.27 diff --git a/tools/MultiPEN/compileNetworkStringDB.R b/tools/MultiPEN/compileNetworkStringDB.R new file mode 100644 index 00000000..6b5545bf --- /dev/null +++ b/tools/MultiPEN/compileNetworkStringDB.R @@ -0,0 +1,89 @@ +# Script to compile a Protein-Protein Interaction network using STRINGdb: +# "STRINGdb (Search Tool for the Retrieval of Interacting proteins database)" +# al. FAe (2013). “STRING v9.1: protein-protein interaction networks, with increased coverage and integration.” Nucleic Acids Research (Database issue), 41. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +#Inputs: +# fileName - table with column 'name' +# speciesCode = 9606 #homo sapiens +# threshold = minimum combined score +# networkFileName = "SI_network.human.NormalisedExpressionLevels.csv" + +# It requieres R Packages: +# STRINGdb, http://bioconductor.org/packages/release/bioc/html/STRINGdb.html +# +# To run script from a terminal use the command: +# Rscript copileNetworkStringDB.R 'path-to-directory/fileName.txt' speciesCode threshold 'path-to-output-folder/networkFileName.txt' + + + +# Input arguments +args = commandArgs(trailingOnly=TRUE) + +# User must provide all four input parameters +if (length(args)!=4) { + stop("Please specify file name, species code, threshold and the name of the network", call.=FALSE) +} + +fileName <- args[1] +speciesCode <- as.numeric(args[2]); +threshold <- as.numeric(args[3]); +networkFileName <- args[4]; + + +# Read data, which needs to have at least the following two columns: [gene_id, shortName] +inputData <- read.delim( fileName, header = TRUE, sep = '\t', stringsAsFactors = FALSE) + + +#### begin compiling network #### +library(STRINGdb) +string_db <- STRINGdb$new( version="10", species = speciesCode, score_threshold=threshold, input_directory="" ) +mapped <- string_db$map( inputData, "name", removeUnmappedRows = TRUE ) + +#get interactions +inter<-string_db$get_interactions(mapped$STRING_id) + +#annotate source and target nodes +s <- paste(speciesCode, '.', sep = "") +from <- gsub(s, "", inter$from) +to <- gsub(s,"",inter$to) +#normalise combined_score values: divide by 1000 +network <- data.frame(from = from, to = to, score = inter$combined_score/1000) +subNetwork <- network[network$score > threshold,] + +#edit STRING_id (speciesCode.ENSPxxxxx) to remove speciesCode +mapped$StringID <- gsub(s, "", mapped$STRING_id) +mapped$STRING_id <- NULL + + + +#### network with gene names #### +nn <- dim(subNetwork)[1] +interactions <- matrix(data=NA,nrow=dim(subNetwork)[1], ncol=3) +for(ii in 1:nn){ + interactions[ii,1] = mapped$name[mapped$StringID==subNetwork$from[ii]] + interactions[ii,2] = mapped$name[mapped$StringID==subNetwork$to[ii]] + interactions[ii,3] = subNetwork$score[ii] +} + +edges <- data.frame(source = interactions[,1], target = interactions[,2], score = interactions[,3]) + +#write two files to run with GenePEN +cat(sprintf('\nSaving network (edges) to file: %s', networkFileName)) +cat('. . .') +#fileName <- paste(networkFileName, '.txt', sep = "") +write.table(edges, networkFileName, sep = '\t', col.names = T, row.names = FALSE, quote = FALSE) +cat(sprintf('Done!')) + diff --git a/tools/MultiPEN/enrichmentGO.R b/tools/MultiPEN/enrichmentGO.R new file mode 100755 index 00000000..69a4a55e --- /dev/null +++ b/tools/MultiPEN/enrichmentGO.R @@ -0,0 +1,177 @@ +# Script to run over-representation and gene set enrichment (GSE) analysis +# with Gene Ontology for homo sapiens - using clusterProfiler package [Yu et al, 2012] +# [Yu et al., 2012] Yu G, Wang L, Han Y and He Q (2012), clusterProfiler: an R package +# for comparing biological themes among gene clusters.” OMICS: A Journal of Integrative Biology, 16(5), +# pp. 284-287. doi: 10.1089/omi.2011.0118. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Inputs: +# File name Tabular (txt) file with columns: (gene) name, value and ranking +# Output directory +# Outputs: +# enrichment-GO.txt list of over-represented categories with GO +# enrichment-GO_BP.pdf figure with the first over-representated biological processes terms +# enrichment-GO_MF.pdf figure with the first over-representated molecular functions terms +# enrichment-GO_CC.pdf figure with the first over-representated cellular components terms +# gse-GO.txt results for gse analysis +# +# +# It requieres R Packages: +# clusterProfiler, https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html +# BBmisc +# GO.db +# org.Hs.eg.db +# +# To run from a terminal use following command: +# Rscript enrichmentGO.R '/path-to-file/file-name.txt' 'output-folder/' + + +# Input arguments +args = commandArgs(trailingOnly=TRUE) + +# User must provide at least an input file, and optionally the output directory +if (length(args)==0) { + stop("At least one argument must be supplied (input file).n", call.=FALSE) +} else if (length(args)==1) { + # if no output file is provided, use current directory + outputDir = "./" +} else if (length(args)==2) { + outputDir <- args[2] +} + +dataFile <- args[1] +cat(sprintf("Loading data from file: %s\n", dataFile)) + + +if(!dir.exists(outputDir)){ + cat(sprintf("Creating output directory: %s\n", outputDir)) + dir.create(outputDir) +} + +# Load file with data for analysis +data <- read.table(dataFile, header=TRUE, sep = "\t", stringsAsFactors=FALSE) +cat(sprintf("Number of genes: %i\n",nrow(data))) + + +library(clusterProfiler) +library(BBmisc) +library(GO.db) + +D <- sortByCol(data, 'ranking') +D <- D[,c(1,2,3)] # Only interested in the first three columns [name, value, ranking] +entrez<-bitr(D$name, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db", drop = FALSE) +ranked<-merge(D,entrez,by.x='name',by.y='SYMBOL') +ranked <- sortByCol(ranked, 'ranking') +geneList <- ranked$value +names(geneList) <- ranked$ENTREZID + + +#### Over-representation Analysis #### +cat(sprintf("Performing over-representation analysis (enrichGO) ... ")) +cat(sprintf("Results saved to folder: %s\n", outputDir)) + +#Enrichment for subontology BP (Biological Process) +subclassOnt <- "BP" +enrichment_BP <- enrichGO(ranked$ENTREZID, OrgDb="org.Hs.eg.db", ont=subclassOnt, readable=TRUE) +enrichmentSummary_BP <- as.data.frame(enrichment_BP) +head(enrichmentSummary_BP) +if(nrow(enrichmentSummary_BP)>0){ + aux <- cbind(enrichmentSummary_BP, "BP") + colnames(aux)[10]<- 'subontology' + enrichmentSummary_BP <- aux +} +#add to results: enrichment for BP category +results <- enrichmentSummary_BP + +#Enrichment for subontology MF (Molecular Function) +subclassOnt <- "MF" +enrichment_MF <- enrichGO(ranked$ENTREZID, OrgDb="org.Hs.eg.db", ont=subclassOnt, readable=TRUE) +enrichmentSummary_MF <- as.data.frame(enrichment_MF) +head(enrichmentSummary_MF) +if(nrow(enrichmentSummary_MF)>0){ + aux <- cbind(enrichmentSummary_MF, "MF") + colnames(aux)[10]<- 'subontology' + enrichmentSummary_MF <- aux + #add to results: enrichment for MF category + results <- rbind(results, enrichmentSummary_MF) +} + + +#Enrichment for subclass CC (Cellular Component) +subclassOnt <- "CC" +enrichment_CC <- enrichGO(ranked$ENTREZID, OrgDb="org.Hs.eg.db", ont=subclassOnt, readable=TRUE) +enrichmentSummary_CC <- as.data.frame(enrichment_CC) +head(enrichmentSummary_CC) +if(nrow(enrichmentSummary_CC)>0){ + aux <- cbind(enrichmentSummary_CC, "CC") + colnames(aux)[10]<- 'subontology' + enrichmentSummary_CC <- aux + #add to results: enrichment for CC category + results <- rbind(results, enrichmentSummary_CC) +} + +# modify column names for consistency with MultiPEN and for valid MATLAB identifiers +# change: pvalue to pValue, p.adjust to pAdjust, qvalue to qValue +aux <- colnames(results) +aux[c(5,6,7)] <- c("pValue", "pAdjust", "qValue") +colnames(results) <- aux + +#write results to file: +fileName <- paste(outputDir, "enrichment-GO.txt", sep = "") +cat(sprintf("writing results to file: %s\n", fileName)) +write.table(results, fileName, sep = '\t', row.names = FALSE) + + +fileName <- paste(outputDir, 'enrichment-GO_BP.pdf', sep = "") +pdf(fileName) +barplot(enrichment_BP, showCategory=20) +dev.off() + +fileName <- paste(outputDir, 'enrichment-GO_MF.pdf', sep = "") +pdf(fileName) +barplot(enrichment_MF, drop=TRUE, showCategory=20) +dev.off() + +fileName <- paste(outputDir, 'enrichment-GO_CC.pdf', sep = "") +pdf(fileName) +barplot(enrichment_CC, showCategory=20) +dev.off() + + + +#### Gene Set Enrichment Analysis #### +# GSE for all ontologies: BP, MF and CC +kk <- gseGO(geneList, ont = "ALL", OrgDb="org.Hs.eg.db", keytype = "ENTREZID") +results <- as.data.frame(kk) +results <- sortByCol(results, 'setSize', asc = F) +head(results) + + + +# modify column names for consistency for valid MATLAB identifiers +# change: pvalue to pValue, p.adjust to pAdjust, qvalues to qValues +aux <- colnames(results) +aux[c(7,8,9)] <- c("pValue", "pAdjust", "qValue") +colnames(results) <- aux + +#write results (table) to file: +fileName <- paste(outputDir, "gse-GO.txt", sep = "") +cat(sprintf("writing results to file: %s\n", fileName)) +write.table(results, fileName, sep = '\t', row.names = FALSE) + +# save plot to file (currently not supported!): +# fileName <- paste(outputDir, 'gse-GO.pdf', sep = "") +# pdf(fileName) +# barplot(kk) diff --git a/tools/MultiPEN/enrichmentKEGG.R b/tools/MultiPEN/enrichmentKEGG.R new file mode 100644 index 00000000..3e74aeae --- /dev/null +++ b/tools/MultiPEN/enrichmentKEGG.R @@ -0,0 +1,121 @@ +# Script to run over-representation and gene set enrichment (GSE) analysis +# with KEGG for homo sapiens - using clusterProfiler package [Yu et al, 2012] +# [Yu et al., 2012] Yu G, Wang L, Han Y and He Q (2012), clusterProfiler: an R package +# for comparing biological themes among gene clusters.” OMICS: A Journal of Integrative Biology, 16(5), +# pp. 284-287. doi: 10.1089/omi.2011.0118. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Inputs: +# File name Tabular (txt) file with columns: (gene) name, value and ranking +# Output directory +# Outputs: +# enrichment-KEGG.txt list of over-represented categories with KEGG +# enrichment-KEGG.pdf figure with the first over-representated categories +# gse-KEGG.txt results for gse analysis +# +# It requieres R Packages: +# clusterProfiler, https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html +# BBmisc +# org.Hs.eg.db +# +# To run script from a terminal use the command: +# Rscript enrichmentKEGG.R 'path-to-directory/file-name.txt' 'path-to-output-folder/' + +# Input Arguments +args = commandArgs(trailingOnly=TRUE) + +# User must provide at least an input file, and optionally the output directory +if (length(args)==0) { + stop("At least one argument must be supplied (input file).n", call.=FALSE) +} else if (length(args)==1) { + # if no output file is provided, use current directory + outputDir = "./" +} else if (length(args)==2) { + outputDir <- args[2] +} + +dataFile <- args[1] +cat(sprintf("Loading data from file: %s\n", dataFile)) + +if(!dir.exists(outputDir)){ + cat(sprintf("Creating output directory: %s\n", outputDir)) + dir.create(outputDir) +} + +# Load file with data for analysis +data <- read.table(dataFile, header=TRUE, sep = "\t", stringsAsFactors=FALSE) +cat(sprintf("Number of genes: %i\n",nrow(data))) + + +library(clusterProfiler) +library(BBmisc) + +D <- sortByCol(data, 'ranking') +D <- D[D[,2]!=0,] +D <- D[,c(1,2,3)] # Only interested in the first three columns [name, value, ranking] +entrez<-bitr(D$name, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db", drop = FALSE) +ranked<-merge(D,entrez,by.x='name',by.y='SYMBOL') +ranked <- sortByCol(ranked, 'ranking') +geneList <- ranked$value +names(geneList) <- ranked$ENTREZID + +cat(sprintf("Performing over-representation analysis (with KEGG) ... ")) +cat(sprintf("Results saved to folder: %s\n", outputDir)) + + +#### Enrichment with KEGG #### +enrichment_kegg <- enrichKEGG(ranked$ENTREZID, organism = 'hsa', keyType = "kegg") +results <- summary(enrichment_kegg) +head(results) + + +# modify column names for consistency with MultiPEN and for valid MATLAB identifiers +# change: pvalue to pValue, p.adjust to pAdjust, qvalue to qValue +aux <- colnames(results) +aux[c(5,6,7)] <- c("pValue", "pAdjust", "qValue") +colnames(results) <- aux + +#write results (table) to file: +fileName <- paste(outputDir, "enrichment-KEGG.txt", sep = "") +cat(sprintf("writing results to file: %s\n", fileName)) +write.table(results, fileName, sep = '\t', row.names = FALSE) + +# save plot to file: +fileName <- paste(outputDir, 'enrichment-KEGG.pdf', sep = "") +pdf(fileName) +barplot(enrichment_kegg, showCategory=20) + + +#### Gene Set Enrichment with KEGG #### +kk <- gseKEGG(geneList, organism = 'hsa', keyType = "kegg") +results <- summary(kk) +results + + +# modify column names for consistency for valid MATLAB identifiers +# change: p.adjust to pAdjust +aux <- colnames(results) +aux[c(6,7,8)] <- c("pValue", "pAdjust", "qValue") +colnames(results) <- aux + +#write results (table) to file: +fileName <- paste(outputDir, "gse-KEGG.txt", sep = "") +cat(sprintf("writing results to file: %s\n", fileName)) +write.table(results, fileName, sep = '\t', row.names = FALSE) + +# save plot to file (currently not supported!): +# fileName <- paste(outputDir, 'gse-KEGG.pdf', sep = "") +# pdf(fileName) +# barplot(kk) \ No newline at end of file diff --git a/tools/MultiPEN/stringdb-network.xml b/tools/MultiPEN/stringdb-network.xml new file mode 100644 index 00000000..11059b7d --- /dev/null +++ b/tools/MultiPEN/stringdb-network.xml @@ -0,0 +1,33 @@ + + Protein interaction network from gene list + + bioconductor-stringdb + + + + + + + + + + + + + + + + + + + + + `_ (Search Tool for the Retrieval of Interacting Proteins database). + ]]> + + 10.1093/nar/gks1094 + + diff --git a/tools/MultiPEN/test-data/MultiPEN-Rankings_lambda0.0001-onlyGenes.txt b/tools/MultiPEN/test-data/MultiPEN-Rankings_lambda0.0001-onlyGenes.txt index 35c14bbb..1a361b65 100644 --- a/tools/MultiPEN/test-data/MultiPEN-Rankings_lambda0.0001-onlyGenes.txt +++ b/tools/MultiPEN/test-data/MultiPEN-Rankings_lambda0.0001-onlyGenes.txt @@ -1,4 +1,4 @@ -name weight ranking foldChange higherIn case1 case2 case3 control1 control2 control3 +name value ranking foldChange higherIn case1 case2 case3 control1 control2 control3 SF3B1 0.99906704216243 1 -0.236470472 control 0.283198634 0.27896994 0.405474972 0.443509247 0.650759237 0.173061082 PDIA2 0.998078594941955 2 -0.247818562 control 0.123381382 0.291498034 0.57227493 0.264566354 0.514119992 0.533702257 TGFBR2 0.994602559989464 3 0.221089433 case 0.986051067 0.724247034 0.477134352 0.888992345 0.867667063 0.03471835