TGAC · PerlaTroncosoRey · Apr 8, 2017 · Apr 8, 2017 · Apr 8, 2017 · Apr 8, 2017
diff --git a/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml b/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml
diff --git a/tools/MultiPEN/MultiPEN-StringDBNetwork.xml → tools/MultiPEN/Rscript-STRINGdb-network.xml b/tools/MultiPEN/MultiPEN-StringDBNetwork.xml → tools/MultiPEN/Rscript-STRINGdb-network.xml
@@ -1,17 +1,18 @@
-<tool id="MultiPEN-StringDBNetwork" name="StringDB Network" version="0.0.3">
-    <description> Compiles network from list of genes</description>
+<tool id="STRINGdb-network" name="STRINGdb-network" version="0.0.3">
+    <description> Compile network from list of genes</description>
     <requirements>
-        <requirement type="package" version="0.0.3">MultiPEN</requirement>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-STRINGdb</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" />
     </stdio>
     <command><![CDATA[
-run_MultiPEN_slurm.sh StringDBNetwork ./ '$geneList' $SpeciesCode $threshold '$networkFileName'
+        Rscript '$__tool_directory__'/compileNetworkStringDB.R ./ '$geneList' $SpeciesCode $threshold '$networkFileName'
     ]]></command>
     <inputs>
-        <param name="geneList" type="data" format="tabular" label="list of genes" help="List of genes (use symbol names for genes) to compile a network" />
-        <param name="SpeciesCode" type="integer" value="" label="Code for your species (i.e., 9606 for humans)" help="Use symbol names for genes to compile a network" />
+        <param name="geneList" type="data" format="tabular" label="list of genes" help="List of genes (use symbol gene names) to compile a network" />
+        <param name="SpeciesCode" type="integer" value="" label="NCBI taxonomy identifiers for your organism (i.e., 9606 for Human, 10090 for mouse)" help="If you don’t know your organism's identifier, you can search the NCBI Taxonomy from http://www.nlm.nih.gov/taxonomy"/>
         <param name="threshold" type="float" value="0.60" label="threshold for the combined score for the interactions" help="Default value set to 0.60" />
     </inputs>
     <outputs>
@@ -24,7 +25,7 @@ run_MultiPEN_slurm.sh StringDBNetwork ./ '$geneList' $SpeciesCode $threshold '$n
         </test>
     </tests>
     <help><![CDATA[
-MultiPEN includes a Wrapper to use the R package STRINGdb (bioconductor): Search Tool for the Retrieval of Interacting Proteins database (see http://bioconductor.org/packages/release/bioc/html/STRINGdb.html).
+        This wrapper compiles a PPI network from a list of genes, using the bioconductor R package STRINGdb: Search Tool for the Retrieval of Interacting Proteins database (see http://bioconductor.org/packages/release/bioc/html/STRINGdb.html).
     ]]></help>
     <citations>
         <citation type="doi">10.1093/nar/gks1094</citation>

diff --git a/tools/MultiPEN/MultiPEN-enrichment-GO.xml → tools/MultiPEN/Rscript-enrichment-GO.xml b/tools/MultiPEN/MultiPEN-enrichment-GO.xml → tools/MultiPEN/Rscript-enrichment-GO.xml
@@ -1,16 +1,20 @@
-<tool id="MultiPEN-enrichment-GO" name="Enrichment with GO" version="0.0.3">
-    <description> (enrichment with Gene Ontology)</description>
+<tool id="enrichGO" name="enrichGO" version="0.0.3">
+    <description> over-representation and GSE analysis with Gene Ontology</description>
     <requirements>
-        <requirement type="package" version="0.0.3">MultiPEN</requirement>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-clusterProfiler</requirement>
+        <requirement type="package" version="1.10">r-BBmisc</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-GO.db</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-org.Hs.eg.db</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" />
     </stdio>
     <command><![CDATA[
-run_MultiPEN_slurm.sh EnrichmentGO ./ '$rankings'
+        Rscript '$__tool_directory__'/enrichmentGO.R '$rankings'
     ]]></command>
     <inputs>
-        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of features (genes and/or metabolites). The rankings must be specified in a tab delimited text file with (at least) three columns: 'name' (use symbol names for genes or chEBI IDs for metabolites), 'value' (used to rank the features) and 'ranking' (ranking 1 represent the most important feature)." />
+        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of genes which must be specified in a tab delimited text file with (at least) three columns: 'name' (this is the gene names), 'value' (used to rank the genes) and 'ranking' (where ranking 1 represent the most important feature)." />
     </inputs>
     <outputs>
         <data name="enrichment-GO" format="txt" label="${tool.name}: over-representation analysis with Gene Ontology" from_work_dir="enrichment-GO.txt" />
@@ -30,9 +34,12 @@ run_MultiPEN_slurm.sh EnrichmentGO ./ '$rankings'
         </test>
     </tests>
     <help><![CDATA[
-MultiPEN includes a Wrapper to use the R package clusterProfiler to perform over-representation analysis with Gene Ontology.
+        This wrapper performs over-representation and gene set enrichment analysis from a list of genes using the bioconductor R package clusterProfiler and Gene Ontology.
     ]]></help>
     <citations>
         <citation type="doi">10.1089/omi.2011.0118</citation>
+        <citation type="doi">10.1038/75556</citation>
+        <citation type="doi">https://doi.org/10.1093/nar/gku1179</citation>
+
     </citations>
 </tool>
diff --git a/tools/MultiPEN/Rscript-enrichment-KEGG.xml b/tools/MultiPEN/Rscript-enrichment-KEGG.xml
@@ -0,0 +1,52 @@
+<tool id="enrichKEGG" name="enrichKEGG" version="0.0.3">
+    <description> over-representation and GSE analysis with KEGG</description>
+    <requirements>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-clusterProfiler</requirement>
+        <requirement type="package" version="1.10">r-BBmisc</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-org.Hs.eg.db</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" />
+    </stdio>
+    <command><![CDATA[
+        Rscript '$__tool_directory__'/enrichmentKEGG.R '$rankings'
+    ]]></command>
+    <inputs>
+        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of genes which should be specified in a tabular delimited text file with (at least) three columns: 'name' (this are gene names), 'value' (used to rank the genes) and 'ranking' (where ranking 1 represent the most important feature)" />
+    </inputs>
+    <outputs>
+        <data name="enrichment-KEGG" format="txt" label="${tool.name}: over-representation analysis with KEGG" from_work_dir="enrichment-KEGG.txt" />
+        <data name="enrichment-KEGG" format="pdf" label="${tool.name}: over-representation analysis with KEGG" from_work_dir="enrichment-KEGG_BP.pdf" />
+        <data name="gse-KEGG" format="txt" label="Gene set enrichment analysis with KEGG" from_work_dir="gse-KEGG.txt" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="rankings" ftype="tabular" value="MultiPEN-Rankings_lambda0.0001.txt" />
+            <output name="enrichment-KEGG" file="enrichment-KEGG" ftype="txt" />
+            <output name="enrichment-KEGG" file="enrichment-KEGG_BP" ftype="pdf" />
+            <output name="gse-KEGG" file="gse-KEGG" ftype="pdf" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        This wrapper performs over-representation and gene set enrichment analysis from a list of genes using the bioconductor R package clusterProfiler and KEGG.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1089/omi.2011.0118</citation>
+        <citation type="doi">10.1093/nar/gkw1092</citation>
+        <citation type="doi">10.1093/nar/gkv1070</citation>
+        <citation type="bibtex">@article{Kanehisa2000,
+            Author = {Kanehisa, M and Goto, S},
+            Crdt = {1999/12/11 09:00},
+            Date = {2000 Jan 01},
+            Journal = {Nucleic Acids Res},
+            Month = {Jan},
+            Number = {1},
+            Oid = {NLM: PMC102409},
+            Pages = {27--30},
+            Title = {KEGG: kyoto encyclopedia of genes and genomes.},
+            Volume = {28},
+            Year = {2000}}
+            </citation>
+    </citations>
+</tool>
diff --git a/tools/MultiPEN/compileNetworkStringDB.R b/tools/MultiPEN/compileNetworkStringDB.R
@@ -0,0 +1,89 @@
+# Script to compile a Protein-Protein Interaction network using STRINGdb: 
+# "STRINGdb (Search Tool for the Retrieval of Interacting proteins database)"
+#    al. FAe (2013). “STRING v9.1: protein-protein interaction networks, with increased coverage and integration.” Nucleic Acids Research (Database issue), 41. 
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#Inputs: 
+# fileName - table with column 'name'
+# speciesCode = 9606  #homo sapiens
+# threshold = minimum combined score
+# networkFileName = "SI_network.human.NormalisedExpressionLevels.csv"
+
+# It requieres R Packages:
+# STRINGdb, http://bioconductor.org/packages/release/bioc/html/STRINGdb.html
+#
+# To run script from a terminal use the command:
+# Rscript copileNetworkStringDB.R 'path-to-directory/fileName.txt' speciesCode threshold 'path-to-output-folder/networkFileName.txt'
+
+
+
+# Input arguments
+args = commandArgs(trailingOnly=TRUE)
+
+# User must provide all four input parameters 
+if (length(args)!=4) {
+  stop("Please specify file name, species code, threshold and the name of the network", call.=FALSE)
+}
+
+fileName <- args[1]
+speciesCode <- as.numeric(args[2]);
+threshold <- as.numeric(args[3]);
+networkFileName <- args[4];
+
+
+# Read data, which needs to have at least the following two columns: [gene_id, shortName]
+inputData <- read.delim( fileName, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
+
+
+#### begin compiling network ####
+library(STRINGdb)
+string_db <- STRINGdb$new( version="10", species = speciesCode, score_threshold=threshold, input_directory="" )
+mapped <- string_db$map( inputData,  "name", removeUnmappedRows = TRUE )
+
+#get interactions 
+inter<-string_db$get_interactions(mapped$STRING_id)
+
+#annotate source and target nodes
+s <- paste(speciesCode, '.', sep = "")
+from <- gsub(s, "", inter$from)
+to <- gsub(s,"",inter$to)
+#normalise combined_score values: divide by 1000
+network <- data.frame(from = from, to = to, score = inter$combined_score/1000)
+subNetwork <- network[network$score > threshold,] 
+
+#edit STRING_id (speciesCode.ENSPxxxxx) to remove speciesCode
+mapped$StringID <- gsub(s, "", mapped$STRING_id)
+mapped$STRING_id <- NULL
+
+
+
+#### network with gene names ####
+nn <- dim(subNetwork)[1]
+interactions <- matrix(data=NA,nrow=dim(subNetwork)[1], ncol=3)
+for(ii in 1:nn){
+  interactions[ii,1] = mapped$name[mapped$StringID==subNetwork$from[ii]]
+  interactions[ii,2] = mapped$name[mapped$StringID==subNetwork$to[ii]]
+  interactions[ii,3] = subNetwork$score[ii]
+}
+
+edges <- data.frame(source = interactions[,1], target = interactions[,2], score = interactions[,3])
+
+#write two files to run with GenePEN
+cat(sprintf('\nSaving network (edges) to file: %s', networkFileName))
+cat('. . .')
+#fileName <- paste(networkFileName, '.txt', sep = "")
+write.table(edges, networkFileName, sep = '\t', col.names = T, row.names = FALSE, quote = FALSE)
+cat(sprintf('Done!'))
+