TGAC · PerlaTroncosoRey · Apr 8, 2017 · Apr 8, 2017 · Apr 8, 2017 · Apr 8, 2017
diff --git a/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml b/tools/MultiPEN/MultiPEN-enrichment-KEGG.xml
diff --git a/tools/MultiPEN/MultiPEN-feature-selection.xml b/tools/MultiPEN/MultiPEN-feature-selection.xml
@@ -7,24 +7,20 @@
         <exit_code range="1:" level="fatal" />
     </stdio>
     <command><![CDATA[
-run_MultiPEN_slurm.sh FeatureSelection ./ '$ExpressionData' '$Interactions' '$SampleClass' $lambda $DecisionThreshold
-#if str($MaxIter)
-    $MaxIter
-#end if
+run_MultiPEN_slurm.sh FeatureSelection ./ '$ExpressionData' '$Interactions' '$SampleClass' $lambda $optionalParameters
 &&
-mv MultiPEN-performance_feature-selection_lambda${lambda}.txt Performance.txt &&
-mv MultiPEN-Rankings_lambda${lambda}.txt Rankings.txt &&
-mv MultiPEN-vts_lambda${lambda}.txt vts.txt &&
-mv MultiPEN-Rankings_lambda${lambda}_genes-higher-in-cases.txt Rankings-higherInCases.txt &&
-mv MultiPEN-Rankings_lambda${lambda}_genes-higher-in-control.txt Rankings-higherInControl.txt
+mv MultiPEN-performance_feature-selection_lambda${lambda}_${optionalParameters}.txt Performance.txt &&
+mv MultiPEN-Rankings_lambda${lambda}_${optionalParameters}.txt Rankings.txt &&
+mv MultiPEN-vts_lambda${lambda}_${optionalParameters}.txt vts.txt &&
+mv MultiPEN-Rankings_lambda${lambda}_genes-higher-in-cases_${optionalParameters}.txt Rankings-higherInCases.txt &&
+mv MultiPEN-Rankings_lambda${lambda}_genes-higher-in-control_${optionalParameters}.txt Rankings-higherInControl.txt
     ]]></command>
     <inputs>
         <param name="ExpressionData" type="data" format="tabular" label="Expression and/or levels for features (genes and/or metabolites)" help="Gene expression and/or metabolite levels" />
         <param name="Interactions" type="data" format="tabular" label="Interaction Network" help="Molecular interaction network, where every interaction is defined by three elements: the source  node (name of gene and/or metabolite), the target node (name of gene and/or metabolite) and the weight for the interaction (a real number in the range [0,1]), i.e., the confidence level. The interaction matrix is provided as a tabular file with three columns: source, target and weight, and where each row corresponds to an interaction."  />
         <param name="SampleClass" type="data" format="txt" label="SampleClass for the samples" help="Tabular file with two columns: sample and class. The sample column contains the names of all samples, which corresponds to the columns in 'ExpressionData'. The class for each sample is 1 for cases and 0 for control" />
         <param name="lambda" type="float" value="" label="Lambda for the optimisation function" help="" />
-        <param name="DecisionThreshold" type="float" value="0.5" label="Decision threshold for classification" help="Optional parameter, the default value is 0.5" />
-        <param name="MaxIter" type="integer" value="100" optional="true" label="Maximum number of iterations for optimisation" help="Optional parameter, the default value is 100" />
+        <param name="optionalParameters" type="string" value="000.500100" label="Optional parameters" help="The digits of this string represent a sequence of parameters. The following triples define the position in the string, the parameter name and its possible values (i.e., the left most value in the string states whether the log2 transform has been used or not): {1, log2 transform, [0,1]}, {2, z-scores, [0,1]}, {3 to 6, decision threshold, e.g.: 0.60}, {7 to 10, max number of iterations, e.g.: 0300 (default value is set to 0100)}. For example, the default string shown, 000.500100, states that no log 2 transformed is used, no z-scores are used, decision threshold is 0.50 and a maximum of 100 iterations are used." />
     </inputs>
     <outputs>
         <data name="Config" format="txt" label="${tool.name} on ${on_string} (lambda $lambda): Configuration" from_work_dir="MultiPEN-feature-selection_config.txt" />

diff --git a/tools/MultiPEN/MultiPEN-StringDBNetwork.xml → tools/MultiPEN/Rscript-STRINGdb-network.xml b/tools/MultiPEN/MultiPEN-StringDBNetwork.xml → tools/MultiPEN/Rscript-STRINGdb-network.xml
@@ -1,17 +1,18 @@
-<tool id="MultiPEN-StringDBNetwork" name="StringDB Network" version="0.0.3">
-    <description> Compiles network from list of genes</description>
+<tool id="stringdb network" name="stringdb-network" version="0.0.3">
+    <description> Compile network from list of genes</description>
     <requirements>
-        <requirement type="package" version="0.0.3">MultiPEN</requirement>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-STRINGdb</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" />
     </stdio>
     <command><![CDATA[
-run_MultiPEN_slurm.sh StringDBNetwork ./ '$geneList' $SpeciesCode $threshold '$networkFileName'
+        Rscript '$__tool_directory__'/compileNetworkStringDB.R ./ '$geneList' $SpeciesCode $threshold '$networkFileName'
     ]]></command>
     <inputs>
-        <param name="geneList" type="data" format="tabular" label="list of genes" help="List of genes (use symbol names for genes) to compile a network" />
-        <param name="SpeciesCode" type="integer" value="" label="Code for your species (i.e., 9606 for humans)" help="Use symbol names for genes to compile a network" />
+        <param name="geneList" type="data" format="tabular" label="list of genes" help="List of genes (use symbol gene names) to compile a network" />
+        <param name="SpeciesCode" type="integer" value="" label="NCBI taxonomy identifiers for your organism (i.e., 9606 for Human, 10090 for mouse)" help="If you don’t know your organism's identifier, you can search the NCBI Taxonomy from http://www.nlm.nih.gov/taxonomy"/>
         <param name="threshold" type="float" value="0.60" label="threshold for the combined score for the interactions" help="Default value set to 0.60" />
     </inputs>
     <outputs>
@@ -24,7 +25,7 @@ run_MultiPEN_slurm.sh StringDBNetwork ./ '$geneList' $SpeciesCode $threshold '$n
         </test>
     </tests>
     <help><![CDATA[
-MultiPEN includes a Wrapper to use the R package STRINGdb (bioconductor): Search Tool for the Retrieval of Interacting Proteins database (see http://bioconductor.org/packages/release/bioc/html/STRINGdb.html).
+        This wrapper compiles a PPI network from a list of genes, using the bioconductor R package STRINGdb: Search Tool for the Retrieval of Interacting Proteins database (see http://bioconductor.org/packages/release/bioc/html/STRINGdb.html).
     ]]></help>
     <citations>
         <citation type="doi">10.1093/nar/gks1094</citation>

diff --git a/tools/MultiPEN/MultiPEN-enrichment-GO.xml → tools/MultiPEN/Rscript-enrichment-GO.xml b/tools/MultiPEN/MultiPEN-enrichment-GO.xml → tools/MultiPEN/Rscript-enrichment-GO.xml
@@ -1,16 +1,20 @@
-<tool id="MultiPEN-enrichment-GO" name="Enrichment with GO" version="0.0.3">
-    <description> (enrichment with Gene Ontology)</description>
+<tool id="enrich go" name="enrich go" version="0.0.3">
+    <description> over-representation and GSE analysis with Gene Ontology</description>
     <requirements>
-        <requirement type="package" version="0.0.3">MultiPEN</requirement>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-clusterProfiler</requirement>
+        <requirement type="package" version="1.10">r-BBmisc</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-GO.db</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-org.Hs.eg.db</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" />
     </stdio>
     <command><![CDATA[
-run_MultiPEN_slurm.sh EnrichmentGO ./ '$rankings'
+        Rscript '$__tool_directory__'/enrichmentGO.R '$rankings'
     ]]></command>
     <inputs>
-        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of features (genes and/or metabolites). The rankings must be specified in a tab delimited text file with (at least) three columns: 'name' (use symbol names for genes or chEBI IDs for metabolites), 'value' (used to rank the features) and 'ranking' (ranking 1 represent the most important feature)." />
+        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of genes which must be specified in a tab delimited text file with (at least) three columns: 'name' (this is the gene names), 'value' (used to rank the genes) and 'ranking' (where ranking 1 represent the most important feature)." />
     </inputs>
     <outputs>
         <data name="enrichment-GO" format="txt" label="${tool.name}: over-representation analysis with Gene Ontology" from_work_dir="enrichment-GO.txt" />
@@ -30,9 +34,12 @@ run_MultiPEN_slurm.sh EnrichmentGO ./ '$rankings'
         </test>
     </tests>
     <help><![CDATA[
-MultiPEN includes a Wrapper to use the R package clusterProfiler to perform over-representation analysis with Gene Ontology.
+        This wrapper performs over-representation and gene set enrichment analysis from a list of genes using the bioconductor R package clusterProfiler and Gene Ontology.
     ]]></help>
     <citations>
         <citation type="doi">10.1089/omi.2011.0118</citation>
+        <citation type="doi">10.1038/75556</citation>
+        <citation type="doi">https://doi.org/10.1093/nar/gku1179</citation>
+
     </citations>
 </tool>
diff --git a/tools/MultiPEN/Rscript-enrichment-KEGG.xml b/tools/MultiPEN/Rscript-enrichment-KEGG.xml
@@ -0,0 +1,52 @@
+<tool id="enrich kegg" name="enrich kegg" version="0.0.3">
+    <description> over-representation and GSE analysis with KEGG</description>
+    <requirements>
+        <requirement type="package" version="3.3.1">r</requirement>
+        <requirement type="package" version="3.0.5">bioconductor-clusterProfiler</requirement>
+        <requirement type="package" version="1.10">r-BBmisc</requirement>
+        <requirement type="package" version="3.3.0">bioconductor-org.Hs.eg.db</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" />
+    </stdio>
+    <command><![CDATA[
+        Rscript '$__tool_directory__'/enrichmentKEGG.R '$rankings'
+    ]]></command>
+    <inputs>
+        <param name="rankings" type="data" format="tabular" label="Rankings for all features" help="Ranking of genes which should be specified in a tabular delimited text file with (at least) three columns: 'name' (this are gene names), 'value' (used to rank the genes) and 'ranking' (where ranking 1 represent the most important feature)" />
+    </inputs>
+    <outputs>
+        <data name="enrichment-KEGG" format="txt" label="${tool.name}: over-representation analysis with KEGG" from_work_dir="enrichment-KEGG.txt" />
+        <data name="enrichment-KEGG" format="pdf" label="${tool.name}: over-representation analysis with KEGG" from_work_dir="enrichment-KEGG_BP.pdf" />
+        <data name="gse-KEGG" format="txt" label="Gene set enrichment analysis with KEGG" from_work_dir="gse-KEGG.txt" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="rankings" ftype="tabular" value="MultiPEN-Rankings_lambda0.0001.txt" />
+            <output name="enrichment-KEGG" file="enrichment-KEGG" ftype="txt" />
+            <output name="enrichment-KEGG" file="enrichment-KEGG_BP" ftype="pdf" />
+            <output name="gse-KEGG" file="gse-KEGG" ftype="pdf" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        This wrapper performs over-representation and gene set enrichment analysis from a list of genes using the bioconductor R package clusterProfiler and KEGG.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1089/omi.2011.0118</citation>
+        <citation type="doi">10.1093/nar/gkw1092</citation>
+        <citation type="doi">10.1093/nar/gkv1070</citation>
+        <citation type="bibtex">@article{Kanehisa2000,
+            Author = {Kanehisa, M and Goto, S},
+            Crdt = {1999/12/11 09:00},
+            Date = {2000 Jan 01},
+            Journal = {Nucleic Acids Res},
+            Month = {Jan},
+            Number = {1},
+            Oid = {NLM: PMC102409},
+            Pages = {27--30},
+            Title = {KEGG: kyoto encyclopedia of genes and genomes.},
+            Volume = {28},
+            Year = {2000}}
+            </citation>
+    </citations>
+</tool>
diff --git a/tools/MultiPEN/compileNetworkStringDB.R b/tools/MultiPEN/compileNetworkStringDB.R
@@ -0,0 +1,89 @@
+# Script to compile a Protein-Protein Interaction network using STRINGdb: 
+# "STRINGdb (Search Tool for the Retrieval of Interacting proteins database)"
+#    al. FAe (2013). “STRING v9.1: protein-protein interaction networks, with increased coverage and integration.” Nucleic Acids Research (Database issue), 41. 
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#Inputs: 
+# fileName - table with column 'name'
+# speciesCode = 9606  #homo sapiens
+# threshold = minimum combined score
+# networkFileName = "SI_network.human.NormalisedExpressionLevels.csv"
+
+# It requieres R Packages:
+# STRINGdb, http://bioconductor.org/packages/release/bioc/html/STRINGdb.html
+#
+# To run script from a terminal use the command:
+# Rscript copileNetworkStringDB.R 'path-to-directory/fileName.txt' speciesCode threshold 'path-to-output-folder/networkFileName.txt'
+
+
+
+# Input arguments
+args = commandArgs(trailingOnly=TRUE)
+
+# User must provide all four input parameters 
+if (length(args)!=4) {
+  stop("Please specify file name, species code, threshold and the name of the network", call.=FALSE)
+}
+
+fileName <- args[1]
+speciesCode <- as.numeric(args[2]);
+threshold <- as.numeric(args[3]);
+networkFileName <- args[4];
+
+
+# Read data, which needs to have at least the following two columns: [gene_id, shortName]
+inputData <- read.delim( fileName, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
+
+
+#### begin compiling network ####
+library(STRINGdb)
+string_db <- STRINGdb$new( version="10", species = speciesCode, score_threshold=threshold, input_directory="" )
+mapped <- string_db$map( inputData,  "name", removeUnmappedRows = TRUE )
+
+#get interactions 
+inter<-string_db$get_interactions(mapped$STRING_id)
+
+#annotate source and target nodes
+s <- paste(speciesCode, '.', sep = "")
+from <- gsub(s, "", inter$from)
+to <- gsub(s,"",inter$to)
+#normalise combined_score values: divide by 1000
+network <- data.frame(from = from, to = to, score = inter$combined_score/1000)
+subNetwork <- network[network$score > threshold,] 
+
+#edit STRING_id (speciesCode.ENSPxxxxx) to remove speciesCode
+mapped$StringID <- gsub(s, "", mapped$STRING_id)
+mapped$STRING_id <- NULL
+
+
+
+#### network with gene names ####
+nn <- dim(subNetwork)[1]
+interactions <- matrix(data=NA,nrow=dim(subNetwork)[1], ncol=3)
+for(ii in 1:nn){
+  interactions[ii,1] = mapped$name[mapped$StringID==subNetwork$from[ii]]
+  interactions[ii,2] = mapped$name[mapped$StringID==subNetwork$to[ii]]
+  interactions[ii,3] = subNetwork$score[ii]
+}
+
+edges <- data.frame(source = interactions[,1], target = interactions[,2], score = interactions[,3])
+
+#write two files to run with GenePEN
+cat(sprintf('\nSaving network (edges) to file: %s', networkFileName))
+cat('. . .')
+#fileName <- paste(networkFileName, '.txt', sep = "")
+write.table(edges, networkFileName, sep = '\t', col.names = T, row.names = FALSE, quote = FALSE)
+cat(sprintf('Done!'))
+