Merge pull request #35 from lemaslab/xinsong

add unknown search
lemaslab · Apr 24, 2020 · 701cad5 · 701cad5
2 parents 7bac1ab + 358fe47
commit 701cad5
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 24 deletions.
diff --git a/.travis/RUMP-test_aftermzmine.sh b/.travis/RUMP-test_aftermzmine.sh
@@ -1,2 +1,2 @@
-# Test processes after MZmine with sample data
-./nextflow run_aftermzmine.nf --input_dir_pos .travis/data/POS/ --input_dir_neg .travis/data/NEG --POS_design_path .travis/pos_design.csv --NEG_design_path .travis/neg_design.csv --cutoff 1 --pos_mzmine_peak_output .travis/pos_data.csv --neg_mzmine_peak_output .travis/neg_data.csv -with-docker galaxydream/metabolomics_pipeline
+# Test processes MZmine output files with sample data
+./nextflow run_aftermzmine.nf --input_dir_pos .travis/data/POS/ --input_dir_neg .travis/data/NEG --POS_design_path .travis/pos_design.csv --NEG_design_path .travis/neg_design.csv --cutoff 1 --pos_mzmine_peak_output .travis/pos_data.csv --neg_mzmine_peak_output .travis/neg_data.csv -with-docker xinsongdu/lemaslab_rump:v1.0.0
diff --git a/.travis/RUMP-test_all.sh b/.travis/RUMP-test_all.sh
@@ -4,5 +4,5 @@
 wget https://github.com/mzmine/mzmine2/releases/download/v2.53/MZmine-2.53-Linux.zip && unzip MZmine-2.53-Linux.zip && rm MZmine-2.53-Linux.zip
 
 # Test all processes with sample data
-./nextflow main.nf --input_dir_pos .travis/data/POS/ --input_dir_neg .travis/data/NEG --POS_design_path .travis/pos_design.csv --NEG_design_path .travis/neg_design.csv --cutoff 1 -with-docker galaxydream/metabolomics_pipeline
+./nextflow main.nf --input_dir_pos .travis/data/POS/ --input_dir_neg .travis/data/NEG --POS_design_path .travis/pos_design.csv --NEG_design_path .travis/neg_design.csv --cutoff 1 -with-docker xinsongdu/lemaslab_rump:v1.0.0
 
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
-# Dockerfile for UMPIRE
+# Dockerfile for RUMP
 
-FROM rocker/r-ver:3.5.2
+FROM rocker/rstudio:3.6.3
 
 MAINTAINER [email protected]
 
@@ -58,7 +58,11 @@ WORKDIR /app
 COPY accessibility.properties /app
 
 # Fix a bug for java
-RUN mv accessibility.properties /etc/java-8-openjdk/
+# RUN mv accessibility.properties /etc/java-8-openjdk/
+
+# install R packages
+COPY r_package_install.R /app
+RUN Rscript r_package_install.R
 
 # Install mummichog
 RUN pip install --upgrade setuptools

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ wget https://github.com/mzmine/mzmine2/releases/download/v2.53/MZmine-2.53-Linux
 ```
 4. Pull singularity image if using high-performance computing (**if using local machine, skip this step**)
 ```
-mkdir -p work/singularity && singularity pull --name work/singularity/xinsongdu-lemaslab_reump.img docker://xinsongdu/lemaslab_rump:v0.0.0
+mkdir -p work/singularity && singularity pull --name work/singularity/xinsongdu-lemaslab_reump.img docker://xinsongdu/lemaslab_rump:v1.0.0
 ```
 
 # General Behavior
@@ -78,11 +78,11 @@ Negative mode:
 - Create design files for positve data and negative data, indicating the group of each file, save them to `data/pos_design.csv` and `data/neg_design.csv`. Sample design file can be found in `data/sample_data/pos_design.csv` and `data/sample_data/neg_design.csv`
 - Process your data with default parameters using local machine
 ```
-nextflow main.nf -with-docker xinsongdu/lemaslab_rump:v0.0.0
+nextflow main.nf -with-docker xinsongdu/lemaslab_rump:v1.0.0
 ```
 - Process your data with default parameters using high-performance computing (It is recommended to maximize CPU and memory in pos_peakDetection_mzmine and neg_peakDetection_mzmine processes in `nextflow.config` if using high-performance computing)
 ```
-nextflow main.nf --container singularity -with-singularity docker://xinsongdu/lemaslab_rump:v0.0.0
+nextflow main.nf --container singularity -with-singularity docker://xinsongdu/lemaslab_rump:v1.0.0
 ```
 
 ### Process dataframe generatd by MZmine-2.53
@@ -91,7 +91,7 @@ nextflow main.nf --container singularity -with-singularity docker://xinsongdu/le
 - Create design files describing the group of each column of positive/negative data, save them to `data/pos_design.csv` and `data/neg_design.csv`
 - Get statistical analysis and pathway analysis
 ```
-nextflow run_aftermzmine.nf -with-docker xinsongdu/lemaslab_rump:v0.0.0
+nextflow run_aftermzmine.nf -with-docker xinsongdu/lemaslab_rump:v1.0.0
 ```
 
 ### Help message
@@ -113,7 +113,7 @@ Check https://github.com/lemaslab/RUMP for updates, and refer to
 https://github.com/lemaslab/RUMP/wiki
 
 Usage:
-   nextflow run_all.nf [options] -with-docker xinsongdu/lemaslab_rump:v0.0.0
+   nextflow run_all.nf [options] -with-docker xinsongdu/lemaslab_rump:v1.0.0
 
 Arguments (it is mandatory to change `input_file` and `mzmine_dir` before running:
 ----------------------------- common parameters ----------------------------------
@@ -128,7 +128,7 @@ Please refer to nextflow.config for more options.
 
 Container:
     Docker image to use with -with-docker|-with-singularity options is
-    'docker://xinsongdu/lemaslab_rump:v0.0.0'
+    'docker://xinsongdu/lemaslab_rump:v1.0.0'
 
 RUMP supports .mzXML format files.
 ```
@@ -163,13 +163,13 @@ RUMP returns the following exit status values:
 ### Running tests on local machine
 
 ```
-nextflow main.nf --input_dir_pos functional_test/sample_data/POS/ --input_dir_neg functional_test/sample_data/NEG --POS_design_path functional_test/sample_data/pos_design.csv --NEG_design_path functional_test/sample_data/neg_design.csv -with-docker xinsongdu/lemaslab_rump:v0.0.0
+nextflow main.nf --input_dir_pos functional_test/sample_data/POS/ --input_dir_neg functional_test/sample_data/NEG --POS_design_path functional_test/sample_data/pos_design.csv --NEG_design_path functional_test/sample_data/neg_design.csv -with-docker xinsongdu/lemaslab_rump:v1.0.0
 ```
 
 ### Running tests on high-performance computing
 
 ```
-nextflow main.nf --input_dir_pos functional_test/sample_data/POS/ --input_dir_neg functional_test/sample_data/NEG --POS_design_path functional_test/sample_data/pos_design.csv --NEG_design_path functional_test/sample_data/neg_design.csv --container singularity -with-singularity docker://xinsongdu/lemaslab_rump:v0.0.0
+nextflow main.nf --input_dir_pos functional_test/sample_data/POS/ --input_dir_neg functional_test/sample_data/NEG --POS_design_path functional_test/sample_data/pos_design.csv --NEG_design_path functional_test/sample_data/neg_design.csv --container singularity -with-singularity docker://xinsongdu/lemaslab_rump:v1.0.0
 ```
 
 # Bug reporting and feature requests

diff --git a/main.nf b/main.nf
@@ -83,6 +83,10 @@ MQC_CONFIG = Channel.fromPath(params.mqc_config)
 PYTHON_MUMMICHOG_INPUT_PREPARE = Channel.fromPath(params.python_mummichog_input_prepare)
 PYTHON_MUMMICHOG_INPUT_PREPARE.into{PYTHON_MUMMICHOG_INPUT_PREPARE_NOBG; PYTHON_MUMMICHOG_INPUT_PREPARE_WITHBG}
 
+// R code for unknown search
+R_UNKNOWN_SEARCH = Channel.fromPath(params.r_unknown_search)
+R_UNKNOWN_SEARCH.into{R_UNKNOWN_SEARCH_NOBG; R_UNKNOWN_SEARCH_WITHBG}
+
 // Result files used by MultiQC to generate report.
 // MQC_DIR = Channel.fromPath(params.mqc_dir, type: 'dir')
 
@@ -143,7 +147,7 @@ if (params.help) {
     exit 1
 }
 
-// Unit tests
+// Check appropriateness of input
 process input_check {
 
     echo true
@@ -278,8 +282,8 @@ process add_stats {
     """
 }
 
-POS_DATA_NOBG.into{POS_NOBG_FOR_BS; POS_NOBG_FOR_MQC; POS_NOBG_FOR_PCA; POS_NOBG_FOR_HCLUSTERING; POS_NOBG_FOR_VD; POS_NOBG_FOR_BARPLOT; POS_NOBG_FOR_MUMMICHOG}
-NEG_DATA_NOBG.into{NEG_NOBG_FOR_BS; NEG_NOBG_FOR_MQC; NEG_NOBG_FOR_PCA; NEG_NOBG_FOR_HCLUSTERING; NEG_NOBG_FOR_VD; NEG_NOBG_FOR_BARPLOT; NEG_NOBG_FOR_MUMMICHOG}
+POS_DATA_NOBG.into{POS_NOBG_FOR_BS; POS_NOBG_FOR_MQC; POS_NOBG_FOR_PCA; POS_NOBG_FOR_HCLUSTERING; POS_NOBG_FOR_VD; POS_NOBG_FOR_BARPLOT; POS_NOBG_FOR_MUMMICHOG; POS_NOBG_FOR_UNKNOWN_SEARCH}
+NEG_DATA_NOBG.into{NEG_NOBG_FOR_BS; NEG_NOBG_FOR_MQC; NEG_NOBG_FOR_PCA; NEG_NOBG_FOR_HCLUSTERING; NEG_NOBG_FOR_VD; NEG_NOBG_FOR_BARPLOT; NEG_NOBG_FOR_MUMMICHOG; NEG_NOBG_FOR_UNKNOWN_SEARCH}
 
 // Background subtraction
 process blank_subtraction {
@@ -311,8 +315,8 @@ process blank_subtraction {
 
 
 // split channel content for multiple-time use
-POS_DATA_WITHBG.into{POS_WITHBG_FOR_MQC; POS_WITHBG_FOR_PCA; POS_WITHBG_FOR_HCLUSTERING; POS_WITHBG_FOR_VD; POS_WITHBG_FOR_BARPLOT; POS_WITHBG_FOR_MUMMICHOG}
-NEG_DATA_WITHBG.into{NEG_WITHBG_FOR_MQC; NEG_WITHBG_FOR_PCA; NEG_WITHBG_FOR_HCLUSTERING; NEG_WITHBG_FOR_VD; NEG_WITHBG_FOR_BARPLOT; NEG_WITHBG_FOR_MUMMICHOG}
+POS_DATA_WITHBG.into{POS_WITHBG_FOR_MQC; POS_WITHBG_FOR_PCA; POS_WITHBG_FOR_HCLUSTERING; POS_WITHBG_FOR_VD; POS_WITHBG_FOR_BARPLOT; POS_WITHBG_FOR_MUMMICHOG; POS_WITHBG_FOR_UNKNOWN_SEARCH}
+NEG_DATA_WITHBG.into{NEG_WITHBG_FOR_MQC; NEG_WITHBG_FOR_PCA; NEG_WITHBG_FOR_HCLUSTERING; NEG_WITHBG_FOR_VD; NEG_WITHBG_FOR_BARPLOT; NEG_WITHBG_FOR_MUMMICHOG; NEG_WITHBG_FOR_UNKNOWN_SEARCH}
 
 // Process for generating files that can be parsed by MultiQC regarding peak numbers of different steps.
 process mqc_peak_number_comparison {
@@ -568,6 +572,55 @@ process bar_plot_withbg {
 
 }
 
+// unknown search for metabolites identified before blank subtraction
+process unknown_search_nobg {
+
+    publishDir './results/peak_table/', mode: 'copy'
+
+    input:
+    file data_pos from POS_NOBG_FOR_UNKNOWN_SEARCH
+    file data_neg from NEG_NOBG_FOR_UNKNOWN_SEARCH
+    file r_unknown_search from R_UNKNOWN_SEARCH_NOBG
+
+    output:
+    file params.unknown_search_pos_nobg into UNKNOWN_SEARCH_POS_NOBG
+    file params.unknown_search_neg_nobg into UNKNOWN_SEARCH_NEG_NOBG
+
+    shell:
+    """   
+    Rscript ${r_unknown_search} -i ${data_pos} -n positive -c ${params.mz_col_pos_nobg} -o ${params.unknown_search_pos_nobg} &&
+    Rscript ${r_unknown_search} -i ${data_neg} -n negative -c ${params.mz_col_neg_nobg} -o ${params.unknown_search_neg_nobg}
+
+    """
+
+}
+
+// unknown search for metabolites identified after blank subtraction
+process unknown_search_withbg {
+
+    publishDir './results/peak_table/', mode: 'copy'
+
+    input:
+    file data_pos from POS_WITHBG_FOR_UNKNOWN_SEARCH
+    file data_neg from NEG_WITHBG_FOR_UNKNOWN_SEARCH
+    file r_unknown_search from R_UNKNOWN_SEARCH_WITHBG
+
+    output:
+    file params.unknown_search_pos_withbg into UNKNOWN_SEARCH_POS_WITHBG
+    file params.unknown_search_neg_withbg into UNKNOWN_SEARCH_NEG_WITHBG
+
+    when:
+    params.bs == "1"
+
+    shell:
+    """   
+    Rscript ${r_unknown_search} -i ${data_pos} -n positive -c ${params.mz_col_pos_withbg} -o ${params.unknown_search_pos_withbg} &&
+    Rscript ${r_unknown_search} -i ${data_neg} -n negative -c ${params.mz_col_neg_withbg} -o ${params.unknown_search_neg_withbg}
+
+    """
+
+}
+
 process mqc_figs {
 
     publishDir './results/mqc/', mode: 'copy'

diff --git a/nextflow.config b/nextflow.config
@@ -106,8 +106,13 @@ params
     python_barplot = "./rump/bar_plot.py"
     data_info = "./rump/data_info.py"
     peak_number_comparison_path = "./rump/peak_number_comparison.py"
-
     python_bs = "./rump/blank_subtraction.py"
+    r_unknown_search = "./rump/unknown_search.R"
+
+    mz_col_pos_nobg = "row.m.z"
+    mz_col_neg_nobg = "row.m.z"
+    mz_col_pos_withbg = "row.m.z"
+    mz_col_neg_withbg = "row.m.z"
 
     mqc_dir = "./results/mqc/"
     experiments_info = "./rump/software_descriptions_mqc.txt"
@@ -193,6 +198,14 @@ params
     barplot_neg_withbg = "neg_barplot_group1_withbg.png"
     barplot_neg_withbg_om = "neg_onlymatched_barplot_group1_withbg.png"
 
+    // outputs for unknown_search_nobg
+    unknown_search_pos_nobg = "unknown_search_pos_nobg.csv"
+    unknown_search_neg_nobg = "unknown_search_neg_nobg.csv"
+
+    // outputs for unknown_search_withbg
+    unknown_search_pos_withbg = "unknown_search_pos_withbg.csv"
+    unknown_search_neg_withbg = "unknown_search_neg_withbg.csv"
+
     // regarding mummichog
     python_mummichog_input_prepare = "./rump/mummichog_input_prepare.py"
     data_pos_nobg_group1_mummichog = "data_pos_nobg_group1_mummichog.txt"
@@ -288,6 +301,18 @@ process
         cpus = 1
         memory = '4 GB'   
     }
+    withName: unknown_search_nobg
+    {
+        time =  '15m'
+        cpus = 1
+        memory = '4 GB'   
+    }
+    withName: unknown_search_withbg
+    {
+        time =  '15m'
+        cpus = 1
+        memory = '4 GB'   
+    }
     withName: mqc_figs
     {
         time =  '15m'

diff --git a/r_package_install.R b/r_package_install.R
@@ -0,0 +1,4 @@
+# install neccessary packages
+list.of.packages <- c("cmmr", "optparse")
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages)
diff --git a/rump/unknown_search.R b/rump/unknown_search.R
@@ -0,0 +1,59 @@
+# 2018.12.19. ask
+# rm(list=ls(all=TRUE))
+
+# 20 Digits Precision Representation
+options(scipen=20)
+
+# Setting the correct working directory.
+# NOTE!!! -> Can be linked differently on different computers.
+# setwd("/Users/xinsongdu/mnt/projects/beach01/secimtools")
+
+library(optparse) # add this library to enable argparse arguments
+library(cmmr)
+options(warn=-1)
+
+## Define input and output arguments
+option_list = list(
+  make_option(c("-i", "--input"), type="character", default="bovine_enriched_unknown.csv", 
+              help="input data file"),
+  make_option(c("-c", "--mz_col"), type="character", default="row.m.z", 
+              help="column name indicating m/z values"),
+  make_option(c("-n", "--ion"), type="character", default="positive", 
+              help="ion mode"),
+  make_option(c("-o", "--output"), type="character", default="searched_unknown_pos_after_blank_subtraction.csv", 
+              help="output csv file name")
+); 
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+# read data
+data <- read.csv(file=opt$input)
+
+# extract mz values from data
+mzs = as.vector(data[['row.m.z']])
+# mzs = lapply(mzs,round,4)
+
+if (opt$ion=="negative"){
+  adduct <- '["M-H"]'
+} else {
+  adduct <- '["M+H"]'
+}
+
+# batch search
+batch_df <- batch_search('http://ceumass.eps.uspceu.es/mediator/api/v3/batch',
+                             'all-except-peptides',
+                             '["all-except-mine"]',
+                             'mz',
+                             opt$ion,
+                             adduct,
+                             5,
+                             'ppm',
+                             mzs)
+if (typeof(batch_df)=="character"){
+  data_merge <- data.frame(Empty=character())
+} else {
+  data_merge <- merge(data, batch_df, by.x='row.m.z', by.y='experimental_mass')
+}
+
+write.csv(data_merge, opt$output, row.names=TRUE)