add version info for each classification tool

microbiomedata · Jan 30, 2023 · f4bdd33 · f4bdd33
1 parent 3defae7
commit f4bdd33
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 17 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
-FROM continuumio/miniconda3:4.8.2
+FROM continuumio/miniconda3:latest
 
 LABEL developer="Po-E Li"
 LABEL email="[email protected]"
-LABEL version="1.0.1"
+LABEL version="1.0.4"
 LABEL software="nmdc_taxa_profilers"
 LABEL tags="metagenome, bioinformatics, NMDC, taxonomy"
 
@@ -18,26 +18,23 @@ RUN conda config --add channels conda-forge \
 
 # install gottcha2
 RUN conda install minimap2 pandas
-RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.7.tar.gz \
-    && tar -xzf 2.1.7.tar.gz \
-    && cp GOTTCHA2-2.1.7/*.py /usr/local/bin \
-    && rm -rf GOTTCHA2-2.1.7/ 2.1.7.zip
+RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.8.1.tar.gz \
+    && tar -xzf 2.1.8.1.tar.gz \
+    && cp GOTTCHA2-2.1.8.1/*.py /usr/local/bin \
+    && rm -rf GOTTCHA2-2.1.8.1/ 2.1.8.1.zip
 
 # install kraken2
-RUN conda install kraken2=2.1.0
+RUN conda install kraken2=2.1.2
 
 # install centrifuge
-RUN wget https://github.com/DaehwanKimLab/centrifuge/archive/v1.0.4-beta.tar.gz \
-    && tar -xzf v1.0.4-beta.tar.gz \
-    && cd centrifuge-1.0.4-beta \
-    && make install prefix=/usr/local
+RUN conda create -n centrifuge centrifuge=1.0.4_beta
 
 # install krona
 RUN conda install krona \
     && ktUpdateTaxonomy.sh
 
 # install additional libs
-RUN conda install click
+RUN conda install pandas click
 ADD *.py /opt/conda/bin/
 
 CMD ["/bin/bash"]
diff --git a/ReadbasedAnalysis.wdl b/ReadbasedAnalysis.wdl
@@ -14,7 +14,7 @@ workflow ReadbasedAnalysis {
     String? outdir
     Boolean? paired = false
     String bbtools_container="microbiomedata/bbtools:38.96"
-    String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.2p1"
+    String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.4"
 
     call stage {
         input:
@@ -53,6 +53,19 @@ workflow ReadbasedAnalysis {
         }
     }
 
+    call make_info_file {
+        input: enabled_tools = enabled_tools,
+            db = db,
+            docker = docker,
+            gottcha2_info = profilerGottcha2.info,
+            gottcha2_report_tsv = profilerGottcha2.report_tsv,
+            gottcha2_info = profilerGottcha2.info,
+            centrifuge_report_tsv = profilerCentrifuge.report_tsv,
+            centrifuge_info = profilerCentrifuge.info,
+            kraken2_report_tsv = profilerKraken2.report_tsv,
+            kraken2_info = profilerKraken2.info,
+        }
+
     call finish_reads {
             input:
             proj=proj,
@@ -85,12 +98,14 @@ workflow ReadbasedAnalysis {
         File final_kraken2_report_tsv = finish_reads.kr_report_tsv
         File final_kraken2_krona_html = finish_reads.kr_krona_html
         File reads_objects = finish_reads.objects
+        File? info_file = make_info_file.profiler_info
+        String? info = make_info_file.profiler_info_text
     }
 
     meta {
         author: "Po-E Li, B10, LANL"
         email: "[email protected]"
-        version: "1.0.2"
+        version: "1.0.4"
     }
 }
 
@@ -245,3 +260,60 @@ task make_outputs{
     }
 }
 
+
+task make_info_file {
+    Map[String, Boolean] enabled_tools
+    Map[String, String] db
+    String? docker
+    File? gottcha2_report_tsv
+    File? gottcha2_info
+    File? centrifuge_report_tsv
+    File? centrifuge_info
+    File? kraken2_report_tsv
+    File? kraken2_info
+    String info_filename = "profiler.info"
+
+    command <<<
+        set -euo pipefail
+
+        # generate output info file
+
+        info_text="Taxonomy profiling tools and databases used: "
+        echo $info_text > ${info_filename}
+
+        if [[ ${enabled_tools['kraken2']} == true ]]
+        then
+            software_ver=`cat ${kraken2_info}`
+            #db_ver=`echo "${db['kraken2']}" | rev | cut -d'/' -f 1 | rev`
+            db_ver=`cat ${db['kraken2']}/db_ver.info`
+            info_text="Kraken2 v$software_ver (database version: $db_ver)"
+            echo $info_text >> ${info_filename}
+        fi
+
+        if [[ ${enabled_tools['centrifuge']} == true ]]
+        then
+            software_ver=`cat ${centrifuge_info}`
+            db_ver=`cat $(dirname ${db['centrifuge']})/db_ver.info`
+            info_text="Centrifuge v$software_ver (database version: $db_ver)"
+            echo $info_text >> ${info_filename}
+        fi
+
+        if [[ ${enabled_tools['gottcha2']} == true ]]
+        then
+            software_ver=`cat ${gottcha2_info}`
+            db_ver=`cat ${db['gottcha2']}/db_ver.info`
+            info_text="Gottcha2 v$software_ver (database version: $db_ver)"
+            echo $info_text >> ${info_filename}
+        fi
+    >>>
+
+    output {
+        File profiler_info = "${info_filename}"
+        String profiler_info_text = read_string("${info_filename}")
+    }
+    runtime {
+        memory: "2G"
+        cpu:  1
+        maxRetries: 1
+    }
+}
diff --git a/ReadbasedAnalysisTasks.wdl b/ReadbasedAnalysisTasks.wdl
@@ -8,6 +8,8 @@ task profilerGottcha2 {
 
     command <<<
         set -euo pipefail
+        . /opt/conda/etc/profile.d/conda.sh
+        conda activate gottcha2
 
         gottcha2.py -r ${RELABD_COL} \
                     -i ${sep=' ' READS} \
@@ -17,11 +19,14 @@ task profilerGottcha2 {
                     --database ${DB}
 
         grep "^species" ${PREFIX}.tsv | ktImportTaxonomy -t 3 -m 9 -o ${PREFIX}.krona.html - || true
+
+        gottcha2.py --version > ${PREFIX}.info
     >>>
     output {
         File report_tsv = "${PREFIX}.tsv"
         File full_tsv = "${PREFIX}.full.tsv"
         File krona_html = "${PREFIX}.krona.html"
+        File info = "${PREFIX}.info"
     }
     runtime {
         docker: DOCKER
@@ -46,6 +51,8 @@ task profilerCentrifuge {
 
     command <<<
         set -euo pipefail
+        . /opt/conda/etc/profile.d/conda.sh
+        conda activate centrifuge
 
         centrifuge -x ${DB} \
                    -p ${CPU} \
@@ -54,11 +61,14 @@ task profilerCentrifuge {
                    --report-file ${PREFIX}.report.tsv
 
         ktImportTaxonomy -m 5 -t 2 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv
+
+        centrifuge --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info
     >>>
     output {
       File classification_tsv="${PREFIX}.classification.tsv"
       File report_tsv="${PREFIX}.report.tsv"
       File krona_html="${PREFIX}.krona.html"
+      File info = "${PREFIX}.info"
     }
     runtime {
         docker: DOCKER
@@ -84,20 +94,26 @@ task profilerKraken2 {
 
     command <<<
         set -euo pipefail
+        . /opt/conda/etc/profile.d/conda.sh
+        conda activate kraken2
 
         kraken2 ${true="--paired" false='' PAIRED} \
                 --threads ${CPU} \
                 --db ${DB} \
                 --output ${PREFIX}.classification.tsv \
                 --report ${PREFIX}.report.tsv \
                 ${sep=' ' READS}
+        conda deactivate
 
         ktImportTaxonomy -m 3 -t 5 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv
+
+        kraken2 --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info
     >>>
     output {
       File classification_tsv = "${PREFIX}.classification.tsv"
       File report_tsv = "${PREFIX}.report.tsv"
       File krona_html = "${PREFIX}.krona.html"
+      File info = "${PREFIX}.info"
     }
     runtime {
         docker: DOCKER

diff --git a/outputTsv2json.py b/outputTsv2json.py
@@ -2,15 +2,13 @@
 import os
 import json
 import pandas as pd
-import numpy as np
 import click
 
 @click.command()
 @click.option('--meta', type=click.File('r'), help='JSON of the metadata of output files')
 
 def output2json(meta):
     """ Simple converter that takes TSV files to generate a summary JSON. """
-    df = pd.DataFrame()
     out_dict = {}
 
     tsvfile_lod = json.load(meta)
@@ -21,6 +19,7 @@ def output2json(meta):
         tool = tsvmeta['tool']
         idx_col = 'taxRank'
         read_cnt_col = 'numReads'
+        df = pd.DataFrame()
 
         result = {
             'classifiedReadCount': 0,
@@ -33,7 +32,6 @@ def output2json(meta):
         def reduceDf(df, cols, ranks=['species','genus','family'], top=10):
             """
             Report top # rows of ranks respectively and return a dict
-
             df: results in dataframe
             cols: (rnk_col, name_col, read_count_col, abu_col, taxid_col)
             """