diff --git a/Dockerfile b/Dockerfile index b4de3b5..ed38887 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -FROM continuumio/miniconda3:4.8.2 +FROM continuumio/miniconda3:latest LABEL developer="Po-E Li" LABEL email="po-e@lanl.gov" -LABEL version="1.0.1" +LABEL version="1.0.4" LABEL software="nmdc_taxa_profilers" LABEL tags="metagenome, bioinformatics, NMDC, taxonomy" @@ -18,26 +18,23 @@ RUN conda config --add channels conda-forge \ # install gottcha2 RUN conda install minimap2 pandas -RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.7.tar.gz \ - && tar -xzf 2.1.7.tar.gz \ - && cp GOTTCHA2-2.1.7/*.py /usr/local/bin \ - && rm -rf GOTTCHA2-2.1.7/ 2.1.7.zip +RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.8.1.tar.gz \ + && tar -xzf 2.1.8.1.tar.gz \ + && cp GOTTCHA2-2.1.8.1/*.py /usr/local/bin \ + && rm -rf GOTTCHA2-2.1.8.1/ 2.1.8.1.zip # install kraken2 -RUN conda install kraken2=2.1.0 +RUN conda install kraken2=2.1.2 # install centrifuge -RUN wget https://github.com/DaehwanKimLab/centrifuge/archive/v1.0.4-beta.tar.gz \ - && tar -xzf v1.0.4-beta.tar.gz \ - && cd centrifuge-1.0.4-beta \ - && make install prefix=/usr/local +RUN conda create -n centrifuge centrifuge=1.0.4_beta # install krona RUN conda install krona \ && ktUpdateTaxonomy.sh # install additional libs -RUN conda install click +RUN conda install pandas click ADD *.py /opt/conda/bin/ CMD ["/bin/bash"] diff --git a/ReadbasedAnalysis.wdl b/ReadbasedAnalysis.wdl index 45970a9..4ccd305 100644 --- a/ReadbasedAnalysis.wdl +++ b/ReadbasedAnalysis.wdl @@ -14,7 +14,7 @@ workflow ReadbasedAnalysis { String? outdir Boolean? paired = false String bbtools_container="microbiomedata/bbtools:38.96" - String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.2p1" + String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.4" call stage { input: @@ -53,6 +53,19 @@ workflow ReadbasedAnalysis { } } + call make_info_file { + input: enabled_tools = enabled_tools, + db = db, + docker = docker, + gottcha2_info = profilerGottcha2.info, + gottcha2_report_tsv = profilerGottcha2.report_tsv, + gottcha2_info = profilerGottcha2.info, + centrifuge_report_tsv = profilerCentrifuge.report_tsv, + centrifuge_info = profilerCentrifuge.info, + kraken2_report_tsv = profilerKraken2.report_tsv, + kraken2_info = profilerKraken2.info, + } + call finish_reads { input: proj=proj, @@ -85,12 +98,14 @@ workflow ReadbasedAnalysis { File final_kraken2_report_tsv = finish_reads.kr_report_tsv File final_kraken2_krona_html = finish_reads.kr_krona_html File reads_objects = finish_reads.objects + File? info_file = make_info_file.profiler_info + String? info = make_info_file.profiler_info_text } meta { author: "Po-E Li, B10, LANL" email: "po-e@lanl.gov" - version: "1.0.2" + version: "1.0.4" } } @@ -245,3 +260,60 @@ task make_outputs{ } } + +task make_info_file { + Map[String, Boolean] enabled_tools + Map[String, String] db + String? docker + File? gottcha2_report_tsv + File? gottcha2_info + File? centrifuge_report_tsv + File? centrifuge_info + File? kraken2_report_tsv + File? kraken2_info + String info_filename = "profiler.info" + + command <<< + set -euo pipefail + + # generate output info file + + info_text="Taxonomy profiling tools and databases used: " + echo $info_text > ${info_filename} + + if [[ ${enabled_tools['kraken2']} == true ]] + then + software_ver=`cat ${kraken2_info}` + #db_ver=`echo "${db['kraken2']}" | rev | cut -d'/' -f 1 | rev` + db_ver=`cat ${db['kraken2']}/db_ver.info` + info_text="Kraken2 v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + + if [[ ${enabled_tools['centrifuge']} == true ]] + then + software_ver=`cat ${centrifuge_info}` + db_ver=`cat $(dirname ${db['centrifuge']})/db_ver.info` + info_text="Centrifuge v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + + if [[ ${enabled_tools['gottcha2']} == true ]] + then + software_ver=`cat ${gottcha2_info}` + db_ver=`cat ${db['gottcha2']}/db_ver.info` + info_text="Gottcha2 v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + >>> + + output { + File profiler_info = "${info_filename}" + String profiler_info_text = read_string("${info_filename}") + } + runtime { + memory: "2G" + cpu: 1 + maxRetries: 1 + } +} diff --git a/ReadbasedAnalysisTasks.wdl b/ReadbasedAnalysisTasks.wdl index 9cb8be2..d5f2a46 100644 --- a/ReadbasedAnalysisTasks.wdl +++ b/ReadbasedAnalysisTasks.wdl @@ -8,6 +8,8 @@ task profilerGottcha2 { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate gottcha2 gottcha2.py -r ${RELABD_COL} \ -i ${sep=' ' READS} \ @@ -17,11 +19,14 @@ task profilerGottcha2 { --database ${DB} grep "^species" ${PREFIX}.tsv | ktImportTaxonomy -t 3 -m 9 -o ${PREFIX}.krona.html - || true + + gottcha2.py --version > ${PREFIX}.info >>> output { File report_tsv = "${PREFIX}.tsv" File full_tsv = "${PREFIX}.full.tsv" File krona_html = "${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER @@ -46,6 +51,8 @@ task profilerCentrifuge { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate centrifuge centrifuge -x ${DB} \ -p ${CPU} \ @@ -54,11 +61,14 @@ task profilerCentrifuge { --report-file ${PREFIX}.report.tsv ktImportTaxonomy -m 5 -t 2 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv + + centrifuge --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info >>> output { File classification_tsv="${PREFIX}.classification.tsv" File report_tsv="${PREFIX}.report.tsv" File krona_html="${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER @@ -84,6 +94,8 @@ task profilerKraken2 { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate kraken2 kraken2 ${true="--paired" false='' PAIRED} \ --threads ${CPU} \ @@ -91,13 +103,17 @@ task profilerKraken2 { --output ${PREFIX}.classification.tsv \ --report ${PREFIX}.report.tsv \ ${sep=' ' READS} + conda deactivate ktImportTaxonomy -m 3 -t 5 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv + + kraken2 --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info >>> output { File classification_tsv = "${PREFIX}.classification.tsv" File report_tsv = "${PREFIX}.report.tsv" File krona_html = "${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER diff --git a/outputTsv2json.py b/outputTsv2json.py index 3c2e55c..e8fbe2b 100755 --- a/outputTsv2json.py +++ b/outputTsv2json.py @@ -2,7 +2,6 @@ import os import json import pandas as pd -import numpy as np import click @click.command() @@ -10,7 +9,6 @@ def output2json(meta): """ Simple converter that takes TSV files to generate a summary JSON. """ - df = pd.DataFrame() out_dict = {} tsvfile_lod = json.load(meta) @@ -21,6 +19,7 @@ def output2json(meta): tool = tsvmeta['tool'] idx_col = 'taxRank' read_cnt_col = 'numReads' + df = pd.DataFrame() result = { 'classifiedReadCount': 0, @@ -33,7 +32,6 @@ def output2json(meta): def reduceDf(df, cols, ranks=['species','genus','family'], top=10): """ Report top # rows of ranks respectively and return a dict - df: results in dataframe cols: (rnk_col, name_col, read_count_col, abu_col, taxid_col) """