diff --git a/Dockerfile b/Dockerfile index b4de3b5..ed38887 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -FROM continuumio/miniconda3:4.8.2 +FROM continuumio/miniconda3:latest LABEL developer="Po-E Li" LABEL email="po-e@lanl.gov" -LABEL version="1.0.1" +LABEL version="1.0.4" LABEL software="nmdc_taxa_profilers" LABEL tags="metagenome, bioinformatics, NMDC, taxonomy" @@ -18,26 +18,23 @@ RUN conda config --add channels conda-forge \ # install gottcha2 RUN conda install minimap2 pandas -RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.7.tar.gz \ - && tar -xzf 2.1.7.tar.gz \ - && cp GOTTCHA2-2.1.7/*.py /usr/local/bin \ - && rm -rf GOTTCHA2-2.1.7/ 2.1.7.zip +RUN wget https://github.com/poeli/GOTTCHA2/archive/2.1.8.1.tar.gz \ + && tar -xzf 2.1.8.1.tar.gz \ + && cp GOTTCHA2-2.1.8.1/*.py /usr/local/bin \ + && rm -rf GOTTCHA2-2.1.8.1/ 2.1.8.1.zip # install kraken2 -RUN conda install kraken2=2.1.0 +RUN conda install kraken2=2.1.2 # install centrifuge -RUN wget https://github.com/DaehwanKimLab/centrifuge/archive/v1.0.4-beta.tar.gz \ - && tar -xzf v1.0.4-beta.tar.gz \ - && cd centrifuge-1.0.4-beta \ - && make install prefix=/usr/local +RUN conda create -n centrifuge centrifuge=1.0.4_beta # install krona RUN conda install krona \ && ktUpdateTaxonomy.sh # install additional libs -RUN conda install click +RUN conda install pandas click ADD *.py /opt/conda/bin/ CMD ["/bin/bash"] diff --git a/ReadbasedAnalysis.wdl b/ReadbasedAnalysis.wdl index 1de443e..c08df65 100644 --- a/ReadbasedAnalysis.wdl +++ b/ReadbasedAnalysis.wdl @@ -1,83 +1,250 @@ import "ReadbasedAnalysisTasks.wdl" as tasks workflow ReadbasedAnalysis { - Map[String, Boolean] enabled_tools - Map[String, String] db - Array[File] reads - Int cpu + Boolean enabled_tools_gottcha2 = true + Boolean enabled_tools_kraken2 = true + Boolean enabled_tools_centrifuge = true + String db_gottcha2 = "/refdata/gottcha2/RefSeq-r90.cg.BacteriaArchaeaViruses.species.fna" + String db_kraken2 = "/refdata/kraken2/" + String db_centrifuge = "/refdata/centrifuge/p_compressed" + Int cpu = 8 + String input_file + String proj + String resource + String informed_by + String? git_url="https://github.com/microbiomedata/mg_annotation/releases/tag/0.1" + String? url_root="https://data.microbiomedata.org/data/" String prefix - String outdir + String? outdir Boolean? paired = false - String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.2" + String bbtools_container="microbiomedata/bbtools:38.96" + String? docker = "microbiomedata/nmdc_taxa_profilers:1.0.4" + + call stage { + input: + container=bbtools_container, + input_file=input_file + } - if (enabled_tools["gottcha2"] == true) { + if (enabled_tools_gottcha2 == true) { call tasks.profilerGottcha2 { - input: READS = reads, - DB = db["gottcha2"], + input: READS = stage.reads, + DB = db_gottcha2, PREFIX = prefix, CPU = cpu, DOCKER = docker } } - if (enabled_tools["kraken2"] == true) { + if (enabled_tools_kraken2 == true) { call tasks.profilerKraken2 { - input: READS = reads, + input: READS = stage.reads, PAIRED = paired, - DB = db["kraken2"], + DB = db_kraken2, PREFIX = prefix, CPU = cpu, DOCKER = docker } } - if (enabled_tools["centrifuge"] == true) { + if (enabled_tools_centrifuge == true) { call tasks.profilerCentrifuge { - input: READS = reads, - DB = db["centrifuge"], + input: READS = stage.reads, + DB = db_centrifuge, PREFIX = prefix, CPU = cpu, DOCKER = docker } } -# call tasks.generateSummaryJson { -# input: TSV_META_JSON = [profilerGottcha2.results, profilerCentrifuge.results, profilerKraken2.results], -# PREFIX = prefix, -# OUTPATH = outdir, -# DOCKER = docker -# } - call make_outputs { - input: gottcha2_report_tsv = profilerGottcha2.report_tsv, - gottcha2_full_tsv = profilerGottcha2.full_tsv, - gottcha2_krona_html = profilerGottcha2.krona_html, - centrifuge_classification_tsv = profilerCentrifuge.classification_tsv, - centrifuge_report_tsv = profilerCentrifuge.report_tsv, - centrifuge_krona_html = profilerCentrifuge.krona_html, - kraken2_classification_tsv = profilerKraken2.classification_tsv, - kraken2_report_tsv = profilerKraken2.report_tsv, - kraken2_krona_html = profilerKraken2.krona_html, - outdir = outdir, - container = docker - } + call make_info_file { + input: docker = docker, + enabled_tools_gottcha2 = enabled_tools_gottcha2, + enabled_tools_kraken2 = enabled_tools_kraken2, + enabled_tools_centrifuge = enabled_tools_centrifuge, + db_gottcha2 = db_gottcha2, + db_kraken2 = db_kraken2, + db_centrifuge = db_centrifuge, + docker = docker, + gottcha2_info = profilerGottcha2.info, + gottcha2_report_tsv = profilerGottcha2.report_tsv, + gottcha2_info = profilerGottcha2.info, + centrifuge_report_tsv = profilerCentrifuge.report_tsv, + centrifuge_info = profilerCentrifuge.info, + kraken2_report_tsv = profilerKraken2.report_tsv, + kraken2_info = profilerKraken2.info, + } + + call finish_reads { + input: + proj=proj, + start=stage.start, + git_url=git_url, + url_root=url_root, + input_file=stage.read_in, + container="microbiomedata/workflowmeta:1.1.1", + informed_by=informed_by, + resource=resource, + gottcha2_report_tsv=profilerGottcha2.report_tsv, + gottcha2_full_tsv=profilerGottcha2.full_tsv, + gottcha2_krona_html=profilerGottcha2.krona_html, + centrifuge_classification_tsv=profilerCentrifuge.classification_tsv, + centrifuge_report_tsv=profilerCentrifuge.report_tsv, + centrifuge_krona_html=profilerCentrifuge.krona_html, + kraken2_classification_tsv=profilerKraken2.classification_tsv, + kraken2_report_tsv=profilerKraken2.report_tsv, + kraken2_krona_html=profilerKraken2.krona_html + } output { - File? gottcha2_report_tsv = profilerGottcha2.report_tsv - File? gottcha2_full_tsv = profilerGottcha2.full_tsv - File? gottcha2_krona_html = profilerGottcha2.krona_html - File? centrifuge_classification_tsv = profilerCentrifuge.classification_tsv - File? centrifuge_report_tsv = profilerCentrifuge.report_tsv - File? centrifuge_krona_html = profilerCentrifuge.krona_html - File? kraken2_classification_tsv = profilerKraken2.classification_tsv - File? kraken2_report_tsv = profilerKraken2.report_tsv - File? kraken2_krona_html = profilerKraken2.krona_html -# File summary_json = generateSummaryJson.summary_json + File final_gottcha2_report_tsv = finish_reads.g2_report_tsv + File final_gottcha2_full_tsv = finish_reads.g2_full_tsv + File final_gottcha2_krona_html = finish_reads.g2_krona_html + File final_centrifuge_classification_tsv = finish_reads.cent_classification_tsv + File final_centrifuge_report_tsv = finish_reads.cent_report_tsv + File final_centrifuge_krona_html = finish_reads.cent_krona_html + File final_kraken2_classification_tsv = finish_reads.kr_classification_tsv + File final_kraken2_report_tsv = finish_reads.kr_report_tsv + File final_kraken2_krona_html = finish_reads.kr_krona_html + File reads_objects = finish_reads.objects + File? info_file = make_info_file.profiler_info + String? info = make_info_file.profiler_info_text } meta { author: "Po-E Li, B10, LANL" email: "po-e@lanl.gov" - version: "1.0.2" + version: "1.0.4" + } +} + + +task stage { + String container + String input_file + String? memory = "4G" + String target = "staged.fastq.gz" + String output1 = "input.left.fastq.gz" + String output2 = "input.right.fastq.gz" + + command <<< + set -e + if [ $( echo ${input_file}|egrep -c "https*:") -gt 0 ] ; then + wget ${input_file} -O ${target} + else + ln ${input_file} ${target} || cp ${input_file} ${target} + fi + + reformat.sh -Xmx${default="10G" memory} in=${target} out1=${output1} out2=${output2} + # Capture the start time + date --iso-8601=seconds > start.txt + + >>> + + output{ + File read_in = target + Array[File] reads = [output1, output2] + String start = read_string("start.txt") + } + runtime { + cpu: 2 + maxRetries: 1 + docker: container + } +} + +task finish_reads { + String input_file + String container + String git_url + String informed_by + String proj + String prefix=sub(proj, ":", "_") + String resource + String url_root + String start + File gottcha2_report_tsv + File gottcha2_full_tsv + File gottcha2_krona_html + File centrifuge_classification_tsv + File centrifuge_report_tsv + File centrifuge_krona_html + File kraken2_classification_tsv + File kraken2_report_tsv + File kraken2_krona_html + + command <<< + + set -e + end=`date --iso-8601=seconds` + # Set names + if [[ $(head -2 ${gottcha2_report_tsv}|wc -l) -eq 1 ]] ; then + echo "Nothing found in gottcha2 for ${proj} $end" >> ${prefix}_gottcha2_report.tsv + else + ln ${gottcha2_report_tsv} ${prefix}_gottcha2_report.tsv + fi + ln ${gottcha2_full_tsv} ${prefix}_gottcha2_full_tsv + ln ${gottcha2_krona_html} ${prefix}_gottcha2_krona.html + + ln ${centrifuge_classification_tsv} ${prefix}_centrifuge_classification.tsv + if [[ $(head -2 ${centrifuge_report_tsv}|wc -l) -eq 1 ]] ; then + echo "Nothing found in centrifuge for ${proj} $end" >> ${prefix}_centrifuge_report.tsv + else + ln ${centrifuge_report_tsv} ${prefix}_centrifuge_report.tsv + fi + ln ${centrifuge_krona_html} ${prefix}_centrifuge_krona.html + + ln ${kraken2_classification_tsv} ${prefix}_kraken2_classification.tsv + if [[ $(head -2 ${kraken2_report_tsv}|wc -l) -eq 1 ]] ; then + echo "Nothing found in kraken2 for ${proj} $end" >> ${prefix}_kraken2_report.tsv + else + ln ${kraken2_report_tsv} ${prefix}_kraken2_report.tsv + fi + ln ${kraken2_krona_html} ${prefix}_kraken2_krona.html + + /scripts/generate_object_json.py \ + --type "nmdc:ReadBasedAnalysisActivity" \ + --set read_based_taxonomy_analysis_activity_set \ + --part ${proj} \ + -p "name=ReadBased Analysis Activity for ${proj}" \ + was_informed_by=${informed_by} \ + started_at_time=${start} \ + ended_at_time=$end \ + execution_resource="${resource}" \ + git_url=${git_url} \ + version="v1.0.2-beta" \ + --url ${url_root}${proj}/ReadbasedAnalysis/ \ + --inputs ${input_file} \ + --outputs \ + ${prefix}_gottcha2_report.tsv "GOTTCHA2 classification report file" "GOTTCHA2 Classification Report" "GOTTCHA2 Classification for ${proj}"\ + ${prefix}_gottcha2_full_tsv "GOTTCHA2 report file" "GOTTCHA2 Report Full" "GOTTCHA2 Full Report for ${proj}" \ + ${prefix}_gottcha2_krona.html "GOTTCHA2 krona plot HTML file" "GOTTCHA2 Krona Plot" "GOTTCHA2 Krona for ${proj}"\ + ${prefix}_centrifuge_classification.tsv "Centrifuge output read classification file" "Centrifuge Taxonomic Classification" "Centrifuge Classification for ${proj}"\ + ${prefix}_centrifuge_report.tsv "Centrifuge Classification Report" "Centrifuge output report file" "Centrifuge Report for ${proj}"\ + ${prefix}_centrifuge_krona.html "Centrifug krona plot HTML file" "Centrifuge Krona Plot" "Centrifuge Krona for ${proj}"\ + ${prefix}_kraken2_classification.tsv "Kraken2 output read classification file" "Kraken2 Taxonomic Classification" "Kraken2 Classification for ${proj}" \ + ${prefix}_kraken2_report.tsv "Kraken2 output report file" "Kraken2 Classification Report" "Kraken2 Report for ${proj}" \ + ${prefix}_kraken2_krona.html "Kraken2 Krona plot HTML file" "Kraken2 Krona Plot" "Kraken2 Krona for ${proj}" + >>> + + output { + + File objects="objects.json" + File g2_report_tsv="${prefix}_gottcha2_report.tsv" + File g2_full_tsv="${prefix}_gottcha2_full_tsv" + File g2_krona_html="${prefix}_gottcha2_krona.html" + File cent_classification_tsv="${prefix}_centrifuge_classification.tsv" + File cent_report_tsv="${prefix}_centrifuge_report.tsv" + File cent_krona_html="${prefix}_centrifuge_krona.html" + File kr_classification_tsv="${prefix}_kraken2_classification.tsv" + File kr_report_tsv="${prefix}_kraken2_report.tsv" + File kr_krona_html="${prefix}_kraken2_krona.html" + } + + runtime { + docker: container + memory: "1 GiB" + cpu: 1 } } @@ -96,6 +263,7 @@ task make_outputs{ String container command<<< + mkdir -p ${outdir}/gottcha2 cp ${gottcha2_report_tsv} ${gottcha2_full_tsv} ${gottcha2_krona_html} \ ${outdir}/gottcha2 @@ -116,3 +284,65 @@ task make_outputs{ } } + +task make_info_file { + Boolean enabled_tools_gottcha2 + Boolean enabled_tools_kraken2 + Boolean enabled_tools_centrifuge + String db_gottcha2 + String db_kraken2 + String db_centrifuge + String docker + File? gottcha2_report_tsv + File? gottcha2_info + File? centrifuge_report_tsv + File? centrifuge_info + File? kraken2_report_tsv + File? kraken2_info + String info_filename = "profiler.info" + + command <<< + set -euo pipefail + + # generate output info file + + info_text="Taxonomy profiling tools and databases used: " + echo $info_text > ${info_filename} + + if [[ ${enabled_tools_kraken2} == true ]] + then + software_ver=`cat ${kraken2_info}` + #db_ver=`echo "${db_kraken2}" | rev | cut -d'/' -f 1 | rev` + db_ver=`cat ${db_kraken2}/db_ver.info` + info_text="Kraken2 v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + + if [[ ${enabled_tools_centrifuge} == true ]] + then + software_ver=`cat ${centrifuge_info}` + db_ver=`cat $(dirname ${db_centrifuge})/db_ver.info` + info_text="Centrifuge v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + + if [[ ${enabled_tools_gottcha2} == true ]] + then + software_ver=`cat ${gottcha2_info}` + db_ver=`cat $(dirname ${db_gottcha2})/db_ver.info` + info_text="Gottcha2 v$software_ver (database version: $db_ver)" + echo $info_text >> ${info_filename} + fi + >>> + + output { + File profiler_info = "${info_filename}" + String profiler_info_text = read_string("${info_filename}") + } + runtime { + docker: docker + memory: "2G" + cpu: 1 + maxRetries: 1 + } +} diff --git a/ReadbasedAnalysisTasks.wdl b/ReadbasedAnalysisTasks.wdl index e745e17..81e7d65 100644 --- a/ReadbasedAnalysisTasks.wdl +++ b/ReadbasedAnalysisTasks.wdl @@ -8,6 +8,8 @@ task profilerGottcha2 { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate gottcha2 gottcha2.py -r ${RELABD_COL} \ -i ${sep=' ' READS} \ @@ -17,18 +19,21 @@ task profilerGottcha2 { --database ${DB} grep "^species" ${PREFIX}.tsv | ktImportTaxonomy -t 3 -m 9 -o ${PREFIX}.krona.html - || true + + gottcha2.py --version > ${PREFIX}.info >>> output { File report_tsv = "${PREFIX}.tsv" File full_tsv = "${PREFIX}.full.tsv" File krona_html = "${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER cpu: CPU node: 1 nwpn: 1 - mem: "45G" + memory: "45G" time: "04:00:00" } meta { @@ -46,6 +51,8 @@ task profilerCentrifuge { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate centrifuge centrifuge -x ${DB} \ -p ${CPU} \ @@ -54,18 +61,21 @@ task profilerCentrifuge { --report-file ${PREFIX}.report.tsv ktImportTaxonomy -m 5 -t 2 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv + + centrifuge --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info >>> output { File classification_tsv="${PREFIX}.classification.tsv" File report_tsv="${PREFIX}.report.tsv" File krona_html="${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER cpu: CPU node: 1 nwpn: 1 - mem: "45G" + memory: "45G" time: "04:00:00" } meta { @@ -84,6 +94,8 @@ task profilerKraken2 { command <<< set -euo pipefail + . /opt/conda/etc/profile.d/conda.sh + conda activate kraken2 kraken2 ${true="--paired" false='' PAIRED} \ --threads ${CPU} \ @@ -91,20 +103,24 @@ task profilerKraken2 { --output ${PREFIX}.classification.tsv \ --report ${PREFIX}.report.tsv \ ${sep=' ' READS} + kraken2 --version | head -1 | cut -d ' ' -f3 > ${PREFIX}.info + conda deactivate ktImportTaxonomy -m 3 -t 5 -o ${PREFIX}.krona.html ${PREFIX}.report.tsv + >>> output { File classification_tsv = "${PREFIX}.classification.tsv" File report_tsv = "${PREFIX}.report.tsv" File krona_html = "${PREFIX}.krona.html" + File info = "${PREFIX}.info" } runtime { docker: DOCKER cpu: CPU node: 1 nwpn: 1 - mem: "45G" + memory: "45G" time: "04:00:00" } meta { @@ -128,7 +144,7 @@ task generateSummaryJson { docker: DOCKER node: 1 nwpn: 1 - mem: "45G" + memory: "45G" time: "04:00:00" } meta { @@ -136,3 +152,32 @@ task generateSummaryJson { email: "po-e@lanl.gov" } } + +task stage { + String container + String target="raw.fastq.gz" + String input_file + + command <<< + set -e + if [ $( echo ${input_file}|egrep -c "https*:") -gt 0 ] ; then + wget ${input_file} -O ${target} + else + ln ${input_file} ${target} || cp ${input_file} ${target} + fi + # Capture the start time + date --iso-8601=seconds > start.txt + + >>> + + output{ + File read = "${target}" + String start = read_string("start.txt") + } + runtime { + memory: "1 GiB" + cpu: 2 + maxRetries: 1 + docker: container + } +} diff --git a/ReadbasedAnalysis_inputs.json b/ReadbasedAnalysis_inputs.json index 769c230..32e3119 100644 --- a/ReadbasedAnalysis_inputs.json +++ b/ReadbasedAnalysis_inputs.json @@ -1,20 +1,9 @@ { - "ReadbasedAnalysis.enabled_tools": { - "gottcha2": true, - "kraken2": true, - "centrifuge": true - }, - "ReadbasedAnalysis.db": { - "gottcha2": "/global/cfs/projectdirs/m3408/aim2/database/gottcha2/RefSeq-r90.cg.BacteriaArchaeaViruses.species.fna", - "kraken2": "/global/cfs/projectdirs/m3408/aim2/database/kraken2/", - "centrifuge": "/global/cfs/projectdirs/m3408/aim2/database/centrifuge/p_compressed" - }, - "ReadbasedAnalysis.reads": [ - "/path/to/SRR7877884.1.fastq.gz", - "/path/to/SRR7877884.2.fastq.gz" - ], + "ReadbasedAnalysis.input_file": "/path/to/test.fastq.gz", "ReadbasedAnalysis.paired": true, - "ReadbasedAnalysis.prefix": "SRR7877884", - "ReadbasedAnalysis.outdir": "/global/cfs/projectdirs/m3408/aim2/metagenome/ReadbasedAnalysis/SRR7877884", - "ReadbasedAnalysis.cpu": 8 + "ReadbasedAnalysis.prefix": "TEST", + "ReadbasedAnalysis.cpu": 8, + "ReadbasedAnalysis.proj": "TEST", + "ReadbasedAnalysis.resource": "NERSC - CORI", + "ReadbasedAnalysis.informed_by": "None" } diff --git a/outputTsv2json.py b/outputTsv2json.py index 3c2e55c..e8fbe2b 100755 --- a/outputTsv2json.py +++ b/outputTsv2json.py @@ -2,7 +2,6 @@ import os import json import pandas as pd -import numpy as np import click @click.command() @@ -10,7 +9,6 @@ def output2json(meta): """ Simple converter that takes TSV files to generate a summary JSON. """ - df = pd.DataFrame() out_dict = {} tsvfile_lod = json.load(meta) @@ -21,6 +19,7 @@ def output2json(meta): tool = tsvmeta['tool'] idx_col = 'taxRank' read_cnt_col = 'numReads' + df = pd.DataFrame() result = { 'classifiedReadCount': 0, @@ -33,7 +32,6 @@ def output2json(meta): def reduceDf(df, cols, ranks=['species','genus','family'], top=10): """ Report top # rows of ranks respectively and return a dict - df: results in dataframe cols: (rnk_col, name_col, read_count_col, abu_col, taxid_col) """