diff --git a/Docker/Dockerfile b/Docker/Dockerfile new file mode 100644 index 0000000..4b2e877 --- /dev/null +++ b/Docker/Dockerfile @@ -0,0 +1,36 @@ +FROM ubuntu:20.04 + +LABEL base.image="ubuntu:20.04" +LABEL dockerfile.version="1" +LABEL software="BBTools" +LABEL software.version="38.96" +LABEL description="A set of tools labeled as \"Bestus Bioinformaticus\"" +LABEL website="https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" +LABEL license="https://jgi.doe.gov/disclaimer/" +LABEL maintainer="Chienchi Lo" +LABEL maintainer.email="chienchi@lanl.gov" + +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=en_US.UTF-8 +ENV JAVA_HOME=/usr/java/openjdk-13 +ENV PATH=/usr/java/openjdk-13/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV JAVA_VERSION=13.0.1 +ENV JAVA_URL=https://download.java.net/java/GA/jdk13.0.1/cec27d702aa74d5a8630c65ae61e4305/9/GPL/openjdk-13.0.1_linux-x64_bin.tar.gz +ENV JAVA_SHA256=2e01716546395694d3fad54c9b36d1cd46c5894c06f72d156772efbcf4b41335 + + +RUN apt-get update && apt-get install -y build-essential file python \ + wget samtools curl && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# openjdk-13-jre \ +RUN /bin/sh -c set -eux; curl -fL -o /openjdk.tgz "$JAVA_URL"; echo "$JAVA_SHA256 */openjdk.tgz" | sha256sum -c -; mkdir -p "$JAVA_HOME"; tar --extract --file /openjdk.tgz --directory "$JAVA_HOME" --strip-components 1; rm /openjdk.tgz; ln -sfT "$JAVA_HOME" /usr/java/default; ln -sfT "$JAVA_HOME" /usr/java/latest; for bin in "$JAVA_HOME/bin/"*; do base="$(basename "$bin")"; [ ! -e "/usr/bin/$base" ]; update-alternatives --install "/usr/bin/$base" "$base" "$bin" 20000; done; java -Xshare:dump; java --version; javac --version + +RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_38.96.tar.gz && \ + tar -xzf BBMap_38.96.tar.gz && \ + rm BBMap_38.96.tar.gz + +ENV PATH="${PATH}:/bbmap"\ + LC_ALL=C + +WORKDIR /data diff --git a/README.md b/README.md index 7256bd3..994af3c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Summary -This workflow is replicate the [QA protocol](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/data-preprocessing/) implemented at JGI for Illumina reads and use the program “rqcfilter2” from BBTools(38:44) which implements them as a pipeline. +This workflow is a replicate of the [QA protocol](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/data-preprocessing/) implemented at JGI for Illumina reads and use the program “rqcfilter2” from BBTools(38:96) which implements them as a pipeline. ## Required Database @@ -27,15 +27,18 @@ Description of the files: ## The Docker image and Dockerfile can be found here -[microbiomedata/bbtools:38.44](https://hub.docker.com/r/microbiomedata/bbtools) +[microbiomedata/bbtools:38.92](https://hub.docker.com/r/microbiomedata/bbtools) ## Input files 1. database path, 2. fastq (illumina paired-end interleaved fastq), 3. output path -4. memory (optional) ex: "jgi_rqcfilter.memory": "35G" -5. threads (optional) ex: "jgi_rqcfilter.threads": "16" +4. input_interleaved (boolean) +5. forwards reads fastq file (when input_interleaved is false) +6. reverse reads fastq file (when input_interleaved is false) +7. memory (optional) ex: "jgi_rqcfilter.memory": "35G" +8. threads (optional) ex: "jgi_rqcfilter.threads": "16" ``` { @@ -46,6 +49,9 @@ Description of the files: "/global/cfs/cdirs/m3408/ficus/8434.3.102077.ATGTCA.fastq.gz" ], "jgi_rqcfilter.outdir": "/global/cfs/cdirs/m3408/ficus_rqcfiltered", + "jgi_rqcfilter.input_interleaved": true, + "jgi_rqcfilter.input_fq1":[], + "jgi_rqcfilter.input_fq2":[], "jgi_rqcfilter.memory": "35G", "jgi_rqcfilter.threads": "16" } diff --git a/docs/index.rst b/docs/index.rst index 2cf7937..73b5b93 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,6 +11,28 @@ Workflow Overview This workflow utilizes the program “rqcfilter2” from BBTools to perform quality control on raw Illumina reads. The workflow performs quality trimming, artifact removal, linker trimming, adapter trimming, and spike-in removal (using BBDuk), and performs human/cat/dog/mouse/microbe removal (using BBMap). +The following parameters are used for "rqcfilter2" in this workflow:: + - qtrim=r : Quality-trim from right ends before mapping. + - trimq=0 : Trim quality threshold. + - maxns=3 : Reads with more Ns than this will be discarded. + - maq=3 : Reads with average quality (before trimming) below this will be discarded. + - minlen=51 : Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter. + - mlf=0.33 : Reads shorter than this fraction of original length after trimming will be discarded. + - phix=true : Remove reads containing phiX kmers. + - khist=true : Generate a kmer-frequency histogram of the output data. + - kapa=true : Remove and quantify kapa tag + - trimpolyg=5 : Trim reads that start or end with a G polymer at least this long + - clumpify=true : Run clumpify; all deduplication flags require this. + - removehuman=true : Remove human reads via mapping. + - removedog=true : Remove dog reads via mapping. + - removecat=true : Remove cat reads via mapping. + - removemouse=true : Remove mouse reads via mapping. + - barcodefilter=false : Disable improper barcodes filter + - chastityfilter=false: Remove illumina reads failing chastity filter. + - trimfragadapter=true: Trim all known Illumina adapter sequences, including TruSeq and Nextera. + - removemicrobes=true : Remove common contaminant microbial reads via mapping, and place them in a separate file. + + Workflow Availability --------------------- @@ -39,7 +61,7 @@ Workflow Dependencies Third party software (This is included in the Docker image.) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- `BBTools v38.90 `_ (License: `BSD-3-Clause-LBNL `_) +- `BBTools v38.96 `_ (License: `BSD-3-Clause-LBNL `_) Requisite database ~~~~~~~~~~~~~~~~~~ @@ -79,8 +101,11 @@ A JSON file containing the following information: 1. the path to the database 2. the path to the interleaved fastq file (input data) 3. the path to the output directory -4. (optional) parameters for memory -5. (optional) number of threads requested +4. input_interleaved (boolean) +5. forwards reads fastq file (when input_interleaved is false) +6. reverse reads fastq file (when input_interleaved is false) +7. (optional) parameters for memory +8. (optional) number of threads requested An example input JSON file is shown below: @@ -92,6 +117,9 @@ An example input JSON file is shown below: "jgi_rqcfilter.input_files": [ "/path/to/SRR7877884-int-0.1.fastq.gz " ], + "jgi_rqcfilter.input_interleaved": true, + "jgi_rqcfilter.input_fq1":[], + "jgi_rqcfilter.input_fq2":[], "jgi_rqcfilter.outdir": "/path/to/rqcfiltered", "jgi_rqcfilter.memory": "35G", "jgi_rqcfilter.threads": "16" diff --git a/input.json b/input.json index b7023e5..61c4981 100755 --- a/input.json +++ b/input.json @@ -10,5 +10,8 @@ "/global/cfs/cdirs/m3408/ficus/8434.1.102069.ACAGTG.fastq.gz", "/global/cfs/cdirs/m3408/ficus/8434.3.102077.ATGTCA.fastq.gz" ], - "jgi_rqcfilter.outdir": "/global/cfs/cdirs/m3408/ficus_rqcfiltered" -} \ No newline at end of file + "jgi_rqcfilter.input_fq1":[], + "jgi_rqcfilter.input_fq2":[], + "jgi_rqcfilter.outdir": "/global/cfs/cdirs/m3408/ficus_rqcfiltered", + "jgi_rqcfilter.input_interleaved": true +} diff --git a/rqcfilter.wdl b/rqcfilter.wdl index 099d985..e4207e0 100644 --- a/rqcfilter.wdl +++ b/rqcfilter.wdl @@ -3,52 +3,120 @@ workflow jgi_rqcfilter { String? outdir String bbtools_container="microbiomedata/bbtools:38.96" String database="/refdata" - Boolean chastityfilter=true + Boolean chastityfilter=false String? memory String? threads + String? proj = "ReadsQC" + String? informed_by = "${proj}" # "nmdc:xxxxxxxxxxxxxxx" + String resource = "NERSC - Cori" + String url_root = "https://data.microbiomedata.org/data/" + String git_url = "https://github.com/microbiomedata/ReadsQC/releases/tag/1.0.4" - scatter(file in input_files) { - call rqcfilter{ - input: input_file=file, + + Boolean input_interleaved = true + Array[File] input_fq1 + Array[File] input_fq2 + + if (!input_interleaved) { + ## the zip() function generates an array of pairs, use .left and .right to access + scatter(file in zip(input_fq1,input_fq2)){ + call interleave_reads { + input: + input_files = [file.left,file.right], + output_file = basename(file.left) + "_" + basename(file.right), + container = bbtools_container + } + call rqcfilter as rqcPE { + input: input_file=interleave_reads.out_fastq, container=bbtools_container, database=database, - chastityfilter_flag=chastityfilter, + chastityfilter_flag=chastityfilter, memory=memory, threads=threads + + } + call generate_objects as goPE { + input: container="microbiomedata/workflowmeta:1.0.5.1", + proj = proj, + start = rqcPE.start, + informed_by = "${informed_by}", + resource = "${resource}", + url_base = "${url_root}", + git_url = "${git_url}", + read = [file.left,file.right], + filtered = rqcPE.filtered, + filtered_stats = rqcPE.stat, + filtered_stats_json = rqcPE.json_out, + prefix = rqcPE.input_prefix + } + } + } + + if (input_interleaved) { + scatter(file in input_files) { + call rqcfilter as rqcInt { + input: input_file=file, + container=bbtools_container, + database=database, + chastityfilter_flag=chastityfilter, + memory=memory, + threads=threads + } + call generate_objects as goInt { + input: container="microbiomedata/workflowmeta:1.0.5.1", + proj = proj, + start = rqcInt.start, + informed_by = "${informed_by}", + resource = "${resource}", + url_base = "${url_root}", + git_url = "${git_url}", + read = [file], + filtered = rqcInt.filtered, + filtered_stats = rqcInt.stat, + filtered_stats_json = rqcInt.json_out, + prefix = rqcInt.input_prefix + } } } # rqcfilter.stat implicit as Array because of scatter # Optional staging to an output directory if (defined(outdir)){ + call make_output { - input: outdir=outdir, - filtered=rqcfilter.filtered, - stats=rqcfilter.stat, - stats2=rqcfilter.stat2, - container=bbtools_container + input: outdir=outdir, + filtered= if (input_interleaved) then rqcInt.filtered else rqcPE.filtered, + activity_json= if (input_interleaved) then goInt.activity_json else goPE.activity_json, + object_json= if (input_interleaved) then goInt.data_object_json else goPE.data_object_json, + container=bbtools_container } } output{ - Array[File] filtered = rqcfilter.filtered - Array[File] stats = rqcfilter.stat - Array[File] stats2 = rqcfilter.stat2 + Array[File]? filtered = if (input_interleaved) then rqcInt.filtered else rqcPE.filtered + Array[File]? stats = if (input_interleaved) then rqcInt.stat else rqcPE.stat + Array[File]? stats2 = if (input_interleaved) then rqcInt.stat2 else rqcPE.stat2 + Array[File]? statsjson = if (input_interleaved) then rqcInt.json_out else rqcPE.json_out + Array[File]? activityjson = if (input_interleaved) then goInt.activity_json else goPE.activity_json + Array[File]? objectjson = if (input_interleaved) then goInt.data_object_json else goPE.data_object_json Array[File]? clean_fastq_files = make_output.fastq_files } parameter_meta { input_files: "illumina paired-end interleaved fastq files" - outdir: "The final output directory path" + outdir: "The final output directory path" database : "database path to RQCFilterData directory" clean_fastq_files: "after QC fastq files" + stats : "summary statistics" + activityjson: "nmdc activity json file" + objectjson: "nmdc data object json file" memory: "optional for jvm memory for bbtools, ex: 32G" threads: "optional for jvm threads for bbtools ex: 16" } meta { author: "Chienchi Lo, B10, LANL" email: "chienchi@lanl.gov" - version: "1.0.2" + version: "1.0.4" } } @@ -59,6 +127,7 @@ task rqcfilter { Boolean chastityfilter_flag=true String? memory String? threads + String prefix=sub(basename(input_file), ".fa?s?t?q.?g?z?$", "") String filename_outlog="stdout.log" String filename_errlog="stderr.log" String filename_stat="filtered/filterStats.txt" @@ -80,12 +149,16 @@ task rqcfilter { #sleep 30 export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P" set -eo pipefail + # Capture the start time + date --iso-8601=seconds > start.txt rqcfilter2.sh -Xmx${default="60G" memory} -da threads=${jvm_threads} ${chastityfilter} jni=t in=${input_file} path=filtered rna=f trimfragadapter=t qtrim=r trimq=0 maxns=3 maq=3 minlen=51 mlf=0.33 phix=t removehuman=t removedog=t removecat=t removemouse=t khist=t removemicrobes=t sketch kapa=t clumpify=t tmpdir= barcodefilter=f trimpolyg=5 usejni=f rqcfilterdata=${database}/RQCFilterData > >(tee -a ${filename_outlog}) 2> >(tee -a ${filename_errlog} >&2) + python <>> - runtime { - docker: container - memory: "1 GiB" - cpu: 1 - } - output{ - Array[String] fastq_files = glob("${outdir}/*.fastq*") - } + command<<< + mkdir -p ${outdir} + for i in ${sep=' ' filtered} + do + f=${dollar}(basename $i) + dir=${dollar}(dirname $i) + prefix=${dollar}{f%.anqdpht*} + mkdir -p ${outdir}/$prefix + cp -f $dir/../filtered/filterStats.txt ${outdir}/$prefix + cp -f $dir/../filtered/filterStats2.txt ${outdir}/$prefix + cp -f $dir/../filtered/filterStats.json ${outdir}/$prefix + cp -f $i ${outdir}/$prefix + echo ${outdir}/$prefix/$f + done + for i in ${sep=' ' activity_json} + do + f=${dollar}(basename $i) + prefix=${dollar}{f%_activity.json*} + cp -f $i ${outdir}/$prefix/activity.json + done + for i in ${sep=' ' object_json} + do + f=${dollar}(basename $i) + prefix=${dollar}{f%_data_objects.json*} + cp -f $i ${outdir}/$prefix/data_objects.json + done + chmod 755 -R ${outdir} + >>> + runtime { + docker: container + memory: "1 GiB" + cpu: 1 + } + output{ + Array[String] fastq_files = read_lines(stdout()) + } } +task interleave_reads{ + Array[File] input_files + String output_file = "interleaved.fastq.gz" + String container + + command <<< + if file --mime -b ${input_files[0]} | grep gzip > /dev/null ; then + paste <(gunzip -c ${input_files[0]} | paste - - - -) <(gunzip -c ${input_files[1]} | paste - - - -) | tr '\t' '\n' | gzip -c > ${output_file} + echo ${output_file} + else + if [[ "${output_file}" == *.gz ]]; then + paste <(cat ${input_files[0]} | paste - - - -) <(cat ${input_files[1]} | paste - - - -) | tr '\t' '\n' | gzip -c > ${output_file} + echo ${output_file} + else + paste <(cat ${input_files[0]} | paste - - - -) <(cat ${input_files[1]} | paste - - - -) | tr '\t' '\n' | gzip -c > ${output_file}.gz + echo ${output_file}.gz + fi + fi + >>> + + runtime { + docker: container + memory: "1 GiB" + cpu: 1 + } + + output { + File out_fastq = read_string(stdout()) + } +} + + diff --git a/test/README.md b/test/README.md index af3780b..a6e5479 100644 --- a/test/README.md +++ b/test/README.md @@ -15,7 +15,7 @@ The ReadsQC validation workflow is meant to compare test data in json format ava 7. url for NMDC test json (provided for small test data) ``` workflow rqctest { - String container="microbiomedata/bbtools:38.90" + String container="microbiomedata/bbtools:38.96" String validate_container="microbiomedata/comparejson" String database="/vol_b/nmdc_workflows/data/test_refdata" Boolean flag=false @@ -25,7 +25,7 @@ workflow rqctest { String ref_json="https://raw.githubusercontent.com/microbiomedata/ReadsQC/master/test/small_test_filterStats.json" ``` ## Docker contianers can be found here: -Bbtools: [microbiomedata/bbtools:38.44](https://hub.docker.com/r/microbiomedata/bbtools) +Bbtools: [microbiomedata/bbtools:38.96](https://hub.docker.com/r/microbiomedata/bbtools) Comparjson: [microbiomedata/comparejson](https://hub.docker.com/r/microbiomedata/comparejson) ## Running Testing Validation Workflow diff --git a/test/rqc_test.wdl b/test/rqc_test.wdl index bb451d9..927ab03 100644 --- a/test/rqc_test.wdl +++ b/test/rqc_test.wdl @@ -1,6 +1,6 @@ import "rqcfilter.wdl" as rqc workflow rqctest { - String container="microbiomedata/bbtools:38.90" + String container="microbiomedata/bbtools:38.96" String validate_container="microbiomedata/comparejson" String database="/vol_b/nmdc_workflows/data/test_refdata" Boolean flag=false