From d02c5d1c63f77a2f4df73a7b90d3496da326f6dc Mon Sep 17 00:00:00 2001 From: Tyghe Vallard Date: Fri, 29 Jan 2016 11:56:06 -0500 Subject: [PATCH] Single pipe from reads->cutadapt->bwa->samtools using interleaved format --- pyjip/bwa_mem.jip | 13 +++++++--- pyjip/cutadapt.jip | 47 +++++++++++++++++----------------- pyjip/paired_to_interleave.jip | 39 ++++++++++++++++++++++++++++ pyjip/simplepipe.jip | 12 ++++++--- 4 files changed, 79 insertions(+), 32 deletions(-) create mode 100755 pyjip/paired_to_interleave.jip diff --git a/pyjip/bwa_mem.jip b/pyjip/bwa_mem.jip index ded4a40..e8a97ad 100755 --- a/pyjip/bwa_mem.jip +++ b/pyjip/bwa_mem.jip @@ -2,11 +2,16 @@ # bwa mem # # Usage: -# bwa_mem -r -f ... [-o ] +# bwa_mem -r -f ... [-p] [-o ] # # Options: # -r, --reference Fasta reference path -# -o, --output Output path [Default: stdout] -# -f, --fastq ... Fastq files to map to reference +# -o, --output Output path [default: stdout] +# -f, --fastq ... Fastq files to map to reference [default: stdin] +# -p, --interleaved Flag to specify interleaved input [default: False] -bwa mem ${reference} ${fastq} +#%begin init +options['fastq'].streamable = True +#%end + +bwa mem ${interleaved|arg|else('')} ${reference} ${fastq|else("/dev/stdin")} ${output|arg(">")} diff --git a/pyjip/cutadapt.jip b/pyjip/cutadapt.jip index ed73e9e..1d58508 100755 --- a/pyjip/cutadapt.jip +++ b/pyjip/cutadapt.jip @@ -2,40 +2,39 @@ # Runs cutadapt with only quality filter # # usage: -# cutadapt -q ... -f ... [-o ] [-p ] [--pipe] +# cutadapt -q ... [-i ] [-o ] [-p ] [--interleave] # # Options: -# -f, --fastq The input fastq file[s] +# -i, --input The input fastq [Default: stdin] # -q, --qualcutoff The quality cutoff [Default: 25] -# -o, --outr1 R1 output fastq [Default: output_r1.cutadapt] -# -p, --outr2 R2 output fastq -# --pipe Flag to create named pipes for output [Default: False] +# -o, --output Output fastq for single read or interleaved(paired)[Default: stdout] +# -p, --outr2 Reverse output for paired fastq(non-interleaved) +# --interleave Interleave output [Default: False] #%begin init -add_output('output', [options['outr1']], nargs='+') +#%end + +#%begin validate +#if options['interleave'] and len(options['fastq']) > 1: +# validation_error("Cannot specify more than 1 fastq file with --interleave") + +if options['interleave'] and len(options['outr2']) > 1: + validation_error("Cannot specify outr2 and --interleave") + +#if len(options['fastq']) != len(options['qualcutoff']): +# validation_error("qualcutoff must match number of fastq given") #%end #%begin setup +''' if len(options['fastq']) == 2: options['qualcutoff'].join = ',' - if len(options['qualcutoff']) != len(options['fastq']): - qualcutoff = options['qualcutoff'].value[0] - options['qualcutoff'].set([qualcutoff,qualcutoff]) - if not options['outr2']: - options['outr2'].set('output_r2.cutadapt') - options['output'].append(options['outr2']) -#%end -#%begin validate -assert len(options['fastq']) == len(options['qualcutoff']), validation_error('qualcuttoff must have same number of entries as fastq') + if not options['interleave']: + if not options['outr2']: + options['outr2'].set('output_r2.cutadapt') + options['output'].append(options['outr2']) +''' #%end -if [ "${pipe|arg}" == "--pipe" ] -then - mkfifo ${outr1} ${outr2|else('')} -fi -cutadapt ${outr1|arg} ${outr2|arg|else('')} -q ${qualcutoff} ${fastq} -if [ "${pipe|arg}" == "--pipe" ] -then - rm ${outr1} ${outr2|else('')} -fi +cutadapt ${interleave|arg|else('')} ${output|arg(">")} ${outr2|arg|else('')} -q ${qualcutoff} -f fastq ${input|else("/dev/stdin")} diff --git a/pyjip/paired_to_interleave.jip b/pyjip/paired_to_interleave.jip new file mode 100755 index 0000000..1ca75be --- /dev/null +++ b/pyjip/paired_to_interleave.jip @@ -0,0 +1,39 @@ +#!/usr/bin/env jip +# +# Converts 2 fastq paired files into single interleave +# +# Usage: +# paired_to_interleave -f -r [--outformat ] [-o ] +# +# Options: +# -f, --forward The forward fastq input +# -r, --reverse The reverse fastq input +# -o, --output The interleaved output [default: stdout] +# --outformat The output format [default: fastq] + +#%begin validate +if options['outformat'] not in ('fasta', 'fastq'): + validation_error( + "output format can only be fasta or fastq. You provided '%s'" + % options['outformat'] + ) +#%end + +#%begin command python +import itertools + +from Bio import SeqIO + +def interleave(iter1, iter2): + for forward, reverse in itertools.izip(iter1, iter2): + assert forward.id == reverse.id, "%s did not match %s" % \ + (forward.id, reverse.id) + yield forward + yield reverse + +f1, f2 = open("${forward}"), open("${reverse}") +records = interleave(SeqIO.parse(f1, 'fastq'), SeqIO.parse(f2, 'fastq')) +outfile = open("${output|else('/dev/stdout')}", 'w') +count = SeqIO.write(records, outfile, "${outformat}") +outfile.close() +#%end diff --git a/pyjip/simplepipe.jip b/pyjip/simplepipe.jip index 4bd20b1..336ba01 100755 --- a/pyjip/simplepipe.jip +++ b/pyjip/simplepipe.jip @@ -3,7 +3,7 @@ # Simple pipeline to run cutadapt, bwa index, bwa mem, samtools view # # Usage: -# simplepipe -r -f ... [-q ...] [-o ] [--pipe] +# simplepipe -r -f ... [-q ...] [-o ] [--interleave] # # Options: # -r, --reference Reference file to index and map to @@ -12,13 +12,17 @@ # [Default: 25] # -o, --output The output bam file # [Default: mapped.bam] -# -p, --pipe To use named pipe(have to submit jip job for this to work) +# -i, --interleave To use interleave format as much as possible for stream # [Default: False] #%begin pipeline -trimmed = run('cutadapt', fastq=options['fastq'], qualcutoff=options['qualcutoff'], pipe=options['pipe']) +if options['interleave']: + fastq = run('paired_to_interleave', forward=options['fastq'].value[0], reverse=options['fastq'].value[1]) +else: + fastq = options['fastq'] +trimmed = run('cutadapt', input=fastq, qualcutoff=options['qualcutoff'], interleave=options['interleave']) index_ref = run('bwa_index', reference=options['reference']) -sam = run('bwa_mem', reference=reference, fastq=['output_r1.cutadapt', 'output_r2.cutadapt']) +sam = run('bwa_mem', reference=reference, fastq=trimmed) bam = run('sam_to_bam', input=sam, output=options['output']) index_ref >> (sam | bam)