VDBWRAIR · necrolyte2 · Feb 11, 2016 · Feb 3, 2016 · Feb 3, 2016 · Feb 3, 2016
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,6 @@ target/
 *~
 *.swp
 *.swo
+
+# Project ignores
+tests/testoutput
diff --git a/bioframework/index_filter.py b/bioframework/index_filter.py
@@ -0,0 +1,51 @@
+from toolz.itertoolz import second, first, partition
+import re
+import os
+from itertools import ifilter, imap, izip, chain
+from toolz import compose
+from seqio import write_zip_results
+from functools import partial
+
+def filter_on_index(predicate, seqs, index_seqs):
+    pred = compose(predicate, second)
+    return imap(first, ifilter(pred, izip(seqs, index_seqs)))
+
+def write_index_filter(input, output, predicate):
+    def get_index(path):
+        index_name = re.sub(r"_R([12])_", r"_I\1_", path)
+        if os.path.exists(index_name): return index_name
+    index = get_index(input)
+    if index is None:
+        raise ValueError("Index %s for file %s not found" % (index, input))
+    return write_zip_results(partial(filter_on_index, predicate),
+                      output, 'fastq', input, index)
+
+def filter_on_index_quality(input, output, minimum):
+    """Removes reads from a paired read file if its associated
+    index file has a quality lower than minimum for that read index."""
+    below_qual = lambda x: min(x.quality) < minimum
+    write_index_filter(input, output, below_qual)
+    return 0
+
+def filter_on_index_quality_interleaved(interleaved, index1, index2, output, minimum):
+    '''enpair the interleaved read file,zip that enpaired with the two indexes
+    drop pairs from the interleaved file if either *index* is below the minimum'''
+    minimum = int(minimum)
+    return write_zip_results(partial(qual_filter, minimum), output, 'fastq', interleaved, index1, index2)
+
+def qual_filter(minimum, interleaved, idx1, idx2):
+    above_min = lambda x: min(x.letter_annotations['phred_quality']) >= minimum
+    def indexes_above_min(seq_index):
+        seq, idx1, idx2 = seq_index
+        return above_min(idx1) and above_min(idx2)
+    # pair together forward/reverse reads
+    #  [SeqRecord(forward), SeqRecord(reverse)]
+    interleaved = partition(2, interleaved)
+    # Zip together indexes with pairs
+    #  [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
+    zipped = izip(interleaved, idx1, idx2)
+    # Filter out all indexes with less than min qual and then grab
+    # only the interleaved sequence tuple
+    filtered = imap(first, ifilter(indexes_above_min, zipped))
+    # Chain together all sequences
+    return chain.from_iterable(filtered)
diff --git a/bioframework/seqio.py b/bioframework/seqio.py
@@ -1,16 +1,23 @@
 import itertools
 from Bio import SeqIO
-
+from functools import partial
 def interleave(iter1, iter2):
     for forward, reverse in itertools.izip(iter1, iter2):
         assert forward.id == reverse.id, "%s did not match %s" % \
             (forward.id, reverse.id)
         yield forward
         yield reverse
 
+def write_zip_results(func, output, out_format, *files):
+    parse_fastq = partial(SeqIO.parse, format='fastq')
+    inputs = map(parse_fastq, files)
+    results = func(*inputs)
+    count = SeqIO.write(results, output, out_format)
+    return count
+
 def paired_to_interleave(forward, reverse, output, out_format):
-    f1, f2 = open(forward), open(reverse)
-    records = interleave(SeqIO.parse(f1, 'fastq'), SeqIO.parse(f2, 'fastq'))
-    outfile = open(output, 'w')
-    count = SeqIO.write(records, outfile, out_format)
-    outfile.close()
+    return write_zip_results(interleave, output, out_format, forward, reverse)
+
+
+
+
diff --git a/jip_modules/convert_format.jip b/jip_modules/convert_format.jip
@@ -4,20 +4,22 @@
 # extensions of the files
 #
 # Usage:
-#    convert_format [-i <input>] [--in-format <informat>] [--out-format <outformat>] [-o <output>]
+#    convert_format [<input>] [--in-format <informat>] [--out-format <outformat>] [-o <output>]
 #
 # Options:
-#    -i, --input=<input>     The input to convert. File extension dictates format
-#                            unless stdin is specified, then you must supply
-#                            output format option.
-#                            [Default: stdin]
 #    --in-format=<informat>  This is autodetected via the extension of the output
 #                            file, but can be overidden here.
 #    --out-format=<informat> Same as input-format, but specifies the output format
 #    -o, --output=<output>   The output file path. Extension will dictate format.
 #                            If unspecified the output will go to standard output
 #                            as fastq.
 #                            [Default: stdout]
+#
+# Help:
+#    <input>     The input to convert. File extension dictates format
+#                unless stdin is specified, then you must supply
+#                output format option.
+#                [Default: stdin]
 #%begin validate
 if not options.input.get() and not options.in_format:
     validation_error("Have to supply input format if input is stdin")

diff --git a/jip_modules/cutadapt.jip b/jip_modules/cutadapt.jip
diff --git a/jip_modules/index_filter_quality.jip b/jip_modules/index_filter_quality.jip
@@ -0,0 +1,14 @@
+#!/usr/bin/env jip
+#
+# Usage:  
+#    index_filter_quality <input> [-o <output>] --minimum <minimum>
+#
+# Options:
+#  -o, --output=<output>     Output files [default: stdout]
+#  -q, --minimum=<minimum>  Minimum quality of the index file
+
+#%begin command python
+from bioframework.index_filter import filter_on_index_quality 
+filter_index("${input}", "${output}", "${min_qual}") 
+#%end
+
diff --git a/jip_modules/interleaved_index_filter.jip b/jip_modules/interleaved_index_filter.jip
@@ -0,0 +1,19 @@
+#!/usr/bin/env jip
+#
+# Usage:  
+#    index_filter_quality [<input>] [-o <output>] --minimum <minimum> --index1 <index1> --index2 <index2>
+#
+# Options:
+#  -o, --output=<output>    Output files [default: stdout]
+#  -q, --minimum=<minimum>  Minimum quality of the index file
+#  -1, --index1=<index1>    Index file from MiSeq
+#  -2, --index2=<index2>    Index file from MiSeq
+# 
+# Help:
+#  <input>   Interleaved fastq file
+
+#%begin command python
+from bioframework.index_filter import filter_on_index_quality_interleaved
+filter_on_index_quality_interleaved("${input|else('/dev/stdin')}", "${index1}", "${index2}", "${output|else('/dev/stdout')}", "${minimum}") 
+#%end
+
diff --git a/jip_modules/paired_to_interleave.jip b/jip_modules/paired_to_interleave.jip
@@ -3,27 +3,44 @@
 # Converts 2 fastq paired files into single interleave
 #
 # Usage:
-#   paired_to_interleave -f <forward> -r <reverse> [--out-format <out-format>] [-o <output>]
+#   paired_to_interleave [<input>...] [--out-format <out-format>] [-o <output>]
 #
 # Options:
 #   -f, --forward=<forward>   The forward fastq input
 #   -r, --reverse=<reverse>   The reverse fastq input
 #   -o, --output=<output>     The interleaved output [default: stdout]
 #   --out-format=<outformat>  The output format [default: fastq]
+#
+# Help:
+#    <input>    Must be two input files where the first input file represents the
+#               forward reads and the second input file represents the reverse reads
+#               The read files must contain the same amount of reads and all read
+#               identifiers must be identical in the forward and reverse
+#               I.E.
+#               forward.fastq
+#                 >id1 whatever123
+#                 ATGC
+#               reverse.fastq
+#                 >id1 whatever321
+#                 CGTA
 
 #%begin validate
 if options['out_format'] not in ('fasta', 'fastq'):
     validation_error(
         "output format can only be fasta or fastq. You provided '%s'" 
             % options['out_format']
     )
+if len(options.input.value) != 2:
+    validation_error(
+        "Must supply two input files to convert to interleave"
+    )
 #%end
 
 #%begin command python
 from bioframework import seqio
 seqio.paired_to_interleave(
-    "${forward}",
-    "${reverse}",
+    "${input.value|first}",
+    "${input.value|last}",
     "${output|else('/dev/stdout')}",
     "${out_format}"
 )

diff --git a/jip_modules/render_template.jip b/jip_modules/render_template.jip
@@ -0,0 +1,25 @@
+#!/usr/bin/env jip
+#
+# Make jip job file from template + json
+#
+# Usage:
+#  mkjip -t <template> -j <json> [-o <output>]
+#
+# Options:
+#  -t, --tmplate=<template>  Jinja2 template to use
+#  -j, --json=<json>         Json file to use to replace values in tmplate
+#  -o, --output=<output>     Output location for job file [Default: stdout]
+
+#%begin command python
+import json
+from jinja2 import Template
+
+tmpl = Template(open("${tmplate}").read())
+jsn = json.load(open("${json}"))
+rendered = tmpl.render(jsn)
+
+fh = open("${output|else('/dev/stdout')}", 'w')
+fh.write(rendered)
+fh.close()
+
+#%end
diff --git a/jip_modules/trim_reads.jip b/jip_modules/trim_reads.jip
@@ -0,0 +1,56 @@
+#!/usr/bin/env jip
+#
+# Trim reads based on criteria via trimming
+#
+# Usage:
+#   filter_reads [<input>] [-p] [-t] [-q <qualcut>...] [-x <delete>...] [-o <output>]
+#
+# Options:
+#   -p, --paired                Indicates that the input is paired-interleave format otherwise
+#                               the input is assumed to be unpaired fastq
+#                               [Default: stdin]
+#   -o, --output=<output>       Output fastq. If input was interleave, output will be as well
+#                               [default: stdout]
+#   -t, --trim-n                Trim N's from ends of reads
+#   -q, --qualcutoff=<qualcut>  Quality score to cut from beginning and end of reads
+#                               If one value is given trim from 3', else trim from both
+#                               where the first value trims the 5' and the second trims the 3'
+#                               Trim 3': -q 25
+#                               Trim 5',3': -q 25 20
+#   -x, --removebases=<delete>  Remove X bases from all reads
+#                               If the number is positive remove from the beginning of reads
+#                               If the number is negative remove from the end of reads
+#                               Ex. -x 10 would remove 10 bases from the front
+#                               Ex. -x -10 would remove 10 bases from the end
+#                               Ex. -x 10 -10 would remove 10 bases from front and end
+#%begin init
+#%end
+
+#%begin validate
+if options.trim_n:
+    # See https://github.com/marcelm/cutadapt/pull/175
+    validation_error("--trim-n is not supported at this time")
+#%end
+
+#%begin setup
+# -u option has to be specified more than once, unlike -q option where you 
+# join together the args
+options.removebases.short = '-u'
+options.removebases.join = ' -u '
+options.qualcutoff.join = ','
+#%end
+
+#%begin command bash
+
+cutargs="${trim_n|arg('--trim-n')} ${qualcutoff|arg} ${removebases|arg} ${interleave}"
+if [ "${paired}" ]
+then
+    cmd="cutadapt --interleave $cutargs ${removebases|arg|upper} ${output|arg} ${input|else('--format fastq /dev/stdin')}"
+else
+    cmd="cutadapt $cutargs ${output|arg} ${forward} ${input|else('--format fastq /dev/stdin')}"
+fi
+
+echo $cmd >&2
+$cmd
+
+#%end
diff --git a/setup.py b/setup.py
@@ -22,6 +22,7 @@ def jip_modules(path=bioframework.JIP_PATH):
     },
     install_requires = [
         'pyjip',
+        'toolz',
     ],
     data_files=[
         (join(sys.prefix,'bin'), jip_modules()),

diff --git a/templates/filter.json b/templates/filter.json
@@ -0,0 +1,14 @@
+{
+    "cmd": "filter_reads",
+    "desc": "Filter reads based on criteria",
+    "opts": [{
+        "short": "n",
+        "long":  "max-n",
+        "name": "max_n",
+        "description": "Max number of n's to allow",
+        "required" : true
+    }],
+    "paired_cmd": "cutadapt ${forward} ${reverse} ${max_n|arg} -o ${forwardoutput} -p {reverseoutput}",
+    "unpaired_cmd": "cutadapt ${unpaired} ${max_n|arg} -o ${output}"
+}
+
diff --git a/templates/jip.jinja b/templates/jip.jinja
@@ -0,0 +1,36 @@
+#!/usr/bin/env jip
+#
+# {{desc}}
+#
+# Usage:
+#   {{cmd}} ([-f <forward> -r <reverse>] | [-u <unpaired>]) [-o <output>...] {% for opt in opts if opt.required %} [--{{opt.long}} <{{opt.name}}>] {% endfor %} {% for opt in opts if not opt.required %} --{{opt.long}} <{{opt.name}}> {% endfor %}
+#
+#
+#
+# Options:
+{% for opt in opts -%} 
+#   -{{opt.short}}, --{{opt.long}}=<{{opt.name}}>  {{opt.description}}
+{% endfor -%}
+#   -u, --unpaired=<unpaired>  The unpaired fastq input
+#   -f, --forward=<forward>    The forward fastq input
+#   -r, --reverse=<reverse>    The reverse fastq input
+#   -o, --output=<output>      Output files [default: stdout]
+
+
+#%begin validate
+
+if options.unpaired and (options.forward or options.reverse):
+    validation_error("Must choose either paired or unpaired")
+
+#%end
+
+#%begin command bash
+
+if [ "${unpaired}" ]
+then
+    {{unpaired_cmd}} 
+else
+    {{paired_cmd}} 
+fi
+
+#%end
-Original file line number
+Diff line change
@@ Expand Up / @@ -69,3 +69,6 @@ target/ @@
     *~
     *.swp
     *.swo
+    # Project ignores
+    tests/testoutput