Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter & trim #16

Merged
merged 38 commits into from
Feb 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
abe38fd
added jip template
averagehat Feb 3, 2016
6d559f8
templating templates example
averagehat Feb 3, 2016
0ddb306
moved file
Feb 3, 2016
7176b0a
fix few issues
necrolyte2 Feb 3, 2016
e285376
fix mutually exclusive option
necrolyte2 Feb 3, 2016
5f71b6e
take jinja template and json and combine into output stream
necrolyte2 Feb 3, 2016
9a643a1
added index quality filter
averagehat Feb 3, 2016
ff2cf14
refactoring io
averagehat Feb 3, 2016
15b029b
Merge branch 'filter-trim' of https://github.com/VDBWRAIR/bioframewor…
Feb 3, 2016
df242b0
template now includes options in usage
averagehat Feb 3, 2016
e6216f5
added index filter jip file
averagehat Feb 3, 2016
f502f28
added interleaved template
averagehat Feb 5, 2016
d010cd0
write_zip_results now takes variable number of input files as args
averagehat Feb 5, 2016
1b451d0
added interleaved index filter
averagehat Feb 5, 2016
4e682f1
stdin support
averagehat Feb 5, 2016
d108795
better arg names
averagehat Feb 5, 2016
689ecf3
transform_reads tool
necrolyte2 Feb 5, 2016
00f5d77
fixed missing arg
averagehat Feb 5, 2016
ff38983
Merge branch 'filter-trim' of https://github.com/VDBWRAIR/bioframewor…
averagehat Feb 5, 2016
cf31ee6
Renaming transform_reads to trim_reads
necrolyte2 Feb 8, 2016
d2f8421
Set execute perms on jip modules. Add shebang to tools. Add toolz dep…
necrolyte2 Feb 8, 2016
14c1698
adding in generic test libraries and converted trim_reads tests to us…
necrolyte2 Feb 8, 2016
a6f33fb
removed redundant cutadapt tool. Updated convert_format.robot to use …
necrolyte2 Feb 8, 2016
98366b6
Fixed all tests to use common keywords and convert and paired to inte…
necrolyte2 Feb 8, 2016
6722d2c
fixed interleave index filter and some basic tests just to get bioler…
necrolyte2 Feb 9, 2016
797be9e
fix missing dependency for tests
necrolyte2 Feb 9, 2016
e14ebf6
check dir exist before removing
necrolyte2 Feb 9, 2016
a8dcd80
index_filter now lazy (uses generators)
averagehat Feb 9, 2016
3249d91
refactor index_filter
averagehat Feb 9, 2016
60152ac
hypothesis tests
averagehat Feb 9, 2016
a9cdb9b
filtering--with index--is not idempotent
averagehat Feb 10, 2016
ddeaa18
add hypothesis and sh to requirements
averagehat Feb 10, 2016
0bf52b6
fixed seqio error where output file could be empty string
necrolyte2 Feb 10, 2016
2d2efd7
more tests, all working locally
averagehat Feb 10, 2016
b370f0e
Merge pull request #17 from VDBWRAIR/hypothesis
necrolyte2 Feb 10, 2016
2ba326f
filterjip now uses [<input>] consistent with others
averagehat Feb 10, 2016
1180bca
working on fixing filter jip for tests
averagehat Feb 11, 2016
48dfc4c
hypothesis tests working
averagehat Feb 11, 2016
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,6 @@ target/
*~
*.swp
*.swo

# Project ignores
tests/testoutput
51 changes: 51 additions & 0 deletions bioframework/index_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from toolz.itertoolz import second, first, partition
import re
import os
from itertools import ifilter, imap, izip, chain
from toolz import compose
from seqio import write_zip_results
from functools import partial

def filter_on_index(predicate, seqs, index_seqs):
pred = compose(predicate, second)
return imap(first, ifilter(pred, izip(seqs, index_seqs)))

def write_index_filter(input, output, predicate):
def get_index(path):
index_name = re.sub(r"_R([12])_", r"_I\1_", path)
if os.path.exists(index_name): return index_name
index = get_index(input)
if index is None:
raise ValueError("Index %s for file %s not found" % (index, input))
return write_zip_results(partial(filter_on_index, predicate),
output, 'fastq', input, index)

def filter_on_index_quality(input, output, minimum):
"""Removes reads from a paired read file if its associated
index file has a quality lower than minimum for that read index."""
below_qual = lambda x: min(x.quality) < minimum
write_index_filter(input, output, below_qual)
return 0

def filter_on_index_quality_interleaved(interleaved, index1, index2, output, minimum):
'''enpair the interleaved read file,zip that enpaired with the two indexes
drop pairs from the interleaved file if either *index* is below the minimum'''
minimum = int(minimum)
return write_zip_results(partial(qual_filter, minimum), output, 'fastq', interleaved, index1, index2)

def qual_filter(minimum, interleaved, idx1, idx2):
above_min = lambda x: min(x.letter_annotations['phred_quality']) >= minimum
def indexes_above_min(seq_index):
seq, idx1, idx2 = seq_index
return above_min(idx1) and above_min(idx2)
# pair together forward/reverse reads
# [SeqRecord(forward), SeqRecord(reverse)]
interleaved = partition(2, interleaved)
# Zip together indexes with pairs
# [((SeqRecord(forward), SeqRecord(reverse)), SeqRecord(forwardindex), SeqRecord(reverseindex)]
zipped = izip(interleaved, idx1, idx2)
# Filter out all indexes with less than min qual and then grab
# only the interleaved sequence tuple
filtered = imap(first, ifilter(indexes_above_min, zipped))
# Chain together all sequences
return chain.from_iterable(filtered)
19 changes: 13 additions & 6 deletions bioframework/seqio.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
import itertools
from Bio import SeqIO

from functools import partial
def interleave(iter1, iter2):
for forward, reverse in itertools.izip(iter1, iter2):
assert forward.id == reverse.id, "%s did not match %s" % \
(forward.id, reverse.id)
yield forward
yield reverse

def write_zip_results(func, output, out_format, *files):
parse_fastq = partial(SeqIO.parse, format='fastq')
inputs = map(parse_fastq, files)
results = func(*inputs)
count = SeqIO.write(results, output, out_format)
return count

def paired_to_interleave(forward, reverse, output, out_format):
f1, f2 = open(forward), open(reverse)
records = interleave(SeqIO.parse(f1, 'fastq'), SeqIO.parse(f2, 'fastq'))
outfile = open(output, 'w')
count = SeqIO.write(records, outfile, out_format)
outfile.close()
return write_zip_results(interleave, output, out_format, forward, reverse)




12 changes: 7 additions & 5 deletions jip_modules/convert_format.jip
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@
# extensions of the files
#
# Usage:
# convert_format [-i <input>] [--in-format <informat>] [--out-format <outformat>] [-o <output>]
# convert_format [<input>] [--in-format <informat>] [--out-format <outformat>] [-o <output>]
#
# Options:
# -i, --input=<input> The input to convert. File extension dictates format
# unless stdin is specified, then you must supply
# output format option.
# [Default: stdin]
# --in-format=<informat> This is autodetected via the extension of the output
# file, but can be overidden here.
# --out-format=<informat> Same as input-format, but specifies the output format
# -o, --output=<output> The output file path. Extension will dictate format.
# If unspecified the output will go to standard output
# as fastq.
# [Default: stdout]
#
# Help:
# <input> The input to convert. File extension dictates format
# unless stdin is specified, then you must supply
# output format option.
# [Default: stdin]
#%begin validate
if not options.input.get() and not options.in_format:
validation_error("Have to supply input format if input is stdin")
Expand Down
40 changes: 0 additions & 40 deletions jip_modules/cutadapt.jip

This file was deleted.

14 changes: 14 additions & 0 deletions jip_modules/index_filter_quality.jip
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env jip
#
# Usage:
# index_filter_quality <input> [-o <output>] --minimum <minimum>
#
# Options:
# -o, --output=<output> Output files [default: stdout]
# -q, --minimum=<minimum> Minimum quality of the index file

#%begin command python
from bioframework.index_filter import filter_on_index_quality
filter_index("${input}", "${output}", "${min_qual}")
#%end

19 changes: 19 additions & 0 deletions jip_modules/interleaved_index_filter.jip
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env jip
#
# Usage:
# index_filter_quality [<input>] [-o <output>] --minimum <minimum> --index1 <index1> --index2 <index2>
#
# Options:
# -o, --output=<output> Output files [default: stdout]
# -q, --minimum=<minimum> Minimum quality of the index file
# -1, --index1=<index1> Index file from MiSeq
# -2, --index2=<index2> Index file from MiSeq
#
# Help:
# <input> Interleaved fastq file

#%begin command python
from bioframework.index_filter import filter_on_index_quality_interleaved
filter_on_index_quality_interleaved("${input|else('/dev/stdin')}", "${index1}", "${index2}", "${output|else('/dev/stdout')}", "${minimum}")
#%end

23 changes: 20 additions & 3 deletions jip_modules/paired_to_interleave.jip
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,44 @@
# Converts 2 fastq paired files into single interleave
#
# Usage:
# paired_to_interleave -f <forward> -r <reverse> [--out-format <out-format>] [-o <output>]
# paired_to_interleave [<input>...] [--out-format <out-format>] [-o <output>]
#
# Options:
# -f, --forward=<forward> The forward fastq input
# -r, --reverse=<reverse> The reverse fastq input
# -o, --output=<output> The interleaved output [default: stdout]
# --out-format=<outformat> The output format [default: fastq]
#
# Help:
# <input> Must be two input files where the first input file represents the
# forward reads and the second input file represents the reverse reads
# The read files must contain the same amount of reads and all read
# identifiers must be identical in the forward and reverse
# I.E.
# forward.fastq
# >id1 whatever123
# ATGC
# reverse.fastq
# >id1 whatever321
# CGTA

#%begin validate
if options['out_format'] not in ('fasta', 'fastq'):
validation_error(
"output format can only be fasta or fastq. You provided '%s'"
% options['out_format']
)
if len(options.input.value) != 2:
validation_error(
"Must supply two input files to convert to interleave"
)
#%end

#%begin command python
from bioframework import seqio
seqio.paired_to_interleave(
"${forward}",
"${reverse}",
"${input.value|first}",
"${input.value|last}",
"${output|else('/dev/stdout')}",
"${out_format}"
)
Expand Down
25 changes: 25 additions & 0 deletions jip_modules/render_template.jip
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env jip
#
# Make jip job file from template + json
#
# Usage:
# mkjip -t <template> -j <json> [-o <output>]
#
# Options:
# -t, --tmplate=<template> Jinja2 template to use
# -j, --json=<json> Json file to use to replace values in tmplate
# -o, --output=<output> Output location for job file [Default: stdout]

#%begin command python
import json
from jinja2 import Template

tmpl = Template(open("${tmplate}").read())
jsn = json.load(open("${json}"))
rendered = tmpl.render(jsn)

fh = open("${output|else('/dev/stdout')}", 'w')
fh.write(rendered)
fh.close()

#%end
56 changes: 56 additions & 0 deletions jip_modules/trim_reads.jip
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env jip
#
# Trim reads based on criteria via trimming
#
# Usage:
# filter_reads [<input>] [-p] [-t] [-q <qualcut>...] [-x <delete>...] [-o <output>]
#
# Options:
# -p, --paired Indicates that the input is paired-interleave format otherwise
# the input is assumed to be unpaired fastq
# [Default: stdin]
# -o, --output=<output> Output fastq. If input was interleave, output will be as well
# [default: stdout]
# -t, --trim-n Trim N's from ends of reads
# -q, --qualcutoff=<qualcut> Quality score to cut from beginning and end of reads
# If one value is given trim from 3', else trim from both
# where the first value trims the 5' and the second trims the 3'
# Trim 3': -q 25
# Trim 5',3': -q 25 20
# -x, --removebases=<delete> Remove X bases from all reads
# If the number is positive remove from the beginning of reads
# If the number is negative remove from the end of reads
# Ex. -x 10 would remove 10 bases from the front
# Ex. -x -10 would remove 10 bases from the end
# Ex. -x 10 -10 would remove 10 bases from front and end
#%begin init
#%end

#%begin validate
if options.trim_n:
# See https://github.com/marcelm/cutadapt/pull/175
validation_error("--trim-n is not supported at this time")
#%end

#%begin setup
# -u option has to be specified more than once, unlike -q option where you
# join together the args
options.removebases.short = '-u'
options.removebases.join = ' -u '
options.qualcutoff.join = ','
#%end

#%begin command bash

cutargs="${trim_n|arg('--trim-n')} ${qualcutoff|arg} ${removebases|arg} ${interleave}"
if [ "${paired}" ]
then
cmd="cutadapt --interleave $cutargs ${removebases|arg|upper} ${output|arg} ${input|else('--format fastq /dev/stdin')}"
else
cmd="cutadapt $cutargs ${output|arg} ${forward} ${input|else('--format fastq /dev/stdin')}"
fi

echo $cmd >&2
$cmd

#%end
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def jip_modules(path=bioframework.JIP_PATH):
},
install_requires = [
'pyjip',
'toolz',
],
data_files=[
(join(sys.prefix,'bin'), jip_modules()),
Expand Down
14 changes: 14 additions & 0 deletions templates/filter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"cmd": "filter_reads",
"desc": "Filter reads based on criteria",
"opts": [{
"short": "n",
"long": "max-n",
"name": "max_n",
"description": "Max number of n's to allow",
"required" : true
}],
"paired_cmd": "cutadapt ${forward} ${reverse} ${max_n|arg} -o ${forwardoutput} -p {reverseoutput}",
"unpaired_cmd": "cutadapt ${unpaired} ${max_n|arg} -o ${output}"
}

36 changes: 36 additions & 0 deletions templates/jip.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env jip
#
# {{desc}}
#
# Usage:
# {{cmd}} ([-f <forward> -r <reverse>] | [-u <unpaired>]) [-o <output>...] {% for opt in opts if opt.required %} [--{{opt.long}} <{{opt.name}}>] {% endfor %} {% for opt in opts if not opt.required %} --{{opt.long}} <{{opt.name}}> {% endfor %}
#
#
#
# Options:
{% for opt in opts -%}
# -{{opt.short}}, --{{opt.long}}=<{{opt.name}}> {{opt.description}}
{% endfor -%}
# -u, --unpaired=<unpaired> The unpaired fastq input
# -f, --forward=<forward> The forward fastq input
# -r, --reverse=<reverse> The reverse fastq input
# -o, --output=<output> Output files [default: stdout]


#%begin validate

if options.unpaired and (options.forward or options.reverse):
validation_error("Must choose either paired or unpaired")

#%end

#%begin command bash

if [ "${unpaired}" ]
then
{{unpaired_cmd}}
else
{{paired_cmd}}
fi

#%end
Loading