Skip to content

Commit

Permalink
v1.2.1: clean I/O handling, improved logs
Browse files Browse the repository at this point in the history
  • Loading branch information
Jaganathan committed Feb 21, 2019
1 parent 131fc38 commit 92378c4
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ build/
dist/
*.egg-info/
.eggs/
*.flat
*.gdx
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cat input.vcf | spliceai -R genome.fa -A grch37 > output.vcf

Options:
- **-I**: Input VCF with variants of interest.
- **-O**: Output VCF with SpliceAI predictions `SpliceAI=ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL` included in the INFO column (see table below for details). Only SNVs and simple INDELs (ref or alt must be a single base) within genes are annotated. Variants in multiple genes have separate predictions for each gene.
- **-O**: Output VCF with SpliceAI predictions `SpliceAI=ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL` included in the INFO column (see table below for details). Only SNVs and simple INDELs (REF or ALT must be a single base) within genes are annotated. Variants in multiple genes have separate predictions for each gene.
- **-R**: Reference genome fasta file.
- **-A**: Gene annotation file. Can instead provide `grch37` or `grch38` to use GENCODE canonical annotation files included with the package. To create custom annotation files, use `spliceai/annotations/grch37.txt` in repository as template.

Expand Down
2 changes: 1 addition & 1 deletion examples/output.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.2 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.2.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
#CHROM POS ID REF ALT QUAL FILTER INFO
2 179642185 . G A . . SpliceAI=A|TTN|0.24|0.00|0.64|0.55|126|38|2|-38
19 38958362 . C T . . SpliceAI=T|RYR1|0.22|0.00|0.91|0.70|-107|-46|-2|90
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
description='SpliceAI: A deep learning-based tool to identify splice variants',
long_description=io.open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
version='1.2',
version='1.2.1',
author='Kishore Jaganathan',
author_email='[email protected]',
license='GPLv3',
Expand Down
3 changes: 3 additions & 0 deletions spliceai/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import signal
from pkg_resources import get_distribution


signal.signal(signal.SIGINT, lambda x, y: exit(0))

name = 'spliceai'
__version__ = get_distribution(name).version
31 changes: 17 additions & 14 deletions spliceai/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,39 +27,42 @@ def get_options():
'or path to a similarly-constructed custom gene annotation file')
args = parser.parse_args()

try:
args.I = open(args.I, 'rt')
except TypeError:
pass

try:
args.O = open(args.O, 'wt')
except TypeError:
pass

return args


def main():

args = get_options()

vcf = pysam.VariantFile(args.I)
try:
vcf = pysam.VariantFile(args.I)
except IOError:
print('Input VCF file {} not found, exiting.'.format(args.I))
exit()

header = vcf.header
header.add_line('##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.2 variant '
header.add_line('##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.2.1 variant '
'annotation. These include delta scores (DS) and delta positions (DP) for '
'acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). '
'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">')
output = pysam.VariantFile(args.O, mode='w', header=header)

try:
output = pysam.VariantFile(args.O, mode='w', header=header)
except IOError:
print('Unable to create output VCF file {}, exiting.'.format(args.O))
exit()

ann = Annotator(args.R, args.A)

for record in vcf:

scores = get_delta_scores(record, ann)
if len(scores) > 0:
record.info['SpliceAI'] = scores
output.write(record)

vcf.close()
output.close()


if __name__ == '__main__':
main()
37 changes: 27 additions & 10 deletions spliceai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,26 @@ def __init__(self, ref_fasta, annotations):
annotations = resource_filename(__name__, 'annotations/grch37.txt')
elif annotations == 'grch38':
annotations = resource_filename(__name__, 'annotations/grch38.txt')
df = pd.read_csv(annotations, sep='\t', dtype={'CHROM': object})

self.genes = df['#NAME'].get_values()
self.chroms = df['CHROM'].get_values()
self.strands = df['STRAND'].get_values()
self.tx_starts = df['TX_START'].get_values()+1
self.tx_ends = df['TX_END'].get_values()

self.ref_fasta = Fasta(ref_fasta)
try:
df = pd.read_csv(annotations, sep='\t', dtype={'CHROM': object})
self.genes = df['#NAME'].get_values()
self.chroms = df['CHROM'].get_values()
self.strands = df['STRAND'].get_values()
self.tx_starts = df['TX_START'].get_values()+1
self.tx_ends = df['TX_END'].get_values()
except IOError:
print('Gene annotation file {} not found, exiting.'.format(annotations))
exit()
except KeyError:
print('Gene annotation file {} format incorrect, exiting.'.format(annotations))
exit()

try:
self.ref_fasta = Fasta(ref_fasta)
except IOError:
print('Reference genome fasta file {} not found, exiting.'.format(ref_fasta))
exit()

paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
self.models = [load_model(resource_filename(__name__, x)) for x in paths]
Expand Down Expand Up @@ -82,16 +93,22 @@ def get_delta_scores(record, ann, cov=1001):
try:
record.chrom, record.pos, record.ref, len(record.alts)
except TypeError:
print('Skipping record (bad input): {}'.format(record))
return delta_scores

(genes, strands, idxs) = ann.get_name_and_strand(record.chrom, record.pos)
if len(idxs) == 0 or len(record.ref) >= cov//2 or record.ref == '.':
if len(idxs) == 0:
return delta_scores

chrom = normalise_chrom(record.chrom, list(ann.ref_fasta.keys())[0])
try:
seq = ann.ref_fasta[chrom][record.pos-wid//2-1:record.pos+wid//2].seq
except IndexError:
except (IndexError, ValueError):
print('Skipping record (fasta issue): {}'.format(record))
return delta_scores

if seq[wid//2:wid//2+len(record.ref)].upper() != record.ref:
print('Skipping record (ref issue): {}'.format(record))
return delta_scores

for j in range(len(record.alts)):
Expand Down

0 comments on commit 92378c4

Please sign in to comment.