diff --git a/.gitignore b/.gitignore index 0134b26..47cffc0 100755 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ build/ dist/ *.egg-info/ .eggs/ +*.flat +*.gdx diff --git a/README.md b/README.md index 6ee1e78..bdbe5ed 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ cat input.vcf | spliceai -R genome.fa -A grch37 > output.vcf Options: - **-I**: Input VCF with variants of interest. - - **-O**: Output VCF with SpliceAI predictions `SpliceAI=ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL` included in the INFO column (see table below for details). Only SNVs and simple INDELs (ref or alt must be a single base) within genes are annotated. Variants in multiple genes have separate predictions for each gene. + - **-O**: Output VCF with SpliceAI predictions `SpliceAI=ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL` included in the INFO column (see table below for details). Only SNVs and simple INDELs (REF or ALT must be a single base) within genes are annotated. Variants in multiple genes have separate predictions for each gene. - **-R**: Reference genome fasta file. - **-A**: Gene annotation file. Can instead provide `grch37` or `grch38` to use GENCODE canonical annotation files included with the package. To create custom annotation files, use `spliceai/annotations/grch37.txt` in repository as template. diff --git a/examples/output.vcf b/examples/output.vcf index 25bb82f..39675d2 100644 --- a/examples/output.vcf +++ b/examples/output.vcf @@ -25,7 +25,7 @@ ##contig= ##contig= ##contig= -##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 2 179642185 . G A . . SpliceAI=A|TTN|0.24|0.00|0.64|0.55|126|38|2|-38 19 38958362 . C T . . SpliceAI=T|RYR1|0.22|0.00|0.91|0.70|-107|-46|-2|90 diff --git a/setup.py b/setup.py index fe41bf1..7436d20 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ description='SpliceAI: A deep learning-based tool to identify splice variants', long_description=io.open('README.md', encoding='utf-8').read(), long_description_content_type='text/markdown', - version='1.2', + version='1.2.1', author='Kishore Jaganathan', author_email='kishorejaganathan@gmail.com', license='GPLv3', diff --git a/spliceai/__init__.py b/spliceai/__init__.py index 5799c63..b5d843e 100644 --- a/spliceai/__init__.py +++ b/spliceai/__init__.py @@ -1,5 +1,8 @@ +import signal from pkg_resources import get_distribution +signal.signal(signal.SIGINT, lambda x, y: exit(0)) + name = 'spliceai' __version__ = get_distribution(name).version diff --git a/spliceai/__main__.py b/spliceai/__main__.py index 258e28e..6b3a071 100644 --- a/spliceai/__main__.py +++ b/spliceai/__main__.py @@ -27,16 +27,6 @@ def get_options(): 'or path to a similarly-constructed custom gene annotation file') args = parser.parse_args() - try: - args.I = open(args.I, 'rt') - except TypeError: - pass - - try: - args.O = open(args.O, 'wt') - except TypeError: - pass - return args @@ -44,22 +34,35 @@ def main(): args = get_options() - vcf = pysam.VariantFile(args.I) + try: + vcf = pysam.VariantFile(args.I) + except IOError: + print('Input VCF file {} not found, exiting.'.format(args.I)) + exit() + header = vcf.header - header.add_line('##INFO=') - output = pysam.VariantFile(args.O, mode='w', header=header) + + try: + output = pysam.VariantFile(args.O, mode='w', header=header) + except IOError: + print('Unable to create output VCF file {}, exiting.'.format(args.O)) + exit() + ann = Annotator(args.R, args.A) for record in vcf: - scores = get_delta_scores(record, ann) if len(scores) > 0: record.info['SpliceAI'] = scores output.write(record) + vcf.close() + output.close() + if __name__ == '__main__': main() diff --git a/spliceai/utils.py b/spliceai/utils.py index 58bf35d..4d129ec 100644 --- a/spliceai/utils.py +++ b/spliceai/utils.py @@ -13,15 +13,26 @@ def __init__(self, ref_fasta, annotations): annotations = resource_filename(__name__, 'annotations/grch37.txt') elif annotations == 'grch38': annotations = resource_filename(__name__, 'annotations/grch38.txt') - df = pd.read_csv(annotations, sep='\t', dtype={'CHROM': object}) - self.genes = df['#NAME'].get_values() - self.chroms = df['CHROM'].get_values() - self.strands = df['STRAND'].get_values() - self.tx_starts = df['TX_START'].get_values()+1 - self.tx_ends = df['TX_END'].get_values() - - self.ref_fasta = Fasta(ref_fasta) + try: + df = pd.read_csv(annotations, sep='\t', dtype={'CHROM': object}) + self.genes = df['#NAME'].get_values() + self.chroms = df['CHROM'].get_values() + self.strands = df['STRAND'].get_values() + self.tx_starts = df['TX_START'].get_values()+1 + self.tx_ends = df['TX_END'].get_values() + except IOError: + print('Gene annotation file {} not found, exiting.'.format(annotations)) + exit() + except KeyError: + print('Gene annotation file {} format incorrect, exiting.'.format(annotations)) + exit() + + try: + self.ref_fasta = Fasta(ref_fasta) + except IOError: + print('Reference genome fasta file {} not found, exiting.'.format(ref_fasta)) + exit() paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6)) self.models = [load_model(resource_filename(__name__, x)) for x in paths] @@ -82,16 +93,22 @@ def get_delta_scores(record, ann, cov=1001): try: record.chrom, record.pos, record.ref, len(record.alts) except TypeError: + print('Skipping record (bad input): {}'.format(record)) return delta_scores (genes, strands, idxs) = ann.get_name_and_strand(record.chrom, record.pos) - if len(idxs) == 0 or len(record.ref) >= cov//2 or record.ref == '.': + if len(idxs) == 0: return delta_scores chrom = normalise_chrom(record.chrom, list(ann.ref_fasta.keys())[0]) try: seq = ann.ref_fasta[chrom][record.pos-wid//2-1:record.pos+wid//2].seq - except IndexError: + except (IndexError, ValueError): + print('Skipping record (fasta issue): {}'.format(record)) + return delta_scores + + if seq[wid//2:wid//2+len(record.ref)].upper() != record.ref: + print('Skipping record (ref issue): {}'.format(record)) return delta_scores for j in range(len(record.alts)):