From f0740e74e8a846f975d09ecc6f44327c7e9d8be4 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Sat, 9 Jul 2022 18:54:16 -0400 Subject: [PATCH] Extend count-sv to work for gtf and generic BED --- athena/cli/master.py | 2 +- athena/cli/mutrate/commands.py | 60 ------------------------ athena/cli/utils/commands.py | 84 +++++++++++++++++++++++++++++++++- athena/dosage/__init__.py | 1 + athena/dosage/countsv.py | 52 +++++++++++++++++++++ athena/mutrate/countsv.py | 18 +++++--- athena/mutrate/muquery.py | 5 +- 7 files changed, 152 insertions(+), 70 deletions(-) create mode 100644 athena/dosage/__init__.py create mode 100644 athena/dosage/countsv.py diff --git a/athena/cli/master.py b/athena/cli/master.py index 072e47a..f89fc33 100644 --- a/athena/cli/master.py +++ b/athena/cli/master.py @@ -30,6 +30,7 @@ def utilscli(): utilscli.add_command(utils_commands.pairbins) utilscli.add_command(utils_commands.featurehists) utilscli.add_command(utils_commands.featurestats) +utilscli.add_command(utils_commands.countsv) utilscli.add_command(utils_commands.transform) utilscli.add_command(utils_commands.sliceremote) @@ -45,7 +46,6 @@ def mutratecli(): mutratecli.add_command(mutrate_commands.annotatebins) mutratecli.add_command(mutrate_commands.annotatepairs) mutratecli.add_command(mutrate_commands.annodecomp) -mutratecli.add_command(mutrate_commands.countsv) mutratecli.add_command(mutrate_commands.mutrain) mutratecli.add_command(mutrate_commands.mupredict) mutratecli.add_command(mutrate_commands.muquery) diff --git a/athena/cli/mutrate/commands.py b/athena/cli/mutrate/commands.py index 0db1112..f24ed48 100644 --- a/athena/cli/mutrate/commands.py +++ b/athena/cli/mutrate/commands.py @@ -370,66 +370,6 @@ def annodecomp(bins, bins_outfile, parameters_outfile, precomp_model, components skip_columns, maxfloat, max_pcs, stats, prefix, bgzip) -# Intersect SVs and bins -@click.command(name='count-sv') -@click.argument('bins', type=click.Path(exists=True)) -@click.argument('sv', type=click.Path(exists=True)) -@click.argument('outfile') -@click.option('--bin-format', type=click.Choice(['1D', '2D']), - help='Specify format of input: bins (1D) or bin-pairs (2D) [default: 2D]', - default='2D', required=True) -@click.option('--binsize', type=int, default=None, help='Size of bins [default: ' + - 'infer from spacing of start coordinates]') -@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']), - help='Specification of SV-to-bin comparison [default: breakpoint]', - default='breakpoint', required=True) -@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False, - help='Annotate probability of SVs instead of count [default: count SVs]') -@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' + - 'implied by CIPOS and CIEND if present in SV VCF input [default: 0.95]') -@click.option('--maxfloat', type=int, default=8, - help='Maximum precision of floating-point values. [default: 8]') -@click.option('-z', '--bgzip', is_flag=True, default=False, - help='Compress output with bgzip') -def countsv(bins, sv, outfile, bin_format, binsize, comparison, probs, sv_ci, - maxfloat, bgzip): - """ - Intersect SV and 1D bins or 2D bin-pairs - """ - - # Ensure --bin-format is specified - if bin_format not in '1D 2D'.split(): - if bin_format is None: - err = 'INPUT ERROR: --bin-format is required. Options: ' + \ - '"overlap" or "breakpoint".' - else: - err = 'INPUT ERROR: --bin-format "{0}" not recognized. Options: ' + \ - '"1D" or "2D".' - exit(err.format(bin_format)) - paired = (bin_format == '2D') - - # Ensure --comparison is specified - if comparison not in 'overlap breakpoint'.split(): - if comparison is None: - err = 'INPUT ERROR: --comparison is required. Options: ' + \ - '"overlap" or "breakpoint".' - else: - err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \ - '"overlap" or "breakpoint".' - exit(err.format(comparison)) - breakpoints = (comparison == 'breakpoint') - - # Warn if --probabilities specified with --comparison breakpoint - if probs and comparison == 'overlap': - status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \ - 'with --comparison "overlap". Returning binary indicator of ' + \ - 'overlap/non-overlap instead.' - print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) - - mutrate.count_sv(bins, sv, outfile, paired, binsize, breakpoints, probs, - sv_ci, maxfloat, bgzip) - - # Train mutation rate model @click.command(name='mu-train') @click.option('--training-data', type=click.Path(exists=True), required=True, diff --git a/athena/cli/utils/commands.py b/athena/cli/utils/commands.py index 68e38f0..bd521a3 100644 --- a/athena/cli/utils/commands.py +++ b/athena/cli/utils/commands.py @@ -10,7 +10,7 @@ import click -from athena import utils +from athena import utils, dosage # VCF filtering @@ -250,6 +250,88 @@ def featurestats(bed, outfile, skip_cols, trans_tsv, log_transform, sqrt_transfo exp_transform, square_transform, boxcox_transform, maxfloat) +# Intersect SVs and bins (or BED/GTF) +@click.command(name='count-sv') +@click.argument('sv', type=click.Path(exists=True)) +@click.argument('query', type=click.Path(exists=True)) +@click.option('-o', '--outfile', default='stdout', help='Path to output file. ' + + '[default: stdout]') +@click.option('--query-format', type=click.Choice(['bins', 'pairs', 'bed', 'gtf']), + help='Specify format of input: athena bins, athena bin-pairs ' + + ', generic bed, or gtf [default: pairs]', + default='pairs', required=True) +@click.option('--binsize', type=int, default=None, help='Size of bins. Only used ' + + 'for --query-format "bins" or "pairs". [default: infer from spacing ' + + 'of start coordinates]') +@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']), + help='Specification of SV-to-query interval comparison. Currently ' + + 'ignored for --query-format "bed" and "gtf". [default: "breakpoint" ' + + 'for --query-format "bins" or "pairs" and "overlap" for "bed" or "gtf"]', + default='breakpoint', required=True) +@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False, + help='Annotate probability of SVs instead of count. Ignored for ' + + '--query-format "bed" and "gtf". [default: count SVs]') +@click.option('--group-by', help='Specify key by which queries will be grouped. ' + + 'Only used for --query-format "bed" or "gtf". [default: "gene_name" ' + + '(for GTF query), fourth column (for BED4+) or chrom_start_end ' + + '(for BED3)]') +@click.option('-f', '--fraction', 'ovr_frac', type=float, default=10e-9, + help='Minimum overlap of query interval required for SV to be ' + + 'counted. Only used for --query-format "bed" or "gtf". ' + + '[default: any overlap]') +@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' + + 'implied by CIPOS and CIEND if present in SV VCF input. Ignored for ' + + '--query-format "bed" and "gtf". [default: 0.95]') +@click.option('--maxfloat', type=int, default=8, + help='Maximum precision of floating-point values. [default: 8]') +@click.option('-z', '--bgzip', is_flag=True, default=False, + help='Compress output with bgzip (or gzip for --query-format "bed" ' + + 'or "gtf")') +def countsv(sv, query, outfile, query_format, binsize, comparison, probs, group_by, + ovr_frac, sv_ci, maxfloat, bgzip): + """ + Intersect SVs with query regions + """ + + # Ensure --query-format is specified + if query_format not in 'bins pairs bed gtf'.split(): + if query_format is None: + err = 'INPUT ERROR: --query-format is required. Options: ' + \ + '"bins", "pairs", "bed", or "gtf".' + else: + err = 'INPUT ERROR: --query-format "{0}" not recognized. Options: ' + \ + '"bins", "pairs", "bed", or "gtf".' + exit(err.format(bin_format)) + + # Ensure --comparison is specified + if comparison not in 'overlap breakpoint'.split(): + if comparison is None: + err = 'INPUT ERROR: --comparison is required. Options: ' + \ + '"overlap" or "breakpoint".' + else: + err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \ + '"overlap" or "breakpoint".' + exit(err.format(comparison)) + + # Warn if --probabilities specified with --comparison overlap + if probs and comparison == 'overlap': + status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \ + 'with --comparison "overlap". Returning binary indicator of ' + \ + 'overlap/non-overlap instead.' + print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) + + # Call mutrate.count_sv_in_bins if --query-format is bins or pairs + if query_format in 'bins pairs'.split(): + paired = (bin_format == 'pairs') + breakpoints = (comparison == 'breakpoint') + mutrate.count_sv_in_bins(sv, query, outfile, paired, binsize, breakpoints, + probs, sv_ci, maxfloat, bgzip) + + # Otherwise, call dosage.count_sv_generic + else: + dosage.count_sv_generic(sv, query, outfile, group_by, ovr_frac, maxfloat, bgzip) + + # Transform binwise annotations collected with annotate-bins @click.command(name='transform') @click.argument('BED_in', type=click.Path(exists=True)) diff --git a/athena/dosage/__init__.py b/athena/dosage/__init__.py new file mode 100644 index 0000000..370d064 --- /dev/null +++ b/athena/dosage/__init__.py @@ -0,0 +1 @@ +from .countsv import * diff --git a/athena/dosage/countsv.py b/athena/dosage/countsv.py new file mode 100644 index 0000000..8d8edd7 --- /dev/null +++ b/athena/dosage/countsv.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019-Present Ryan L. Collins +# Distributed under terms of the MIT license. + +""" +Count overlap of SVs and generic BED or GTF files +""" + + +from athena.mutrate.countsv import _load_sv_from_bed +from athena.mutrate.muquery import _get_query_entity_name +from athena.utils.misc import determine_filetype, vcf2bed +import pybedtools as pbt +import pysam +import pandas as pd + + +def count_sv_generic(sv_in, query_in, outfile, group_by, ovr_frac, maxfloat, bgzip): + """ + Collect SV counts for every feature in query_in + """ + + # Load query file as pbt.BedTool + qbt = pbt.BedTool(query_in) + + # Load SVs and convert to pbt.BedTool + sv_format = determine_filetype(sv_in) + if 'vcf' in sv_format: + vcf = pysam.VariantFile(sv_in) + sv = vcf2bed(vcf, breakpoints=False, add_ci_to_bkpts=False) + elif 'bed' in sv_format: + sv = _load_sv_from_bed(sv_in, breakpoints=False) + + # Iterate over all hits in intersection of query & SVs + # For each hit, write SV ID to dict per query feature + hit_ids = {} + for hit in qbt.intersect(sv, loj=True, f=ovr_frac): + qname = _get_query_entity_name(hit, group_by) + if qname not in hit_ids.keys(): + hit_ids[qname] = set() + svid = hit[-1] + if svid != '.': + hit_ids[qname].add(svid) + + # Once hits have been processed, write out counts as .tsv + hit_counts = {qname: len(svids) for qname, svids in hit_ids.items()} + hit_df = pd.DataFrame.from_dict(hit_counts, orient='index').reset_index() + hit_df.columns = '#query n_svs'.split() + hit_df.to_csv(outfile, index=False, header=True, sep='\t') + diff --git a/athena/mutrate/countsv.py b/athena/mutrate/countsv.py index 259b6a4..af69489 100644 --- a/athena/mutrate/countsv.py +++ b/athena/mutrate/countsv.py @@ -19,6 +19,7 @@ import pandas as pd import numpy as np from scipy.stats import norm +from sys import stdout def _load_sv_from_bed(sv_in, breakpoints=False): @@ -213,10 +214,10 @@ def parse_breakpoint_hits(hits, paired, probs): return bkpt_res -def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci, - maxfloat, bgzip): +def count_sv_in_bins(sv_in, bins_in, outfile, paired, binsize, breakpoints, + probs, sv_ci, maxfloat, bgzip): """ - Master function to annotate bins_in with count (or probability) of SVs + Main function to annotate bins_in with count (or probability) of SVs """ # Load bins, split bin coordinates from annotations, and retain header @@ -270,11 +271,16 @@ def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:] # Save bins with SV counts - if 'compressed' in determine_filetype(outfile): - outfile = path.splitext(outfile)[0] + if outfile in 'stdout /dev/stdout -'.split(): + outfile = stdout + outfile_is_stdout = True + else: + outfile_is_stdout = False + if 'compressed' in determine_filetype(outfile): + outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', header=True, index=False) # Bgzip bins, if optioned - if bgzip: + if bgzip and not outfile_is_stdout: bgz(outfile) diff --git a/athena/mutrate/muquery.py b/athena/mutrate/muquery.py index c5545ca..a4855ea 100644 --- a/athena/mutrate/muquery.py +++ b/athena/mutrate/muquery.py @@ -17,6 +17,7 @@ import pysam import pandas as pd from numpy import log10, nansum, nanmax +from sys import stdout def _get_query_entity_name(feature, query_group_by): @@ -115,7 +116,7 @@ def mu_query(pairs, query, outfile, query_group_by, ovr_frac, raw_mu_in, query_results = float_cleanup(query_results, maxfloat, 1) # Write query to output file - if gzip and not outfile.endswith('.gz'): - outfile = outfile + '.gz' + if outfile in 'stdout /dev/stdout -'.split(): + outfile = stdout query_results.to_csv(outfile, header=True, index=False, sep='\t')