Skip to content

Commit

Permalink
Extend count-sv to work for gtf and generic BED
Browse files Browse the repository at this point in the history
  • Loading branch information
RCollins13 committed Jul 9, 2022
1 parent 4ea8e8c commit f0740e7
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 70 deletions.
2 changes: 1 addition & 1 deletion athena/cli/master.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def utilscli():
utilscli.add_command(utils_commands.pairbins)
utilscli.add_command(utils_commands.featurehists)
utilscli.add_command(utils_commands.featurestats)
utilscli.add_command(utils_commands.countsv)
utilscli.add_command(utils_commands.transform)
utilscli.add_command(utils_commands.sliceremote)

Expand All @@ -45,7 +46,6 @@ def mutratecli():
mutratecli.add_command(mutrate_commands.annotatebins)
mutratecli.add_command(mutrate_commands.annotatepairs)
mutratecli.add_command(mutrate_commands.annodecomp)
mutratecli.add_command(mutrate_commands.countsv)
mutratecli.add_command(mutrate_commands.mutrain)
mutratecli.add_command(mutrate_commands.mupredict)
mutratecli.add_command(mutrate_commands.muquery)
Expand Down
60 changes: 0 additions & 60 deletions athena/cli/mutrate/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,66 +370,6 @@ def annodecomp(bins, bins_outfile, parameters_outfile, precomp_model, components
skip_columns, maxfloat, max_pcs, stats, prefix, bgzip)


# Intersect SVs and bins
@click.command(name='count-sv')
@click.argument('bins', type=click.Path(exists=True))
@click.argument('sv', type=click.Path(exists=True))
@click.argument('outfile')
@click.option('--bin-format', type=click.Choice(['1D', '2D']),
help='Specify format of input: bins (1D) or bin-pairs (2D) [default: 2D]',
default='2D', required=True)
@click.option('--binsize', type=int, default=None, help='Size of bins [default: ' +
'infer from spacing of start coordinates]')
@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']),
help='Specification of SV-to-bin comparison [default: breakpoint]',
default='breakpoint', required=True)
@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False,
help='Annotate probability of SVs instead of count [default: count SVs]')
@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' +
'implied by CIPOS and CIEND if present in SV VCF input [default: 0.95]')
@click.option('--maxfloat', type=int, default=8,
help='Maximum precision of floating-point values. [default: 8]')
@click.option('-z', '--bgzip', is_flag=True, default=False,
help='Compress output with bgzip')
def countsv(bins, sv, outfile, bin_format, binsize, comparison, probs, sv_ci,
maxfloat, bgzip):
"""
Intersect SV and 1D bins or 2D bin-pairs
"""

# Ensure --bin-format is specified
if bin_format not in '1D 2D'.split():
if bin_format is None:
err = 'INPUT ERROR: --bin-format is required. Options: ' + \
'"overlap" or "breakpoint".'
else:
err = 'INPUT ERROR: --bin-format "{0}" not recognized. Options: ' + \
'"1D" or "2D".'
exit(err.format(bin_format))
paired = (bin_format == '2D')

# Ensure --comparison is specified
if comparison not in 'overlap breakpoint'.split():
if comparison is None:
err = 'INPUT ERROR: --comparison is required. Options: ' + \
'"overlap" or "breakpoint".'
else:
err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \
'"overlap" or "breakpoint".'
exit(err.format(comparison))
breakpoints = (comparison == 'breakpoint')

# Warn if --probabilities specified with --comparison breakpoint
if probs and comparison == 'overlap':
status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \
'with --comparison "overlap". Returning binary indicator of ' + \
'overlap/non-overlap instead.'
print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S')))

mutrate.count_sv(bins, sv, outfile, paired, binsize, breakpoints, probs,
sv_ci, maxfloat, bgzip)


# Train mutation rate model
@click.command(name='mu-train')
@click.option('--training-data', type=click.Path(exists=True), required=True,
Expand Down
84 changes: 83 additions & 1 deletion athena/cli/utils/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


import click
from athena import utils
from athena import utils, dosage


# VCF filtering
Expand Down Expand Up @@ -250,6 +250,88 @@ def featurestats(bed, outfile, skip_cols, trans_tsv, log_transform, sqrt_transfo
exp_transform, square_transform, boxcox_transform, maxfloat)


# Intersect SVs and bins (or BED/GTF)
@click.command(name='count-sv')
@click.argument('sv', type=click.Path(exists=True))
@click.argument('query', type=click.Path(exists=True))
@click.option('-o', '--outfile', default='stdout', help='Path to output file. ' +
'[default: stdout]')
@click.option('--query-format', type=click.Choice(['bins', 'pairs', 'bed', 'gtf']),
help='Specify format of input: athena bins, athena bin-pairs ' +
', generic bed, or gtf [default: pairs]',
default='pairs', required=True)
@click.option('--binsize', type=int, default=None, help='Size of bins. Only used ' +
'for --query-format "bins" or "pairs". [default: infer from spacing ' +
'of start coordinates]')
@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']),
help='Specification of SV-to-query interval comparison. Currently ' +
'ignored for --query-format "bed" and "gtf". [default: "breakpoint" ' +
'for --query-format "bins" or "pairs" and "overlap" for "bed" or "gtf"]',
default='breakpoint', required=True)
@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False,
help='Annotate probability of SVs instead of count. Ignored for ' +
'--query-format "bed" and "gtf". [default: count SVs]')
@click.option('--group-by', help='Specify key by which queries will be grouped. ' +
'Only used for --query-format "bed" or "gtf". [default: "gene_name" ' +
'(for GTF query), fourth column (for BED4+) or chrom_start_end ' +
'(for BED3)]')
@click.option('-f', '--fraction', 'ovr_frac', type=float, default=10e-9,
help='Minimum overlap of query interval required for SV to be ' +
'counted. Only used for --query-format "bed" or "gtf". ' +
'[default: any overlap]')
@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' +
'implied by CIPOS and CIEND if present in SV VCF input. Ignored for ' +
'--query-format "bed" and "gtf". [default: 0.95]')
@click.option('--maxfloat', type=int, default=8,
help='Maximum precision of floating-point values. [default: 8]')
@click.option('-z', '--bgzip', is_flag=True, default=False,
help='Compress output with bgzip (or gzip for --query-format "bed" ' +
'or "gtf")')
def countsv(sv, query, outfile, query_format, binsize, comparison, probs, group_by,
ovr_frac, sv_ci, maxfloat, bgzip):
"""
Intersect SVs with query regions
"""

# Ensure --query-format is specified
if query_format not in 'bins pairs bed gtf'.split():
if query_format is None:
err = 'INPUT ERROR: --query-format is required. Options: ' + \
'"bins", "pairs", "bed", or "gtf".'
else:
err = 'INPUT ERROR: --query-format "{0}" not recognized. Options: ' + \
'"bins", "pairs", "bed", or "gtf".'
exit(err.format(bin_format))

# Ensure --comparison is specified
if comparison not in 'overlap breakpoint'.split():
if comparison is None:
err = 'INPUT ERROR: --comparison is required. Options: ' + \
'"overlap" or "breakpoint".'
else:
err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \
'"overlap" or "breakpoint".'
exit(err.format(comparison))

# Warn if --probabilities specified with --comparison overlap
if probs and comparison == 'overlap':
status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \
'with --comparison "overlap". Returning binary indicator of ' + \
'overlap/non-overlap instead.'
print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S')))

# Call mutrate.count_sv_in_bins if --query-format is bins or pairs
if query_format in 'bins pairs'.split():
paired = (bin_format == 'pairs')
breakpoints = (comparison == 'breakpoint')
mutrate.count_sv_in_bins(sv, query, outfile, paired, binsize, breakpoints,
probs, sv_ci, maxfloat, bgzip)

# Otherwise, call dosage.count_sv_generic
else:
dosage.count_sv_generic(sv, query, outfile, group_by, ovr_frac, maxfloat, bgzip)


# Transform binwise annotations collected with annotate-bins
@click.command(name='transform')
@click.argument('BED_in', type=click.Path(exists=True))
Expand Down
1 change: 1 addition & 0 deletions athena/dosage/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .countsv import *
52 changes: 52 additions & 0 deletions athena/dosage/countsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019-Present Ryan L. Collins <[email protected]>
# Distributed under terms of the MIT license.

"""
Count overlap of SVs and generic BED or GTF files
"""


from athena.mutrate.countsv import _load_sv_from_bed
from athena.mutrate.muquery import _get_query_entity_name
from athena.utils.misc import determine_filetype, vcf2bed
import pybedtools as pbt
import pysam
import pandas as pd


def count_sv_generic(sv_in, query_in, outfile, group_by, ovr_frac, maxfloat, bgzip):
"""
Collect SV counts for every feature in query_in
"""

# Load query file as pbt.BedTool
qbt = pbt.BedTool(query_in)

# Load SVs and convert to pbt.BedTool
sv_format = determine_filetype(sv_in)
if 'vcf' in sv_format:
vcf = pysam.VariantFile(sv_in)
sv = vcf2bed(vcf, breakpoints=False, add_ci_to_bkpts=False)
elif 'bed' in sv_format:
sv = _load_sv_from_bed(sv_in, breakpoints=False)

# Iterate over all hits in intersection of query & SVs
# For each hit, write SV ID to dict per query feature
hit_ids = {}
for hit in qbt.intersect(sv, loj=True, f=ovr_frac):
qname = _get_query_entity_name(hit, group_by)
if qname not in hit_ids.keys():
hit_ids[qname] = set()
svid = hit[-1]
if svid != '.':
hit_ids[qname].add(svid)

# Once hits have been processed, write out counts as .tsv
hit_counts = {qname: len(svids) for qname, svids in hit_ids.items()}
hit_df = pd.DataFrame.from_dict(hit_counts, orient='index').reset_index()
hit_df.columns = '#query n_svs'.split()
hit_df.to_csv(outfile, index=False, header=True, sep='\t')

18 changes: 12 additions & 6 deletions athena/mutrate/countsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pandas as pd
import numpy as np
from scipy.stats import norm
from sys import stdout


def _load_sv_from_bed(sv_in, breakpoints=False):
Expand Down Expand Up @@ -213,10 +214,10 @@ def parse_breakpoint_hits(hits, paired, probs):
return bkpt_res


def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci,
maxfloat, bgzip):
def count_sv_in_bins(sv_in, bins_in, outfile, paired, binsize, breakpoints,
probs, sv_ci, maxfloat, bgzip):
"""
Master function to annotate bins_in with count (or probability) of SVs
Main function to annotate bins_in with count (or probability) of SVs
"""

# Load bins, split bin coordinates from annotations, and retain header
Expand Down Expand Up @@ -270,11 +271,16 @@ def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci
out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]

# Save bins with SV counts
if 'compressed' in determine_filetype(outfile):
outfile = path.splitext(outfile)[0]
if outfile in 'stdout /dev/stdout -'.split():
outfile = stdout
outfile_is_stdout = True
else:
outfile_is_stdout = False
if 'compressed' in determine_filetype(outfile):
outfile = path.splitext(outfile)[0]
out_df.to_csv(outfile, sep='\t', header=True, index=False)

# Bgzip bins, if optioned
if bgzip:
if bgzip and not outfile_is_stdout:
bgz(outfile)

5 changes: 3 additions & 2 deletions athena/mutrate/muquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pysam
import pandas as pd
from numpy import log10, nansum, nanmax
from sys import stdout


def _get_query_entity_name(feature, query_group_by):
Expand Down Expand Up @@ -115,7 +116,7 @@ def mu_query(pairs, query, outfile, query_group_by, ovr_frac, raw_mu_in,
query_results = float_cleanup(query_results, maxfloat, 1)

# Write query to output file
if gzip and not outfile.endswith('.gz'):
outfile = outfile + '.gz'
if outfile in 'stdout /dev/stdout -'.split():
outfile = stdout
query_results.to_csv(outfile, header=True, index=False, sep='\t')

0 comments on commit f0740e7

Please sign in to comment.