From f0740e74e8a846f975d09ecc6f44327c7e9d8be4 Mon Sep 17 00:00:00 2001
From: Ryan Collins <rlcollins@g.harvard.edu>
Date: Sat, 9 Jul 2022 18:54:16 -0400
Subject: [PATCH] Extend count-sv to work for gtf and generic BED

---
 athena/cli/master.py           |  2 +-
 athena/cli/mutrate/commands.py | 60 ------------------------
 athena/cli/utils/commands.py   | 84 +++++++++++++++++++++++++++++++++-
 athena/dosage/__init__.py      |  1 +
 athena/dosage/countsv.py       | 52 +++++++++++++++++++++
 athena/mutrate/countsv.py      | 18 +++++---
 athena/mutrate/muquery.py      |  5 +-
 7 files changed, 152 insertions(+), 70 deletions(-)
 create mode 100644 athena/dosage/__init__.py
 create mode 100644 athena/dosage/countsv.py

diff --git a/athena/cli/master.py b/athena/cli/master.py
index 072e47a..f89fc33 100644
--- a/athena/cli/master.py
+++ b/athena/cli/master.py
@@ -30,6 +30,7 @@ def utilscli():
 utilscli.add_command(utils_commands.pairbins)
 utilscli.add_command(utils_commands.featurehists)
 utilscli.add_command(utils_commands.featurestats)
+utilscli.add_command(utils_commands.countsv)
 utilscli.add_command(utils_commands.transform)
 utilscli.add_command(utils_commands.sliceremote)
 
@@ -45,7 +46,6 @@ def mutratecli():
 mutratecli.add_command(mutrate_commands.annotatebins)
 mutratecli.add_command(mutrate_commands.annotatepairs)
 mutratecli.add_command(mutrate_commands.annodecomp)
-mutratecli.add_command(mutrate_commands.countsv)
 mutratecli.add_command(mutrate_commands.mutrain)
 mutratecli.add_command(mutrate_commands.mupredict)
 mutratecli.add_command(mutrate_commands.muquery)
diff --git a/athena/cli/mutrate/commands.py b/athena/cli/mutrate/commands.py
index 0db1112..f24ed48 100644
--- a/athena/cli/mutrate/commands.py
+++ b/athena/cli/mutrate/commands.py
@@ -370,66 +370,6 @@ def annodecomp(bins, bins_outfile, parameters_outfile, precomp_model, components
                            skip_columns, maxfloat, max_pcs, stats, prefix, bgzip)
 
 
-# Intersect SVs and bins
-@click.command(name='count-sv')
-@click.argument('bins', type=click.Path(exists=True))
-@click.argument('sv', type=click.Path(exists=True))
-@click.argument('outfile')
-@click.option('--bin-format', type=click.Choice(['1D', '2D']),
-              help='Specify format of input: bins (1D) or bin-pairs (2D) [default: 2D]',
-              default='2D', required=True)
-@click.option('--binsize', type=int, default=None, help='Size of bins [default: ' +
-              'infer from spacing of start coordinates]')
-@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']),
-              help='Specification of SV-to-bin comparison [default: breakpoint]',
-              default='breakpoint', required=True)
-@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False,
-              help='Annotate probability of SVs instead of count [default: count SVs]')
-@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' + 
-              'implied by CIPOS and CIEND if present in SV VCF input [default: 0.95]')
-@click.option('--maxfloat', type=int, default=8, 
-              help='Maximum precision of floating-point values. [default: 8]')
-@click.option('-z', '--bgzip', is_flag=True, default=False, 
-              help='Compress output with bgzip')
-def countsv(bins, sv, outfile, bin_format, binsize, comparison, probs, sv_ci, 
-            maxfloat, bgzip):
-    """
-    Intersect SV and 1D bins or 2D bin-pairs
-    """
-
-    # Ensure --bin-format is specified
-    if bin_format not in '1D 2D'.split():
-        if bin_format is None:
-            err = 'INPUT ERROR: --bin-format is required. Options: ' + \
-                  '"overlap" or "breakpoint".'
-        else:
-            err = 'INPUT ERROR: --bin-format "{0}" not recognized. Options: ' + \
-                  '"1D" or "2D".'
-        exit(err.format(bin_format))
-    paired = (bin_format == '2D')
-
-    # Ensure --comparison is specified
-    if comparison not in 'overlap breakpoint'.split():
-        if comparison is None:
-            err = 'INPUT ERROR: --comparison is required. Options: ' + \
-                  '"overlap" or "breakpoint".'
-        else:
-            err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \
-                  '"overlap" or "breakpoint".'
-        exit(err.format(comparison))
-    breakpoints = (comparison == 'breakpoint')
-
-    # Warn if --probabilities specified with --comparison breakpoint
-    if probs and comparison == 'overlap':
-      status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \
-                   'with --comparison "overlap". Returning binary indicator of ' + \
-                   'overlap/non-overlap instead.'
-      print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
-
-    mutrate.count_sv(bins, sv, outfile, paired, binsize, breakpoints, probs, 
-                     sv_ci, maxfloat, bgzip)
-
-
 # Train mutation rate model
 @click.command(name='mu-train')
 @click.option('--training-data', type=click.Path(exists=True), required=True,
diff --git a/athena/cli/utils/commands.py b/athena/cli/utils/commands.py
index 68e38f0..bd521a3 100644
--- a/athena/cli/utils/commands.py
+++ b/athena/cli/utils/commands.py
@@ -10,7 +10,7 @@
 
 
 import click
-from athena import utils
+from athena import utils, dosage
 
 
 # VCF filtering
@@ -250,6 +250,88 @@ def featurestats(bed, outfile, skip_cols, trans_tsv, log_transform, sqrt_transfo
                       exp_transform, square_transform, boxcox_transform, maxfloat)
 
 
+# Intersect SVs and bins (or BED/GTF)
+@click.command(name='count-sv')
+@click.argument('sv', type=click.Path(exists=True))
+@click.argument('query', type=click.Path(exists=True))
+@click.option('-o', '--outfile', default='stdout', help='Path to output file. ' +
+              '[default: stdout]')
+@click.option('--query-format', type=click.Choice(['bins', 'pairs', 'bed', 'gtf']),
+              help='Specify format of input: athena bins, athena bin-pairs ' +
+              ', generic bed, or gtf [default: pairs]',
+              default='pairs', required=True)
+@click.option('--binsize', type=int, default=None, help='Size of bins. Only used ' +
+              'for --query-format "bins" or "pairs". [default: infer from spacing ' +
+              'of start coordinates]')
+@click.option('--comparison', type=click.Choice(['overlap', 'breakpoint']),
+              help='Specification of SV-to-query interval comparison. Currently ' +
+              'ignored for --query-format "bed" and "gtf". [default: "breakpoint" ' +
+              'for --query-format "bins" or "pairs" and "overlap" for "bed" or "gtf"]', 
+              default='breakpoint', required=True)
+@click.option('-p', '--probabilities', 'probs', is_flag=True, default=False,
+              help='Annotate probability of SVs instead of count. Ignored for ' +
+              '--query-format "bed" and "gtf". [default: count SVs]')
+@click.option('--group-by', help='Specify key by which queries will be grouped. ' +
+              'Only used for --query-format "bed" or "gtf". [default: "gene_name" ' +
+              '(for GTF query), fourth column (for BED4+) or chrom_start_end ' +
+              '(for BED3)]')
+@click.option('-f', '--fraction', 'ovr_frac', type=float, default=10e-9, 
+              help='Minimum overlap of query interval required for SV to be ' +
+              'counted. Only used for --query-format "bed" or "gtf". ' +
+              '[default: any overlap]')
+@click.option('--sv-ci', type=float, default=0.95, help='Specify confidence interval ' + 
+              'implied by CIPOS and CIEND if present in SV VCF input. Ignored for ' +
+              '--query-format "bed" and "gtf". [default: 0.95]')
+@click.option('--maxfloat', type=int, default=8, 
+              help='Maximum precision of floating-point values. [default: 8]')
+@click.option('-z', '--bgzip', is_flag=True, default=False, 
+              help='Compress output with bgzip (or gzip for --query-format "bed" ' +
+              'or "gtf")')
+def countsv(sv, query, outfile, query_format, binsize, comparison, probs, group_by,
+            ovr_frac, sv_ci, maxfloat, bgzip):
+    """
+    Intersect SVs with query regions
+    """
+
+    # Ensure --query-format is specified
+    if query_format not in 'bins pairs bed gtf'.split():
+        if query_format is None:
+            err = 'INPUT ERROR: --query-format is required. Options: ' + \
+                  '"bins", "pairs", "bed", or "gtf".'
+        else:
+            err = 'INPUT ERROR: --query-format "{0}" not recognized. Options: ' + \
+                  '"bins", "pairs", "bed", or "gtf".'
+        exit(err.format(bin_format))
+
+    # Ensure --comparison is specified
+    if comparison not in 'overlap breakpoint'.split():
+        if comparison is None:
+            err = 'INPUT ERROR: --comparison is required. Options: ' + \
+                  '"overlap" or "breakpoint".'
+        else:
+            err = 'INPUT ERROR: --comparison "{0}" not recognized. Options: ' + \
+                  '"overlap" or "breakpoint".'
+        exit(err.format(comparison))
+
+    # Warn if --probabilities specified with --comparison overlap
+    if probs and comparison == 'overlap':
+      status_msg = '[{0}] athena count-sv: --probabilities is not compatible ' + \
+                   'with --comparison "overlap". Returning binary indicator of ' + \
+                   'overlap/non-overlap instead.'
+      print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
+
+    # Call mutrate.count_sv_in_bins if --query-format is bins or pairs
+    if query_format in 'bins pairs'.split():
+      paired = (bin_format == 'pairs')
+      breakpoints = (comparison == 'breakpoint')
+      mutrate.count_sv_in_bins(sv, query, outfile, paired, binsize, breakpoints, 
+                               probs, sv_ci, maxfloat, bgzip)
+
+    # Otherwise, call dosage.count_sv_generic
+    else:
+      dosage.count_sv_generic(sv, query, outfile, group_by, ovr_frac, maxfloat, bgzip)
+
+
 # Transform binwise annotations collected with annotate-bins
 @click.command(name='transform')
 @click.argument('BED_in', type=click.Path(exists=True))
diff --git a/athena/dosage/__init__.py b/athena/dosage/__init__.py
new file mode 100644
index 0000000..370d064
--- /dev/null
+++ b/athena/dosage/__init__.py
@@ -0,0 +1 @@
+from .countsv import *
diff --git a/athena/dosage/countsv.py b/athena/dosage/countsv.py
new file mode 100644
index 0000000..8d8edd7
--- /dev/null
+++ b/athena/dosage/countsv.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019-Present Ryan L. Collins <rlcollins@g.harvard.edu>
+# Distributed under terms of the MIT license.
+
+"""
+Count overlap of SVs and generic BED or GTF files
+"""
+
+
+from athena.mutrate.countsv import _load_sv_from_bed
+from athena.mutrate.muquery import _get_query_entity_name
+from athena.utils.misc import determine_filetype, vcf2bed
+import pybedtools as pbt
+import pysam
+import pandas as pd
+
+
+def count_sv_generic(sv_in, query_in, outfile, group_by, ovr_frac, maxfloat, bgzip):
+    """
+    Collect SV counts for every feature in query_in 
+    """
+
+    # Load query file as pbt.BedTool
+    qbt = pbt.BedTool(query_in)
+
+    # Load SVs and convert to pbt.BedTool
+    sv_format = determine_filetype(sv_in)
+    if 'vcf' in sv_format:
+        vcf = pysam.VariantFile(sv_in)
+        sv = vcf2bed(vcf, breakpoints=False, add_ci_to_bkpts=False)
+    elif 'bed' in sv_format:
+        sv = _load_sv_from_bed(sv_in, breakpoints=False)
+
+    # Iterate over all hits in intersection of query & SVs
+    # For each hit, write SV ID to dict per query feature
+    hit_ids = {}
+    for hit in qbt.intersect(sv, loj=True, f=ovr_frac):
+        qname  = _get_query_entity_name(hit, group_by)
+        if qname not in hit_ids.keys():
+            hit_ids[qname] = set()
+        svid = hit[-1]
+        if svid != '.':
+            hit_ids[qname].add(svid)
+
+    # Once hits have been processed, write out counts as .tsv
+    hit_counts = {qname: len(svids) for qname, svids in hit_ids.items()}
+    hit_df = pd.DataFrame.from_dict(hit_counts, orient='index').reset_index()
+    hit_df.columns = '#query n_svs'.split()
+    hit_df.to_csv(outfile, index=False, header=True, sep='\t')
+
diff --git a/athena/mutrate/countsv.py b/athena/mutrate/countsv.py
index 259b6a4..af69489 100644
--- a/athena/mutrate/countsv.py
+++ b/athena/mutrate/countsv.py
@@ -19,6 +19,7 @@
 import pandas as pd
 import numpy as np
 from scipy.stats import norm
+from sys import stdout
 
 
 def _load_sv_from_bed(sv_in, breakpoints=False):
@@ -213,10 +214,10 @@ def parse_breakpoint_hits(hits, paired, probs):
     return bkpt_res
 
 
-def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci, 
-             maxfloat, bgzip):
+def count_sv_in_bins(sv_in, bins_in, outfile, paired, binsize, breakpoints, 
+                     probs, sv_ci, maxfloat, bgzip):
     """
-    Master function to annotate bins_in with count (or probability) of SVs
+    Main function to annotate bins_in with count (or probability) of SVs
     """
 
     # Load bins, split bin coordinates from annotations, and retain header
@@ -270,11 +271,16 @@ def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci
     out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]
 
     # Save bins with SV counts
-    if 'compressed' in determine_filetype(outfile):
-        outfile = path.splitext(outfile)[0]
+    if outfile in 'stdout /dev/stdout -'.split():
+        outfile = stdout
+        outfile_is_stdout = True
+    else:
+        outfile_is_stdout = False
+        if 'compressed' in determine_filetype(outfile):
+            outfile = path.splitext(outfile)[0]
     out_df.to_csv(outfile, sep='\t', header=True, index=False)
 
     # Bgzip bins, if optioned
-    if bgzip:
+    if bgzip and not outfile_is_stdout:
         bgz(outfile)
 
diff --git a/athena/mutrate/muquery.py b/athena/mutrate/muquery.py
index c5545ca..a4855ea 100644
--- a/athena/mutrate/muquery.py
+++ b/athena/mutrate/muquery.py
@@ -17,6 +17,7 @@
 import pysam
 import pandas as pd
 from numpy import log10, nansum, nanmax
+from sys import stdout
 
 
 def _get_query_entity_name(feature, query_group_by):
@@ -115,7 +116,7 @@ def mu_query(pairs, query, outfile, query_group_by, ovr_frac, raw_mu_in,
     query_results = float_cleanup(query_results, maxfloat, 1)
     
     # Write query to output file
-    if gzip and not outfile.endswith('.gz'):
-        outfile = outfile + '.gz'
+    if outfile in 'stdout /dev/stdout -'.split():
+        outfile = stdout
     query_results.to_csv(outfile, header=True, index=False, sep='\t')