Skip to content

Commit

Permalink
Added clr_weight_name argument to coverage to calculate balanced cove…
Browse files Browse the repository at this point in the history
…rage (#385)

* Added `clr_weight_name` argument to coverage to calculate balanced coverage
* coverage function API change - `store_names` -> `store_prefix`
* Added tests
  • Loading branch information
efriman authored Sep 21, 2022
1 parent 2ba3c6e commit a5341aa
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 20 deletions.
67 changes: 55 additions & 12 deletions cooltools/api/coverage.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
import numpy as np
import cooler.tools
from ..lib.checks import is_cooler_balanced


def _apply_balancing(chunk, bias, balanced_column_name='balanced'):
"""
Multiply raw pixel counts by the balancing bias and return a modified
chunk with an additional column named balanced_column_name
"""
pixels = chunk["pixels"]
chunk['pixels'][balanced_column_name] = bias[pixels["bin1_id"]] * bias[pixels["bin2_id"]] * pixels["count"]
# returning modified chunks with an additional column
return chunk


def _zero_diags(chunk, n_diags):
Expand Down Expand Up @@ -55,15 +67,17 @@ def coverage(
chunksize=int(1e7),
map=map,
use_lock=False,
clr_weight_name=None,
store=False,
store_names=["cis_raw_cov", "tot_raw_cov"],
store_prefix="cov",
):

"""
Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for
a sparse Hi-C contact map in Cooler HDF5 format.
Note that the sum(tot_cov) from this function is two times the number of reads
contributing to the cooler, as each side contributes to the coverage.
Note that for raw coverage (i.e. clr_weight_name=None) the sum(tot_cov) from this
function is two times the number of reads contributing to the cooler,
as each side contributes to the coverage.
Parameters
----------
Expand All @@ -76,14 +90,18 @@ def coverage(
Map function to dispatch the matrix chunks to workers.
Default is the builtin ``map``, but alternatives include parallel map
implementations from a multiprocessing pool.
clr_weight_name : str
Name of the weight column. Specify to calculate coverage of balanced cooler.
ignore_diags : int, optional
Drop elements occurring on the first ``ignore_diags`` diagonals of the
matrix (including the main diagonal).
If None, equals the number of diagonals ignored during IC balancing.
store : bool, optional
If True, store the results in the file when finished. Default is False.
store_names : list, optional
Names of the columns of the bin table to save cis and total coverages.
If True, store the results in the input cooler file when finished. If clr_weight_name=None,
also stores total cis counts in the cooler info. Default is False.
store_prefix : str, optional
Name prefix of the columns of the bin table to save cis and total coverages.
Will add suffixes _cis and _tot, as well as _raw in the default case or _clr_weight_name if specified.
Returns
-------
Expand All @@ -102,24 +120,49 @@ def coverage(
raise ValueError(
"Please, specify ignore_diags and/or IC balance this cooler! Cannot access the value used in IC balancing. "
)

if clr_weight_name is None:
# summing raw pixel counts
pixel_weight_key = "count"
elif is_cooler_balanced(clr, clr_weight_name):
# initialize balancing bias and masking
bias = clr.bins()[clr_weight_name][:]
bias_na_mask = np.isnan(bias) # remember masked bins
bias = np.nan_to_num(bias)
# summing balanced pixels
pixel_weight_key = "balanced"
else:
raise ValueError(
"cooler is not balanced, or"
f"balancing weight {clr_weight_name} is not available in the cooler."
)

chunks = cooler.tools.split(clr, chunksize=chunksize, map=map, use_lock=use_lock)

if ignore_diags:
chunks = chunks.pipe(_zero_diags, n_diags=ignore_diags)

if clr_weight_name is not None:
chunks = chunks.pipe(_apply_balancing, bias=bias, balanced_column_name=pixel_weight_key)

n_bins = clr.info["nbins"]
covs = chunks.pipe(_get_chunk_coverage).reduce(np.add, np.zeros((2, n_bins)))
covs = covs[0].astype(int), covs[1].astype(int)

covs = chunks.pipe(_get_chunk_coverage, pixel_weight_key=pixel_weight_key).reduce(np.add, np.zeros(n_bins))
if clr_weight_name is None:
covs = covs.astype(int)
store_names = [f"{store_prefix}_cis_raw", f"{store_prefix}_tot_raw" ]
else:
covs[:, bias_na_mask] = np.nan
store_names = [f"{store_prefix}_cis_{clr_weight_name}", f"{store_prefix}_tot_{clr_weight_name}" ]

if store:
with clr.open("r+") as grp:
dtype = int if clr_weight_name is None else float
for store_name, cov_arr in zip(store_names, covs):
if store_name in grp["bins"]:
del grp["bins"][store_name]
h5opts = dict(compression="gzip", compression_opts=6)
grp["bins"].create_dataset(
store_name, data=cov_arr, **h5opts, dtype=int
store_name, data=cov_arr, **h5opts, dtype=dtype
)
grp.attrs.create("cis", np.sum(covs[0]) // 2, dtype=int)
if clr_weight_name is None:
grp.attrs.create("cis", np.sum(covs[0]) // 2, dtype=int)
return covs
32 changes: 24 additions & 8 deletions cooltools/cli/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
)
@click.option(
"--store",
help="Append columns with coverage (cis_raw_cov, tot_raw_cov) "
" to the cooler bin table.",
help="Append columns with coverage (cov_cis_raw, cov_tot_raw), or"
" (cov_cis_clr_weight_name, cov_tot_clr_weight_name) if calculating"
" balanced coverage, to the cooler bin table. If clr_weight_name=None,"
" also stores total cis counts in the cooler info",
is_flag=True,
)
@click.option(
Expand All @@ -49,6 +51,14 @@
is_flag=True,
default=False,
)
@click.option(
"--clr_weight_name",
help="Name of the weight column. Specify to calculate coverage of"
" balanced cooler.",
type=str,
default=None,
show_default=False,
)
@click.option(
"-p",
"--nproc",
Expand All @@ -58,7 +68,7 @@
type=int,
)
def coverage(
cool_path, output, ignore_diags, store, chunksize, bigwig, nproc,
cool_path, output, ignore_diags, store, chunksize, bigwig, clr_weight_name, nproc,
):
"""
Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for
Expand All @@ -80,15 +90,21 @@ def coverage(

try:
cis_cov, tot_cov = api.coverage.coverage(
clr, ignore_diags=ignore_diags, chunksize=chunksize, map=_map, store=store
clr, ignore_diags=ignore_diags, chunksize=chunksize, map=_map, store=store, clr_weight_name=clr_weight_name
)
finally:
if nproc > 1:
pool.close()

coverage_table = clr.bins()[:][["chrom", "start", "end"]]
coverage_table["cis_raw_cov"] = cis_cov.astype(int)
coverage_table["tot_raw_cov"] = tot_cov.astype(int)
if clr_weight_name is None:
store_names = ["cov_cis_raw", "cov_tot_raw"]
coverage_table[store_names[0]] = cis_cov.astype(int)
coverage_table[store_names[1]] = tot_cov.astype(int)
else:
store_names = [f"cov_cis_{clr_weight_name}", f"cov_tot_{clr_weight_name}"]
coverage_table[store_names[0]] = cis_cov.astype(float)
coverage_table[store_names[1]] = tot_cov.astype(float)

# output to file if specified:
if output:
Expand All @@ -103,11 +119,11 @@ def coverage(
coverage_table,
clr.chromsizes,
f"{output}.cis.bw",
value_field="cis_raw_cov",
value_field=store_names[0],
)
bioframe.to_bigwig(
coverage_table,
clr.chromsizes,
f"{output}.tot.bw",
value_field="tot_raw_cov",
value_field=store_names[1],
)
33 changes: 33 additions & 0 deletions tests/test_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,36 @@ def test_coverage_symmetric_upper(request):
assert (tot_cov == np.array([3, 1, 2])).all()
assert clr.info["cis"] == 1
assert clr.info["sum"] == 3

def test_balanced_coverage(request):
# perform test:
clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
cis_cov_weight, tot_cov_weight = cooltools.api.coverage.coverage(
clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight"
)

# Test that mean total balanced coverage is 1.0
assert np.nanmean(tot_cov_weight) == 1.0

# Generate test matrix with weights
bins=pd.DataFrame(
[["chr1", 0, 1, 0.5],
["chr1", 1, 2, 1],
["chrX", 1, 2, 0.2],
["chrX", 2, 3, np.nan]],
columns=["chrom", "start", "end", "weight"],
)

pixels = pd.DataFrame(
[[0, 1, 1], [0, 2, 2], [1, 3, 2], [2, 3, 1]],
columns=["bin1_id", "bin2_id", "count"]
)

clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool")
cooler.create_cooler(clr_file, bins, pixels)
clr = cooler.Cooler(clr_file)
cis_cov_weight, tot_cov_weight = cooltools.coverage(clr, ignore_diags=0, store=True, clr_weight_name="weight")
assert np.allclose(cis_cov_weight, np.array([0.5, 0.5, 0, np.nan]),
equal_nan=True)
assert np.allclose(tot_cov_weight, np.array([0.7, 0.5, 0.2, np.nan]),
equal_nan=True)

0 comments on commit a5341aa

Please sign in to comment.