From a5341aa03f1bbcc1087983f2919602d4f25c333a Mon Sep 17 00:00:00 2001 From: Elias Friman <40932182+efriman@users.noreply.github.com> Date: Wed, 21 Sep 2022 20:13:16 +0100 Subject: [PATCH] Added clr_weight_name argument to coverage to calculate balanced coverage (#385) * Added `clr_weight_name` argument to coverage to calculate balanced coverage * coverage function API change - `store_names` -> `store_prefix` * Added tests --- cooltools/api/coverage.py | 67 ++++++++++++++++++++++++++++++++------- cooltools/cli/coverage.py | 32 ++++++++++++++----- tests/test_coverage.py | 33 +++++++++++++++++++ 3 files changed, 112 insertions(+), 20 deletions(-) diff --git a/cooltools/api/coverage.py b/cooltools/api/coverage.py index 2cb4630a..6b8342a3 100644 --- a/cooltools/api/coverage.py +++ b/cooltools/api/coverage.py @@ -1,5 +1,17 @@ import numpy as np import cooler.tools +from ..lib.checks import is_cooler_balanced + + +def _apply_balancing(chunk, bias, balanced_column_name='balanced'): + """ + Multiply raw pixel counts by the balancing bias and return a modified + chunk with an additional column named balanced_column_name + """ + pixels = chunk["pixels"] + chunk['pixels'][balanced_column_name] = bias[pixels["bin1_id"]] * bias[pixels["bin2_id"]] * pixels["count"] + # returning modified chunks with an additional column + return chunk def _zero_diags(chunk, n_diags): @@ -55,15 +67,17 @@ def coverage( chunksize=int(1e7), map=map, use_lock=False, + clr_weight_name=None, store=False, - store_names=["cis_raw_cov", "tot_raw_cov"], + store_prefix="cov", ): """ Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for a sparse Hi-C contact map in Cooler HDF5 format. - Note that the sum(tot_cov) from this function is two times the number of reads - contributing to the cooler, as each side contributes to the coverage. + Note that for raw coverage (i.e. clr_weight_name=None) the sum(tot_cov) from this + function is two times the number of reads contributing to the cooler, + as each side contributes to the coverage. Parameters ---------- @@ -76,14 +90,18 @@ def coverage( Map function to dispatch the matrix chunks to workers. Default is the builtin ``map``, but alternatives include parallel map implementations from a multiprocessing pool. + clr_weight_name : str + Name of the weight column. Specify to calculate coverage of balanced cooler. ignore_diags : int, optional Drop elements occurring on the first ``ignore_diags`` diagonals of the matrix (including the main diagonal). If None, equals the number of diagonals ignored during IC balancing. store : bool, optional - If True, store the results in the file when finished. Default is False. - store_names : list, optional - Names of the columns of the bin table to save cis and total coverages. + If True, store the results in the input cooler file when finished. If clr_weight_name=None, + also stores total cis counts in the cooler info. Default is False. + store_prefix : str, optional + Name prefix of the columns of the bin table to save cis and total coverages. + Will add suffixes _cis and _tot, as well as _raw in the default case or _clr_weight_name if specified. Returns ------- @@ -102,24 +120,49 @@ def coverage( raise ValueError( "Please, specify ignore_diags and/or IC balance this cooler! Cannot access the value used in IC balancing. " ) - + if clr_weight_name is None: + # summing raw pixel counts + pixel_weight_key = "count" + elif is_cooler_balanced(clr, clr_weight_name): + # initialize balancing bias and masking + bias = clr.bins()[clr_weight_name][:] + bias_na_mask = np.isnan(bias) # remember masked bins + bias = np.nan_to_num(bias) + # summing balanced pixels + pixel_weight_key = "balanced" + else: + raise ValueError( + "cooler is not balanced, or" + f"balancing weight {clr_weight_name} is not available in the cooler." + ) + chunks = cooler.tools.split(clr, chunksize=chunksize, map=map, use_lock=use_lock) if ignore_diags: chunks = chunks.pipe(_zero_diags, n_diags=ignore_diags) + if clr_weight_name is not None: + chunks = chunks.pipe(_apply_balancing, bias=bias, balanced_column_name=pixel_weight_key) + n_bins = clr.info["nbins"] - covs = chunks.pipe(_get_chunk_coverage).reduce(np.add, np.zeros((2, n_bins))) - covs = covs[0].astype(int), covs[1].astype(int) - + covs = chunks.pipe(_get_chunk_coverage, pixel_weight_key=pixel_weight_key).reduce(np.add, np.zeros(n_bins)) + if clr_weight_name is None: + covs = covs.astype(int) + store_names = [f"{store_prefix}_cis_raw", f"{store_prefix}_tot_raw" ] + else: + covs[:, bias_na_mask] = np.nan + store_names = [f"{store_prefix}_cis_{clr_weight_name}", f"{store_prefix}_tot_{clr_weight_name}" ] + if store: with clr.open("r+") as grp: + dtype = int if clr_weight_name is None else float for store_name, cov_arr in zip(store_names, covs): if store_name in grp["bins"]: del grp["bins"][store_name] h5opts = dict(compression="gzip", compression_opts=6) grp["bins"].create_dataset( - store_name, data=cov_arr, **h5opts, dtype=int + store_name, data=cov_arr, **h5opts, dtype=dtype ) - grp.attrs.create("cis", np.sum(covs[0]) // 2, dtype=int) + if clr_weight_name is None: + grp.attrs.create("cis", np.sum(covs[0]) // 2, dtype=int) return covs diff --git a/cooltools/cli/coverage.py b/cooltools/cli/coverage.py index b9b4b7a1..98d82fdb 100644 --- a/cooltools/cli/coverage.py +++ b/cooltools/cli/coverage.py @@ -30,8 +30,10 @@ ) @click.option( "--store", - help="Append columns with coverage (cis_raw_cov, tot_raw_cov) " - " to the cooler bin table.", + help="Append columns with coverage (cov_cis_raw, cov_tot_raw), or" + " (cov_cis_clr_weight_name, cov_tot_clr_weight_name) if calculating" + " balanced coverage, to the cooler bin table. If clr_weight_name=None," + " also stores total cis counts in the cooler info", is_flag=True, ) @click.option( @@ -49,6 +51,14 @@ is_flag=True, default=False, ) +@click.option( + "--clr_weight_name", + help="Name of the weight column. Specify to calculate coverage of" + " balanced cooler.", + type=str, + default=None, + show_default=False, +) @click.option( "-p", "--nproc", @@ -58,7 +68,7 @@ type=int, ) def coverage( - cool_path, output, ignore_diags, store, chunksize, bigwig, nproc, + cool_path, output, ignore_diags, store, chunksize, bigwig, clr_weight_name, nproc, ): """ Calculate the sums of cis and genome-wide contacts (aka coverage aka marginals) for @@ -80,15 +90,21 @@ def coverage( try: cis_cov, tot_cov = api.coverage.coverage( - clr, ignore_diags=ignore_diags, chunksize=chunksize, map=_map, store=store + clr, ignore_diags=ignore_diags, chunksize=chunksize, map=_map, store=store, clr_weight_name=clr_weight_name ) finally: if nproc > 1: pool.close() coverage_table = clr.bins()[:][["chrom", "start", "end"]] - coverage_table["cis_raw_cov"] = cis_cov.astype(int) - coverage_table["tot_raw_cov"] = tot_cov.astype(int) + if clr_weight_name is None: + store_names = ["cov_cis_raw", "cov_tot_raw"] + coverage_table[store_names[0]] = cis_cov.astype(int) + coverage_table[store_names[1]] = tot_cov.astype(int) + else: + store_names = [f"cov_cis_{clr_weight_name}", f"cov_tot_{clr_weight_name}"] + coverage_table[store_names[0]] = cis_cov.astype(float) + coverage_table[store_names[1]] = tot_cov.astype(float) # output to file if specified: if output: @@ -103,11 +119,11 @@ def coverage( coverage_table, clr.chromsizes, f"{output}.cis.bw", - value_field="cis_raw_cov", + value_field=store_names[0], ) bioframe.to_bigwig( coverage_table, clr.chromsizes, f"{output}.tot.bw", - value_field="tot_raw_cov", + value_field=store_names[1], ) diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 9920f54f..f79fb990 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -53,3 +53,36 @@ def test_coverage_symmetric_upper(request): assert (tot_cov == np.array([3, 1, 2])).all() assert clr.info["cis"] == 1 assert clr.info["sum"] == 3 + +def test_balanced_coverage(request): + # perform test: + clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) + cis_cov_weight, tot_cov_weight = cooltools.api.coverage.coverage( + clr, ignore_diags=2, chunksize=int(1e7), clr_weight_name="weight" + ) + + # Test that mean total balanced coverage is 1.0 + assert np.nanmean(tot_cov_weight) == 1.0 + + # Generate test matrix with weights + bins=pd.DataFrame( + [["chr1", 0, 1, 0.5], + ["chr1", 1, 2, 1], + ["chrX", 1, 2, 0.2], + ["chrX", 2, 3, np.nan]], + columns=["chrom", "start", "end", "weight"], + ) + + pixels = pd.DataFrame( + [[0, 1, 1], [0, 2, 2], [1, 3, 2], [2, 3, 1]], + columns=["bin1_id", "bin2_id", "count"] + ) + + clr_file = op.join(request.fspath.dirname, "data/test_coverage.cool") + cooler.create_cooler(clr_file, bins, pixels) + clr = cooler.Cooler(clr_file) + cis_cov_weight, tot_cov_weight = cooltools.coverage(clr, ignore_diags=0, store=True, clr_weight_name="weight") + assert np.allclose(cis_cov_weight, np.array([0.5, 0.5, 0, np.nan]), + equal_nan=True) + assert np.allclose(tot_cov_weight, np.array([0.7, 0.5, 0.2, np.nan]), + equal_nan=True)