From 0a7c01f9d186e36bb37cda22583e394fd0611e29 Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Thu, 15 Feb 2024 12:25:14 -0800 Subject: [PATCH 1/6] 1. use n_elem for calculating count.avg 2. store count.avg.smoothed and count.avg.smoothed.agg in cvd table --- cooltools/api/expected.py | 17 ++++++++++------ cooltools/sandbox/expected_smoothing.py | 26 ++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index 8d2fbeac..eeacd598 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -256,7 +256,8 @@ def _make_diag_table(n_bins, bad_locs): ) diags = diags[diags["n_elem"] > 0] - diags = diags.drop("n_elem", axis=1) + # keep diags["n_elem"] for calculating count.avg + # diags = diags.drop("n_elem", axis=1) return diags.astype(int) @@ -1009,7 +1010,11 @@ def expected_cis( # calculate actual averages by dividing sum by n_valid: for key in chain(["count"], transforms): - result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID] + if key == "count": + result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"] + else: + result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID] + # additional smoothing and aggregating options would add columns only, not replace them if smooth: @@ -1017,9 +1022,9 @@ def expected_cis( result, sigma_log10=smooth_sigma, ) - # add smoothed columns to the result (only balanced for now) + # add smoothed columns to the result (only balanced for now) (include count as well) result = result.merge( - result_smooth[["balanced.avg.smoothed", _DIST]], + result_smooth[["balanced.avg.smoothed", "count.avg.smoothed", _DIST]], on=[_REGION1, _REGION2, _DIST], how="left", ) @@ -1028,10 +1033,10 @@ def expected_cis( result, groupby=None, sigma_log10=smooth_sigma, - ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"}) + ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"}) # add smoothed columns to the result result = result.merge( - result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]], + result_smooth_agg[["balanced.avg.smoothed.agg", "count.avg.smoothed.agg", _DIST]], on=[ _DIST, ], diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py index 630675cc..18c354b9 100644 --- a/cooltools/sandbox/expected_smoothing.py +++ b/cooltools/sandbox/expected_smoothing.py @@ -10,6 +10,9 @@ "n_contacts": "balanced.sum", "contact_freq": "balanced.avg", "smooth_suffix": ".smoothed", + "n_pixels_tot": "n_elem", + "n_contacts_raw": "count.sum", + "contact_freq_raw": "count.avg", } @@ -182,6 +185,8 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non { cols["n_pixels"]: "sum", cols["n_contacts"]: "sum", + cols["n_pixels_tot"]: "sum", + cols["n_contacts_raw"]: "sum", } ) .reset_index() @@ -198,6 +203,18 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non points_per_sigma=points_per_sigma, ) + smoothed_raw_sum, smoothed_n_elem = log_smooth( + cvd_smoothed[cols["dist"]].values.astype(np.float64), + [ + cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64), + cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64), + ], + sigma_log10=sigma_log10, + window_sigma=window_sigma, + points_per_sigma=points_per_sigma, + ) + + # cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]] cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid @@ -207,6 +224,13 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] ) + cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem + cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum + cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = ( + cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] + / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] + ) + return cvd_smoothed @@ -291,7 +315,7 @@ def agg_smooth_cvd( ) cvd_smoothed.drop( - [cols["n_pixels"], cols["n_contacts"]], axis="columns", inplace=True + [cols["n_pixels"], cols["n_contacts"], cols["n_pixels_tot"], cols["n_contacts_raw"]], axis="columns", inplace=True ) # cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore') From 323d7681e8161d750cef54a3cb643711df9e1430 Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Wed, 6 Mar 2024 14:18:28 -0800 Subject: [PATCH 2/6] 1. Add a test to check output columns of expected_cis() 2. Add notes for column names in cvd table to expected_cis() and agg_smooth_cvd() 3. Add dist_bp and contact_freq columns to cvd table and keep all other columns --- cooltools/api/expected.py | 42 ++++++++++++++-- cooltools/sandbox/expected_smoothing.py | 64 +++++++++++++++---------- tests/test_expected.py | 4 ++ 3 files changed, 82 insertions(+), 28 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index eeacd598..ab4cde6e 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -945,7 +945,36 @@ def expected_cis( ------- DataFrame with summary statistic of every diagonal of every symmetric or asymmetric block: - region1, region2, diag, n_valid, count.sum count.avg, etc + + Notes + ----- + dist: + Distance in bins. + dist_bp: + Distance in basepairs. + contact_freq: + The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; + if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. + n_elem: + Number of total pixels at a given distance. + n_valid: + Number of valid pixels (with non-NaN values after balancing) at a given distance. + count.sum: + Sum up raw contact counts of all pixels at a given distance. + balanced.sum: + Sum up balanced contact values of valid pixels at a given distance. + count.avg: + The average raw contact count of pixels at a given distance. count.sum / n_elem. + balanced.avg: + The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. + count.avg.smoothed: + Log-smoothed count.avg. + balanced.avg.smoothed: + Log-smoothed balanced.avg. + count.avg.smoothed.agg: + Aggregate Log-smoothed count.avg of all genome regions. + balanced.avg.smoothed.agg: + Aggregate Log-smoothed balanced.avg of all genome regions. """ @@ -979,7 +1008,6 @@ def expected_cis( weight1 = clr_weight_name + "1" weight2 = clr_weight_name + "2" transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]} - else: raise ValueError( "cooler is not balanced, or" @@ -1015,6 +1043,9 @@ def expected_cis( else: result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID] + # add dist_bp column, which shows distance in basepairs + result.insert(3, 'dist_bp', result[_DIST]*clr.binsize) + # additional smoothing and aggregating options would add columns only, not replace them if smooth: @@ -1024,7 +1055,7 @@ def expected_cis( ) # add smoothed columns to the result (only balanced for now) (include count as well) result = result.merge( - result_smooth[["balanced.avg.smoothed", "count.avg.smoothed", _DIST]], + result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]], on=[_REGION1, _REGION2, _DIST], how="left", ) @@ -1036,12 +1067,15 @@ def expected_cis( ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"}) # add smoothed columns to the result result = result.merge( - result_smooth_agg[["balanced.avg.smoothed.agg", "count.avg.smoothed.agg", _DIST]], + result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]], on=[ _DIST, ], how="left", ) + + # add contact_frequency columns to the result, which is the copy of the "most processing" contact values + result.insert(4, 'contact_frequency', result.iloc[:,-1]) return result diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py index 18c354b9..46f21b44 100644 --- a/cooltools/sandbox/expected_smoothing.py +++ b/cooltools/sandbox/expected_smoothing.py @@ -3,7 +3,7 @@ import numpy as np import numba - +# See notes in the docstring of agg_smooth_cvd() DEFAULT_CVD_COLS = { "dist": "dist", "n_pixels": "n_valid", @@ -213,23 +213,20 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non window_sigma=window_sigma, points_per_sigma=points_per_sigma, ) - - - # cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]] - - cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid - cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum - cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = ( - cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] - / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] - ) - + cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = ( cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] ) + + cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid + cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum + cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = ( + cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] + / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] + ) return cvd_smoothed @@ -292,6 +289,37 @@ def agg_smooth_cvd( ------- cvd_smoothed : pandas.DataFrame A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed"). + + Notes + ----- + dist: + Distance in bins. + dist_bp: + Distance in basepairs. + contact_freq: + The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; + if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. + n_elem: + Number of total pixels at a given distance. + n_valid: + Number of valid pixels (with non-NaN values after balancing) at a given distance. + count.sum: + Sum up raw contact counts of all pixels at a given distance. + balanced.sum: + Sum up balanced contact values of valid pixels at a given distance. + count.avg: + The average raw contact count of pixels at a given distance. count.sum / n_elem. + balanced.avg: + The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. + count.avg.smoothed: + Log-smoothed count.avg. + balanced.avg.smoothed: + Log-smoothed balanced.avg. + count.avg.smoothed.agg: + Aggregate Log-smoothed count.avg of all genome regions. + balanced.avg.smoothed.agg: + Aggregate Log-smoothed balanced.avg of all genome regions. + """ cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols)) @@ -314,16 +342,4 @@ def agg_smooth_cvd( cols=cols, ) - cvd_smoothed.drop( - [cols["n_pixels"], cols["n_contacts"], cols["n_pixels_tot"], cols["n_contacts_raw"]], axis="columns", inplace=True - ) - - # cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore') - - # cvd = cvd.merge( - # cvd_smoothed, - # on=groupby + [cols["dist"]], - # how="left", - # ) - return cvd_smoothed diff --git a/tests/test_expected.py b/tests/test_expected.py index 31ee1fe7..adce9375 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -257,6 +257,10 @@ def test_expected_cis(request): chunksize=chunksize, ignore_diags=ignore_diags, ) + + # check column names + assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum', 'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg'] + # check results for every block grouped = res_symm.groupby(["region1", "region2"]) for (name1, name2), group in grouped: From c04f1005d57454065643238fe838f7e035791860 Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Thu, 7 Mar 2024 21:42:38 -0800 Subject: [PATCH 3/6] update the structure of agg_smooth_cvd() --- cooltools/api/expected.py | 110 +++++++----- cooltools/sandbox/expected_smoothing.py | 219 +++++++++++------------- tests/test_expected.py | 26 ++- 3 files changed, 184 insertions(+), 171 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index ab4cde6e..283e164c 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -25,6 +25,26 @@ _DIST = list(diag_expected_dtypes)[2] _NUM_VALID = list(diag_expected_dtypes)[3] +# See notes in the docstring of agg_smooth_cvd() +DEFAULT_BALANCED_CVD_COLS = { + "region1": _REGION1, + "region2": _REGION2, + "dist": "dist", + "n_pixels": "n_valid", + "n_contacts": "balanced.sum", + "output_prefix": "balanced.avg", +} +DEFAULT_RAW_CVD_COLS = { + "region1": _REGION1, + "region2": _REGION2, + "dist": "dist", + "n_pixels": "n_total", + "n_contacts": "count.sum", + "output_prefix": "count.avg", +} +SMOOTH_SUFFIX = ".smoothed" +SMOOTH_AGG_SUFFIX = ".smoothed.agg" + def lattice_pdist_frequencies(n, points): """ @@ -199,7 +219,7 @@ def count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y): def make_diag_table(bad_mask, span1, span2): """ - Compute the total number of elements ``n_elem`` and the number of bad + Compute the total number of elements ``n_total`` and the number of bad elements ``n_bad`` per diagonal for a single contact area encompassing ``span1`` and ``span2`` on the same genomic scaffold (cis matrix). @@ -217,7 +237,7 @@ def make_diag_table(bad_mask, span1, span2): Returns ------- diags : pandas.DataFrame - Table indexed by 'diag' with columns ['n_elem', 'n_bad']. + Table indexed by 'diag' with columns ['n_total', 'n_bad']. """ @@ -226,8 +246,8 @@ def make_diag_table(bad_mask, span1, span2): def _make_diag_table(n_bins, bad_locs): diags = pd.DataFrame(index=pd.Series(np.arange(n_bins), name=_DIST)) - diags["n_elem"] = count_all_pixels_per_diag(n_bins) - diags[_NUM_VALID] = diags["n_elem"] - count_bad_pixels_per_diag( + diags["n_total"] = count_all_pixels_per_diag(n_bins) + diags[_NUM_VALID] = diags["n_total"] - count_bad_pixels_per_diag( n_bins, bad_locs ) return diags @@ -254,10 +274,10 @@ def _make_diag_table(n_bins, bad_locs): diags.add( _make_diag_table(lo2 - hi1, where(bad_mask[hi1:lo2])), fill_value=0 ) - diags = diags[diags["n_elem"] > 0] + diags = diags[diags["n_total"] > 0] - # keep diags["n_elem"] for calculating count.avg - # diags = diags.drop("n_elem", axis=1) + # keep diags["n_total"] for calculating count.avg + # diags = diags.drop("n_total", axis=1) return diags.astype(int) @@ -303,7 +323,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"): regions2 = bioframe.make_viewframe( regions2, check_bounds=clr.chromsizes ).to_numpy() - except ValueError: # If there are non-unique entries in regions1/2, possible only for asymmetric expected: + except ( + ValueError + ): # If there are non-unique entries in regions1/2, possible only for asymmetric expected: regions = pd.concat( [ bioframe.make_viewframe([region], check_bounds=clr.chromsizes) @@ -939,42 +961,44 @@ def expected_cis( map_functor : callable, optional Map function to dispatch the matrix chunks to workers. If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool; - If nproc=1 this defaults the builtin map. + If nproc=1 this defaults the builtin map. Returns ------- DataFrame with summary statistic of every diagonal of every symmetric or asymmetric block: - + Notes ----- - dist: + Output DataFrame columns correspond to the following quantities: + + dist: Distance in bins. dist_bp: Distance in basepairs. contact_freq: - The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; + The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. - n_elem: + n_total: Number of total pixels at a given distance. - n_valid: + n_valid: Number of valid pixels (with non-NaN values after balancing) at a given distance. count.sum: Sum up raw contact counts of all pixels at a given distance. - balanced.sum: - Sum up balanced contact values of valid pixels at a given distance. + balanced.sum: + Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None. count.avg: - The average raw contact count of pixels at a given distance. count.sum / n_elem. + The average raw contact count of pixels at a given distance. count.sum / n_total. balanced.avg: - The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. + The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None. count.avg.smoothed: - Log-smoothed count.avg. + Log-smoothed count.avg. Returned if smooth=True. balanced.avg.smoothed: - Log-smoothed balanced.avg. + Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None. count.avg.smoothed.agg: - Aggregate Log-smoothed count.avg of all genome regions. + Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True. balanced.avg.smoothed.agg: - Aggregate Log-smoothed balanced.avg of all genome regions. + Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None. """ @@ -1000,9 +1024,11 @@ def expected_cis( raise ValueError("view_df is not a valid viewframe or incompatible") from e # define transforms - balanced and raw ('count') for now + cols = DEFAULT_BALANCED_CVD_COLS if clr_weight_name is None: # no transforms transforms = {} + cols = DEFAULT_RAW_CVD_COLS elif is_cooler_balanced(clr, clr_weight_name): # define balanced data transform: weight1 = clr_weight_name + "1" @@ -1012,6 +1038,7 @@ def expected_cis( raise ValueError( "cooler is not balanced, or" f"balancing weight {clr_weight_name} is not available in the cooler." + f"Pass clr_weight_name=None explicitly to calculate on raw counts" ) # using try-clause to close mp.Pool properly @@ -1039,43 +1066,32 @@ def expected_cis( # calculate actual averages by dividing sum by n_valid: for key in chain(["count"], transforms): if key == "count": - result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"] - else: + result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_total"] + else: result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID] # add dist_bp column, which shows distance in basepairs - result.insert(3, 'dist_bp', result[_DIST]*clr.binsize) - + result.insert(3, "dist_bp", result[_DIST] * clr.binsize) # additional smoothing and aggregating options would add columns only, not replace them if smooth: - result_smooth = expected_smoothing.agg_smooth_cvd( + result = expected_smoothing.per_region_smooth_cvd( result, sigma_log10=smooth_sigma, + cols=cols, + suffix=SMOOTH_SUFFIX, ) - # add smoothed columns to the result (only balanced for now) (include count as well) - result = result.merge( - result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]], - on=[_REGION1, _REGION2, _DIST], - how="left", - ) + if aggregate_smoothed: - result_smooth_agg = expected_smoothing.agg_smooth_cvd( + result = expected_smoothing.genome_wide_smooth_cvd( result, - groupby=None, sigma_log10=smooth_sigma, - ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"}) - # add smoothed columns to the result - result = result.merge( - result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]], - on=[ - _DIST, - ], - how="left", + cols=cols, + suffix=SMOOTH_AGG_SUFFIX, ) - + # add contact_frequency columns to the result, which is the copy of the "most processing" contact values - result.insert(4, 'contact_frequency', result.iloc[:,-1]) + result.insert(4, "contact_frequency", result.iloc[:, -1]) return result @@ -1121,7 +1137,7 @@ def expected_trans( map_functor : callable, optional Map function to dispatch the matrix chunks to workers. If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool; - If nproc=1 this defaults the builtin map. + If nproc=1 this defaults the builtin map. Returns ------- @@ -1156,7 +1172,7 @@ def expected_trans( weight1 = clr_weight_name + "1" weight2 = clr_weight_name + "2" transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]} - + else: raise ValueError( "cooler is not balanced, or" diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py index 46f21b44..af7bd4f7 100644 --- a/cooltools/sandbox/expected_smoothing.py +++ b/cooltools/sandbox/expected_smoothing.py @@ -3,18 +3,6 @@ import numpy as np import numba -# See notes in the docstring of agg_smooth_cvd() -DEFAULT_CVD_COLS = { - "dist": "dist", - "n_pixels": "n_valid", - "n_contacts": "balanced.sum", - "contact_freq": "balanced.avg", - "smooth_suffix": ".smoothed", - "n_pixels_tot": "n_elem", - "n_contacts_raw": "count.sum", - "contact_freq_raw": "count.avg", -} - def _log_interp(xs, xp, fp): """ @@ -66,7 +54,7 @@ def _log_thin(xs, min_log10_step=0.1): """ xs_thinned = [xs[0]] prev = xs[0] - min_ratio = 10 ** min_log10_step + min_ratio = 10**min_log10_step for x in xs[1:]: if x > prev * min_ratio: xs_thinned.append(x) @@ -103,9 +91,9 @@ def _log_smooth_numba( hi = np.searchsorted(log_xs, cur_log_x + sigma_log10 * window_sigma) smooth_weights = np.exp( -((cur_log_x - log_xs[lo:hi]) ** 2) / 2 / sigma_log10 / sigma_log10 - ) + ) norm = smooth_weights.sum() - + if norm > 0: smooth_weights /= norm @@ -156,9 +144,7 @@ def log_smooth( if ys.ndim not in (1, 2): raise ValueError('ys must be either a 1D vector or a "tall" 2D matrix') if xs.shape[0] != ys.shape[-1]: - raise ValueError( - "xs and ys must have the same number of observations" - ) + raise ValueError("xs and ys must have the same number of observations") ys = ys[np.newaxis, :] if ys.ndim == 1 else ys @@ -176,17 +162,15 @@ def log_smooth( return ys_smoothed -def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=None): - cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols)) - +def _smooth_cvd_group( + cvd, sigma_log10, window_sigma, points_per_sigma, cols=None, suffix="" +): cvd_smoothed = ( cvd.groupby(cols["dist"]) .agg( { cols["n_pixels"]: "sum", cols["n_contacts"]: "sum", - cols["n_pixels_tot"]: "sum", - cols["n_contacts_raw"]: "sum", } ) .reset_index() @@ -203,77 +187,31 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non points_per_sigma=points_per_sigma, ) - smoothed_raw_sum, smoothed_n_elem = log_smooth( - cvd_smoothed[cols["dist"]].values.astype(np.float64), - [ - cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64), - cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64), - ], - sigma_log10=sigma_log10, - window_sigma=window_sigma, - points_per_sigma=points_per_sigma, - ) - - cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem - cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum - cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = ( - cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] - / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] - ) - - cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid - cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum - cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = ( - cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] - / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] + cvd_smoothed[cols["n_pixels"] + suffix] = smoothed_n_valid + cvd_smoothed[cols["n_contacts"] + suffix] = smoothed_balanced_sum + cvd_smoothed[cols["output_prefix"] + suffix] = ( + cvd_smoothed[cols["n_contacts"] + suffix] + / cvd_smoothed[cols["n_pixels"] + suffix] ) return cvd_smoothed -def _agg_smooth_cvd( - cvd, groupby, sigma_log10, window_sigma, points_per_sigma, cols=None -): - if groupby: - cvd = cvd.set_index(groupby).groupby(groupby).apply( - _smooth_cvd_group, - sigma_log10=sigma_log10, - window_sigma=window_sigma, - points_per_sigma=points_per_sigma, - cols=cols, - ) - else: - cvd = _smooth_cvd_group( - cvd, - sigma_log10=sigma_log10, - window_sigma=window_sigma, - points_per_sigma=points_per_sigma, - cols=cols, - ) - - return cvd - - -def agg_smooth_cvd( +def genome_wide_smooth_cvd( cvd, - groupby=["region1", "region2"], sigma_log10=0.1, window_sigma=5, points_per_sigma=10, cols=None, + suffix="", ): """ - Smooth the contact-vs-distance curve in the log-space. + Smooth the contact-vs-distance curve aggregated across all regions in log-space. Parameters ---------- cvd : pandas.DataFrame A dataframe with the expected values in the cooltools.expected format. - groupby : list of str - The list of column names to split the input table before smoothing. - This argument can be used to calculate separate smoothed CvD curves for - each region, Hi-C read orientation, etc. - If None or empty, a single CvD curve is calculated for the whole table. sigma_log10 : float, optional The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 window_sigma : int, optional @@ -287,59 +225,102 @@ def agg_smooth_cvd( Returns ------- - cvd_smoothed : pandas.DataFrame + cvd : pandas.DataFrame A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed"). Notes ----- - dist: - Distance in bins. - dist_bp: - Distance in basepairs. - contact_freq: - The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; - if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. - n_elem: - Number of total pixels at a given distance. - n_valid: - Number of valid pixels (with non-NaN values after balancing) at a given distance. - count.sum: - Sum up raw contact counts of all pixels at a given distance. - balanced.sum: - Sum up balanced contact values of valid pixels at a given distance. - count.avg: - The average raw contact count of pixels at a given distance. count.sum / n_elem. - balanced.avg: - The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. - count.avg.smoothed: - Log-smoothed count.avg. - balanced.avg.smoothed: - Log-smoothed balanced.avg. - count.avg.smoothed.agg: - Aggregate Log-smoothed count.avg of all genome regions. - balanced.avg.smoothed.agg: - Aggregate Log-smoothed balanced.avg of all genome regions. - - """ + Parameters in "cols" + + dist: - cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols)) + n_pixels: - if groupby is None: - groupby = [] - elif isinstance(groupby, str): - groupby = [groupby] - elif isinstance(groupby, Iterable): - groupby = list(groupby) - else: - raise ValueError("groupby must be a string, a list of strings, or None") + n_contacts: - cvd_smoothed = _agg_smooth_cvd( + contact_freq: + + """ + cvd_smoothed = _smooth_cvd_group( cvd, - groupby=groupby, sigma_log10=sigma_log10, window_sigma=window_sigma, points_per_sigma=points_per_sigma, cols=cols, + suffix=suffix, ) - return cvd_smoothed + # add aggeragated smoothed columns to the result + cvd = cvd.merge( + cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], + on=[cols["dist"]], + how="left", + ) + + return cvd + + +def per_region_smooth_cvd( + cvd, + sigma_log10=0.1, + window_sigma=5, + points_per_sigma=10, + cols=None, + suffix="", +): + """ + Smooth the contact-vs-distance curve for each region in log-space. + + Parameters + ---------- + cvd : pandas.DataFrame + A dataframe with the expected values in the cooltools.expected format. + sigma_log10 : float, optional + The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 + window_sigma : int, optional + Width of the smoothing window, expressed in sigmas, by default 5 + points_per_sigma : int, optional + If provided, smoothing is done only for `points_per_sigma` points per sigma and the + rest of the values are interpolated (this results in a major speed-up). By default 10 + cols : dict, optional + If provided, use the specified column names instead of the standard ones. + See DEFAULT_CVD_COLS variable for the format of this argument. + + Returns + ------- + cvd : pandas.DataFrame + A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed"). + + Notes + ----- + Parameters in "cols" + + dist: + + n_pixels: + + n_contacts: + + contact_freq: + + """ + cvd_smoothed = ( + cvd.set_index([cols["region1"], cols["region2"]]) + .groupby([cols["region1"], cols["region2"]]) + .apply( + _smooth_cvd_group, + sigma_log10=sigma_log10, + window_sigma=window_sigma, + points_per_sigma=points_per_sigma, + cols=cols, + suffix=suffix, + ) + ) + # add smoothed columns to the result + cvd = cvd.merge( + cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], + on=[cols["region1"], cols["region2"], cols["dist"]], + how="left", + ) + + return cvd diff --git a/tests/test_expected.py b/tests/test_expected.py index adce9375..d831b9bd 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -259,7 +259,23 @@ def test_expected_cis(request): ) # check column names - assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum', 'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg'] + assert list(res_symm.columns) == [ + "region1", + "region2", + "dist", + "dist_bp", + "contact_frequency", + "n_elem", + "n_valid", + "count.sum", + "balanced.sum", + "count.avg", + "balanced.avg", + "count.avg.smoothed", + "balanced.avg.smoothed", + "count.avg.smoothed.agg", + "balanced.avg.smoothed.agg", + ] # check results for every block grouped = res_symm.groupby(["region1", "region2"]) @@ -304,7 +320,7 @@ def test_expected_cis(request): desired=desired_expected, equal_nan=True, ) - + # check multiprocessed result res_all_pooled = cooltools.api.expected.expected_cis( clr, @@ -313,7 +329,7 @@ def test_expected_cis(request): clr_weight_name=clr_weight_name, chunksize=chunksize, ignore_diags=ignore_diags, - nproc=3 + nproc=3, ) assert res_all.equals(res_all_pooled) @@ -368,7 +384,7 @@ def test_expected_trans(request): view_df=view_df, clr_weight_name=clr_weight_name, chunksize=chunksize, - nproc=3 + nproc=3, ) assert res.equals(res_pooled) @@ -696,4 +712,4 @@ def test_diagsum_from_array(): exp = _diagsum_symm_dense(ar, bad_bins=list(range(3, 5))) exp1 = diagsum_from_array(ar, ignore_diags=0) exp1["balanced.avg"] = exp1["balanced.sum"] / exp1["n_valid"] - assert np.allclose(exp, exp1["balanced.avg"].values, equal_nan=True) \ No newline at end of file + assert np.allclose(exp, exp1["balanced.avg"].values, equal_nan=True) From 2d137259987fa83d2066c6121a8efdeade9aaeca Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Thu, 7 Mar 2024 21:54:11 -0800 Subject: [PATCH 4/6] update col names in test_expected.py --- tests/test_expected.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_expected.py b/tests/test_expected.py index d831b9bd..e6e86ca6 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -265,15 +265,13 @@ def test_expected_cis(request): "dist", "dist_bp", "contact_frequency", - "n_elem", + "n_total", "n_valid", "count.sum", "balanced.sum", "count.avg", "balanced.avg", - "count.avg.smoothed", "balanced.avg.smoothed", - "count.avg.smoothed.agg", "balanced.avg.smoothed.agg", ] From 2b4d03490ff62b2fc2ed7cc4dc32667e31952ea3 Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Fri, 8 Mar 2024 13:40:03 -0800 Subject: [PATCH 5/6] update docstring and migragate functinos to expected.py --- cooltools/api/expected.py | 212 ++++++++++++++++++++---- cooltools/sandbox/expected_smoothing.py | 131 +-------------- tests/test_expected.py | 22 +++ 3 files changed, 202 insertions(+), 163 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index 283e164c..f20edca9 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -25,7 +25,7 @@ _DIST = list(diag_expected_dtypes)[2] _NUM_VALID = list(diag_expected_dtypes)[3] -# See notes in the docstring of agg_smooth_cvd() +# See notes in the docstring of expected_cis() and per_region_smooth_cvd() DEFAULT_BALANCED_CVD_COLS = { "region1": _REGION1, "region2": _REGION2, @@ -34,6 +34,7 @@ "n_contacts": "balanced.sum", "output_prefix": "balanced.avg", } +# See notes in the docstring of expected_cis() and genomewide_smooth_cvd() DEFAULT_RAW_CVD_COLS = { "region1": _REGION1, "region2": _REGION2, @@ -899,6 +900,141 @@ def blocksum_pairwise( ) +def genomewide_smooth_cvd( + cvd, + sigma_log10=0.1, + window_sigma=5, + points_per_sigma=10, + cols=None, + suffix=".smoothed", +): + """ + Smooth the contact-vs-distance curve aggregated across all regions in log-space. + + Parameters + ---------- + cvd : pandas.DataFrame + A dataframe with the expected values in the cooltools.expected format. + sigma_log10 : float, optional + The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 + window_sigma : int, optional + Width of the smoothing window, expressed in sigmas, by default 5 + points_per_sigma : int, optional + If provided, smoothing is done only for `points_per_sigma` points per sigma and the + rest of the values are interpolated (this results in a major speed-up). By default 10 + cols : dict, optional + If provided, use the specified column names instead of the standard ones. + See DEFAULT_CVD_COLS variable for the format of this argument. + suffix : string, optional + If provided, use the specified string as the suffix of the output column's name + + Returns + ------- + cvd : pandas.DataFrame + A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed.agg" or "balanced.avg.smoothed.agg" depends on whether balanced or not). + + Notes + ----- + Parameters in "cols" will be used: + + dist: + Indicate the name of the column that stores distance values (by default, "dist"). + n_pixels: + Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not). + n_contacts: + Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not). + output_prefix: + Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not). + """ + cvd_smoothed = expected_smoothing._smooth_cvd_group( + cvd, + sigma_log10=sigma_log10, + window_sigma=window_sigma, + points_per_sigma=points_per_sigma, + cols=cols, + suffix=suffix, + ) + + # add aggeragated smoothed columns to the result + cvd = cvd.merge( + cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], + on=[cols["dist"]], + how="left", + ) + + return cvd + + +def per_region_smooth_cvd( + cvd, + sigma_log10=0.1, + window_sigma=5, + points_per_sigma=10, + cols=None, + suffix="", +): + """ + Smooth the contact-vs-distance curve for each region in log-space. + + Parameters + ---------- + cvd : pandas.DataFrame + A dataframe with the expected values in the cooltools.expected format. + sigma_log10 : float, optional + The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 + window_sigma : int, optional + Width of the smoothing window, expressed in sigmas, by default 5 + points_per_sigma : int, optional + If provided, smoothing is done only for `points_per_sigma` points per sigma and the + rest of the values are interpolated (this results in a major speed-up). By default 10 + cols : dict, optional + If provided, use the specified column names instead of the standard ones. + See DEFAULT_CVD_COLS variable for the format of this argument. + suffix : string, optional + If provided, use the specified string as the suffix of the output column's name + + Returns + ------- + cvd : pandas.DataFrame + A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed" or "balanced.avg.smoothed" depends on whether balanced or not). + + Notes + ----- + Parameters in "cols" will be used: + + region1: + Indicate the name of the column that stores region1's locations (by default, "region1"). + region2: + Indicate the name of the column that stores region2's locations (by default, "region2"). + dist: + Indicate the name of the column that stores distance values (by default, "dist"). + n_pixels: + Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not). + n_contacts: + Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not). + output_prefix: + Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not). + """ + cvd_smoothed = ( + cvd.set_index([cols["region1"], cols["region2"]]) + .groupby([cols["region1"], cols["region2"]]) + .apply( + expected_smoothing._smooth_cvd_group, + sigma_log10=sigma_log10, + window_sigma=window_sigma, + points_per_sigma=points_per_sigma, + cols=cols, + suffix=suffix, + ) + ) + # add smoothed columns to the result + cvd = cvd.merge( + cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], + on=[cols["region1"], cols["region2"], cols["dist"]], + how="left", + ) + + return cvd # user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API @pool_decorator def expected_cis( @@ -970,35 +1106,45 @@ def expected_cis( Notes ----- - Output DataFrame columns correspond to the following quantities: - - dist: - Distance in bins. - dist_bp: - Distance in basepairs. - contact_freq: - The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; - if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. - n_total: - Number of total pixels at a given distance. - n_valid: - Number of valid pixels (with non-NaN values after balancing) at a given distance. - count.sum: - Sum up raw contact counts of all pixels at a given distance. - balanced.sum: - Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None. - count.avg: - The average raw contact count of pixels at a given distance. count.sum / n_total. - balanced.avg: - The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None. - count.avg.smoothed: - Log-smoothed count.avg. Returned if smooth=True. - balanced.avg.smoothed: - Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None. - count.avg.smoothed.agg: - Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True. - balanced.avg.smoothed.agg: - Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None. + When clr_weight_name=None, smooth=False, aggregate_smoothed=False, the minimum output DataFrame includes the following quantities (columns): + + dist: + Distance in bins. + dist_bp: + Distance in basepairs. + contact_freq: + The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; + if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg. + n_total: + Number of total pixels at a given distance. + n_valid: + Number of valid pixels (with non-NaN values after balancing) at a given distance. + count.sum: + Sum up raw contact counts of all pixels at a given distance. + count.avg: + The average raw contact count of pixels at a given distance. count.sum / n_total. + + When clr_weigh_name is provided (by default, clr_weigh_name="weight"), the following quantities (columns) will be added into the DataFrame: + + balanced.sum: + Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None. + balanced.avg: + The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None. + + When smooth=True, the following quantities (columns) will be added into the DataFrame: + + count.avg.smoothed: + Log-smoothed count.avg. Returned if smooth=True and clr_weight_name=None. + balanced.avg.smoothed: + Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None. + + When aggregate_smoothed=True, the following quantities (columns) will be added into the DataFrame: + count.avg.smoothed.agg: + Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name=None. + balanced.avg.smoothed.agg: + Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None. + + By default, clr_weight_name="weight", smooth=True, aggregate_smoothed=True, the output DataFrame includes all quantities (columns). """ @@ -1038,7 +1184,7 @@ def expected_cis( raise ValueError( "cooler is not balanced, or" f"balancing weight {clr_weight_name} is not available in the cooler." - f"Pass clr_weight_name=None explicitly to calculate on raw counts" + f"Pass clr_weight_name=None explicitly to calculate expected on raw counts" ) # using try-clause to close mp.Pool properly @@ -1075,7 +1221,7 @@ def expected_cis( # additional smoothing and aggregating options would add columns only, not replace them if smooth: - result = expected_smoothing.per_region_smooth_cvd( + result = per_region_smooth_cvd( result, sigma_log10=smooth_sigma, cols=cols, @@ -1083,7 +1229,7 @@ def expected_cis( ) if aggregate_smoothed: - result = expected_smoothing.genome_wide_smooth_cvd( + result = genomewide_smooth_cvd( result, sigma_log10=smooth_sigma, cols=cols, diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py index af7bd4f7..385d22f9 100644 --- a/cooltools/sandbox/expected_smoothing.py +++ b/cooltools/sandbox/expected_smoothing.py @@ -194,133 +194,4 @@ def _smooth_cvd_group( / cvd_smoothed[cols["n_pixels"] + suffix] ) - return cvd_smoothed - - -def genome_wide_smooth_cvd( - cvd, - sigma_log10=0.1, - window_sigma=5, - points_per_sigma=10, - cols=None, - suffix="", -): - """ - Smooth the contact-vs-distance curve aggregated across all regions in log-space. - - Parameters - ---------- - cvd : pandas.DataFrame - A dataframe with the expected values in the cooltools.expected format. - sigma_log10 : float, optional - The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 - window_sigma : int, optional - Width of the smoothing window, expressed in sigmas, by default 5 - points_per_sigma : int, optional - If provided, smoothing is done only for `points_per_sigma` points per sigma and the - rest of the values are interpolated (this results in a major speed-up). By default 10 - cols : dict, optional - If provided, use the specified column names instead of the standard ones. - See DEFAULT_CVD_COLS variable for the format of this argument. - - Returns - ------- - cvd : pandas.DataFrame - A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed"). - - Notes - ----- - Parameters in "cols" - - dist: - - n_pixels: - - n_contacts: - - contact_freq: - - """ - cvd_smoothed = _smooth_cvd_group( - cvd, - sigma_log10=sigma_log10, - window_sigma=window_sigma, - points_per_sigma=points_per_sigma, - cols=cols, - suffix=suffix, - ) - - # add aggeragated smoothed columns to the result - cvd = cvd.merge( - cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], - on=[cols["dist"]], - how="left", - ) - - return cvd - - -def per_region_smooth_cvd( - cvd, - sigma_log10=0.1, - window_sigma=5, - points_per_sigma=10, - cols=None, - suffix="", -): - """ - Smooth the contact-vs-distance curve for each region in log-space. - - Parameters - ---------- - cvd : pandas.DataFrame - A dataframe with the expected values in the cooltools.expected format. - sigma_log10 : float, optional - The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1 - window_sigma : int, optional - Width of the smoothing window, expressed in sigmas, by default 5 - points_per_sigma : int, optional - If provided, smoothing is done only for `points_per_sigma` points per sigma and the - rest of the values are interpolated (this results in a major speed-up). By default 10 - cols : dict, optional - If provided, use the specified column names instead of the standard ones. - See DEFAULT_CVD_COLS variable for the format of this argument. - - Returns - ------- - cvd : pandas.DataFrame - A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed"). - - Notes - ----- - Parameters in "cols" - - dist: - - n_pixels: - - n_contacts: - - contact_freq: - - """ - cvd_smoothed = ( - cvd.set_index([cols["region1"], cols["region2"]]) - .groupby([cols["region1"], cols["region2"]]) - .apply( - _smooth_cvd_group, - sigma_log10=sigma_log10, - window_sigma=window_sigma, - points_per_sigma=points_per_sigma, - cols=cols, - suffix=suffix, - ) - ) - # add smoothed columns to the result - cvd = cvd.merge( - cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]], - on=[cols["region1"], cols["region2"], cols["dist"]], - how="left", - ) - - return cvd + return cvd_smoothed \ No newline at end of file diff --git a/tests/test_expected.py b/tests/test_expected.py index e6e86ca6..7804fd59 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -290,6 +290,28 @@ def test_expected_cis(request): desired=desired_expected, equal_nan=True, ) + + # check column names, when clr_weight_name = None, which is the unbalanced case + res_symm = cooltools.api.expected.expected_cis( + clr, + view_df=view_df, + clr_weight_name=None, + chunksize=chunksize, + ignore_diags=ignore_diags, + ) + assert list(res_symm.columns) == [ + "region1", + "region2", + "dist", + "dist_bp", + "contact_frequency", + "n_total", + "n_valid", + "count.sum", + "count.avg", + "count.avg.smoothed", + "count.avg.smoothed.agg", + ] # asymm and symm result together - engaging diagsum_pairwise res_all = cooltools.api.expected.expected_cis( From 1ec9eae1d4ee359ecde4c902f7e4891893e62bec Mon Sep 17 00:00:00 2001 From: Yaoyx Date: Fri, 8 Mar 2024 14:57:27 -0800 Subject: [PATCH 6/6] Update docstring --- cooltools/api/expected.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index f20edca9..9aa60401 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -931,20 +931,20 @@ def genomewide_smooth_cvd( Returns ------- cvd : pandas.DataFrame - A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed.agg" or "balanced.avg.smoothed.agg" depends on whether balanced or not). + A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed.agg" if balanced, or "count.avg.smoothed.agg" if raw). Notes ----- Parameters in "cols" will be used: dist: - Indicate the name of the column that stores distance values (by default, "dist"). + Name of the column that stores distance values (by default, "dist"). n_pixels: - Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not). + Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw). n_contacts: - Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not). + Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw). output_prefix: - Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not). + Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw). """ cvd_smoothed = expected_smoothing._smooth_cvd_group( cvd, @@ -996,24 +996,24 @@ def per_region_smooth_cvd( Returns ------- cvd : pandas.DataFrame - A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed" or "balanced.avg.smoothed" depends on whether balanced or not). + A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed" if balanced, or "count.avg.smoothed" if raw). Notes ----- Parameters in "cols" will be used: region1: - Indicate the name of the column that stores region1's locations (by default, "region1"). + Name of the column that stores region1's locations (by default, "region1"). region2: - Indicate the name of the column that stores region2's locations (by default, "region2"). + Name of the column that stores region2's locations (by default, "region2"). dist: - Indicate the name of the column that stores distance values (by default, "dist"). + Name of the column that stores distance values (by default, "dist"). n_pixels: - Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not). + Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw). n_contacts: - Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not). + Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw). output_prefix: - Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not). + Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw). """ cvd_smoothed = ( cvd.set_index([cols["region1"], cols["region2"]])