open2c · gfudenberg · Mar 10, 2024 · Feb 15, 2024 · Mar 6, 2024 · Mar 8, 2024
diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
@@ -25,6 +25,27 @@
 _DIST = list(diag_expected_dtypes)[2]
 _NUM_VALID = list(diag_expected_dtypes)[3]
 
+# See notes in the docstring of expected_cis() and per_region_smooth_cvd() 
+DEFAULT_BALANCED_CVD_COLS = {
+    "region1": _REGION1,
+    "region2": _REGION2,
+    "dist": "dist",
+    "n_pixels": "n_valid",
+    "n_contacts": "balanced.sum",
+    "output_prefix": "balanced.avg",
+}
+# See notes in the docstring of expected_cis() and genomewide_smooth_cvd()
+DEFAULT_RAW_CVD_COLS = {
+    "region1": _REGION1,
+    "region2": _REGION2,
+    "dist": "dist",
+    "n_pixels": "n_total",
+    "n_contacts": "count.sum",
+    "output_prefix": "count.avg",
+}
+SMOOTH_SUFFIX = ".smoothed"
+SMOOTH_AGG_SUFFIX = ".smoothed.agg"
+
 
 def lattice_pdist_frequencies(n, points):
     """
@@ -199,7 +220,7 @@ def count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y):
 
 def make_diag_table(bad_mask, span1, span2):
     """
-    Compute the total number of elements ``n_elem`` and the number of bad
+    Compute the total number of elements ``n_total`` and the number of bad
     elements ``n_bad`` per diagonal for a single contact area encompassing
     ``span1`` and ``span2`` on the same genomic scaffold (cis matrix).
 
@@ -217,7 +238,7 @@ def make_diag_table(bad_mask, span1, span2):
     Returns
     -------
     diags : pandas.DataFrame
-        Table indexed by 'diag' with columns ['n_elem', 'n_bad'].
+        Table indexed by 'diag' with columns ['n_total', 'n_bad'].
 
     """
 
@@ -226,8 +247,8 @@ def make_diag_table(bad_mask, span1, span2):
 
     def _make_diag_table(n_bins, bad_locs):
         diags = pd.DataFrame(index=pd.Series(np.arange(n_bins), name=_DIST))
-        diags["n_elem"] = count_all_pixels_per_diag(n_bins)
-        diags[_NUM_VALID] = diags["n_elem"] - count_bad_pixels_per_diag(
+        diags["n_total"] = count_all_pixels_per_diag(n_bins)
+        diags[_NUM_VALID] = diags["n_total"] - count_bad_pixels_per_diag(
             n_bins, bad_locs
         )
         return diags
@@ -254,9 +275,10 @@ def _make_diag_table(n_bins, bad_locs):
             diags.add(
                 _make_diag_table(lo2 - hi1, where(bad_mask[hi1:lo2])), fill_value=0
             )
-        diags = diags[diags["n_elem"] > 0]
+        diags = diags[diags["n_total"] > 0]
 
-    diags = diags.drop("n_elem", axis=1)
+    # keep diags["n_total"] for calculating count.avg
+    # diags = diags.drop("n_total", axis=1)
     return diags.astype(int)
 
 
@@ -302,7 +324,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"):
             regions2 = bioframe.make_viewframe(
                 regions2, check_bounds=clr.chromsizes
             ).to_numpy()
-    except ValueError:  # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
+    except (
+        ValueError
+    ):  # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
         regions = pd.concat(
             [
                 bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
@@ -876,6 +900,141 @@ def blocksum_pairwise(
     )
 
 
+def genomewide_smooth_cvd(
+    cvd,
+    sigma_log10=0.1,
+    window_sigma=5,
+    points_per_sigma=10,
+    cols=None,
+    suffix=".smoothed",
+):
+    """
+    Smooth the contact-vs-distance curve aggregated across all regions in log-space.
+
+    Parameters
+    ----------
+    cvd : pandas.DataFrame
+        A dataframe with the expected values in the cooltools.expected format.
+    sigma_log10 : float, optional
+        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
+    window_sigma : int, optional
+        Width of the smoothing window, expressed in sigmas, by default 5
+    points_per_sigma : int, optional
+        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
+        rest of the values are interpolated (this results in a major speed-up). By default 10
+    cols : dict, optional
+        If provided, use the specified column names instead of the standard ones.
+        See DEFAULT_CVD_COLS variable for the format of this argument.
+    suffix : string, optional
+        If provided, use the specified string as the suffix of the output column's name
+
+    Returns
+    -------
+    cvd : pandas.DataFrame
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed.agg" if balanced, or "count.avg.smoothed.agg" if raw).
+
+    Notes
+    -----
+    Parameters in "cols" will be used:
+
+    dist:
+        Name of the column that stores distance values (by default, "dist").
+    n_pixels:
+        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
+    n_contacts:
+        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
+    output_prefix:
+        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
+    """
+    cvd_smoothed = expected_smoothing._smooth_cvd_group(
+        cvd,
+        sigma_log10=sigma_log10,
+        window_sigma=window_sigma,
+        points_per_sigma=points_per_sigma,
+        cols=cols,
+        suffix=suffix,
+    )
+
+    # add aggeragated smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["dist"]],
+        how="left",
+    )
+
+    return cvd
+
+
+def per_region_smooth_cvd(
+    cvd,
+    sigma_log10=0.1,
+    window_sigma=5,
+    points_per_sigma=10,
+    cols=None,
+    suffix="",
+):
+    """
+    Smooth the contact-vs-distance curve for each region in log-space.
+
+    Parameters
+    ----------
+    cvd : pandas.DataFrame
+        A dataframe with the expected values in the cooltools.expected format.
+    sigma_log10 : float, optional
+        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
+    window_sigma : int, optional
+        Width of the smoothing window, expressed in sigmas, by default 5
+    points_per_sigma : int, optional
+        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
+        rest of the values are interpolated (this results in a major speed-up). By default 10
+    cols : dict, optional
+        If provided, use the specified column names instead of the standard ones.
+        See DEFAULT_CVD_COLS variable for the format of this argument.
+    suffix : string, optional
+        If provided, use the specified string as the suffix of the output column's name
+
+    Returns
+    -------
+    cvd : pandas.DataFrame
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed" if balanced, or "count.avg.smoothed" if raw).
+
+    Notes
+    -----
+    Parameters in "cols" will be used:
+
+    region1:
+        Name of the column that stores region1's locations (by default, "region1").
+    region2:
+        Name of the column that stores region2's locations (by default, "region2").
+    dist:
+        Name of the column that stores distance values (by default, "dist").
+    n_pixels:
+        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
+    n_contacts:
+        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
+    output_prefix:
+        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
+    """
+    cvd_smoothed = (
+        cvd.set_index([cols["region1"], cols["region2"]])
+        .groupby([cols["region1"], cols["region2"]])
+        .apply(
+            expected_smoothing._smooth_cvd_group,
+            sigma_log10=sigma_log10,
+            window_sigma=window_sigma,
+            points_per_sigma=points_per_sigma,
+            cols=cols,
+            suffix=suffix,
+        )
+    )
+    # add smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["region1"], cols["region2"], cols["dist"]],
+        how="left",
+    )
+
+    return cvd
 # user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API
 @pool_decorator
 def expected_cis(
@@ -938,13 +1097,54 @@ def expected_cis(
     map_functor : callable, optional
         Map function to dispatch the matrix chunks to workers.
         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
-        If nproc=1 this defaults the builtin map. 
+        If nproc=1 this defaults the builtin map.
 
     Returns
     -------
     DataFrame with summary statistic of every diagonal of every symmetric
     or asymmetric block:
-    region1, region2, diag, n_valid, count.sum count.avg, etc
+
+    Notes
+    -----
+    When clr_weight_name=None, smooth=False, aggregate_smoothed=False, the minimum output DataFrame includes the following quantities (columns):
+
+        dist:
+            Distance in bins.
+        dist_bp:
+            Distance in basepairs.
+        contact_freq:
+            The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
+            if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+        n_total:
+            Number of total pixels at a given distance.
+        n_valid:
+            Number of valid pixels (with non-NaN values after balancing) at a given distance.
+        count.sum:
+            Sum up raw contact counts of all pixels at a given distance.
+        count.avg:
+            The average raw contact count of pixels at a given distance. count.sum / n_total. 
+
+    When clr_weigh_name is provided (by default, clr_weigh_name="weight"), the following quantities (columns) will be added into the DataFrame:
+
+        balanced.sum:
+            Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
+        balanced.avg:
+            The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.
+
+    When smooth=True, the following quantities (columns) will be added into the DataFrame:
+
+        count.avg.smoothed:
+            Log-smoothed count.avg. Returned if smooth=True and clr_weight_name=None.
+        balanced.avg.smoothed:
+            Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.
+
+    When aggregate_smoothed=True, the following quantities (columns) will be added into the DataFrame:
+        count.avg.smoothed.agg:
+            Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name=None.
+        balanced.avg.smoothed.agg:
+            Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.
+
+    By default, clr_weight_name="weight", smooth=True, aggregate_smoothed=True, the output DataFrame includes all quantities (columns).
 
     """
 
@@ -970,19 +1170,21 @@ def expected_cis(
             raise ValueError("view_df is not a valid viewframe or incompatible") from e
 
     # define transforms - balanced and raw ('count') for now
+    cols = DEFAULT_BALANCED_CVD_COLS
     if clr_weight_name is None:
         # no transforms
         transforms = {}
+        cols = DEFAULT_RAW_CVD_COLS
     elif is_cooler_balanced(clr, clr_weight_name):
         # define balanced data transform:
         weight1 = clr_weight_name + "1"
         weight2 = clr_weight_name + "2"
         transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
-
     else:
         raise ValueError(
             "cooler is not balanced, or"
             f"balancing weight {clr_weight_name} is not available in the cooler."
+            f"Pass clr_weight_name=None explicitly to calculate expected on raw counts"
         )
 
     # using try-clause to close mp.Pool properly
@@ -1009,35 +1211,34 @@ def expected_cis(
 
     # calculate actual averages by dividing sum by n_valid:
     for key in chain(["count"], transforms):
-        result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+        if key == "count":
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_total"]
+        else:
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+
+    # add dist_bp column, which shows distance in basepairs
+    result.insert(3, "dist_bp", result[_DIST] * clr.binsize)
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
-        result_smooth = expected_smoothing.agg_smooth_cvd(
+        result = per_region_smooth_cvd(
             result,
             sigma_log10=smooth_sigma,
+            cols=cols,
+            suffix=SMOOTH_SUFFIX,
         )
-        # add smoothed columns to the result (only balanced for now)
-        result = result.merge(
-            result_smooth[["balanced.avg.smoothed", _DIST]],
-            on=[_REGION1, _REGION2, _DIST],
-            how="left",
-        )
+
         if aggregate_smoothed:
-            result_smooth_agg = expected_smoothing.agg_smooth_cvd(
+            result = genomewide_smooth_cvd(
                 result,
-                groupby=None,
                 sigma_log10=smooth_sigma,
-            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
-            # add smoothed columns to the result
-            result = result.merge(
-                result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
-                on=[
-                    _DIST,
-                ],
-                how="left",
+                cols=cols,
+                suffix=SMOOTH_AGG_SUFFIX,
             )
 
+    # add contact_frequency columns to the result, which is the copy of the "most processing" contact values
+    result.insert(4, "contact_frequency", result.iloc[:, -1])
+
     return result
 
 
@@ -1082,7 +1283,7 @@ def expected_trans(
     map_functor : callable, optional
         Map function to dispatch the matrix chunks to workers.
         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
-        If nproc=1 this defaults the builtin map. 
+        If nproc=1 this defaults the builtin map.
 
     Returns
     -------
@@ -1117,7 +1318,7 @@ def expected_trans(
         weight1 = clr_weight_name + "1"
         weight2 = clr_weight_name + "2"
         transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
-        
+
     else:
         raise ValueError(
             "cooler is not balanced, or"