open2c · gfudenberg · Mar 10, 2024 · Feb 15, 2024 · Mar 6, 2024 · Mar 8, 2024
diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
@@ -256,7 +256,8 @@ def _make_diag_table(n_bins, bad_locs):
             )
         diags = diags[diags["n_elem"] > 0]
 
-    diags = diags.drop("n_elem", axis=1)
+    # keep diags["n_elem"] for calculating count.avg
+    # diags = diags.drop("n_elem", axis=1) 
     return diags.astype(int)
 
 
@@ -944,7 +945,36 @@ def expected_cis(
     -------
     DataFrame with summary statistic of every diagonal of every symmetric
     or asymmetric block:
-    region1, region2, diag, n_valid, count.sum count.avg, etc
+
+    Notes
+    -----
+    dist: 
+        Distance in bins.
+    dist_bp:
+        Distance in basepairs.
+    contact_freq:
+        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
+        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+    n_elem:
+        Number of total pixels at a given distance.
+    n_valid: 
+        Number of valid pixels (with non-NaN values after balancing) at a given distance.
+    count.sum:
+        Sum up raw contact counts of all pixels at a given distance.
+    balanced.sum: 
+        Sum up balanced contact values of valid pixels at a given distance.
+    count.avg:
+        The average raw contact count of pixels at a given distance. count.sum / n_elem.
+    balanced.avg:
+        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
+    count.avg.smoothed:
+        Log-smoothed count.avg.
+    balanced.avg.smoothed:
+        Log-smoothed balanced.avg.
+    count.avg.smoothed.agg:
+        Aggregate Log-smoothed count.avg of all genome regions.
+    balanced.avg.smoothed.agg:
+        Aggregate Log-smoothed balanced.avg of all genome regions.
 
     """
 
@@ -978,7 +1008,6 @@ def expected_cis(
         weight1 = clr_weight_name + "1"
         weight2 = clr_weight_name + "2"
         transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
-
     else:
         raise ValueError(
             "cooler is not balanced, or"
@@ -1009,17 +1038,24 @@ def expected_cis(
 
     # calculate actual averages by dividing sum by n_valid:
     for key in chain(["count"], transforms):
-        result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+        if key == "count":
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"]
+        else:    
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+
+    # add dist_bp column, which shows distance in basepairs
+    result.insert(3, 'dist_bp', result[_DIST]*clr.binsize)
+
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
         result_smooth = expected_smoothing.agg_smooth_cvd(
             result,
             sigma_log10=smooth_sigma,
         )
-        # add smoothed columns to the result (only balanced for now)
+        # add smoothed columns to the result (only balanced for now) (include count as well)
         result = result.merge(
-            result_smooth[["balanced.avg.smoothed", _DIST]],
+            result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]],
             on=[_REGION1, _REGION2, _DIST],
             how="left",
         )
@@ -1028,15 +1064,18 @@ def expected_cis(
                 result,
                 groupby=None,
                 sigma_log10=smooth_sigma,
-            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
+            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"})
             # add smoothed columns to the result
             result = result.merge(
-                result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
+                result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]],
                 on=[
                     _DIST,
                 ],
                 how="left",
             )
+
+    # add contact_frequency columns to the result, which is the copy of the "most processing" contact values
+    result.insert(4, 'contact_frequency', result.iloc[:,-1])
 
     return result
 

diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
@@ -3,13 +3,16 @@
 import numpy as np
 import numba
 
-
+# See notes in the docstring of agg_smooth_cvd()
 DEFAULT_CVD_COLS = {
     "dist": "dist",
     "n_pixels": "n_valid",
     "n_contacts": "balanced.sum",
     "contact_freq": "balanced.avg",
     "smooth_suffix": ".smoothed",
+    "n_pixels_tot": "n_elem",
+    "n_contacts_raw": "count.sum",
+    "contact_freq_raw": "count.avg",
 }
 
 
@@ -182,6 +185,8 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
             {
                 cols["n_pixels"]: "sum",
                 cols["n_contacts"]: "sum",
+                cols["n_pixels_tot"]: "sum",
+                cols["n_contacts_raw"]: "sum",
             }
         )
         .reset_index()
@@ -198,8 +203,24 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
         points_per_sigma=points_per_sigma,
     )
 
-    # cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]]
-
+    smoothed_raw_sum, smoothed_n_elem = log_smooth(
+        cvd_smoothed[cols["dist"]].values.astype(np.float64),
+        [
+            cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64),
+            cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64),
+        ],
+        sigma_log10=sigma_log10,
+        window_sigma=window_sigma,
+        points_per_sigma=points_per_sigma,
+    )
+
+    cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem
+    cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum
+    cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = (
+        cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]]
+        / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]]
+    )
+
     cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
     cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum
     cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = (
@@ -268,6 +289,37 @@ def agg_smooth_cvd(
     -------
     cvd_smoothed : pandas.DataFrame
         A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
+
+    Notes
+    -----
+    dist: 
+        Distance in bins.
+    dist_bp:
+        Distance in basepairs.
+    contact_freq:
+        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
+        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+    n_elem:
+        Number of total pixels at a given distance.
+    n_valid: 
+        Number of valid pixels (with non-NaN values after balancing) at a given distance.
+    count.sum:
+        Sum up raw contact counts of all pixels at a given distance.
+    balanced.sum: 
+        Sum up balanced contact values of valid pixels at a given distance.
+    count.avg:
+        The average raw contact count of pixels at a given distance. count.sum / n_elem.
+    balanced.avg:
+        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
+    count.avg.smoothed:
+        Log-smoothed count.avg.
+    balanced.avg.smoothed:
+        Log-smoothed balanced.avg.
+    count.avg.smoothed.agg:
+        Aggregate Log-smoothed count.avg of all genome regions.
+    balanced.avg.smoothed.agg:
+        Aggregate Log-smoothed balanced.avg of all genome regions.
+
     """
 
     cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols))
@@ -290,16 +342,4 @@ def agg_smooth_cvd(
         cols=cols,
     )
 
-    cvd_smoothed.drop(
-        [cols["n_pixels"], cols["n_contacts"]], axis="columns", inplace=True
-    )
-
-    # cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore')
-
-    # cvd = cvd.merge(
-    #     cvd_smoothed,
-    #     on=groupby + [cols["dist"]],
-    #     how="left",
-    # )
-
     return cvd_smoothed
diff --git a/tests/test_expected.py b/tests/test_expected.py
@@ -257,6 +257,10 @@ def test_expected_cis(request):
         chunksize=chunksize,
         ignore_diags=ignore_diags,
     )
+
+    # check column names
+    assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum',  'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg']
+
     # check results for every block
     grouped = res_symm.groupby(["region1", "region2"])
     for (name1, name2), group in grouped: