From 0a7c01f9d186e36bb37cda22583e394fd0611e29 Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Thu, 15 Feb 2024 12:25:14 -0800
Subject: [PATCH 1/6] 1. use n_elem for calculating count.avg 2. store
 count.avg.smoothed and count.avg.smoothed.agg in cvd table

---
 cooltools/api/expected.py               | 17 ++++++++++------
 cooltools/sandbox/expected_smoothing.py | 26 ++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
index 8d2fbeac..eeacd598 100644
--- a/cooltools/api/expected.py
+++ b/cooltools/api/expected.py
@@ -256,7 +256,8 @@ def _make_diag_table(n_bins, bad_locs):
             )
         diags = diags[diags["n_elem"] > 0]
 
-    diags = diags.drop("n_elem", axis=1)
+    # keep diags["n_elem"] for calculating count.avg
+    # diags = diags.drop("n_elem", axis=1) 
     return diags.astype(int)
 
 
@@ -1009,7 +1010,11 @@ def expected_cis(
 
     # calculate actual averages by dividing sum by n_valid:
     for key in chain(["count"], transforms):
-        result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+        if key == "count":
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"]
+        else:    
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
+
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
@@ -1017,9 +1022,9 @@ def expected_cis(
             result,
             sigma_log10=smooth_sigma,
         )
-        # add smoothed columns to the result (only balanced for now)
+        # add smoothed columns to the result (only balanced for now) (include count as well)
         result = result.merge(
-            result_smooth[["balanced.avg.smoothed", _DIST]],
+            result_smooth[["balanced.avg.smoothed", "count.avg.smoothed", _DIST]],
             on=[_REGION1, _REGION2, _DIST],
             how="left",
         )
@@ -1028,10 +1033,10 @@ def expected_cis(
                 result,
                 groupby=None,
                 sigma_log10=smooth_sigma,
-            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
+            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"})
             # add smoothed columns to the result
             result = result.merge(
-                result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
+                result_smooth_agg[["balanced.avg.smoothed.agg", "count.avg.smoothed.agg", _DIST]],
                 on=[
                     _DIST,
                 ],
diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
index 630675cc..18c354b9 100644
--- a/cooltools/sandbox/expected_smoothing.py
+++ b/cooltools/sandbox/expected_smoothing.py
@@ -10,6 +10,9 @@
     "n_contacts": "balanced.sum",
     "contact_freq": "balanced.avg",
     "smooth_suffix": ".smoothed",
+    "n_pixels_tot": "n_elem",
+    "n_contacts_raw": "count.sum",
+    "contact_freq_raw": "count.avg",
 }
 
 
@@ -182,6 +185,8 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
             {
                 cols["n_pixels"]: "sum",
                 cols["n_contacts"]: "sum",
+                cols["n_pixels_tot"]: "sum",
+                cols["n_contacts_raw"]: "sum",
             }
         )
         .reset_index()
@@ -198,6 +203,18 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
         points_per_sigma=points_per_sigma,
     )
 
+    smoothed_raw_sum, smoothed_n_elem = log_smooth(
+        cvd_smoothed[cols["dist"]].values.astype(np.float64),
+        [
+            cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64),
+            cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64),
+        ],
+        sigma_log10=sigma_log10,
+        window_sigma=window_sigma,
+        points_per_sigma=points_per_sigma,
+    )
+
+
     # cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]]
 
     cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
@@ -207,6 +224,13 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
         / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]]
     )
 
+    cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem
+    cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum
+    cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = (
+        cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]]
+        / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]]
+    )
+
     return cvd_smoothed
 
 
@@ -291,7 +315,7 @@ def agg_smooth_cvd(
     )
 
     cvd_smoothed.drop(
-        [cols["n_pixels"], cols["n_contacts"]], axis="columns", inplace=True
+        [cols["n_pixels"], cols["n_contacts"], cols["n_pixels_tot"], cols["n_contacts_raw"]], axis="columns", inplace=True
     )
 
     # cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore')

From 323d7681e8161d750cef54a3cb643711df9e1430 Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Wed, 6 Mar 2024 14:18:28 -0800
Subject: [PATCH 2/6] 1. Add a test to check output columns of expected_cis()
 2. Add notes for column names in cvd table to expected_cis() and
 agg_smooth_cvd() 3. Add dist_bp and contact_freq columns to cvd table and
 keep all other columns

---
 cooltools/api/expected.py               | 42 ++++++++++++++--
 cooltools/sandbox/expected_smoothing.py | 64 +++++++++++++++----------
 tests/test_expected.py                  |  4 ++
 3 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
index eeacd598..ab4cde6e 100644
--- a/cooltools/api/expected.py
+++ b/cooltools/api/expected.py
@@ -945,7 +945,36 @@ def expected_cis(
     -------
     DataFrame with summary statistic of every diagonal of every symmetric
     or asymmetric block:
-    region1, region2, diag, n_valid, count.sum count.avg, etc
+    
+    Notes
+    -----
+    dist: 
+        Distance in bins.
+    dist_bp:
+        Distance in basepairs.
+    contact_freq:
+        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
+        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+    n_elem:
+        Number of total pixels at a given distance.
+    n_valid: 
+        Number of valid pixels (with non-NaN values after balancing) at a given distance.
+    count.sum:
+        Sum up raw contact counts of all pixels at a given distance.
+    balanced.sum: 
+        Sum up balanced contact values of valid pixels at a given distance.
+    count.avg:
+        The average raw contact count of pixels at a given distance. count.sum / n_elem.
+    balanced.avg:
+        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
+    count.avg.smoothed:
+        Log-smoothed count.avg.
+    balanced.avg.smoothed:
+        Log-smoothed balanced.avg.
+    count.avg.smoothed.agg:
+        Aggregate Log-smoothed count.avg of all genome regions.
+    balanced.avg.smoothed.agg:
+        Aggregate Log-smoothed balanced.avg of all genome regions.
 
     """
 
@@ -979,7 +1008,6 @@ def expected_cis(
         weight1 = clr_weight_name + "1"
         weight2 = clr_weight_name + "2"
         transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
-
     else:
         raise ValueError(
             "cooler is not balanced, or"
@@ -1015,6 +1043,9 @@ def expected_cis(
         else:    
             result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
 
+    # add dist_bp column, which shows distance in basepairs
+    result.insert(3, 'dist_bp', result[_DIST]*clr.binsize)
+
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
@@ -1024,7 +1055,7 @@ def expected_cis(
         )
         # add smoothed columns to the result (only balanced for now) (include count as well)
         result = result.merge(
-            result_smooth[["balanced.avg.smoothed", "count.avg.smoothed", _DIST]],
+            result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]],
             on=[_REGION1, _REGION2, _DIST],
             how="left",
         )
@@ -1036,12 +1067,15 @@ def expected_cis(
             ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"})
             # add smoothed columns to the result
             result = result.merge(
-                result_smooth_agg[["balanced.avg.smoothed.agg", "count.avg.smoothed.agg", _DIST]],
+                result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]],
                 on=[
                     _DIST,
                 ],
                 how="left",
             )
+        
+    # add contact_frequency columns to the result, which is the copy of the "most processing" contact values
+    result.insert(4, 'contact_frequency', result.iloc[:,-1])
 
     return result
 
diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
index 18c354b9..46f21b44 100644
--- a/cooltools/sandbox/expected_smoothing.py
+++ b/cooltools/sandbox/expected_smoothing.py
@@ -3,7 +3,7 @@
 import numpy as np
 import numba
 
-
+# See notes in the docstring of agg_smooth_cvd()
 DEFAULT_CVD_COLS = {
     "dist": "dist",
     "n_pixels": "n_valid",
@@ -213,23 +213,20 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
         window_sigma=window_sigma,
         points_per_sigma=points_per_sigma,
     )
-
-
-    # cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]]
-
-    cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
-    cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum
-    cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = (
-        cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]]
-        / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]]
-    )
-
+    
     cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem
     cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum
     cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = (
         cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]]
         / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]]
     )
+    
+    cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
+    cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum
+    cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = (
+        cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]]
+        / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]]
+    )
 
     return cvd_smoothed
 
@@ -292,6 +289,37 @@ def agg_smooth_cvd(
     -------
     cvd_smoothed : pandas.DataFrame
         A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
+
+    Notes
+    -----
+    dist: 
+        Distance in bins.
+    dist_bp:
+        Distance in basepairs.
+    contact_freq:
+        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
+        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+    n_elem:
+        Number of total pixels at a given distance.
+    n_valid: 
+        Number of valid pixels (with non-NaN values after balancing) at a given distance.
+    count.sum:
+        Sum up raw contact counts of all pixels at a given distance.
+    balanced.sum: 
+        Sum up balanced contact values of valid pixels at a given distance.
+    count.avg:
+        The average raw contact count of pixels at a given distance. count.sum / n_elem.
+    balanced.avg:
+        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
+    count.avg.smoothed:
+        Log-smoothed count.avg.
+    balanced.avg.smoothed:
+        Log-smoothed balanced.avg.
+    count.avg.smoothed.agg:
+        Aggregate Log-smoothed count.avg of all genome regions.
+    balanced.avg.smoothed.agg:
+        Aggregate Log-smoothed balanced.avg of all genome regions.
+   
     """
 
     cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols))
@@ -314,16 +342,4 @@ def agg_smooth_cvd(
         cols=cols,
     )
 
-    cvd_smoothed.drop(
-        [cols["n_pixels"], cols["n_contacts"], cols["n_pixels_tot"], cols["n_contacts_raw"]], axis="columns", inplace=True
-    )
-
-    # cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore')
-
-    # cvd = cvd.merge(
-    #     cvd_smoothed,
-    #     on=groupby + [cols["dist"]],
-    #     how="left",
-    # )
-
     return cvd_smoothed
diff --git a/tests/test_expected.py b/tests/test_expected.py
index 31ee1fe7..adce9375 100644
--- a/tests/test_expected.py
+++ b/tests/test_expected.py
@@ -257,6 +257,10 @@ def test_expected_cis(request):
         chunksize=chunksize,
         ignore_diags=ignore_diags,
     )
+
+    # check column names
+    assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum',  'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg']
+
     # check results for every block
     grouped = res_symm.groupby(["region1", "region2"])
     for (name1, name2), group in grouped:

From c04f1005d57454065643238fe838f7e035791860 Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Thu, 7 Mar 2024 21:42:38 -0800
Subject: [PATCH 3/6] update the structure of agg_smooth_cvd()

---
 cooltools/api/expected.py               | 110 +++++++-----
 cooltools/sandbox/expected_smoothing.py | 219 +++++++++++-------------
 tests/test_expected.py                  |  26 ++-
 3 files changed, 184 insertions(+), 171 deletions(-)

diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
index ab4cde6e..283e164c 100644
--- a/cooltools/api/expected.py
+++ b/cooltools/api/expected.py
@@ -25,6 +25,26 @@
 _DIST = list(diag_expected_dtypes)[2]
 _NUM_VALID = list(diag_expected_dtypes)[3]
 
+# See notes in the docstring of agg_smooth_cvd()
+DEFAULT_BALANCED_CVD_COLS = {
+    "region1": _REGION1,
+    "region2": _REGION2,
+    "dist": "dist",
+    "n_pixels": "n_valid",
+    "n_contacts": "balanced.sum",
+    "output_prefix": "balanced.avg",
+}
+DEFAULT_RAW_CVD_COLS = {
+    "region1": _REGION1,
+    "region2": _REGION2,
+    "dist": "dist",
+    "n_pixels": "n_total",
+    "n_contacts": "count.sum",
+    "output_prefix": "count.avg",
+}
+SMOOTH_SUFFIX = ".smoothed"
+SMOOTH_AGG_SUFFIX = ".smoothed.agg"
+
 
 def lattice_pdist_frequencies(n, points):
     """
@@ -199,7 +219,7 @@ def count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y):
 
 def make_diag_table(bad_mask, span1, span2):
     """
-    Compute the total number of elements ``n_elem`` and the number of bad
+    Compute the total number of elements ``n_total`` and the number of bad
     elements ``n_bad`` per diagonal for a single contact area encompassing
     ``span1`` and ``span2`` on the same genomic scaffold (cis matrix).
 
@@ -217,7 +237,7 @@ def make_diag_table(bad_mask, span1, span2):
     Returns
     -------
     diags : pandas.DataFrame
-        Table indexed by 'diag' with columns ['n_elem', 'n_bad'].
+        Table indexed by 'diag' with columns ['n_total', 'n_bad'].
 
     """
 
@@ -226,8 +246,8 @@ def make_diag_table(bad_mask, span1, span2):
 
     def _make_diag_table(n_bins, bad_locs):
         diags = pd.DataFrame(index=pd.Series(np.arange(n_bins), name=_DIST))
-        diags["n_elem"] = count_all_pixels_per_diag(n_bins)
-        diags[_NUM_VALID] = diags["n_elem"] - count_bad_pixels_per_diag(
+        diags["n_total"] = count_all_pixels_per_diag(n_bins)
+        diags[_NUM_VALID] = diags["n_total"] - count_bad_pixels_per_diag(
             n_bins, bad_locs
         )
         return diags
@@ -254,10 +274,10 @@ def _make_diag_table(n_bins, bad_locs):
             diags.add(
                 _make_diag_table(lo2 - hi1, where(bad_mask[hi1:lo2])), fill_value=0
             )
-        diags = diags[diags["n_elem"] > 0]
+        diags = diags[diags["n_total"] > 0]
 
-    # keep diags["n_elem"] for calculating count.avg
-    # diags = diags.drop("n_elem", axis=1) 
+    # keep diags["n_total"] for calculating count.avg
+    # diags = diags.drop("n_total", axis=1)
     return diags.astype(int)
 
 
@@ -303,7 +323,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"):
             regions2 = bioframe.make_viewframe(
                 regions2, check_bounds=clr.chromsizes
             ).to_numpy()
-    except ValueError:  # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
+    except (
+        ValueError
+    ):  # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
         regions = pd.concat(
             [
                 bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
@@ -939,42 +961,44 @@ def expected_cis(
     map_functor : callable, optional
         Map function to dispatch the matrix chunks to workers.
         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
-        If nproc=1 this defaults the builtin map. 
+        If nproc=1 this defaults the builtin map.
 
     Returns
     -------
     DataFrame with summary statistic of every diagonal of every symmetric
     or asymmetric block:
-    
+
     Notes
     -----
-    dist: 
+    Output DataFrame columns correspond to the following quantities:
+
+    dist:
         Distance in bins.
     dist_bp:
         Distance in basepairs.
     contact_freq:
-        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
+        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
         if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
-    n_elem:
+    n_total:
         Number of total pixels at a given distance.
-    n_valid: 
+    n_valid:
         Number of valid pixels (with non-NaN values after balancing) at a given distance.
     count.sum:
         Sum up raw contact counts of all pixels at a given distance.
-    balanced.sum: 
-        Sum up balanced contact values of valid pixels at a given distance.
+    balanced.sum:
+        Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
     count.avg:
-        The average raw contact count of pixels at a given distance. count.sum / n_elem.
+        The average raw contact count of pixels at a given distance. count.sum / n_total.
     balanced.avg:
-        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
+        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.
     count.avg.smoothed:
-        Log-smoothed count.avg.
+        Log-smoothed count.avg. Returned if smooth=True.
     balanced.avg.smoothed:
-        Log-smoothed balanced.avg.
+        Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.
     count.avg.smoothed.agg:
-        Aggregate Log-smoothed count.avg of all genome regions.
+        Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True.
     balanced.avg.smoothed.agg:
-        Aggregate Log-smoothed balanced.avg of all genome regions.
+        Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.
 
     """
 
@@ -1000,9 +1024,11 @@ def expected_cis(
             raise ValueError("view_df is not a valid viewframe or incompatible") from e
 
     # define transforms - balanced and raw ('count') for now
+    cols = DEFAULT_BALANCED_CVD_COLS
     if clr_weight_name is None:
         # no transforms
         transforms = {}
+        cols = DEFAULT_RAW_CVD_COLS
     elif is_cooler_balanced(clr, clr_weight_name):
         # define balanced data transform:
         weight1 = clr_weight_name + "1"
@@ -1012,6 +1038,7 @@ def expected_cis(
         raise ValueError(
             "cooler is not balanced, or"
             f"balancing weight {clr_weight_name} is not available in the cooler."
+            f"Pass clr_weight_name=None explicitly to calculate on raw counts"
         )
 
     # using try-clause to close mp.Pool properly
@@ -1039,43 +1066,32 @@ def expected_cis(
     # calculate actual averages by dividing sum by n_valid:
     for key in chain(["count"], transforms):
         if key == "count":
-            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"]
-        else:    
+            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_total"]
+        else:
             result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
 
     # add dist_bp column, which shows distance in basepairs
-    result.insert(3, 'dist_bp', result[_DIST]*clr.binsize)
-
+    result.insert(3, "dist_bp", result[_DIST] * clr.binsize)
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
-        result_smooth = expected_smoothing.agg_smooth_cvd(
+        result = expected_smoothing.per_region_smooth_cvd(
             result,
             sigma_log10=smooth_sigma,
+            cols=cols,
+            suffix=SMOOTH_SUFFIX,
         )
-        # add smoothed columns to the result (only balanced for now) (include count as well)
-        result = result.merge(
-            result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]],
-            on=[_REGION1, _REGION2, _DIST],
-            how="left",
-        )
+
         if aggregate_smoothed:
-            result_smooth_agg = expected_smoothing.agg_smooth_cvd(
+            result = expected_smoothing.genome_wide_smooth_cvd(
                 result,
-                groupby=None,
                 sigma_log10=smooth_sigma,
-            ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"})
-            # add smoothed columns to the result
-            result = result.merge(
-                result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]],
-                on=[
-                    _DIST,
-                ],
-                how="left",
+                cols=cols,
+                suffix=SMOOTH_AGG_SUFFIX,
             )
-        
+
     # add contact_frequency columns to the result, which is the copy of the "most processing" contact values
-    result.insert(4, 'contact_frequency', result.iloc[:,-1])
+    result.insert(4, "contact_frequency", result.iloc[:, -1])
 
     return result
 
@@ -1121,7 +1137,7 @@ def expected_trans(
     map_functor : callable, optional
         Map function to dispatch the matrix chunks to workers.
         If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
-        If nproc=1 this defaults the builtin map. 
+        If nproc=1 this defaults the builtin map.
 
     Returns
     -------
@@ -1156,7 +1172,7 @@ def expected_trans(
         weight1 = clr_weight_name + "1"
         weight2 = clr_weight_name + "2"
         transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
-        
+
     else:
         raise ValueError(
             "cooler is not balanced, or"
diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
index 46f21b44..af7bd4f7 100644
--- a/cooltools/sandbox/expected_smoothing.py
+++ b/cooltools/sandbox/expected_smoothing.py
@@ -3,18 +3,6 @@
 import numpy as np
 import numba
 
-# See notes in the docstring of agg_smooth_cvd()
-DEFAULT_CVD_COLS = {
-    "dist": "dist",
-    "n_pixels": "n_valid",
-    "n_contacts": "balanced.sum",
-    "contact_freq": "balanced.avg",
-    "smooth_suffix": ".smoothed",
-    "n_pixels_tot": "n_elem",
-    "n_contacts_raw": "count.sum",
-    "contact_freq_raw": "count.avg",
-}
-
 
 def _log_interp(xs, xp, fp):
     """
@@ -66,7 +54,7 @@ def _log_thin(xs, min_log10_step=0.1):
     """
     xs_thinned = [xs[0]]
     prev = xs[0]
-    min_ratio = 10 ** min_log10_step
+    min_ratio = 10**min_log10_step
     for x in xs[1:]:
         if x > prev * min_ratio:
             xs_thinned.append(x)
@@ -103,9 +91,9 @@ def _log_smooth_numba(
         hi = np.searchsorted(log_xs, cur_log_x + sigma_log10 * window_sigma)
         smooth_weights = np.exp(
             -((cur_log_x - log_xs[lo:hi]) ** 2) / 2 / sigma_log10 / sigma_log10
-        ) 
+        )
         norm = smooth_weights.sum()
-        
+
         if norm > 0:
             smooth_weights /= norm
 
@@ -156,9 +144,7 @@ def log_smooth(
     if ys.ndim not in (1, 2):
         raise ValueError('ys must be either a 1D vector or a "tall" 2D matrix')
     if xs.shape[0] != ys.shape[-1]:
-        raise ValueError(
-            "xs and ys must have the same number of observations"
-        )
+        raise ValueError("xs and ys must have the same number of observations")
 
     ys = ys[np.newaxis, :] if ys.ndim == 1 else ys
 
@@ -176,17 +162,15 @@ def log_smooth(
     return ys_smoothed
 
 
-def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=None):
-    cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols))
-
+def _smooth_cvd_group(
+    cvd, sigma_log10, window_sigma, points_per_sigma, cols=None, suffix=""
+):
     cvd_smoothed = (
         cvd.groupby(cols["dist"])
         .agg(
             {
                 cols["n_pixels"]: "sum",
                 cols["n_contacts"]: "sum",
-                cols["n_pixels_tot"]: "sum",
-                cols["n_contacts_raw"]: "sum",
             }
         )
         .reset_index()
@@ -203,77 +187,31 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
         points_per_sigma=points_per_sigma,
     )
 
-    smoothed_raw_sum, smoothed_n_elem = log_smooth(
-        cvd_smoothed[cols["dist"]].values.astype(np.float64),
-        [
-            cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64),
-            cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64),
-        ],
-        sigma_log10=sigma_log10,
-        window_sigma=window_sigma,
-        points_per_sigma=points_per_sigma,
-    )
-    
-    cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem
-    cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum
-    cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = (
-        cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]]
-        / cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]]
-    )
-    
-    cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
-    cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum
-    cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = (
-        cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]]
-        / cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]]
+    cvd_smoothed[cols["n_pixels"] + suffix] = smoothed_n_valid
+    cvd_smoothed[cols["n_contacts"] + suffix] = smoothed_balanced_sum
+    cvd_smoothed[cols["output_prefix"] + suffix] = (
+        cvd_smoothed[cols["n_contacts"] + suffix]
+        / cvd_smoothed[cols["n_pixels"] + suffix]
     )
 
     return cvd_smoothed
 
 
-def _agg_smooth_cvd(
-    cvd, groupby, sigma_log10, window_sigma, points_per_sigma, cols=None
-):
-    if groupby:
-        cvd = cvd.set_index(groupby).groupby(groupby).apply(
-            _smooth_cvd_group,
-            sigma_log10=sigma_log10,
-            window_sigma=window_sigma,
-            points_per_sigma=points_per_sigma,
-            cols=cols,
-        )
-    else:
-        cvd = _smooth_cvd_group(
-            cvd,
-            sigma_log10=sigma_log10,
-            window_sigma=window_sigma,
-            points_per_sigma=points_per_sigma,
-            cols=cols,
-        )
-
-    return cvd
-
-
-def agg_smooth_cvd(
+def genome_wide_smooth_cvd(
     cvd,
-    groupby=["region1", "region2"],
     sigma_log10=0.1,
     window_sigma=5,
     points_per_sigma=10,
     cols=None,
+    suffix="",
 ):
     """
-    Smooth the contact-vs-distance curve in the log-space.
+    Smooth the contact-vs-distance curve aggregated across all regions in log-space.
 
     Parameters
     ----------
     cvd : pandas.DataFrame
         A dataframe with the expected values in the cooltools.expected format.
-    groupby : list of str
-        The list of column names to split the input table before smoothing.
-        This argument can be used to calculate separate smoothed CvD curves for
-        each region, Hi-C read orientation, etc.
-        If None or empty, a single CvD curve is calculated for the whole table.
     sigma_log10 : float, optional
         The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
     window_sigma : int, optional
@@ -287,59 +225,102 @@ def agg_smooth_cvd(
 
     Returns
     -------
-    cvd_smoothed : pandas.DataFrame
+    cvd : pandas.DataFrame
         A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
 
     Notes
     -----
-    dist: 
-        Distance in bins.
-    dist_bp:
-        Distance in basepairs.
-    contact_freq:
-        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg; 
-        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
-    n_elem:
-        Number of total pixels at a given distance.
-    n_valid: 
-        Number of valid pixels (with non-NaN values after balancing) at a given distance.
-    count.sum:
-        Sum up raw contact counts of all pixels at a given distance.
-    balanced.sum: 
-        Sum up balanced contact values of valid pixels at a given distance.
-    count.avg:
-        The average raw contact count of pixels at a given distance. count.sum / n_elem.
-    balanced.avg:
-        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
-    count.avg.smoothed:
-        Log-smoothed count.avg.
-    balanced.avg.smoothed:
-        Log-smoothed balanced.avg.
-    count.avg.smoothed.agg:
-        Aggregate Log-smoothed count.avg of all genome regions.
-    balanced.avg.smoothed.agg:
-        Aggregate Log-smoothed balanced.avg of all genome regions.
-   
-    """
+    Parameters in "cols"
+
+    dist:
 
-    cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols))
+    n_pixels:
 
-    if groupby is None:
-        groupby = []
-    elif isinstance(groupby, str):
-        groupby = [groupby]
-    elif isinstance(groupby, Iterable):
-        groupby = list(groupby)
-    else:
-        raise ValueError("groupby must be a string, a list of strings, or None")
+    n_contacts:
 
-    cvd_smoothed = _agg_smooth_cvd(
+    contact_freq:
+
+    """
+    cvd_smoothed = _smooth_cvd_group(
         cvd,
-        groupby=groupby,
         sigma_log10=sigma_log10,
         window_sigma=window_sigma,
         points_per_sigma=points_per_sigma,
         cols=cols,
+        suffix=suffix,
     )
 
-    return cvd_smoothed
+    # add aggeragated smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["dist"]],
+        how="left",
+    )
+
+    return cvd
+
+
+def per_region_smooth_cvd(
+    cvd,
+    sigma_log10=0.1,
+    window_sigma=5,
+    points_per_sigma=10,
+    cols=None,
+    suffix="",
+):
+    """
+    Smooth the contact-vs-distance curve for each region in log-space.
+
+    Parameters
+    ----------
+    cvd : pandas.DataFrame
+        A dataframe with the expected values in the cooltools.expected format.
+    sigma_log10 : float, optional
+        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
+    window_sigma : int, optional
+        Width of the smoothing window, expressed in sigmas, by default 5
+    points_per_sigma : int, optional
+        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
+        rest of the values are interpolated (this results in a major speed-up). By default 10
+    cols : dict, optional
+        If provided, use the specified column names instead of the standard ones.
+        See DEFAULT_CVD_COLS variable for the format of this argument.
+
+    Returns
+    -------
+    cvd : pandas.DataFrame
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
+
+    Notes
+    -----
+    Parameters in "cols"
+
+    dist:
+
+    n_pixels:
+
+    n_contacts:
+
+    contact_freq:
+
+    """
+    cvd_smoothed = (
+        cvd.set_index([cols["region1"], cols["region2"]])
+        .groupby([cols["region1"], cols["region2"]])
+        .apply(
+            _smooth_cvd_group,
+            sigma_log10=sigma_log10,
+            window_sigma=window_sigma,
+            points_per_sigma=points_per_sigma,
+            cols=cols,
+            suffix=suffix,
+        )
+    )
+    # add smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["region1"], cols["region2"], cols["dist"]],
+        how="left",
+    )
+
+    return cvd
diff --git a/tests/test_expected.py b/tests/test_expected.py
index adce9375..d831b9bd 100644
--- a/tests/test_expected.py
+++ b/tests/test_expected.py
@@ -259,7 +259,23 @@ def test_expected_cis(request):
     )
 
     # check column names
-    assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum',  'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg']
+    assert list(res_symm.columns) == [
+        "region1",
+        "region2",
+        "dist",
+        "dist_bp",
+        "contact_frequency",
+        "n_elem",
+        "n_valid",
+        "count.sum",
+        "balanced.sum",
+        "count.avg",
+        "balanced.avg",
+        "count.avg.smoothed",
+        "balanced.avg.smoothed",
+        "count.avg.smoothed.agg",
+        "balanced.avg.smoothed.agg",
+    ]
 
     # check results for every block
     grouped = res_symm.groupby(["region1", "region2"])
@@ -304,7 +320,7 @@ def test_expected_cis(request):
             desired=desired_expected,
             equal_nan=True,
         )
-    
+
     # check multiprocessed result
     res_all_pooled = cooltools.api.expected.expected_cis(
         clr,
@@ -313,7 +329,7 @@ def test_expected_cis(request):
         clr_weight_name=clr_weight_name,
         chunksize=chunksize,
         ignore_diags=ignore_diags,
-        nproc=3
+        nproc=3,
     )
     assert res_all.equals(res_all_pooled)
 
@@ -368,7 +384,7 @@ def test_expected_trans(request):
         view_df=view_df,
         clr_weight_name=clr_weight_name,
         chunksize=chunksize,
-        nproc=3
+        nproc=3,
     )
     assert res.equals(res_pooled)
 
@@ -696,4 +712,4 @@ def test_diagsum_from_array():
     exp = _diagsum_symm_dense(ar, bad_bins=list(range(3, 5)))
     exp1 = diagsum_from_array(ar, ignore_diags=0)
     exp1["balanced.avg"] = exp1["balanced.sum"] / exp1["n_valid"]
-    assert np.allclose(exp, exp1["balanced.avg"].values, equal_nan=True)
\ No newline at end of file
+    assert np.allclose(exp, exp1["balanced.avg"].values, equal_nan=True)

From 2d137259987fa83d2066c6121a8efdeade9aaeca Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Thu, 7 Mar 2024 21:54:11 -0800
Subject: [PATCH 4/6] update col names in test_expected.py

---
 tests/test_expected.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_expected.py b/tests/test_expected.py
index d831b9bd..e6e86ca6 100644
--- a/tests/test_expected.py
+++ b/tests/test_expected.py
@@ -265,15 +265,13 @@ def test_expected_cis(request):
         "dist",
         "dist_bp",
         "contact_frequency",
-        "n_elem",
+        "n_total",
         "n_valid",
         "count.sum",
         "balanced.sum",
         "count.avg",
         "balanced.avg",
-        "count.avg.smoothed",
         "balanced.avg.smoothed",
-        "count.avg.smoothed.agg",
         "balanced.avg.smoothed.agg",
     ]
 

From 2b4d03490ff62b2fc2ed7cc4dc32667e31952ea3 Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Fri, 8 Mar 2024 13:40:03 -0800
Subject: [PATCH 5/6] update docstring and migragate functinos to expected.py

---
 cooltools/api/expected.py               | 212 ++++++++++++++++++++----
 cooltools/sandbox/expected_smoothing.py | 131 +--------------
 tests/test_expected.py                  |  22 +++
 3 files changed, 202 insertions(+), 163 deletions(-)

diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
index 283e164c..f20edca9 100644
--- a/cooltools/api/expected.py
+++ b/cooltools/api/expected.py
@@ -25,7 +25,7 @@
 _DIST = list(diag_expected_dtypes)[2]
 _NUM_VALID = list(diag_expected_dtypes)[3]
 
-# See notes in the docstring of agg_smooth_cvd()
+# See notes in the docstring of expected_cis() and per_region_smooth_cvd() 
 DEFAULT_BALANCED_CVD_COLS = {
     "region1": _REGION1,
     "region2": _REGION2,
@@ -34,6 +34,7 @@
     "n_contacts": "balanced.sum",
     "output_prefix": "balanced.avg",
 }
+# See notes in the docstring of expected_cis() and genomewide_smooth_cvd()
 DEFAULT_RAW_CVD_COLS = {
     "region1": _REGION1,
     "region2": _REGION2,
@@ -899,6 +900,141 @@ def blocksum_pairwise(
     )
 
 
+def genomewide_smooth_cvd(
+    cvd,
+    sigma_log10=0.1,
+    window_sigma=5,
+    points_per_sigma=10,
+    cols=None,
+    suffix=".smoothed",
+):
+    """
+    Smooth the contact-vs-distance curve aggregated across all regions in log-space.
+
+    Parameters
+    ----------
+    cvd : pandas.DataFrame
+        A dataframe with the expected values in the cooltools.expected format.
+    sigma_log10 : float, optional
+        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
+    window_sigma : int, optional
+        Width of the smoothing window, expressed in sigmas, by default 5
+    points_per_sigma : int, optional
+        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
+        rest of the values are interpolated (this results in a major speed-up). By default 10
+    cols : dict, optional
+        If provided, use the specified column names instead of the standard ones.
+        See DEFAULT_CVD_COLS variable for the format of this argument.
+    suffix : string, optional
+        If provided, use the specified string as the suffix of the output column's name
+
+    Returns
+    -------
+    cvd : pandas.DataFrame
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed.agg" or "balanced.avg.smoothed.agg" depends on whether balanced or not).
+
+    Notes
+    -----
+    Parameters in "cols" will be used:
+
+    dist:
+        Indicate the name of the column that stores distance values (by default, "dist").
+    n_pixels:
+        Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not).
+    n_contacts:
+        Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not).
+    output_prefix:
+        Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not).
+    """
+    cvd_smoothed = expected_smoothing._smooth_cvd_group(
+        cvd,
+        sigma_log10=sigma_log10,
+        window_sigma=window_sigma,
+        points_per_sigma=points_per_sigma,
+        cols=cols,
+        suffix=suffix,
+    )
+
+    # add aggeragated smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["dist"]],
+        how="left",
+    )
+
+    return cvd
+
+
+def per_region_smooth_cvd(
+    cvd,
+    sigma_log10=0.1,
+    window_sigma=5,
+    points_per_sigma=10,
+    cols=None,
+    suffix="",
+):
+    """
+    Smooth the contact-vs-distance curve for each region in log-space.
+
+    Parameters
+    ----------
+    cvd : pandas.DataFrame
+        A dataframe with the expected values in the cooltools.expected format.
+    sigma_log10 : float, optional
+        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
+    window_sigma : int, optional
+        Width of the smoothing window, expressed in sigmas, by default 5
+    points_per_sigma : int, optional
+        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
+        rest of the values are interpolated (this results in a major speed-up). By default 10
+    cols : dict, optional
+        If provided, use the specified column names instead of the standard ones.
+        See DEFAULT_CVD_COLS variable for the format of this argument.
+    suffix : string, optional
+        If provided, use the specified string as the suffix of the output column's name
+
+    Returns
+    -------
+    cvd : pandas.DataFrame
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed" or "balanced.avg.smoothed" depends on whether balanced or not).
+
+    Notes
+    -----
+    Parameters in "cols" will be used:
+
+    region1:
+        Indicate the name of the column that stores region1's locations (by default, "region1").
+    region2:
+        Indicate the name of the column that stores region2's locations (by default, "region2").
+    dist:
+        Indicate the name of the column that stores distance values (by default, "dist").
+    n_pixels:
+        Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not).
+    n_contacts:
+        Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not).
+    output_prefix:
+        Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not).
+    """
+    cvd_smoothed = (
+        cvd.set_index([cols["region1"], cols["region2"]])
+        .groupby([cols["region1"], cols["region2"]])
+        .apply(
+            expected_smoothing._smooth_cvd_group,
+            sigma_log10=sigma_log10,
+            window_sigma=window_sigma,
+            points_per_sigma=points_per_sigma,
+            cols=cols,
+            suffix=suffix,
+        )
+    )
+    # add smoothed columns to the result
+    cvd = cvd.merge(
+        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
+        on=[cols["region1"], cols["region2"], cols["dist"]],
+        how="left",
+    )
+
+    return cvd
 # user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API
 @pool_decorator
 def expected_cis(
@@ -970,35 +1106,45 @@ def expected_cis(
 
     Notes
     -----
-    Output DataFrame columns correspond to the following quantities:
-
-    dist:
-        Distance in bins.
-    dist_bp:
-        Distance in basepairs.
-    contact_freq:
-        The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
-        if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
-    n_total:
-        Number of total pixels at a given distance.
-    n_valid:
-        Number of valid pixels (with non-NaN values after balancing) at a given distance.
-    count.sum:
-        Sum up raw contact counts of all pixels at a given distance.
-    balanced.sum:
-        Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
-    count.avg:
-        The average raw contact count of pixels at a given distance. count.sum / n_total.
-    balanced.avg:
-        The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.
-    count.avg.smoothed:
-        Log-smoothed count.avg. Returned if smooth=True.
-    balanced.avg.smoothed:
-        Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.
-    count.avg.smoothed.agg:
-        Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True.
-    balanced.avg.smoothed.agg:
-        Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.
+    When clr_weight_name=None, smooth=False, aggregate_smoothed=False, the minimum output DataFrame includes the following quantities (columns):
+
+        dist:
+            Distance in bins.
+        dist_bp:
+            Distance in basepairs.
+        contact_freq:
+            The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
+            if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
+        n_total:
+            Number of total pixels at a given distance.
+        n_valid:
+            Number of valid pixels (with non-NaN values after balancing) at a given distance.
+        count.sum:
+            Sum up raw contact counts of all pixels at a given distance.
+        count.avg:
+            The average raw contact count of pixels at a given distance. count.sum / n_total. 
+    
+    When clr_weigh_name is provided (by default, clr_weigh_name="weight"), the following quantities (columns) will be added into the DataFrame:
+        
+        balanced.sum:
+            Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
+        balanced.avg:
+            The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.
+    
+    When smooth=True, the following quantities (columns) will be added into the DataFrame:
+        
+        count.avg.smoothed:
+            Log-smoothed count.avg. Returned if smooth=True and clr_weight_name=None.
+        balanced.avg.smoothed:
+            Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.
+    
+    When aggregate_smoothed=True, the following quantities (columns) will be added into the DataFrame:
+        count.avg.smoothed.agg:
+            Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name=None.
+        balanced.avg.smoothed.agg:
+            Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.
+
+    By default, clr_weight_name="weight", smooth=True, aggregate_smoothed=True, the output DataFrame includes all quantities (columns).
 
     """
 
@@ -1038,7 +1184,7 @@ def expected_cis(
         raise ValueError(
             "cooler is not balanced, or"
             f"balancing weight {clr_weight_name} is not available in the cooler."
-            f"Pass clr_weight_name=None explicitly to calculate on raw counts"
+            f"Pass clr_weight_name=None explicitly to calculate expected on raw counts"
         )
 
     # using try-clause to close mp.Pool properly
@@ -1075,7 +1221,7 @@ def expected_cis(
 
     # additional smoothing and aggregating options would add columns only, not replace them
     if smooth:
-        result = expected_smoothing.per_region_smooth_cvd(
+        result = per_region_smooth_cvd(
             result,
             sigma_log10=smooth_sigma,
             cols=cols,
@@ -1083,7 +1229,7 @@ def expected_cis(
         )
 
         if aggregate_smoothed:
-            result = expected_smoothing.genome_wide_smooth_cvd(
+            result = genomewide_smooth_cvd(
                 result,
                 sigma_log10=smooth_sigma,
                 cols=cols,
diff --git a/cooltools/sandbox/expected_smoothing.py b/cooltools/sandbox/expected_smoothing.py
index af7bd4f7..385d22f9 100644
--- a/cooltools/sandbox/expected_smoothing.py
+++ b/cooltools/sandbox/expected_smoothing.py
@@ -194,133 +194,4 @@ def _smooth_cvd_group(
         / cvd_smoothed[cols["n_pixels"] + suffix]
     )
 
-    return cvd_smoothed
-
-
-def genome_wide_smooth_cvd(
-    cvd,
-    sigma_log10=0.1,
-    window_sigma=5,
-    points_per_sigma=10,
-    cols=None,
-    suffix="",
-):
-    """
-    Smooth the contact-vs-distance curve aggregated across all regions in log-space.
-
-    Parameters
-    ----------
-    cvd : pandas.DataFrame
-        A dataframe with the expected values in the cooltools.expected format.
-    sigma_log10 : float, optional
-        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
-    window_sigma : int, optional
-        Width of the smoothing window, expressed in sigmas, by default 5
-    points_per_sigma : int, optional
-        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
-        rest of the values are interpolated (this results in a major speed-up). By default 10
-    cols : dict, optional
-        If provided, use the specified column names instead of the standard ones.
-        See DEFAULT_CVD_COLS variable for the format of this argument.
-
-    Returns
-    -------
-    cvd : pandas.DataFrame
-        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
-
-    Notes
-    -----
-    Parameters in "cols"
-
-    dist:
-
-    n_pixels:
-
-    n_contacts:
-
-    contact_freq:
-
-    """
-    cvd_smoothed = _smooth_cvd_group(
-        cvd,
-        sigma_log10=sigma_log10,
-        window_sigma=window_sigma,
-        points_per_sigma=points_per_sigma,
-        cols=cols,
-        suffix=suffix,
-    )
-
-    # add aggeragated smoothed columns to the result
-    cvd = cvd.merge(
-        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
-        on=[cols["dist"]],
-        how="left",
-    )
-
-    return cvd
-
-
-def per_region_smooth_cvd(
-    cvd,
-    sigma_log10=0.1,
-    window_sigma=5,
-    points_per_sigma=10,
-    cols=None,
-    suffix="",
-):
-    """
-    Smooth the contact-vs-distance curve for each region in log-space.
-
-    Parameters
-    ----------
-    cvd : pandas.DataFrame
-        A dataframe with the expected values in the cooltools.expected format.
-    sigma_log10 : float, optional
-        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
-    window_sigma : int, optional
-        Width of the smoothing window, expressed in sigmas, by default 5
-    points_per_sigma : int, optional
-        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
-        rest of the values are interpolated (this results in a major speed-up). By default 10
-    cols : dict, optional
-        If provided, use the specified column names instead of the standard ones.
-        See DEFAULT_CVD_COLS variable for the format of this argument.
-
-    Returns
-    -------
-    cvd : pandas.DataFrame
-        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").
-
-    Notes
-    -----
-    Parameters in "cols"
-
-    dist:
-
-    n_pixels:
-
-    n_contacts:
-
-    contact_freq:
-
-    """
-    cvd_smoothed = (
-        cvd.set_index([cols["region1"], cols["region2"]])
-        .groupby([cols["region1"], cols["region2"]])
-        .apply(
-            _smooth_cvd_group,
-            sigma_log10=sigma_log10,
-            window_sigma=window_sigma,
-            points_per_sigma=points_per_sigma,
-            cols=cols,
-            suffix=suffix,
-        )
-    )
-    # add smoothed columns to the result
-    cvd = cvd.merge(
-        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
-        on=[cols["region1"], cols["region2"], cols["dist"]],
-        how="left",
-    )
-
-    return cvd
+    return cvd_smoothed
\ No newline at end of file
diff --git a/tests/test_expected.py b/tests/test_expected.py
index e6e86ca6..7804fd59 100644
--- a/tests/test_expected.py
+++ b/tests/test_expected.py
@@ -290,6 +290,28 @@ def test_expected_cis(request):
             desired=desired_expected,
             equal_nan=True,
         )
+    
+    # check column names, when clr_weight_name = None, which is the unbalanced case
+    res_symm = cooltools.api.expected.expected_cis(
+        clr,
+        view_df=view_df,
+        clr_weight_name=None,
+        chunksize=chunksize,
+        ignore_diags=ignore_diags,
+    )
+    assert list(res_symm.columns) == [
+        "region1",
+        "region2",
+        "dist",
+        "dist_bp",
+        "contact_frequency",
+        "n_total",
+        "n_valid",
+        "count.sum",
+        "count.avg",
+        "count.avg.smoothed",
+        "count.avg.smoothed.agg",
+    ]
 
     # asymm and symm result together - engaging diagsum_pairwise
     res_all = cooltools.api.expected.expected_cis(

From 1ec9eae1d4ee359ecde4c902f7e4891893e62bec Mon Sep 17 00:00:00 2001
From: Yaoyx <yxiaoy@umich.edu>
Date: Fri, 8 Mar 2024 14:57:27 -0800
Subject: [PATCH 6/6] Update docstring

---
 cooltools/api/expected.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py
index f20edca9..9aa60401 100644
--- a/cooltools/api/expected.py
+++ b/cooltools/api/expected.py
@@ -931,20 +931,20 @@ def genomewide_smooth_cvd(
     Returns
     -------
     cvd : pandas.DataFrame
-        A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed.agg" or "balanced.avg.smoothed.agg" depends on whether balanced or not).
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed.agg" if balanced, or "count.avg.smoothed.agg" if raw).
 
     Notes
     -----
     Parameters in "cols" will be used:
 
     dist:
-        Indicate the name of the column that stores distance values (by default, "dist").
+        Name of the column that stores distance values (by default, "dist").
     n_pixels:
-        Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not).
+        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
     n_contacts:
-        Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not).
+        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
     output_prefix:
-        Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not).
+        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
     """
     cvd_smoothed = expected_smoothing._smooth_cvd_group(
         cvd,
@@ -996,24 +996,24 @@ def per_region_smooth_cvd(
     Returns
     -------
     cvd : pandas.DataFrame
-        A cvd table with extra column for the log-smoothed contact frequencies (by default, "count.avg.smoothed" or "balanced.avg.smoothed" depends on whether balanced or not).
+        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed" if balanced, or "count.avg.smoothed" if raw).
 
     Notes
     -----
     Parameters in "cols" will be used:
 
     region1:
-        Indicate the name of the column that stores region1's locations (by default, "region1").
+        Name of the column that stores region1's locations (by default, "region1").
     region2:
-        Indicate the name of the column that stores region2's locations (by default, "region2").
+        Name of the column that stores region2's locations (by default, "region2").
     dist:
-        Indicate the name of the column that stores distance values (by default, "dist").
+        Name of the column that stores distance values (by default, "dist").
     n_pixels:
-        Indicate the name of the column that stores number of pixels (by default can be either "n_total" or "n_valid" depends on whether balanced or not).
+        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
     n_contacts:
-        Indicate the name of the column that stores the sum of contacts (by default can be either "count.sum" or "balanced.sum" depends on whether balanced or not).
+        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
     output_prefix:
-        Indicate the name prefix of the column that will store output value (by default, can be either "count.avg" or "balanced.avg" depends on whether balanced or not).
+        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
     """
     cvd_smoothed = (
         cvd.set_index([cols["region1"], cols["region2"]])