Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include smoothed count in cvd table #501

Merged
merged 6 commits into from
Mar 10, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 47 additions & 8 deletions cooltools/api/expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def _make_diag_table(n_bins, bad_locs):
)
diags = diags[diags["n_elem"] > 0]

diags = diags.drop("n_elem", axis=1)
# keep diags["n_elem"] for calculating count.avg
gfudenberg marked this conversation as resolved.
Show resolved Hide resolved
# diags = diags.drop("n_elem", axis=1)
return diags.astype(int)


Expand Down Expand Up @@ -944,7 +945,36 @@ def expected_cis(
-------
DataFrame with summary statistic of every diagonal of every symmetric
or asymmetric block:
region1, region2, diag, n_valid, count.sum count.avg, etc

Notes
-----
dist:
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
Distance in bins.
dist_bp:
Distance in basepairs.
contact_freq:
The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
n_elem:
Number of total pixels at a given distance.
n_valid:
Number of valid pixels (with non-NaN values after balancing) at a given distance.
count.sum:
Sum up raw contact counts of all pixels at a given distance.
balanced.sum:
Sum up balanced contact values of valid pixels at a given distance.
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
count.avg:
The average raw contact count of pixels at a given distance. count.sum / n_elem.
balanced.avg:
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
count.avg.smoothed:
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
Log-smoothed count.avg.
balanced.avg.smoothed:
Log-smoothed balanced.avg.
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
count.avg.smoothed.agg:
Aggregate Log-smoothed count.avg of all genome regions.
Yaoyx marked this conversation as resolved.
Show resolved Hide resolved
balanced.avg.smoothed.agg:
Aggregate Log-smoothed balanced.avg of all genome regions.
gfudenberg marked this conversation as resolved.
Show resolved Hide resolved

"""

Expand Down Expand Up @@ -978,7 +1008,6 @@ def expected_cis(
weight1 = clr_weight_name + "1"
weight2 = clr_weight_name + "2"
transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}

else:
raise ValueError(
"cooler is not balanced, or"
Expand Down Expand Up @@ -1009,17 +1038,24 @@ def expected_cis(

# calculate actual averages by dividing sum by n_valid:
for key in chain(["count"], transforms):
result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
if key == "count":
result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_elem"]
else:
result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]

# add dist_bp column, which shows distance in basepairs
result.insert(3, 'dist_bp', result[_DIST]*clr.binsize)


# additional smoothing and aggregating options would add columns only, not replace them
if smooth:
result_smooth = expected_smoothing.agg_smooth_cvd(
result,
sigma_log10=smooth_sigma,
)
# add smoothed columns to the result (only balanced for now)
# add smoothed columns to the result (only balanced for now) (include count as well)
result = result.merge(
result_smooth[["balanced.avg.smoothed", _DIST]],
result_smooth[["count.avg.smoothed", "balanced.avg.smoothed", _DIST]],
gfudenberg marked this conversation as resolved.
Show resolved Hide resolved
on=[_REGION1, _REGION2, _DIST],
how="left",
)
Expand All @@ -1028,15 +1064,18 @@ def expected_cis(
result,
groupby=None,
sigma_log10=smooth_sigma,
).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg", "count.avg.smoothed": "count.avg.smoothed.agg"})
# add smoothed columns to the result
result = result.merge(
result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
result_smooth_agg[["count.avg.smoothed.agg", "balanced.avg.smoothed.agg", _DIST]],
on=[
_DIST,
],
how="left",
)

# add contact_frequency columns to the result, which is the copy of the "most processing" contact values
result.insert(4, 'contact_frequency', result.iloc[:,-1])

return result

Expand Down
70 changes: 55 additions & 15 deletions cooltools/sandbox/expected_smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
import numpy as np
import numba


# See notes in the docstring of agg_smooth_cvd()
DEFAULT_CVD_COLS = {
"dist": "dist",
"n_pixels": "n_valid",
"n_contacts": "balanced.sum",
"contact_freq": "balanced.avg",
"smooth_suffix": ".smoothed",
"n_pixels_tot": "n_elem",
"n_contacts_raw": "count.sum",
"contact_freq_raw": "count.avg",
}


Expand Down Expand Up @@ -182,6 +185,8 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
{
cols["n_pixels"]: "sum",
cols["n_contacts"]: "sum",
cols["n_pixels_tot"]: "sum",
cols["n_contacts_raw"]: "sum",
}
)
.reset_index()
Expand All @@ -198,8 +203,24 @@ def _smooth_cvd_group(cvd, sigma_log10, window_sigma, points_per_sigma, cols=Non
points_per_sigma=points_per_sigma,
)

# cvd_smoothed[cols["contact_freq"]] = cvd_smoothed[cols["n_contacts"]] / cvd_smoothed[cols["n_pixels"]]

smoothed_raw_sum, smoothed_n_elem = log_smooth(
cvd_smoothed[cols["dist"]].values.astype(np.float64),
[
cvd_smoothed[cols["n_contacts_raw"]].values.astype(np.float64),
cvd_smoothed[cols["n_pixels_tot"]].values.astype(np.float64),
],
sigma_log10=sigma_log10,
window_sigma=window_sigma,
points_per_sigma=points_per_sigma,
)

cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]] = smoothed_n_elem
cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]] = smoothed_raw_sum
cvd_smoothed[cols["contact_freq_raw"] + cols["smooth_suffix"]] = (
cvd_smoothed[cols["n_contacts_raw"] + cols["smooth_suffix"]]
/ cvd_smoothed[cols["n_pixels_tot"] + cols["smooth_suffix"]]
)

cvd_smoothed[cols["n_pixels"] + cols["smooth_suffix"]] = smoothed_n_valid
cvd_smoothed[cols["n_contacts"] + cols["smooth_suffix"]] = smoothed_balanced_sum
cvd_smoothed[cols["contact_freq"] + cols["smooth_suffix"]] = (
Expand Down Expand Up @@ -268,6 +289,37 @@ def agg_smooth_cvd(
-------
cvd_smoothed : pandas.DataFrame
A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed").

Notes
-----
gfudenberg marked this conversation as resolved.
Show resolved Hide resolved
dist:
Distance in bins.
dist_bp:
Distance in basepairs.
contact_freq:
The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
n_elem:
Number of total pixels at a given distance.
n_valid:
Number of valid pixels (with non-NaN values after balancing) at a given distance.
count.sum:
Sum up raw contact counts of all pixels at a given distance.
balanced.sum:
Sum up balanced contact values of valid pixels at a given distance.
count.avg:
The average raw contact count of pixels at a given distance. count.sum / n_elem.
balanced.avg:
The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid.
count.avg.smoothed:
Log-smoothed count.avg.
balanced.avg.smoothed:
Log-smoothed balanced.avg.
count.avg.smoothed.agg:
Aggregate Log-smoothed count.avg of all genome regions.
balanced.avg.smoothed.agg:
Aggregate Log-smoothed balanced.avg of all genome regions.

"""

cols = dict(DEFAULT_CVD_COLS, **({} if cols is None else cols))
Expand All @@ -290,16 +342,4 @@ def agg_smooth_cvd(
cols=cols,
)

cvd_smoothed.drop(
[cols["n_pixels"], cols["n_contacts"]], axis="columns", inplace=True
)

# cvd = cvd.drop(cols["contact_freq"], axis='columns', errors='ignore')

# cvd = cvd.merge(
# cvd_smoothed,
# on=groupby + [cols["dist"]],
# how="left",
# )

return cvd_smoothed
4 changes: 4 additions & 0 deletions tests/test_expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ def test_expected_cis(request):
chunksize=chunksize,
ignore_diags=ignore_diags,
)

# check column names
assert list(res_symm.columns) == ['region1', 'region2', 'dist', 'dist_bp', 'contact_frequency', 'n_elem', 'n_valid', 'count.sum', 'balanced.sum', 'count.avg', 'balanced.avg', 'count.avg.smoothed', 'balanced.avg.smoothed', 'count.avg.smoothed.agg', 'balanced.avg.smoothed.agg']
gfudenberg marked this conversation as resolved.
Show resolved Hide resolved

# check results for every block
grouped = res_symm.groupby(["region1", "region2"])
for (name1, name2), group in grouped:
Expand Down
Loading