Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include smoothed count in cvd table #501

Merged
merged 6 commits into from
Mar 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 231 additions & 30 deletions cooltools/api/expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@
_DIST = list(diag_expected_dtypes)[2]
_NUM_VALID = list(diag_expected_dtypes)[3]

# See notes in the docstring of expected_cis() and per_region_smooth_cvd()
DEFAULT_BALANCED_CVD_COLS = {
"region1": _REGION1,
"region2": _REGION2,
"dist": "dist",
"n_pixels": "n_valid",
"n_contacts": "balanced.sum",
"output_prefix": "balanced.avg",
}
# See notes in the docstring of expected_cis() and genomewide_smooth_cvd()
DEFAULT_RAW_CVD_COLS = {
"region1": _REGION1,
"region2": _REGION2,
"dist": "dist",
"n_pixels": "n_total",
"n_contacts": "count.sum",
"output_prefix": "count.avg",
}
SMOOTH_SUFFIX = ".smoothed"
SMOOTH_AGG_SUFFIX = ".smoothed.agg"


def lattice_pdist_frequencies(n, points):
"""
Expand Down Expand Up @@ -199,7 +220,7 @@ def count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y):

def make_diag_table(bad_mask, span1, span2):
"""
Compute the total number of elements ``n_elem`` and the number of bad
Compute the total number of elements ``n_total`` and the number of bad
elements ``n_bad`` per diagonal for a single contact area encompassing
``span1`` and ``span2`` on the same genomic scaffold (cis matrix).

Expand All @@ -217,7 +238,7 @@ def make_diag_table(bad_mask, span1, span2):
Returns
-------
diags : pandas.DataFrame
Table indexed by 'diag' with columns ['n_elem', 'n_bad'].
Table indexed by 'diag' with columns ['n_total', 'n_bad'].

"""

Expand All @@ -226,8 +247,8 @@ def make_diag_table(bad_mask, span1, span2):

def _make_diag_table(n_bins, bad_locs):
diags = pd.DataFrame(index=pd.Series(np.arange(n_bins), name=_DIST))
diags["n_elem"] = count_all_pixels_per_diag(n_bins)
diags[_NUM_VALID] = diags["n_elem"] - count_bad_pixels_per_diag(
diags["n_total"] = count_all_pixels_per_diag(n_bins)
diags[_NUM_VALID] = diags["n_total"] - count_bad_pixels_per_diag(
n_bins, bad_locs
)
return diags
Expand All @@ -254,9 +275,10 @@ def _make_diag_table(n_bins, bad_locs):
diags.add(
_make_diag_table(lo2 - hi1, where(bad_mask[hi1:lo2])), fill_value=0
)
diags = diags[diags["n_elem"] > 0]
diags = diags[diags["n_total"] > 0]

diags = diags.drop("n_elem", axis=1)
# keep diags["n_total"] for calculating count.avg
# diags = diags.drop("n_total", axis=1)
return diags.astype(int)


Expand Down Expand Up @@ -302,7 +324,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"):
regions2 = bioframe.make_viewframe(
regions2, check_bounds=clr.chromsizes
).to_numpy()
except ValueError: # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
except (
ValueError
): # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
regions = pd.concat(
[
bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
Expand Down Expand Up @@ -876,6 +900,141 @@ def blocksum_pairwise(
)


def genomewide_smooth_cvd(
cvd,
sigma_log10=0.1,
window_sigma=5,
points_per_sigma=10,
cols=None,
suffix=".smoothed",
):
"""
Smooth the contact-vs-distance curve aggregated across all regions in log-space.

Parameters
----------
cvd : pandas.DataFrame
A dataframe with the expected values in the cooltools.expected format.
sigma_log10 : float, optional
The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
window_sigma : int, optional
Width of the smoothing window, expressed in sigmas, by default 5
points_per_sigma : int, optional
If provided, smoothing is done only for `points_per_sigma` points per sigma and the
rest of the values are interpolated (this results in a major speed-up). By default 10
cols : dict, optional
If provided, use the specified column names instead of the standard ones.
See DEFAULT_CVD_COLS variable for the format of this argument.
suffix : string, optional
If provided, use the specified string as the suffix of the output column's name

Returns
-------
cvd : pandas.DataFrame
A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed.agg" if balanced, or "count.avg.smoothed.agg" if raw).

Notes
-----
Parameters in "cols" will be used:

dist:
Name of the column that stores distance values (by default, "dist").
n_pixels:
Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
n_contacts:
Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
output_prefix:
Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
"""
cvd_smoothed = expected_smoothing._smooth_cvd_group(
cvd,
sigma_log10=sigma_log10,
window_sigma=window_sigma,
points_per_sigma=points_per_sigma,
cols=cols,
suffix=suffix,
)

# add aggeragated smoothed columns to the result
cvd = cvd.merge(
cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
on=[cols["dist"]],
how="left",
)

return cvd


def per_region_smooth_cvd(
cvd,
sigma_log10=0.1,
window_sigma=5,
points_per_sigma=10,
cols=None,
suffix="",
):
"""
Smooth the contact-vs-distance curve for each region in log-space.

Parameters
----------
cvd : pandas.DataFrame
A dataframe with the expected values in the cooltools.expected format.
sigma_log10 : float, optional
The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
window_sigma : int, optional
Width of the smoothing window, expressed in sigmas, by default 5
points_per_sigma : int, optional
If provided, smoothing is done only for `points_per_sigma` points per sigma and the
rest of the values are interpolated (this results in a major speed-up). By default 10
cols : dict, optional
If provided, use the specified column names instead of the standard ones.
See DEFAULT_CVD_COLS variable for the format of this argument.
suffix : string, optional
If provided, use the specified string as the suffix of the output column's name

Returns
-------
cvd : pandas.DataFrame
A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed" if balanced, or "count.avg.smoothed" if raw).

Notes
-----
Parameters in "cols" will be used:

region1:
Name of the column that stores region1's locations (by default, "region1").
region2:
Name of the column that stores region2's locations (by default, "region2").
dist:
Name of the column that stores distance values (by default, "dist").
n_pixels:
Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
n_contacts:
Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
output_prefix:
Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
"""
cvd_smoothed = (
cvd.set_index([cols["region1"], cols["region2"]])
.groupby([cols["region1"], cols["region2"]])
.apply(
expected_smoothing._smooth_cvd_group,
sigma_log10=sigma_log10,
window_sigma=window_sigma,
points_per_sigma=points_per_sigma,
cols=cols,
suffix=suffix,
)
)
# add smoothed columns to the result
cvd = cvd.merge(
cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
on=[cols["region1"], cols["region2"], cols["dist"]],
how="left",
)

return cvd
# user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API
@pool_decorator
def expected_cis(
Expand Down Expand Up @@ -938,13 +1097,54 @@ def expected_cis(
map_functor : callable, optional
Map function to dispatch the matrix chunks to workers.
If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
If nproc=1 this defaults the builtin map.
If nproc=1 this defaults the builtin map.

Returns
-------
DataFrame with summary statistic of every diagonal of every symmetric
or asymmetric block:
region1, region2, diag, n_valid, count.sum count.avg, etc

Notes
-----
When clr_weight_name=None, smooth=False, aggregate_smoothed=False, the minimum output DataFrame includes the following quantities (columns):

dist:
Distance in bins.
dist_bp:
Distance in basepairs.
contact_freq:
The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
n_total:
Number of total pixels at a given distance.
n_valid:
Number of valid pixels (with non-NaN values after balancing) at a given distance.
count.sum:
Sum up raw contact counts of all pixels at a given distance.
count.avg:
The average raw contact count of pixels at a given distance. count.sum / n_total.

When clr_weigh_name is provided (by default, clr_weigh_name="weight"), the following quantities (columns) will be added into the DataFrame:

balanced.sum:
Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
balanced.avg:
The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.

When smooth=True, the following quantities (columns) will be added into the DataFrame:

count.avg.smoothed:
Log-smoothed count.avg. Returned if smooth=True and clr_weight_name=None.
balanced.avg.smoothed:
Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.

When aggregate_smoothed=True, the following quantities (columns) will be added into the DataFrame:
count.avg.smoothed.agg:
Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name=None.
balanced.avg.smoothed.agg:
Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.

By default, clr_weight_name="weight", smooth=True, aggregate_smoothed=True, the output DataFrame includes all quantities (columns).

"""

Expand All @@ -970,19 +1170,21 @@ def expected_cis(
raise ValueError("view_df is not a valid viewframe or incompatible") from e

# define transforms - balanced and raw ('count') for now
cols = DEFAULT_BALANCED_CVD_COLS
if clr_weight_name is None:
# no transforms
transforms = {}
cols = DEFAULT_RAW_CVD_COLS
elif is_cooler_balanced(clr, clr_weight_name):
# define balanced data transform:
weight1 = clr_weight_name + "1"
weight2 = clr_weight_name + "2"
transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}

else:
raise ValueError(
"cooler is not balanced, or"
f"balancing weight {clr_weight_name} is not available in the cooler."
f"Pass clr_weight_name=None explicitly to calculate expected on raw counts"
)

# using try-clause to close mp.Pool properly
Expand All @@ -1009,35 +1211,34 @@ def expected_cis(

# calculate actual averages by dividing sum by n_valid:
for key in chain(["count"], transforms):
result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]
if key == "count":
result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_total"]
else:
result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]

# add dist_bp column, which shows distance in basepairs
result.insert(3, "dist_bp", result[_DIST] * clr.binsize)

# additional smoothing and aggregating options would add columns only, not replace them
if smooth:
result_smooth = expected_smoothing.agg_smooth_cvd(
result = per_region_smooth_cvd(
result,
sigma_log10=smooth_sigma,
cols=cols,
suffix=SMOOTH_SUFFIX,
)
# add smoothed columns to the result (only balanced for now)
result = result.merge(
result_smooth[["balanced.avg.smoothed", _DIST]],
on=[_REGION1, _REGION2, _DIST],
how="left",
)

if aggregate_smoothed:
result_smooth_agg = expected_smoothing.agg_smooth_cvd(
result = genomewide_smooth_cvd(
result,
groupby=None,
sigma_log10=smooth_sigma,
).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
# add smoothed columns to the result
result = result.merge(
result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
on=[
_DIST,
],
how="left",
cols=cols,
suffix=SMOOTH_AGG_SUFFIX,
)

# add contact_frequency columns to the result, which is the copy of the "most processing" contact values
result.insert(4, "contact_frequency", result.iloc[:, -1])

return result


Expand Down Expand Up @@ -1082,7 +1283,7 @@ def expected_trans(
map_functor : callable, optional
Map function to dispatch the matrix chunks to workers.
If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
If nproc=1 this defaults the builtin map.
If nproc=1 this defaults the builtin map.

Returns
-------
Expand Down Expand Up @@ -1117,7 +1318,7 @@ def expected_trans(
weight1 = clr_weight_name + "1"
weight2 = clr_weight_name + "2"
transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}

else:
raise ValueError(
"cooler is not balanced, or"
Expand Down
Loading
Loading