Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mark_runs and compress_runs #164

Merged
merged 10 commits into from
Oct 18, 2024
162 changes: 162 additions & 0 deletions bioframe/extras.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

from typing import Optional, Tuple

import numpy as np
import pandas as pd

Expand All @@ -14,6 +16,8 @@
"seq_gc",
"frac_gene_coverage",
"pair_by_distance",
"mark_runs",
"merge_runs"
]


Expand Down Expand Up @@ -499,3 +503,161 @@ def pair_by_distance(
)

return pd.concat([left_ivals, right_ivals], axis=1)


def mark_runs(
df: pd.DataFrame,
col: str,
*,
allow_overlaps: bool = False,
reset_counter: bool = True,
run_col: str = 'run',
cols: Optional[Tuple[str, str, str]] = None,
) -> pd.DataFrame:
"""
Mark runs of immediately consecutive intervals sharing the same value of
``col``.

Parameters
----------
df : DataFrame
A bioframe dataframe.
col : str
The column to mark runs of values for.
allow_overlaps : bool, optional [default: False]
If True, allow intervals in ``df`` to overlap. This may cause
unexpected results.
reset_counter : bool, optional [default: True]
If True, reset the run counter for each chromosome.
run_col : str, optional [default: 'run']
The name of the column to store the run numbers in.

Returns
-------
pandas.DataFrame
A reordered copy the input dataframe with an additional column 'run'
marking runs of values in the input column.

See Also
--------
merge_runs
"""
ck, sk, ek = _get_default_colnames() if cols is None else cols

if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
raise ValueError("Not a proper bedGraph: found overlapping intervals.")

result = []
where = np.flatnonzero
n_runs = 0

for _, group in df.groupby(ck, sort=False):
group = group.sort_values([sk, ek])

# Find runs of values
values = group[col].to_numpy()
isnumeric = np.issubdtype(values.dtype, np.number)

if isnumeric:
run_starts = np.r_[
0,
where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1
]
else:
run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1]

run_lengths = np.diff(np.r_[run_starts, len(values)])
run_ends = run_starts + run_lengths

# Assign run numbers to intervals
if reset_counter:
n_runs = 0
group[run_col] = pd.NA
j = group.columns.get_loc(run_col)
for lo, hi in zip(run_starts, run_ends):
group.iloc[lo : hi + 1, j] = n_runs
n_runs += 1

result.append(group)

return pd.concat(result)


def merge_runs(
df: pd.DataFrame,
col: str,
*,
allow_overlaps: bool = False,
agg: Optional[dict] = None,
cols: Optional[Tuple[str, str, str]] = None,
) -> pd.DataFrame:
"""
Merge runs of immediately consecutive intervals sharing the same value of
``col``.

Parameters
----------
df : DataFrame
A bioframe dataframe.
col : str
The column to compress runs of values for.
allow_overlaps : bool, optional [default: False]
If True, allow intervals in ``df`` to overlap. This may cause
unexpected results.
agg : dict, optional [default: None]
A dictionary of additional column names and aggregation functions to
apply to each run. Takes the format:
{'agg_name': ('column_name', 'agg_func')}

Returns
-------
pandas.DataFrame
Dataframe with consecutive intervals in the same run merged.

Examples
--------
>>> df = pd.DataFrame({
... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
... 'start': [0, 100, 200, 300, 400, 500],
... 'end': [100, 200, 300, 400, 500, 600],
... 'value': [1, 1, 1, 2, 2, 2],
... })

>>> merge_runs(df, 'value')
chrom start end value
0 chr1 0 300 1
1 chr1 300 600 2

>>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')})
chrom start end value sum
0 chr1 0 300 1 3
1 chr1 300 600 2 6

See Also
--------
mark_runs
"""
ck, sk, ek = _get_default_colnames() if cols is None else cols

if agg is None:
agg = {}
nvictus marked this conversation as resolved.
Show resolved Hide resolved

df_runs = mark_runs(
df,
col,
allow_overlaps=allow_overlaps,
reset_counter=False,
run_col='_run',
)
df_merged = (
df_runs
.groupby('_run')
.agg(**{
ck: (ck, 'first'),
sk: (sk, 'min'),
ek: (ek, 'max'),
col: (col, 'first'),
**agg
})
)
return df_merged.reset_index(drop=True)
78 changes: 78 additions & 0 deletions tests/test_extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,81 @@ def test_pair_by_distance():
bioframe.pair_by_distance(
df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9
)


def test_mark_merge_runs():
df1 = pd.DataFrame([
["chr1", 85563, 129897, "c", 0.1],
["chr1", 434858, 508340, "c", 0.8],
["chr1", 586303, 620904, "c", 0.5],
["chr1", 652861, 688020, "c", 0.7],
["chr1", 818801, 858415, "b", 0.8],

["chr2", 548402, 639680, "a", 0.6],
["chr2", 970541, 1026586, "b", 0.8],

["chr3", 260538, 272930, "c", 0.5],
["chr3", 460071, 470969, "c", 0.5],
["chr3", 487568, 502336, "c", 0.5],
], columns=["chrom", "start", "end", "name", "score"])

runs = bioframe.mark_runs(df1, "name")
assert (
runs["name"].to_numpy()
== np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"])
).all()
assert (
runs["run"].to_numpy()
== np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0])
).all()

runs = bioframe.mark_runs(df1, "name", reset_counter=False)
assert (
runs["run"].to_numpy()
== np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
).all()

runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
assert (
runs["foo"].to_numpy()
== np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
).all()

merged = bioframe.merge_runs(
df1, "name", agg={"score_mean": ("score", "mean")}
)
assert (
merged["name"].to_numpy()
== np.array(["c", "b", "a", "b", "c"])
).all()
assert np.allclose(
merged["score_mean"].to_numpy(),
np.array([0.525, 0.8, 0.6, 0.8, 0.5]),
)


def test_mark_merge_runs__with_overlaps():
df1 = pd.DataFrame([
["chr1", 85563, 129897, "c", 0.1],
["chr1", 434858, 508340, "c", 0.8],
["chr1", 586303, 620904, "c", 0.5],
["chr1", 652861, 688020, "c", 0.7],
["chr1", 818801, 858415, "b", 0.8],
["chr1", 800000, 900000, "b", 0.8],

["chr2", 548402, 639680, "a", 0.6],
["chr2", 970541, 1026586, "b", 0.8],

["chr3", 260538, 272930, "c", 0.5],
["chr3", 460071, 470969, "c", 0.5],
["chr3", 487568, 502336, "c", 0.5],
], columns=["chrom", "start", "end", "name", "score"])

with pytest.raises(ValueError):
bioframe.mark_runs(df1, "name")

runs = bioframe.mark_runs(df1, "name", allow_overlaps=True)
assert (
runs["name"].to_numpy()
== np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
).all()