open2c · nvictus · Oct 18, 2024 · Jun 20, 2023 · Jun 20, 2023 · Jun 21, 2023
diff --git a/bioframe/extras.py b/bioframe/extras.py
@@ -1,4 +1,6 @@
 
+from typing import Optional, Tuple
+
 import numpy as np
 import pandas as pd
 
@@ -14,6 +16,8 @@
     "seq_gc",
     "frac_gene_coverage",
     "pair_by_distance",
+    "mark_runs",
+    "merge_runs"
 ]
 
 
@@ -499,3 +503,161 @@ def pair_by_distance(
     )
 
     return pd.concat([left_ivals, right_ivals], axis=1)
+
+
+def mark_runs(
+    df: pd.DataFrame,
+    col: str,
+    *,
+    allow_overlaps: bool = False,
+    reset_counter: bool = True,
+    run_col: str = 'run',
+    cols: Optional[Tuple[str, str, str]] = None,
+) -> pd.DataFrame:
+    """
+    Mark runs of immediately consecutive intervals sharing the same value of
+    ``col``.
+
+    Parameters
+    ----------
+    df : DataFrame
+        A bioframe dataframe.
+    col : str
+        The column to mark runs of values for.
+    allow_overlaps : bool, optional [default: False]
+        If True, allow intervals in ``df`` to overlap. This may cause
+        unexpected results.
+    reset_counter : bool, optional [default: True]
+        If True, reset the run counter for each chromosome.
+    run_col : str, optional [default: 'run']
+        The name of the column to store the run numbers in.
+
+    Returns
+    -------
+    pandas.DataFrame
+        A reordered copy the input dataframe with an additional column 'run'
+        marking runs of values in the input column.
+
+    See Also
+    --------
+    merge_runs
+    """
+    ck, sk, ek = _get_default_colnames() if cols is None else cols
+
+    if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
+        raise ValueError("Not a proper bedGraph: found overlapping intervals.")
+
+    result = []
+    where = np.flatnonzero
+    n_runs = 0
+
+    for _, group in df.groupby(ck, sort=False):
+        group = group.sort_values([sk, ek])
+
+        # Find runs of values
+        values = group[col].to_numpy()
+        isnumeric = np.issubdtype(values.dtype, np.number)
+
+        if isnumeric:
+            run_starts = np.r_[
+                0,
+                where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1
+            ]
+        else:
+            run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1]
+
+        run_lengths = np.diff(np.r_[run_starts, len(values)])
+        run_ends = run_starts + run_lengths
+
+        # Assign run numbers to intervals
+        if reset_counter:
+            n_runs = 0
+        group[run_col] = pd.NA
+        j = group.columns.get_loc(run_col)
+        for lo, hi in zip(run_starts, run_ends):
+            group.iloc[lo : hi + 1, j] = n_runs
+            n_runs += 1
+
+        result.append(group)
+
+    return pd.concat(result)
+
+
+def merge_runs(
+    df: pd.DataFrame,
+    col: str,
+    *,
+    allow_overlaps: bool = False,
+    agg: Optional[dict] = None,
+    cols: Optional[Tuple[str, str, str]] = None,
+) -> pd.DataFrame:
+    """
+    Merge runs of immediately consecutive intervals sharing the same value of
+    ``col``.
+
+    Parameters
+    ----------
+    df : DataFrame
+        A bioframe dataframe.
+    col : str
+        The column to compress runs of values for.
+    allow_overlaps : bool, optional [default: False]
+        If True, allow intervals in ``df`` to overlap. This may cause
+        unexpected results.
+    agg : dict, optional [default: None]
+        A dictionary of additional column names and aggregation functions to
+        apply to each run. Takes the format:
+            {'agg_name': ('column_name', 'agg_func')}
+
+    Returns
+    -------
+    pandas.DataFrame
+        Dataframe with consecutive intervals in the same run merged.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({
+    ...     'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
+    ...     'start': [0, 100, 200, 300, 400, 500],
+    ...     'end': [100, 200, 300, 400, 500, 600],
+    ...     'value': [1, 1, 1, 2, 2, 2],
+    ... })
+
+    >>> merge_runs(df, 'value')
+        chrom  start  end  value
+    0   chr1      0  300      1
+    1   chr1    300  600      2
+
+    >>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')})
+        chrom  start  end  value  sum
+    0   chr1      0  300      1    3
+    1   chr1    300  600      2    6
+
+    See Also
+    --------
+    mark_runs
+    """
+    ck, sk, ek = _get_default_colnames() if cols is None else cols
+
+    if agg is None:
+        agg = {}
+
+    df_runs = mark_runs(
+        df,
+        col,
+        allow_overlaps=allow_overlaps,
+        reset_counter=False,
+        run_col='_run',
+    )
+    df_merged = (
+        df_runs
+        .groupby('_run')
+        .agg(**{
+            ck: (ck, 'first'),
+            sk: (sk, 'min'),
+            ek: (ek, 'max'),
+            col: (col, 'first'),
+            **agg
+         })
+    )
+    return df_merged.reset_index(drop=True)
diff --git a/tests/test_extras.py b/tests/test_extras.py
@@ -292,3 +292,81 @@ def test_pair_by_distance():
         bioframe.pair_by_distance(
             df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9
         )
+
+
+def test_mark_merge_runs():
+    df1 = pd.DataFrame([
+        ["chr1", 85563, 129897, "c", 0.1],
+        ["chr1", 434858, 508340, "c", 0.8],
+        ["chr1", 586303, 620904, "c", 0.5],
+        ["chr1", 652861, 688020, "c", 0.7],
+        ["chr1", 818801, 858415, "b", 0.8],
+
+        ["chr2", 548402, 639680, "a", 0.6],
+        ["chr2", 970541, 1026586, "b", 0.8],
+
+        ["chr3", 260538, 272930, "c", 0.5],
+        ["chr3", 460071, 470969, "c", 0.5],
+        ["chr3", 487568, 502336, "c", 0.5],
+    ], columns=["chrom", "start", "end", "name", "score"])
+
+    runs = bioframe.mark_runs(df1, "name")
+    assert (
+        runs["name"].to_numpy()
+        == np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"])
+    ).all()
+    assert (
+        runs["run"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0])
+    ).all()
+
+    runs = bioframe.mark_runs(df1, "name", reset_counter=False)
+    assert (
+        runs["run"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+    ).all()
+
+    runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
+    assert (
+        runs["foo"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+    ).all()
+
+    merged = bioframe.merge_runs(
+        df1, "name", agg={"score_mean": ("score", "mean")}
+    )
+    assert (
+        merged["name"].to_numpy()
+        == np.array(["c", "b", "a", "b", "c"])
+    ).all()
+    assert np.allclose(
+        merged["score_mean"].to_numpy(),
+        np.array([0.525, 0.8, 0.6, 0.8, 0.5]),
+    )
+
+
+def test_mark_merge_runs__with_overlaps():
+    df1 = pd.DataFrame([
+        ["chr1", 85563, 129897, "c", 0.1],
+        ["chr1", 434858, 508340, "c", 0.8],
+        ["chr1", 586303, 620904, "c", 0.5],
+        ["chr1", 652861, 688020, "c", 0.7],
+        ["chr1", 818801, 858415, "b", 0.8],
+        ["chr1", 800000, 900000, "b", 0.8],
+
+        ["chr2", 548402, 639680, "a", 0.6],
+        ["chr2", 970541, 1026586, "b", 0.8],
+
+        ["chr3", 260538, 272930, "c", 0.5],
+        ["chr3", 460071, 470969, "c", 0.5],
+        ["chr3", 487568, 502336, "c", 0.5],
+    ], columns=["chrom", "start", "end", "name", "score"])
+
+    with pytest.raises(ValueError):
+        bioframe.mark_runs(df1, "name")
+
+    runs = bioframe.mark_runs(df1, "name", allow_overlaps=True)
+    assert (
+        runs["name"].to_numpy()
+        == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
+    ).all()