From 1fb78e3cce0eef798cbc8c0e1752bd9091705f88 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Tue, 20 Jun 2023 19:42:58 -0400
Subject: [PATCH 1/9] Add mark_runs and compress_runs

---
 bioframe/extras.py   | 134 +++++++++++++++++++++++++++++++++++++++++++
 tests/test_extras.py |  78 +++++++++++++++++++++++++
 2 files changed, 212 insertions(+)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 5569c3c7..efb6fd15 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -1,4 +1,6 @@
 
+from typing import Optional, Tuple
+
 import numpy as np
 import pandas as pd
 
@@ -14,6 +16,8 @@
     "seq_gc",
     "frac_gene_coverage",
     "pair_by_distance",
+    "mark_runs",
+    "compress_runs"
 ]
 
 
@@ -499,3 +503,133 @@ def pair_by_distance(
     )
 
     return pd.concat([left_ivals, right_ivals], axis=1)
+
+
+def mark_runs(
+    df: pd.DataFrame,
+    col: str,
+    *,
+    allow_overlaps: bool = False,
+    reset_counter: bool = True,
+    run_col: str = 'run',
+    cols: Optional[Tuple[str, str, str]] = None,
+) -> pd.DataFrame:
+    """
+    Mark runs of consecutive intervals sharing the same value of ``col``.
+
+    Parameters
+    ----------
+    df : DataFrame
+        A bioframe dataframe.
+    col : str
+        The column to mark runs of values for.
+    allow_overlaps : bool, optional [default: False]
+        If True, allow intervals in ``df`` to overlap. This may cause
+        unexpected results.
+    reset_counter : bool, optional [default: True]
+        If True, reset the run counter for each chromosome.
+    run_col : str, optional [default: 'run']
+        The name of the column to store the run numbers in.
+
+    Returns
+    -------
+    DataFrame
+        A sorted copy the input dataframe with an additional column 'run'
+        marking runs of values in the input column.
+    """
+    ck, _, _ = _get_default_colnames() if cols is None else cols
+
+    if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
+        raise ValueError("Not a proper bedGraph: found overlapping intervals.")
+
+    result = []
+    where = np.flatnonzero
+    n_runs = 0
+
+    for _, group in df.groupby(ck, sort=False):
+        group = ops.sort_bedframe(group, reset_index=False)
+
+        # Find runs of values
+        values = group[col].to_numpy()
+        isnumeric = np.issubdtype(values.dtype, np.number)
+
+        if isnumeric:
+            run_starts = np.r_[
+                0,
+                where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1
+            ]
+        else:
+            run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1]
+
+        run_lengths = np.diff(np.r_[run_starts, len(values)])
+        run_ends = run_starts + run_lengths
+
+        # Assign run numbers to intervals
+        if reset_counter:
+            n_runs = 0
+        group[run_col] = pd.NA
+        j = group.columns.get_loc(run_col)
+        for lo, hi in zip(run_starts, run_ends):
+            group.iloc[lo : hi + 1, j] = n_runs
+            n_runs += 1
+
+        result.append(group)
+
+    return pd.concat(result)
+
+
+def compress_runs(
+    df: pd.DataFrame,
+    col: str,
+    *,
+    allow_overlaps: bool = False,
+    agg: Optional[dict] = None,
+    cols: Optional[Tuple[str, str, str]] = None,
+) -> pd.DataFrame:
+    """
+    Compress runs of consecutive intervals sharing the same value of ``col``.
+
+    Parameters
+    ----------
+    df : DataFrame
+        A bioframe dataframe.
+    col : str
+        The column to compress runs of values for.
+    allow_overlaps : bool, optional [default: False]
+        If True, allow intervals in ``df`` to overlap. This may cause
+        unexpected results.
+    agg : dict, optional [default: None]
+        A dictionary of additional column names and aggregation functions to
+        apply to each run. Takes the format:
+            {'agg_name': ('column_name', 'agg_func')}
+
+    Returns
+    -------
+    DataFrame
+        A sorted copy the input dataframe with runs of values in the input
+        column compressed.
+    """
+    ck, sk, ek = _get_default_colnames() if cols is None else cols
+
+    if agg is None:
+        agg = {}
+
+    df_runs = mark_runs(
+        df,
+        col,
+        allow_overlaps=allow_overlaps,
+        reset_counter=False,
+        run_col='_run',
+    )
+    df_compressed = (
+        df_runs
+        .groupby('_run')
+        .agg(**{
+            ck: (ck, 'first'),
+            sk: (sk, 'min'),
+            ek: (ek, 'max'),
+            col: (col, 'first'),
+            **agg
+         })
+    )
+    return df_compressed.reset_index(drop=True)
diff --git a/tests/test_extras.py b/tests/test_extras.py
index c7e47a19..041a415d 100644
--- a/tests/test_extras.py
+++ b/tests/test_extras.py
@@ -292,3 +292,81 @@ def test_pair_by_distance():
         bioframe.pair_by_distance(
             df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9
         )
+
+
+def test_mark_compress_runs():
+    df1 = pd.DataFrame([
+        ["chr1", 85563, 129897, "c", 0.1],
+        ["chr1", 434858, 508340, "c", 0.8],
+        ["chr1", 586303, 620904, "c", 0.5],
+        ["chr1", 652861, 688020, "c", 0.7],
+        ["chr1", 818801, 858415, "b", 0.8],
+
+        ["chr2", 548402, 639680, "a", 0.6],
+        ["chr2", 970541, 1026586, "b", 0.8],
+
+        ["chr3", 260538, 272930, "c", 0.5],
+        ["chr3", 460071, 470969, "c", 0.5],
+        ["chr3", 487568, 502336, "c", 0.5],
+    ], columns=["chrom", "start", "end", "name", "score"])
+
+    runs = bioframe.mark_runs(df1, "name")
+    assert (
+        runs["name"].to_numpy()
+        == np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"])
+    ).all()
+    assert (
+        runs["run"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0])
+    ).all()
+
+    runs = bioframe.mark_runs(df1, "name", reset_counter=False)
+    assert (
+        runs["run"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+    ).all()
+
+    runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
+    assert (
+        runs["foo"].to_numpy()
+        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+    ).all()
+
+    compressed = bioframe.compress_runs(
+        df1, "name", agg={"score_mean": ("score", "mean")}
+    )
+    assert (
+        compressed["name"].to_numpy()
+        == np.array(["c", "b", "a", "b", "c"])
+    ).all()
+    assert np.allclose(
+        compressed["score_mean"].to_numpy(),
+        np.array([0.525, 0.8, 0.6, 0.8, 0.5]),
+    )
+
+
+def test_mark_compress_runs__with_overlaps():
+    df1 = pd.DataFrame([
+        ["chr1", 85563, 129897, "c", 0.1],
+        ["chr1", 434858, 508340, "c", 0.8],
+        ["chr1", 586303, 620904, "c", 0.5],
+        ["chr1", 652861, 688020, "c", 0.7],
+        ["chr1", 818801, 858415, "b", 0.8],
+        ["chr1", 800000, 900000, "b", 0.8],
+
+        ["chr2", 548402, 639680, "a", 0.6],
+        ["chr2", 970541, 1026586, "b", 0.8],
+
+        ["chr3", 260538, 272930, "c", 0.5],
+        ["chr3", 460071, 470969, "c", 0.5],
+        ["chr3", 487568, 502336, "c", 0.5],
+    ], columns=["chrom", "start", "end", "name", "score"])
+
+    with pytest.raises(ValueError):
+        bioframe.mark_runs(df1, "name")
+
+    runs = bioframe.mark_runs(df1, "name", allow_overlaps=True)
+    assert (
+        runs["name"].to_numpy()
+        == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
+    ).all()

From 023f2ad3b5b0b1b2fb55678b07ca23ab0aab2569 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Tue, 20 Jun 2023 19:56:31 -0400
Subject: [PATCH 2/9] Update docstring with examples

---
 bioframe/extras.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index efb6fd15..4f6210b7 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -515,7 +515,8 @@ def mark_runs(
     cols: Optional[Tuple[str, str, str]] = None,
 ) -> pd.DataFrame:
     """
-    Mark runs of consecutive intervals sharing the same value of ``col``.
+    Mark runs of immediately consecutive intervals sharing the same value of
+    ``col``.
 
     Parameters
     ----------
@@ -533,8 +534,8 @@ def mark_runs(
 
     Returns
     -------
-    DataFrame
-        A sorted copy the input dataframe with an additional column 'run'
+    pandas.DataFrame
+        A reordered copy the input dataframe with an additional column 'run'
         marking runs of values in the input column.
     """
     ck, _, _ = _get_default_colnames() if cols is None else cols
@@ -587,7 +588,8 @@ def compress_runs(
     cols: Optional[Tuple[str, str, str]] = None,
 ) -> pd.DataFrame:
     """
-    Compress runs of consecutive intervals sharing the same value of ``col``.
+    Merge runs of immediately consecutive intervals sharing the same value of
+    ``col``.
 
     Parameters
     ----------
@@ -605,9 +607,27 @@ def compress_runs(
 
     Returns
     -------
-    DataFrame
-        A sorted copy the input dataframe with runs of values in the input
-        column compressed.
+    pandas.DataFrame
+        Dataframe with consecutive intervals in the same run merged.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({
+    ...     'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
+    ...     'start': [0, 100, 200, 300, 400, 500],
+    ...     'end': [100, 200, 300, 400, 500, 600],
+    ...     'value': [1, 1, 1, 2, 2, 2],
+    ... })
+
+    >>> compress_runs(df, 'value')
+        chrom  start  end  value
+    0   chr1      0  300      1
+    1   chr1    300  600      2
+
+    >>> compress_runs(df, 'value', agg={'sum': ('value', 'sum')})
+        chrom  start  end  value  sum
+    0   chr1      0  300      1    3
+    1   chr1    300  600      2    6
     """
     ck, sk, ek = _get_default_colnames() if cols is None else cols
 

From 074828d28fdcaf7126ba5814279ba00c10838b23 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Tue, 20 Jun 2023 22:16:51 -0400
Subject: [PATCH 3/9] refactor: Make sort more explicit

---
 bioframe/extras.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 4f6210b7..503180a4 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -538,7 +538,7 @@ def mark_runs(
         A reordered copy the input dataframe with an additional column 'run'
         marking runs of values in the input column.
     """
-    ck, _, _ = _get_default_colnames() if cols is None else cols
+    ck, sk, ek = _get_default_colnames() if cols is None else cols
 
     if not allow_overlaps and len(ops.overlap(df, df)) > len(df):
         raise ValueError("Not a proper bedGraph: found overlapping intervals.")
@@ -548,7 +548,7 @@ def mark_runs(
     n_runs = 0
 
     for _, group in df.groupby(ck, sort=False):
-        group = ops.sort_bedframe(group, reset_index=False)
+        group = group.sort_values([sk, ek])
 
         # Find runs of values
         values = group[col].to_numpy()

From 10d8384e64bef596793d231cc01c0fe982cc3238 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Wed, 21 Jun 2023 19:09:00 -0400
Subject: [PATCH 4/9] Rename compress_runs to merge_runs

---
 bioframe/extras.py   | 20 ++++++++++++++------
 tests/test_extras.py | 10 +++++-----
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 503180a4..5126cab4 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -17,7 +17,7 @@
     "frac_gene_coverage",
     "pair_by_distance",
     "mark_runs",
-    "compress_runs"
+    "merge_runs"
 ]
 
 
@@ -537,6 +537,10 @@ def mark_runs(
     pandas.DataFrame
         A reordered copy the input dataframe with an additional column 'run'
         marking runs of values in the input column.
+
+    See Also
+    --------
+    merge_runs
     """
     ck, sk, ek = _get_default_colnames() if cols is None else cols
 
@@ -579,7 +583,7 @@ def mark_runs(
     return pd.concat(result)
 
 
-def compress_runs(
+def merge_runs(
     df: pd.DataFrame,
     col: str,
     *,
@@ -619,15 +623,19 @@ def compress_runs(
     ...     'value': [1, 1, 1, 2, 2, 2],
     ... })
 
-    >>> compress_runs(df, 'value')
+    >>> merge_runs(df, 'value')
         chrom  start  end  value
     0   chr1      0  300      1
     1   chr1    300  600      2
 
-    >>> compress_runs(df, 'value', agg={'sum': ('value', 'sum')})
+    >>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')})
         chrom  start  end  value  sum
     0   chr1      0  300      1    3
     1   chr1    300  600      2    6
+
+    See Also
+    --------
+    mark_runs
     """
     ck, sk, ek = _get_default_colnames() if cols is None else cols
 
@@ -641,7 +649,7 @@ def compress_runs(
         reset_counter=False,
         run_col='_run',
     )
-    df_compressed = (
+    df_merged = (
         df_runs
         .groupby('_run')
         .agg(**{
@@ -652,4 +660,4 @@ def compress_runs(
             **agg
          })
     )
-    return df_compressed.reset_index(drop=True)
+    return df_merged.reset_index(drop=True)
diff --git a/tests/test_extras.py b/tests/test_extras.py
index 041a415d..35c370c5 100644
--- a/tests/test_extras.py
+++ b/tests/test_extras.py
@@ -294,7 +294,7 @@ def test_pair_by_distance():
         )
 
 
-def test_mark_compress_runs():
+def test_mark_merge_runs():
     df1 = pd.DataFrame([
         ["chr1", 85563, 129897, "c", 0.1],
         ["chr1", 434858, 508340, "c", 0.8],
@@ -332,20 +332,20 @@ def test_mark_compress_runs():
         == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
     ).all()
 
-    compressed = bioframe.compress_runs(
+    merged = bioframe.merge_runs(
         df1, "name", agg={"score_mean": ("score", "mean")}
     )
     assert (
-        compressed["name"].to_numpy()
+        merged["name"].to_numpy()
         == np.array(["c", "b", "a", "b", "c"])
     ).all()
     assert np.allclose(
-        compressed["score_mean"].to_numpy(),
+        merged["score_mean"].to_numpy(),
         np.array([0.525, 0.8, 0.6, 0.8, 0.5]),
     )
 
 
-def test_mark_compress_runs__with_overlaps():
+def test_mark_merge_runs__with_overlaps():
     df1 = pd.DataFrame([
         ["chr1", 85563, 129897, "c", 0.1],
         ["chr1", 434858, 508340, "c", 0.8],

From 60d5e8d6afc8c40f34ecc25b59f7898f2b3c1c93 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Thu, 22 Jun 2023 10:03:46 -0400
Subject: [PATCH 5/9] fix: Account for interval contiguity

---
 bioframe/extras.py   | 20 ++++++++------
 tests/test_extras.py | 66 +++++++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 5126cab4..648cf8ea 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -554,18 +554,22 @@ def mark_runs(
     for _, group in df.groupby(ck, sort=False):
         group = group.sort_values([sk, ek])
 
-        # Find runs of values
-        values = group[col].to_numpy()
-        isnumeric = np.issubdtype(values.dtype, np.number)
+        # Find boundaries of consecutive bookended intervals
+        starts = group[sk].to_numpy()
+        ends = group[ek].to_numpy()
+        is_next_run_break = np.r_[starts[1:] != ends[:-1], False]
 
-        if isnumeric:
-            run_starts = np.r_[
-                0,
-                where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1
+        # Find boundaries of consecutive equal values
+        values = group[col].to_numpy()
+        if values.dtype.kind == 'f':
+            is_next_val_break = np.r_[
+                ~np.isclose(values[1:], values[:-1], equal_nan=True), False
             ]
         else:
-            run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1]
+            is_next_val_break = np.r_[values[1:] != values[:-1], False]
 
+        # Find run index extents
+        run_starts = np.r_[0, where(is_next_val_break | is_next_run_break) + 1]
         run_lengths = np.diff(np.r_[run_starts, len(values)])
         run_ends = run_starts + run_lengths
 
diff --git a/tests/test_extras.py b/tests/test_extras.py
index 35c370c5..e3181288 100644
--- a/tests/test_extras.py
+++ b/tests/test_extras.py
@@ -296,18 +296,26 @@ def test_pair_by_distance():
 
 def test_mark_merge_runs():
     df1 = pd.DataFrame([
-        ["chr1", 85563, 129897, "c", 0.1],
-        ["chr1", 434858, 508340, "c", 0.8],
-        ["chr1", 586303, 620904, "c", 0.5],
-        ["chr1", 652861, 688020, "c", 0.7],
-        ["chr1", 818801, 858415, "b", 0.8],
+        # chr1
+        # consecutive run of "c"
+        ["chr1", 85563, 129897, "c", 0.2],
+        ["chr1", 129897, 508340, "c", 0.8],
+        ["chr1", 508340, 620903, "c", 0.5],
 
+        # singleton run of "c" separated by 1bp from previous run
+        ["chr1", 620904, 688020, "c", 0.7],
+
+        # consecutive with previous interval but different value of "name"
+        ["chr1", 688020, 858415, "b", 0.8],
+
+        # chr2
         ["chr2", 548402, 639680, "a", 0.6],
-        ["chr2", 970541, 1026586, "b", 0.8],
+        ["chr2", 639680, 1026586, "b", 0.8],
 
+        # chr3
         ["chr3", 260538, 272930, "c", 0.5],
-        ["chr3", 460071, 470969, "c", 0.5],
-        ["chr3", 487568, 502336, "c", 0.5],
+        ["chr3", 272930, 470969, "c", 0.5],
+        ["chr3", 470969, 502336, "c", 0.5],
     ], columns=["chrom", "start", "end", "name", "score"])
 
     runs = bioframe.mark_runs(df1, "name")
@@ -317,19 +325,19 @@ def test_mark_merge_runs():
     ).all()
     assert (
         runs["run"].to_numpy()
-        == np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0])
+        == np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0])
     ).all()
 
     runs = bioframe.mark_runs(df1, "name", reset_counter=False)
     assert (
         runs["run"].to_numpy()
-        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+        == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
     ).all()
 
     runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False)
     assert (
         runs["foo"].to_numpy()
-        == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4])
+        == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])
     ).all()
 
     merged = bioframe.merge_runs(
@@ -337,29 +345,39 @@ def test_mark_merge_runs():
     )
     assert (
         merged["name"].to_numpy()
-        == np.array(["c", "b", "a", "b", "c"])
+        == np.array(["c", "c", "b", "a", "b", "c"])
     ).all()
     assert np.allclose(
         merged["score_mean"].to_numpy(),
-        np.array([0.525, 0.8, 0.6, 0.8, 0.5]),
+        np.array([0.5, 0.7, 0.8, 0.6, 0.8, 0.5]),
     )
 
 
 def test_mark_merge_runs__with_overlaps():
     df1 = pd.DataFrame([
-        ["chr1", 85563, 129897, "c", 0.1],
-        ["chr1", 434858, 508340, "c", 0.8],
-        ["chr1", 586303, 620904, "c", 0.5],
-        ["chr1", 652861, 688020, "c", 0.7],
-        ["chr1", 818801, 858415, "b", 0.8],
-        ["chr1", 800000, 900000, "b", 0.8],
+        # chr1
+        # consecutive run of "c"
+        ["chr1", 85563, 129897, "c", 0.2],
+        ["chr1", 129897, 508340, "c", 0.8],
+        ["chr1", 508340, 620903, "c", 0.5],
+
+        # singleton run of "c" separated by 1bp from previous run
+        ["chr1", 620904, 688020, "c", 0.7],
+
+        # consecutive with previous interval but different value of "name"
+        ["chr1", 688020, 858415, "b", 0.8],
 
+        # overlapping with previous interval
+        ["chr1", 700000, 900000, "b", 0.8],
+
+        # chr2
         ["chr2", 548402, 639680, "a", 0.6],
-        ["chr2", 970541, 1026586, "b", 0.8],
+        ["chr2", 639680, 1026586, "b", 0.8],
 
+        # chr3
         ["chr3", 260538, 272930, "c", 0.5],
-        ["chr3", 460071, 470969, "c", 0.5],
-        ["chr3", 487568, 502336, "c", 0.5],
+        ["chr3", 272930, 470969, "c", 0.5],
+        ["chr3", 470969, 502336, "c", 0.5],
     ], columns=["chrom", "start", "end", "name", "score"])
 
     with pytest.raises(ValueError):
@@ -370,3 +388,7 @@ def test_mark_merge_runs__with_overlaps():
         runs["name"].to_numpy()
         == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"])
     ).all()
+    assert (
+        runs["run"].to_numpy()
+        == np.array([0, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0])
+    ).all()

From 8b19a9d409468cbafaa595ec94982e4d619b17c8 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Thu, 22 Jun 2023 12:25:37 -0400
Subject: [PATCH 6/9] Algorithm generalized to find clusters in case
 allow_overlaps is True

---
 bioframe/extras.py   | 34 ++++++++++++++++++----------------
 tests/test_extras.py |  3 +--
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 648cf8ea..4860cdac 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -553,34 +553,36 @@ def mark_runs(
 
     for _, group in df.groupby(ck, sort=False):
         group = group.sort_values([sk, ek])
-
-        # Find boundaries of consecutive bookended intervals
         starts = group[sk].to_numpy()
         ends = group[ek].to_numpy()
-        is_next_run_break = np.r_[starts[1:] != ends[:-1], False]
 
-        # Find boundaries of consecutive equal values
+        # Extend ends by running max
+        ends = np.maximum.accumulate(ends)
+
+        # Find borders of interval clusters and assign cluster ids
+        is_cluster_border = np.r_[True, starts[1:] > ends[:-1] + 0, False]
+
+        # Find borders of consecutive equal values
         values = group[col].to_numpy()
         if values.dtype.kind == 'f':
-            is_next_val_break = np.r_[
-                ~np.isclose(values[1:], values[:-1], equal_nan=True), False
+            is_value_border = np.r_[
+                True,
+                ~np.isclose(values[1:], values[:-1], equal_nan=True),
+                False
             ]
         else:
-            is_next_val_break = np.r_[values[1:] != values[:-1], False]
+            is_value_border = np.r_[True, values[1:] != values[:-1], False]
 
-        # Find run index extents
-        run_starts = np.r_[0, where(is_next_val_break | is_next_run_break) + 1]
-        run_lengths = np.diff(np.r_[run_starts, len(values)])
-        run_ends = run_starts + run_lengths
+        # Find index extents of runs
+        is_border = is_cluster_border | is_value_border
+        sum_borders = np.cumsum(is_border)
+        run_ids = sum_borders[:-1] - 1
 
         # Assign run numbers to intervals
         if reset_counter:
             n_runs = 0
-        group[run_col] = pd.NA
-        j = group.columns.get_loc(run_col)
-        for lo, hi in zip(run_starts, run_ends):
-            group.iloc[lo : hi + 1, j] = n_runs
-            n_runs += 1
+        group[run_col] = n_runs + run_ids
+        n_runs += sum_borders[-1]
 
         result.append(group)
 
diff --git a/tests/test_extras.py b/tests/test_extras.py
index e3181288..1ca68a8c 100644
--- a/tests/test_extras.py
+++ b/tests/test_extras.py
@@ -366,7 +366,6 @@ def test_mark_merge_runs__with_overlaps():
 
         # consecutive with previous interval but different value of "name"
         ["chr1", 688020, 858415, "b", 0.8],
-
         # overlapping with previous interval
         ["chr1", 700000, 900000, "b", 0.8],
 
@@ -390,5 +389,5 @@ def test_mark_merge_runs__with_overlaps():
     ).all()
     assert (
         runs["run"].to_numpy()
-        == np.array([0, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0])
+        == np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0])
     ).all()

From e6a1d2b1b8bd17ce2bd6f013fe87bd50608b7b43 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Thu, 22 Jun 2023 13:41:23 -0400
Subject: [PATCH 7/9] Fix linting issue

---
 bioframe/extras.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bioframe/extras.py b/bioframe/extras.py
index 4860cdac..a60e3c5a 100644
--- a/bioframe/extras.py
+++ b/bioframe/extras.py
@@ -548,7 +548,6 @@ def mark_runs(
         raise ValueError("Not a proper bedGraph: found overlapping intervals.")
 
     result = []
-    where = np.flatnonzero
     n_runs = 0
 
     for _, group in df.groupby(ck, sort=False):
@@ -560,7 +559,7 @@ def mark_runs(
         ends = np.maximum.accumulate(ends)
 
         # Find borders of interval clusters and assign cluster ids
-        is_cluster_border = np.r_[True, starts[1:] > ends[:-1] + 0, False]
+        is_cluster_border = np.r_[True, starts[1:] > ends[:-1], False]
 
         # Find borders of consecutive equal values
         values = group[col].to_numpy()

From 7f89ea14dc09ea2dc06edc9dbcaec6eb1f732369 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Fri, 18 Oct 2024 06:23:02 -0400
Subject: [PATCH 8/9] maint: Add python-requires to pyproject and pre-commit to
 dev extra

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 0ab16d0d..f1e7dcca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 readme = "README.md"
+requires-python = ">=3.8"
 dependencies = [
     "matplotlib",
     "numpy>=1.10, <2",
@@ -49,6 +50,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "biopython",
+    "pre-commit",
     "pysam",
     "pybbi",
     "pytest",

From 942b46ada71d64d82c9e9d9458ac9d213a30b964 Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Fri, 18 Oct 2024 06:44:56 -0400
Subject: [PATCH 9/9] ci: Remove unnecessary lint step from test action

---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 251b1be1..78449d27 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -27,6 +27,4 @@ jobs:
       - run: |
           python -m pip install --upgrade pip hatch
           pip install -e .[dev]
-          # Stop the build if there are Python syntax errors or undefined names
-          ruff . --select=E9,F63,F7,F82 --show-source
           hatch run test