From 1fb78e3cce0eef798cbc8c0e1752bd9091705f88 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 20 Jun 2023 19:42:58 -0400 Subject: [PATCH 1/9] Add mark_runs and compress_runs --- bioframe/extras.py | 134 +++++++++++++++++++++++++++++++++++++++++++ tests/test_extras.py | 78 +++++++++++++++++++++++++ 2 files changed, 212 insertions(+) diff --git a/bioframe/extras.py b/bioframe/extras.py index 5569c3c7..efb6fd15 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -1,4 +1,6 @@ +from typing import Optional, Tuple + import numpy as np import pandas as pd @@ -14,6 +16,8 @@ "seq_gc", "frac_gene_coverage", "pair_by_distance", + "mark_runs", + "compress_runs" ] @@ -499,3 +503,133 @@ def pair_by_distance( ) return pd.concat([left_ivals, right_ivals], axis=1) + + +def mark_runs( + df: pd.DataFrame, + col: str, + *, + allow_overlaps: bool = False, + reset_counter: bool = True, + run_col: str = 'run', + cols: Optional[Tuple[str, str, str]] = None, +) -> pd.DataFrame: + """ + Mark runs of consecutive intervals sharing the same value of ``col``. + + Parameters + ---------- + df : DataFrame + A bioframe dataframe. + col : str + The column to mark runs of values for. + allow_overlaps : bool, optional [default: False] + If True, allow intervals in ``df`` to overlap. This may cause + unexpected results. + reset_counter : bool, optional [default: True] + If True, reset the run counter for each chromosome. + run_col : str, optional [default: 'run'] + The name of the column to store the run numbers in. + + Returns + ------- + DataFrame + A sorted copy the input dataframe with an additional column 'run' + marking runs of values in the input column. + """ + ck, _, _ = _get_default_colnames() if cols is None else cols + + if not allow_overlaps and len(ops.overlap(df, df)) > len(df): + raise ValueError("Not a proper bedGraph: found overlapping intervals.") + + result = [] + where = np.flatnonzero + n_runs = 0 + + for _, group in df.groupby(ck, sort=False): + group = ops.sort_bedframe(group, reset_index=False) + + # Find runs of values + values = group[col].to_numpy() + isnumeric = np.issubdtype(values.dtype, np.number) + + if isnumeric: + run_starts = np.r_[ + 0, + where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1 + ] + else: + run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1] + + run_lengths = np.diff(np.r_[run_starts, len(values)]) + run_ends = run_starts + run_lengths + + # Assign run numbers to intervals + if reset_counter: + n_runs = 0 + group[run_col] = pd.NA + j = group.columns.get_loc(run_col) + for lo, hi in zip(run_starts, run_ends): + group.iloc[lo : hi + 1, j] = n_runs + n_runs += 1 + + result.append(group) + + return pd.concat(result) + + +def compress_runs( + df: pd.DataFrame, + col: str, + *, + allow_overlaps: bool = False, + agg: Optional[dict] = None, + cols: Optional[Tuple[str, str, str]] = None, +) -> pd.DataFrame: + """ + Compress runs of consecutive intervals sharing the same value of ``col``. + + Parameters + ---------- + df : DataFrame + A bioframe dataframe. + col : str + The column to compress runs of values for. + allow_overlaps : bool, optional [default: False] + If True, allow intervals in ``df`` to overlap. This may cause + unexpected results. + agg : dict, optional [default: None] + A dictionary of additional column names and aggregation functions to + apply to each run. Takes the format: + {'agg_name': ('column_name', 'agg_func')} + + Returns + ------- + DataFrame + A sorted copy the input dataframe with runs of values in the input + column compressed. + """ + ck, sk, ek = _get_default_colnames() if cols is None else cols + + if agg is None: + agg = {} + + df_runs = mark_runs( + df, + col, + allow_overlaps=allow_overlaps, + reset_counter=False, + run_col='_run', + ) + df_compressed = ( + df_runs + .groupby('_run') + .agg(**{ + ck: (ck, 'first'), + sk: (sk, 'min'), + ek: (ek, 'max'), + col: (col, 'first'), + **agg + }) + ) + return df_compressed.reset_index(drop=True) diff --git a/tests/test_extras.py b/tests/test_extras.py index c7e47a19..041a415d 100644 --- a/tests/test_extras.py +++ b/tests/test_extras.py @@ -292,3 +292,81 @@ def test_pair_by_distance(): bioframe.pair_by_distance( df, min_sep=0, max_sep=9, min_intervening=10, max_intervening=9 ) + + +def test_mark_compress_runs(): + df1 = pd.DataFrame([ + ["chr1", 85563, 129897, "c", 0.1], + ["chr1", 434858, 508340, "c", 0.8], + ["chr1", 586303, 620904, "c", 0.5], + ["chr1", 652861, 688020, "c", 0.7], + ["chr1", 818801, 858415, "b", 0.8], + + ["chr2", 548402, 639680, "a", 0.6], + ["chr2", 970541, 1026586, "b", 0.8], + + ["chr3", 260538, 272930, "c", 0.5], + ["chr3", 460071, 470969, "c", 0.5], + ["chr3", 487568, 502336, "c", 0.5], + ], columns=["chrom", "start", "end", "name", "score"]) + + runs = bioframe.mark_runs(df1, "name") + assert ( + runs["name"].to_numpy() + == np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"]) + ).all() + assert ( + runs["run"].to_numpy() + == np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0]) + ).all() + + runs = bioframe.mark_runs(df1, "name", reset_counter=False) + assert ( + runs["run"].to_numpy() + == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4]) + ).all() + + runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False) + assert ( + runs["foo"].to_numpy() + == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4]) + ).all() + + compressed = bioframe.compress_runs( + df1, "name", agg={"score_mean": ("score", "mean")} + ) + assert ( + compressed["name"].to_numpy() + == np.array(["c", "b", "a", "b", "c"]) + ).all() + assert np.allclose( + compressed["score_mean"].to_numpy(), + np.array([0.525, 0.8, 0.6, 0.8, 0.5]), + ) + + +def test_mark_compress_runs__with_overlaps(): + df1 = pd.DataFrame([ + ["chr1", 85563, 129897, "c", 0.1], + ["chr1", 434858, 508340, "c", 0.8], + ["chr1", 586303, 620904, "c", 0.5], + ["chr1", 652861, 688020, "c", 0.7], + ["chr1", 818801, 858415, "b", 0.8], + ["chr1", 800000, 900000, "b", 0.8], + + ["chr2", 548402, 639680, "a", 0.6], + ["chr2", 970541, 1026586, "b", 0.8], + + ["chr3", 260538, 272930, "c", 0.5], + ["chr3", 460071, 470969, "c", 0.5], + ["chr3", 487568, 502336, "c", 0.5], + ], columns=["chrom", "start", "end", "name", "score"]) + + with pytest.raises(ValueError): + bioframe.mark_runs(df1, "name") + + runs = bioframe.mark_runs(df1, "name", allow_overlaps=True) + assert ( + runs["name"].to_numpy() + == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"]) + ).all() From 023f2ad3b5b0b1b2fb55678b07ca23ab0aab2569 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 20 Jun 2023 19:56:31 -0400 Subject: [PATCH 2/9] Update docstring with examples --- bioframe/extras.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index efb6fd15..4f6210b7 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -515,7 +515,8 @@ def mark_runs( cols: Optional[Tuple[str, str, str]] = None, ) -> pd.DataFrame: """ - Mark runs of consecutive intervals sharing the same value of ``col``. + Mark runs of immediately consecutive intervals sharing the same value of + ``col``. Parameters ---------- @@ -533,8 +534,8 @@ def mark_runs( Returns ------- - DataFrame - A sorted copy the input dataframe with an additional column 'run' + pandas.DataFrame + A reordered copy the input dataframe with an additional column 'run' marking runs of values in the input column. """ ck, _, _ = _get_default_colnames() if cols is None else cols @@ -587,7 +588,8 @@ def compress_runs( cols: Optional[Tuple[str, str, str]] = None, ) -> pd.DataFrame: """ - Compress runs of consecutive intervals sharing the same value of ``col``. + Merge runs of immediately consecutive intervals sharing the same value of + ``col``. Parameters ---------- @@ -605,9 +607,27 @@ def compress_runs( Returns ------- - DataFrame - A sorted copy the input dataframe with runs of values in the input - column compressed. + pandas.DataFrame + Dataframe with consecutive intervals in the same run merged. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'chrom': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'start': [0, 100, 200, 300, 400, 500], + ... 'end': [100, 200, 300, 400, 500, 600], + ... 'value': [1, 1, 1, 2, 2, 2], + ... }) + + >>> compress_runs(df, 'value') + chrom start end value + 0 chr1 0 300 1 + 1 chr1 300 600 2 + + >>> compress_runs(df, 'value', agg={'sum': ('value', 'sum')}) + chrom start end value sum + 0 chr1 0 300 1 3 + 1 chr1 300 600 2 6 """ ck, sk, ek = _get_default_colnames() if cols is None else cols From 074828d28fdcaf7126ba5814279ba00c10838b23 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 20 Jun 2023 22:16:51 -0400 Subject: [PATCH 3/9] refactor: Make sort more explicit --- bioframe/extras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index 4f6210b7..503180a4 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -538,7 +538,7 @@ def mark_runs( A reordered copy the input dataframe with an additional column 'run' marking runs of values in the input column. """ - ck, _, _ = _get_default_colnames() if cols is None else cols + ck, sk, ek = _get_default_colnames() if cols is None else cols if not allow_overlaps and len(ops.overlap(df, df)) > len(df): raise ValueError("Not a proper bedGraph: found overlapping intervals.") @@ -548,7 +548,7 @@ def mark_runs( n_runs = 0 for _, group in df.groupby(ck, sort=False): - group = ops.sort_bedframe(group, reset_index=False) + group = group.sort_values([sk, ek]) # Find runs of values values = group[col].to_numpy() From 10d8384e64bef596793d231cc01c0fe982cc3238 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 21 Jun 2023 19:09:00 -0400 Subject: [PATCH 4/9] Rename compress_runs to merge_runs --- bioframe/extras.py | 20 ++++++++++++++------ tests/test_extras.py | 10 +++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index 503180a4..5126cab4 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -17,7 +17,7 @@ "frac_gene_coverage", "pair_by_distance", "mark_runs", - "compress_runs" + "merge_runs" ] @@ -537,6 +537,10 @@ def mark_runs( pandas.DataFrame A reordered copy the input dataframe with an additional column 'run' marking runs of values in the input column. + + See Also + -------- + merge_runs """ ck, sk, ek = _get_default_colnames() if cols is None else cols @@ -579,7 +583,7 @@ def mark_runs( return pd.concat(result) -def compress_runs( +def merge_runs( df: pd.DataFrame, col: str, *, @@ -619,15 +623,19 @@ def compress_runs( ... 'value': [1, 1, 1, 2, 2, 2], ... }) - >>> compress_runs(df, 'value') + >>> merge_runs(df, 'value') chrom start end value 0 chr1 0 300 1 1 chr1 300 600 2 - >>> compress_runs(df, 'value', agg={'sum': ('value', 'sum')}) + >>> merge_runs(df, 'value', agg={'sum': ('value', 'sum')}) chrom start end value sum 0 chr1 0 300 1 3 1 chr1 300 600 2 6 + + See Also + -------- + mark_runs """ ck, sk, ek = _get_default_colnames() if cols is None else cols @@ -641,7 +649,7 @@ def compress_runs( reset_counter=False, run_col='_run', ) - df_compressed = ( + df_merged = ( df_runs .groupby('_run') .agg(**{ @@ -652,4 +660,4 @@ def compress_runs( **agg }) ) - return df_compressed.reset_index(drop=True) + return df_merged.reset_index(drop=True) diff --git a/tests/test_extras.py b/tests/test_extras.py index 041a415d..35c370c5 100644 --- a/tests/test_extras.py +++ b/tests/test_extras.py @@ -294,7 +294,7 @@ def test_pair_by_distance(): ) -def test_mark_compress_runs(): +def test_mark_merge_runs(): df1 = pd.DataFrame([ ["chr1", 85563, 129897, "c", 0.1], ["chr1", 434858, 508340, "c", 0.8], @@ -332,20 +332,20 @@ def test_mark_compress_runs(): == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4]) ).all() - compressed = bioframe.compress_runs( + merged = bioframe.merge_runs( df1, "name", agg={"score_mean": ("score", "mean")} ) assert ( - compressed["name"].to_numpy() + merged["name"].to_numpy() == np.array(["c", "b", "a", "b", "c"]) ).all() assert np.allclose( - compressed["score_mean"].to_numpy(), + merged["score_mean"].to_numpy(), np.array([0.525, 0.8, 0.6, 0.8, 0.5]), ) -def test_mark_compress_runs__with_overlaps(): +def test_mark_merge_runs__with_overlaps(): df1 = pd.DataFrame([ ["chr1", 85563, 129897, "c", 0.1], ["chr1", 434858, 508340, "c", 0.8], From 60d5e8d6afc8c40f34ecc25b59f7898f2b3c1c93 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 22 Jun 2023 10:03:46 -0400 Subject: [PATCH 5/9] fix: Account for interval contiguity --- bioframe/extras.py | 20 ++++++++------ tests/test_extras.py | 66 +++++++++++++++++++++++++++++--------------- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index 5126cab4..648cf8ea 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -554,18 +554,22 @@ def mark_runs( for _, group in df.groupby(ck, sort=False): group = group.sort_values([sk, ek]) - # Find runs of values - values = group[col].to_numpy() - isnumeric = np.issubdtype(values.dtype, np.number) + # Find boundaries of consecutive bookended intervals + starts = group[sk].to_numpy() + ends = group[ek].to_numpy() + is_next_run_break = np.r_[starts[1:] != ends[:-1], False] - if isnumeric: - run_starts = np.r_[ - 0, - where(~np.isclose(values[1:], values[:-1], equal_nan=True)) + 1 + # Find boundaries of consecutive equal values + values = group[col].to_numpy() + if values.dtype.kind == 'f': + is_next_val_break = np.r_[ + ~np.isclose(values[1:], values[:-1], equal_nan=True), False ] else: - run_starts = np.r_[0, where(values[1:] != values[:-1]) + 1] + is_next_val_break = np.r_[values[1:] != values[:-1], False] + # Find run index extents + run_starts = np.r_[0, where(is_next_val_break | is_next_run_break) + 1] run_lengths = np.diff(np.r_[run_starts, len(values)]) run_ends = run_starts + run_lengths diff --git a/tests/test_extras.py b/tests/test_extras.py index 35c370c5..e3181288 100644 --- a/tests/test_extras.py +++ b/tests/test_extras.py @@ -296,18 +296,26 @@ def test_pair_by_distance(): def test_mark_merge_runs(): df1 = pd.DataFrame([ - ["chr1", 85563, 129897, "c", 0.1], - ["chr1", 434858, 508340, "c", 0.8], - ["chr1", 586303, 620904, "c", 0.5], - ["chr1", 652861, 688020, "c", 0.7], - ["chr1", 818801, 858415, "b", 0.8], + # chr1 + # consecutive run of "c" + ["chr1", 85563, 129897, "c", 0.2], + ["chr1", 129897, 508340, "c", 0.8], + ["chr1", 508340, 620903, "c", 0.5], + # singleton run of "c" separated by 1bp from previous run + ["chr1", 620904, 688020, "c", 0.7], + + # consecutive with previous interval but different value of "name" + ["chr1", 688020, 858415, "b", 0.8], + + # chr2 ["chr2", 548402, 639680, "a", 0.6], - ["chr2", 970541, 1026586, "b", 0.8], + ["chr2", 639680, 1026586, "b", 0.8], + # chr3 ["chr3", 260538, 272930, "c", 0.5], - ["chr3", 460071, 470969, "c", 0.5], - ["chr3", 487568, 502336, "c", 0.5], + ["chr3", 272930, 470969, "c", 0.5], + ["chr3", 470969, 502336, "c", 0.5], ], columns=["chrom", "start", "end", "name", "score"]) runs = bioframe.mark_runs(df1, "name") @@ -317,19 +325,19 @@ def test_mark_merge_runs(): ).all() assert ( runs["run"].to_numpy() - == np.array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0]) + == np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0]) ).all() runs = bioframe.mark_runs(df1, "name", reset_counter=False) assert ( runs["run"].to_numpy() - == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4]) + == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5]) ).all() runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False) assert ( runs["foo"].to_numpy() - == np.array([0, 0, 0, 0, 1, 2, 3, 4, 4, 4]) + == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5]) ).all() merged = bioframe.merge_runs( @@ -337,29 +345,39 @@ def test_mark_merge_runs(): ) assert ( merged["name"].to_numpy() - == np.array(["c", "b", "a", "b", "c"]) + == np.array(["c", "c", "b", "a", "b", "c"]) ).all() assert np.allclose( merged["score_mean"].to_numpy(), - np.array([0.525, 0.8, 0.6, 0.8, 0.5]), + np.array([0.5, 0.7, 0.8, 0.6, 0.8, 0.5]), ) def test_mark_merge_runs__with_overlaps(): df1 = pd.DataFrame([ - ["chr1", 85563, 129897, "c", 0.1], - ["chr1", 434858, 508340, "c", 0.8], - ["chr1", 586303, 620904, "c", 0.5], - ["chr1", 652861, 688020, "c", 0.7], - ["chr1", 818801, 858415, "b", 0.8], - ["chr1", 800000, 900000, "b", 0.8], + # chr1 + # consecutive run of "c" + ["chr1", 85563, 129897, "c", 0.2], + ["chr1", 129897, 508340, "c", 0.8], + ["chr1", 508340, 620903, "c", 0.5], + + # singleton run of "c" separated by 1bp from previous run + ["chr1", 620904, 688020, "c", 0.7], + + # consecutive with previous interval but different value of "name" + ["chr1", 688020, 858415, "b", 0.8], + # overlapping with previous interval + ["chr1", 700000, 900000, "b", 0.8], + + # chr2 ["chr2", 548402, 639680, "a", 0.6], - ["chr2", 970541, 1026586, "b", 0.8], + ["chr2", 639680, 1026586, "b", 0.8], + # chr3 ["chr3", 260538, 272930, "c", 0.5], - ["chr3", 460071, 470969, "c", 0.5], - ["chr3", 487568, 502336, "c", 0.5], + ["chr3", 272930, 470969, "c", 0.5], + ["chr3", 470969, 502336, "c", 0.5], ], columns=["chrom", "start", "end", "name", "score"]) with pytest.raises(ValueError): @@ -370,3 +388,7 @@ def test_mark_merge_runs__with_overlaps(): runs["name"].to_numpy() == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"]) ).all() + assert ( + runs["run"].to_numpy() + == np.array([0, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0]) + ).all() From 8b19a9d409468cbafaa595ec94982e4d619b17c8 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 22 Jun 2023 12:25:37 -0400 Subject: [PATCH 6/9] Algorithm generalized to find clusters in case allow_overlaps is True --- bioframe/extras.py | 34 ++++++++++++++++++---------------- tests/test_extras.py | 3 +-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index 648cf8ea..4860cdac 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -553,34 +553,36 @@ def mark_runs( for _, group in df.groupby(ck, sort=False): group = group.sort_values([sk, ek]) - - # Find boundaries of consecutive bookended intervals starts = group[sk].to_numpy() ends = group[ek].to_numpy() - is_next_run_break = np.r_[starts[1:] != ends[:-1], False] - # Find boundaries of consecutive equal values + # Extend ends by running max + ends = np.maximum.accumulate(ends) + + # Find borders of interval clusters and assign cluster ids + is_cluster_border = np.r_[True, starts[1:] > ends[:-1] + 0, False] + + # Find borders of consecutive equal values values = group[col].to_numpy() if values.dtype.kind == 'f': - is_next_val_break = np.r_[ - ~np.isclose(values[1:], values[:-1], equal_nan=True), False + is_value_border = np.r_[ + True, + ~np.isclose(values[1:], values[:-1], equal_nan=True), + False ] else: - is_next_val_break = np.r_[values[1:] != values[:-1], False] + is_value_border = np.r_[True, values[1:] != values[:-1], False] - # Find run index extents - run_starts = np.r_[0, where(is_next_val_break | is_next_run_break) + 1] - run_lengths = np.diff(np.r_[run_starts, len(values)]) - run_ends = run_starts + run_lengths + # Find index extents of runs + is_border = is_cluster_border | is_value_border + sum_borders = np.cumsum(is_border) + run_ids = sum_borders[:-1] - 1 # Assign run numbers to intervals if reset_counter: n_runs = 0 - group[run_col] = pd.NA - j = group.columns.get_loc(run_col) - for lo, hi in zip(run_starts, run_ends): - group.iloc[lo : hi + 1, j] = n_runs - n_runs += 1 + group[run_col] = n_runs + run_ids + n_runs += sum_borders[-1] result.append(group) diff --git a/tests/test_extras.py b/tests/test_extras.py index e3181288..1ca68a8c 100644 --- a/tests/test_extras.py +++ b/tests/test_extras.py @@ -366,7 +366,6 @@ def test_mark_merge_runs__with_overlaps(): # consecutive with previous interval but different value of "name" ["chr1", 688020, 858415, "b", 0.8], - # overlapping with previous interval ["chr1", 700000, 900000, "b", 0.8], @@ -390,5 +389,5 @@ def test_mark_merge_runs__with_overlaps(): ).all() assert ( runs["run"].to_numpy() - == np.array([0, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0]) + == np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0]) ).all() From e6a1d2b1b8bd17ce2bd6f013fe87bd50608b7b43 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 22 Jun 2023 13:41:23 -0400 Subject: [PATCH 7/9] Fix linting issue --- bioframe/extras.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bioframe/extras.py b/bioframe/extras.py index 4860cdac..a60e3c5a 100644 --- a/bioframe/extras.py +++ b/bioframe/extras.py @@ -548,7 +548,6 @@ def mark_runs( raise ValueError("Not a proper bedGraph: found overlapping intervals.") result = [] - where = np.flatnonzero n_runs = 0 for _, group in df.groupby(ck, sort=False): @@ -560,7 +559,7 @@ def mark_runs( ends = np.maximum.accumulate(ends) # Find borders of interval clusters and assign cluster ids - is_cluster_border = np.r_[True, starts[1:] > ends[:-1] + 0, False] + is_cluster_border = np.r_[True, starts[1:] > ends[:-1], False] # Find borders of consecutive equal values values = group[col].to_numpy() From 7f89ea14dc09ea2dc06edc9dbcaec6eb1f732369 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 18 Oct 2024 06:23:02 -0400 Subject: [PATCH 8/9] maint: Add python-requires to pyproject and pre-commit to dev extra --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0ab16d0d..f1e7dcca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] readme = "README.md" +requires-python = ">=3.8" dependencies = [ "matplotlib", "numpy>=1.10, <2", @@ -49,6 +50,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "biopython", + "pre-commit", "pysam", "pybbi", "pytest", From 942b46ada71d64d82c9e9d9458ac9d213a30b964 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 18 Oct 2024 06:44:56 -0400 Subject: [PATCH 9/9] ci: Remove unnecessary lint step from test action --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 251b1be1..78449d27 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,6 +27,4 @@ jobs: - run: | python -m pip install --upgrade pip hatch pip install -e .[dev] - # Stop the build if there are Python syntax errors or undefined names - ruff . --select=E9,F63,F7,F82 --show-source hatch run test