From bfac0e1dbe13deaeb5bfd99139357994780b9d2e Mon Sep 17 00:00:00 2001 From: agalitsyna Date: Tue, 6 Feb 2024 15:02:17 -0500 Subject: [PATCH] Fix of the bug of closest not returning intervals that do not have closest hits and are not separate chromosomes. Problem: If the segment does not have closest (e.g., in ignore_upstream=True setting), it will simply drop out of output table. Solution: Added checking the indexes of df1 that are absent form the final returned hits Why it was not observed before: No detiled testing for options ignore_upstream/ignore_downstream/ignore_overlap and their combinations. If df1 has a segment from a chromosome X and df2 has a segment from it, but they are in an arrangements that won't produce a hit, this segment will be dropped out of df1 output. --- bioframe/core/arrops.py | 3 +++ bioframe/ops.py | 30 +++++++++++++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py index dc191f12..c6e33350 100644 --- a/bioframe/core/arrops.py +++ b/bioframe/core/arrops.py @@ -709,6 +709,9 @@ def closest_intervals( [left_dists, right_dists, np.zeros(overlap_ids.shape[0])] ) + if len(closest_ids)==0: + return np.empty((0,2), dtype=int) + # Sort by distance to set 1 intervals and, if present, by the tie-breaking # data array. if tie_arr is None: diff --git a/bioframe/ops.py b/bioframe/ops.py index 2f567139..d7725f5f 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -1066,14 +1066,25 @@ def _closest_intidxs( direction=direction_arr, ) - # Convert local per-chromosome indices into the - # indices of the original table. - closest_idxs_group = np.vstack( - [ - df1_group_idxs.values[closest_idxs_group[:, 0]], - df2_group_idxs.values[closest_idxs_group[:, 1]], - ] - ).T + na_idxs = np.isin(np.arange(len(df1_group_idxs)), closest_idxs_group[:, 0], invert=True) + + # 1) Convert local per-chromosome indices into the + # indices of the original table, + # 2) Fill in the intervals that do not have closest values. + closest_idxs_group = np.concatenate([ + np.vstack( + [ + df1_group_idxs.values[closest_idxs_group[:, 0]], + df2_group_idxs.values[closest_idxs_group[:, 1]], + ] + ).T, + np.vstack( + [ + df1_group_idxs.values[na_idxs], + -1 * np.ones_like(df1_group_idxs.values[na_idxs]), + ] + ).T + ]) closest_intidxs.append(closest_idxs_group) @@ -1220,9 +1231,6 @@ def closest( ) na_mask = closest_df_idxs[:, 1] == -1 - if len(closest_df_idxs) == 0: - return # case of no closest intervals - # Generate output tables. df_index_1 = None df_index_2 = None