Fix of the bug of closest not returning intervals that do not have cl…

…osest hits and are not separate chromosomes. Problem: If the segment does not have closest (e.g., in ignore_upstream=True setting), it will simply drop out of output table. Solution: Added checking the indexes of df1 that are absent form the final returned hits Why it was not observed before: No detiled testing for options ignore_upstream/ignore_downstream/ignore_overlap and their combinations. If df1 has a segment from a chromosome X and df2 has a segment from it, but they are in an arrangements that won't produce a hit, this segment will be dropped out of df1 output.
open2c · Feb 6, 2024 · bfac0e1 · bfac0e1
1 parent bf0416a
commit bfac0e1
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 11 deletions.
diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py
@@ -709,6 +709,9 @@ def closest_intervals(
         [left_dists, right_dists, np.zeros(overlap_ids.shape[0])]
     )
 
+    if len(closest_ids)==0:
+        return np.empty((0,2), dtype=int)
+
     # Sort by distance to set 1 intervals and, if present, by the tie-breaking
     # data array.
     if tie_arr is None:

diff --git a/bioframe/ops.py b/bioframe/ops.py
@@ -1066,14 +1066,25 @@ def _closest_intidxs(
             direction=direction_arr,
         )
 
-        # Convert local per-chromosome indices into the
-        # indices of the original table.
-        closest_idxs_group = np.vstack(
-            [
-                df1_group_idxs.values[closest_idxs_group[:, 0]],
-                df2_group_idxs.values[closest_idxs_group[:, 1]],
-            ]
-        ).T
+        na_idxs = np.isin(np.arange(len(df1_group_idxs)), closest_idxs_group[:, 0], invert=True)
+
+        # 1) Convert local per-chromosome indices into the
+        # indices of the original table,
+        # 2) Fill in the intervals that do not have closest values.
+        closest_idxs_group = np.concatenate([
+            np.vstack(
+                [
+                    df1_group_idxs.values[closest_idxs_group[:, 0]],
+                    df2_group_idxs.values[closest_idxs_group[:, 1]],
+                ]
+            ).T,
+            np.vstack(
+                [
+                    df1_group_idxs.values[na_idxs],
+                    -1 * np.ones_like(df1_group_idxs.values[na_idxs]),
+                ]
+            ).T
+        ])
 
         closest_intidxs.append(closest_idxs_group)
 
@@ -1220,9 +1231,6 @@ def closest(
     )
     na_mask = closest_df_idxs[:, 1] == -1
 
-    if len(closest_df_idxs) == 0:
-        return  # case of no closest intervals
-
     # Generate output tables.
     df_index_1 = None
     df_index_2 = None