From bfac0e1dbe13deaeb5bfd99139357994780b9d2e Mon Sep 17 00:00:00 2001
From: agalitsyna <agalitzina@gmail.com>
Date: Tue, 6 Feb 2024 15:02:17 -0500
Subject: [PATCH] Fix of the bug of closest not returning intervals that do not
 have closest hits and are not separate chromosomes.

Problem: If the segment does not have closest (e.g., in ignore_upstream=True setting), it will simply drop out of output table.

Solution: Added checking the indexes of df1 that are absent form the final returned hits

Why it was not observed before: No detiled testing for options ignore_upstream/ignore_downstream/ignore_overlap and their combinations.
If df1 has a segment from a chromosome X and df2 has a segment from it, but they are in an arrangements that won't produce a hit, this segment will be dropped out of df1 output.
---
 bioframe/core/arrops.py |  3 +++
 bioframe/ops.py         | 30 +++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/bioframe/core/arrops.py b/bioframe/core/arrops.py
index dc191f12..c6e33350 100644
--- a/bioframe/core/arrops.py
+++ b/bioframe/core/arrops.py
@@ -709,6 +709,9 @@ def closest_intervals(
         [left_dists, right_dists, np.zeros(overlap_ids.shape[0])]
     )
 
+    if len(closest_ids)==0:
+        return np.empty((0,2), dtype=int)
+
     # Sort by distance to set 1 intervals and, if present, by the tie-breaking
     # data array.
     if tie_arr is None:
diff --git a/bioframe/ops.py b/bioframe/ops.py
index 2f567139..d7725f5f 100644
--- a/bioframe/ops.py
+++ b/bioframe/ops.py
@@ -1066,14 +1066,25 @@ def _closest_intidxs(
             direction=direction_arr,
         )
 
-        # Convert local per-chromosome indices into the
-        # indices of the original table.
-        closest_idxs_group = np.vstack(
-            [
-                df1_group_idxs.values[closest_idxs_group[:, 0]],
-                df2_group_idxs.values[closest_idxs_group[:, 1]],
-            ]
-        ).T
+        na_idxs = np.isin(np.arange(len(df1_group_idxs)), closest_idxs_group[:, 0], invert=True)
+
+        # 1) Convert local per-chromosome indices into the
+        # indices of the original table,
+        # 2) Fill in the intervals that do not have closest values.
+        closest_idxs_group = np.concatenate([
+            np.vstack(
+                [
+                    df1_group_idxs.values[closest_idxs_group[:, 0]],
+                    df2_group_idxs.values[closest_idxs_group[:, 1]],
+                ]
+            ).T,
+            np.vstack(
+                [
+                    df1_group_idxs.values[na_idxs],
+                    -1 * np.ones_like(df1_group_idxs.values[na_idxs]),
+                ]
+            ).T
+        ])
 
         closest_intidxs.append(closest_idxs_group)
 
@@ -1220,9 +1231,6 @@ def closest(
     )
     na_mask = closest_df_idxs[:, 1] == -1
 
-    if len(closest_df_idxs) == 0:
-        return  # case of no closest intervals
-
     # Generate output tables.
     df_index_1 = None
     df_index_2 = None