From ef9e8d8b9d586fb5a3aa2c1b8c6bdd4db34b477c Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Wed, 11 Sep 2024 18:53:44 -0400
Subject: [PATCH 1/5] Clean up the clustering code
---
benchmarks/bench_filter_cluster.py | 123 ------------------------
src/kbmod/configuration.py | 2 +-
src/kbmod/filters/clustering_filters.py | 117 +++++-----------------
src/kbmod/run_search.py | 21 ----
tests/test_clustering_filters.py | 98 ++-----------------
tests/test_regression_test.py | 2 +-
6 files changed, 35 insertions(+), 328 deletions(-)
delete mode 100644 benchmarks/bench_filter_cluster.py
diff --git a/benchmarks/bench_filter_cluster.py b/benchmarks/bench_filter_cluster.py
deleted file mode 100644
index 2d0a6fe09..000000000
--- a/benchmarks/bench_filter_cluster.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import timeit
-import numpy as np
-
-from kbmod.filters.clustering_filters import *
-from kbmod.result_list import ResultList, ResultRow
-from kbmod.search import *
-
-
-def _make_data(objs, times):
- """Create a ResultList for the given objects.
-
- Parameters
- ----------
- obj : list of lists
- A list where each element specifies a Trajectory
- as [x, y, xv, yv].
-
- Returns
- -------
- ResultList
- """
- num_times = len(times)
- rs = ResultList(times, track_filtered=True)
- for x in objs:
- t = Trajectory()
- t.x = x[0]
- t.y = x[1]
- t.vx = x[2]
- t.vy = x[3]
- t.lh = 100.0
-
- row = ResultRow(t, num_times)
- rs.append_result(row)
- return rs
-
-
-def run_index_benchmark(filter, rs):
- tmr = timeit.Timer(stmt="filter.keep_indices(rs)", globals=locals())
- res_time = np.mean(tmr.repeat(repeat=10, number=20))
- return res_time
-
-
-def run_all_benchmarks():
- times = [(10.0 + 0.1 * float(i)) for i in range(20)]
- rs1 = _make_data(
- [
- [10, 11, 1, 2],
- [10, 11, 1000, -1000],
- [10, 11, 0.0, 0.0],
- [25, 24, 1.0, 1.0],
- [25, 26, 10.0, 10.0],
- [10, 12, 5, 5],
- ],
- times,
- )
-
- print("Method | Time")
- print("-" * 40)
-
- f1 = DBSCANFilter("position", 0.025, 100, 100, [0, 50], [0, 1.5], times)
-
- res_time = run_index_benchmark(f1, rs1)
- print(f"position - 2 clusters | {res_time:10.7f}")
-
- f2 = DBSCANFilter("position", 0.025, 100, 100, [0, 50], [0, 1.5], times)
-
- res_time = run_index_benchmark(f2, rs1)
- print(f"position - 4 clusters | {res_time:10.7f}")
-
- f3 = DBSCANFilter("position", 0.025, 1000, 1000, [0, 50], [0, 1.5], times)
-
- res_time = run_index_benchmark(f3, rs1)
- print(f"position - 1 cluster | {res_time:10.7f}")
-
- rs2 = _make_data(
- [
- [10, 11, 1, 2],
- [10, 11, 1000, -1000],
- [10, 11, 1.0, 2.1],
- [55, 54, 1.0, 1.0],
- [55, 56, 10.0, 10.0],
- [10, 12, 4.1, 8],
- ],
- times,
- )
-
- f4 = DBSCANFilter("all", 0.025, 100, 100, [0, 100], [0, 1.5], times)
-
- res_time = run_index_benchmark(f4, rs2)
- print(f"all - 5 clusters | {res_time:10.7f}")
-
- # Larger eps is 3 clusters.
- f5 = DBSCANFilter("all", 0.25, 100, 100, [0, 100], [0, 1.5], times)
-
- res_time = run_index_benchmark(f5, rs2)
- print(f"all - 3 clusters | {res_time:10.7f}")
-
- # Larger scale is 3 clusters.
- f6 = DBSCANFilter("all", 0.025, 100, 100, [0, 5000], [0, 1.5], times)
-
- res_time = run_index_benchmark(f6, rs2)
- print(f"all (larger) - 3 clusters | {res_time:10.7f}")
-
- rs3 = _make_data(
- [
- [10, 11, 1, 2],
- [10, 11, 2, 5],
- [10, 11, 1.01, 1.99],
- [21, 23, 1, 2],
- [21, 23, -10, -10],
- [5, 10, 6, 1],
- [5, 10, 1, 2],
- ],
- times,
- )
-
- f7 = DBSCANFilter("mid_position", 0.1, 20, 20, [0, 100], [0, 1.5], times)
- res_time = run_index_benchmark(f7, rs3)
- print(f"mid_position | {res_time:10.7f}")
-
-
-if __name__ == "__main__":
- run_all_benchmarks()
diff --git a/src/kbmod/configuration.py b/src/kbmod/configuration.py
index e5755eccc..cecf9546a 100644
--- a/src/kbmod/configuration.py
+++ b/src/kbmod/configuration.py
@@ -30,7 +30,7 @@ def __init__(self):
"do_clustering": True,
"do_mask": True,
"do_stamp_filter": True,
- "eps": 0.03,
+ "eps": 20.0,
"encode_num_bytes": -1,
"generator_config": None,
"gpu_filter": False,
diff --git a/src/kbmod/filters/clustering_filters.py b/src/kbmod/filters/clustering_filters.py
index 450c34f00..2412a536b 100644
--- a/src/kbmod/filters/clustering_filters.py
+++ b/src/kbmod/filters/clustering_filters.py
@@ -82,7 +82,7 @@ def keep_indices(self, result_data):
class ClusterPredictionFilter(DBSCANFilter):
"""Cluster the candidates using their positions at specific times."""
- def __init__(self, eps, pred_times=[0.0], height=1.0, width=1.0, scaled=True, **kwargs):
+ def __init__(self, eps, pred_times=[0.0], **kwargs):
"""Create a DBSCANFilter.
Parameters
@@ -92,27 +92,8 @@ def __init__(self, eps, pred_times=[0.0], height=1.0, width=1.0, scaled=True, **
pred_times : `list`
The times a which to prediction the positions.
Default = [0.0] (starting position only)
- height : `int`
- The size of the image height (in pixels) for scaling.
- Default: 1 (no scaling)
- width : `int`
- The size of the image width (in pixels) for scaling.
- Default: 1 (no scaling)
- scaled : `bool`
- Scale the positions to [0, 1] based on ``width`` and ``height``. This impacts
- how ``eps`` is interpreted by DBSCAN. If scaling is turned on ``eps``
- approximates the percentage of each dimension between points. If scaling is
- turned off ``eps`` is a distance in pixels.
"""
super().__init__(eps, **kwargs)
- if scaled:
- if height <= 0.0 or width <= 0:
- raise ValueError(f"Invalid scaling parameters y={height} by x={width}")
- self.height = height
- self.width = width
- else:
- self.height = 1.0
- self.width = 1.0
# Confirm we have at least one prediction time.
if len(pred_times) == 0:
@@ -120,7 +101,7 @@ def __init__(self, eps, pred_times=[0.0], height=1.0, width=1.0, scaled=True, **
self.times = pred_times
# Set up the clustering algorithm's name.
- self.cluster_type = f"position (scaled={scaled}) t={self.times}"
+ self.cluster_type = f"position t={self.times}"
def _build_clustering_data(self, result_data):
"""Build the specific data set for this clustering approach.
@@ -145,49 +126,23 @@ def _build_clustering_data(self, result_data):
# the division by height and width will be no-ops.
coords = []
for t in self.times:
- coords.append((x_arr + t * vx_arr) / self.width)
- coords.append((y_arr + t * vy_arr) / self.height)
+ coords.append(x_arr + t * vx_arr)
+ coords.append(y_arr + t * vy_arr)
return np.array(coords)
-class ClusterPosAngVelFilter(DBSCANFilter):
- """Cluster the candidates using their scaled starting position, trajectory
- angles, and trajectory velocity magnitude.
- """
+class ClusterPosVelFilter(DBSCANFilter):
+ """Cluster the candidates using their starting position and velocities."""
- def __init__(self, eps, height, width, vel_lims, ang_lims, **kwargs):
+ def __init__(self, eps, **kwargs):
"""Create a DBSCANFilter.
Parameters
----------
eps : `float`
- The clustering threshold.
- height : `int`
- The size of the image height (in pixels) for scaling.
- width : `int`
- The size of the image width (in pixels) for scaling.
- vel_lims : `list`
- The velocity limits of the search such that v_lim[1] - v_lim[0]
- is the range of velocities searched. Used for scaling.
- ang_lims : `list`
- The angle limits of the search such that ang_lim[1] - ang_lim[0]
- is the range of velocities searched.
+ The clustering threshold in pixels.
"""
super().__init__(eps, **kwargs)
- if height <= 0.0 or width <= 0:
- raise ValueError(f"Invalid scaling parameters y={height} by x={width}")
- if len(vel_lims) < 2:
- raise ValueError(f"Invalid velocity magnitude scaling parameters {vel_lims}")
- if len(ang_lims) < 2:
- raise ValueError(f"Invalid velocity angle scaling parameters {ang_lims}")
- self.height = height
- self.width = width
-
- self.v_scale = (vel_lims[1] - vel_lims[0]) if vel_lims[1] != vel_lims[0] else 1.0
- self.a_scale = (ang_lims[1] - ang_lims[0]) if ang_lims[1] != ang_lims[0] else 1.0
- self.min_v = vel_lims[0]
- self.min_a = ang_lims[0]
-
self.cluster_type = "all"
def _build_clustering_data(self, result_data):
@@ -209,17 +164,7 @@ def _build_clustering_data(self, result_data):
y_arr = np.array(result_data["y"])
vx_arr = np.array(result_data["vx"])
vy_arr = np.array(result_data["vy"])
- vel_arr = np.sqrt(np.square(vx_arr) + np.square(vy_arr))
- ang_arr = np.arctan2(vy_arr, vx_arr)
-
- # Scale the values. We always do this because distances and velocities
- # are not directly comparable.
- scaled_x = x_arr / self.width
- scaled_y = y_arr / self.height
- scaled_vel = (vel_arr - self.min_v) / self.v_scale
- scaled_ang = (ang_arr - self.min_a) / self.a_scale
-
- return np.array([scaled_x, scaled_y, scaled_vel, scaled_ang])
+ return np.array([x_arr, y_arr, vx_arr, vy_arr])
def apply_clustering(result_data, cluster_params):
@@ -232,7 +177,7 @@ def apply_clustering(result_data, cluster_params):
the filtering.
cluster_params : dict
Contains values concerning the image and search settings including:
- cluster_type, eps, height, width, scaled, vel_lims, ang_lims, and times.
+ cluster_type, eps, and times.
Raises
------
@@ -240,45 +185,31 @@ def apply_clustering(result_data, cluster_params):
Raises a TypeError if ``result_data`` is of an unsupported type.
"""
if "cluster_type" not in cluster_params:
- raise ValueError("Missing cluster_type parameter")
+ raise KeyError("Missing cluster_type parameter")
cluster_type = cluster_params["cluster_type"]
+ if "eps" not in cluster_params:
+ raise KeyError("Missing eps parameter")
+ eps = cluster_params["eps"]
+
# Skip clustering if there is nothing to cluster.
if len(result_data) == 0:
logger.info("Clustering : skipping, no results.")
return
- # Get the times used for prediction clustering.
- all_times = np.sort(cluster_params["times"])
- zeroed_times = np.array(all_times) - all_times[0]
-
# Do the clustering and the filtering.
if cluster_type == "all" or cluster_type == "pos_vel":
- filt = ClusterPosAngVelFilter(**cluster_params)
+ filt = ClusterPosVelFilter(eps)
elif cluster_type == "position" or cluster_type == "start_position":
- cluster_params["pred_times"] = [0.0]
- cluster_params["scaled"] = True
- filt = ClusterPredictionFilter(**cluster_params)
- elif cluster_type == "position_unscaled" or cluster_type == "start_position_unscaled":
- cluster_params["pred_times"] = [0.0]
- cluster_params["scaled"] = False
- filt = ClusterPredictionFilter(**cluster_params)
+ filt = ClusterPredictionFilter(eps, pred_times=[0.0])
elif cluster_type == "mid_position":
- cluster_params["pred_times"] = [np.median(zeroed_times)]
- cluster_params["scaled"] = True
- filt = ClusterPredictionFilter(**cluster_params)
- elif cluster_type == "mid_position_unscaled":
- cluster_params["pred_times"] = [np.median(zeroed_times)]
- cluster_params["scaled"] = False
- filt = ClusterPredictionFilter(**cluster_params)
- elif cluster_type == "start_end_position":
- cluster_params["pred_times"] = [0.0, zeroed_times[-1]]
- filt = ClusterPredictionFilter(**cluster_params)
- cluster_params["scaled"] = True
- elif cluster_type == "start_end_position_unscaled":
- cluster_params["pred_times"] = [0.0, zeroed_times[-1]]
- filt = ClusterPredictionFilter(**cluster_params)
- cluster_params["scaled"] = False
+ if not "times" in cluster_params:
+ raise KeyError("Missing cluster_type parameter")
+ all_times = np.sort(cluster_params["times"])
+ zeroed_times = np.array(all_times) - all_times[0]
+ median_time = np.median(zeroed_times)
+
+ filt = ClusterPredictionFilter(eps, pred_times=[median_time])
else:
raise ValueError(f"Unknown clustering type: {cluster_type}")
logger.info(f"Clustering {len(result_data)} results using {filt.get_filter_name()}")
diff --git a/src/kbmod/run_search.py b/src/kbmod/run_search.py
index 31478eb62..18678ed6e 100644
--- a/src/kbmod/run_search.py
+++ b/src/kbmod/run_search.py
@@ -26,23 +26,6 @@ class SearchRunner:
def __init__(self):
pass
- def get_angle_limits(self, config):
- """Compute the angle limits based on the configuration information.
-
- Parameters
- ----------
- config : `SearchConfiguration`
- The configuration parameters
-
- Returns
- -------
- res : `list`
- A list with the minimum and maximum angle to search (in pixel space).
- """
- ang_min = config["average_angle"] - config["ang_arr"][0]
- ang_max = config["average_angle"] + config["ang_arr"][1]
- return [ang_min, ang_max]
-
def load_and_filter_results(self, search, config):
"""This function loads results that are output by the gpu grid search.
Results are loaded in chunks and evaluated to see if the minimum
@@ -249,13 +232,9 @@ def run_search(self, config, stack, trj_generator=None):
cluster_timer = kb.DebugTimer("clustering", logger)
mjds = [stack.get_obstime(t) for t in range(stack.img_count())]
cluster_params = {
- "ang_lims": self.get_angle_limits(config),
"cluster_type": config["cluster_type"],
"eps": config["eps"],
"times": np.array(mjds),
- "vel_lims": config["v_arr"],
- "width": stack.get_width(),
- "height": stack.get_height(),
}
apply_clustering(keep, cluster_params)
cluster_timer.stop()
diff --git a/tests/test_clustering_filters.py b/tests/test_clustering_filters.py
index 2a77efa29..96a6762fd 100644
--- a/tests/test_clustering_filters.py
+++ b/tests/test_clustering_filters.py
@@ -40,29 +40,18 @@ def test_dbscan_position_results(self):
)
# Standard-ish params collapses to 2 clusters.
- f1 = ClusterPredictionFilter(eps=0.025, pred_times=[0.0], height=100, width=100)
+ f1 = ClusterPredictionFilter(eps=5.0, pred_times=[0.0])
self.assertEqual(f1.keep_indices(rs), [0, 3])
# Small eps is 4 clusters.
- f2 = ClusterPredictionFilter(eps=0.000015, pred_times=[0.0], height=100, width=100)
+ f2 = ClusterPredictionFilter(eps=0.000015, pred_times=[0.0])
self.assertEqual(f2.keep_indices(rs), [0, 3, 4, 5])
# Large scale means 1 cluster
- f3 = ClusterPredictionFilter(eps=0.025, pred_times=[0.0], height=1000, width=1000)
+ f3 = ClusterPredictionFilter(eps=5000.0, pred_times=[0.0])
self.assertEqual(f3.keep_indices(rs), [0])
- # If we do not scale everything at the same starting point is marked its own cluster at low eps
- # (0.025 pixels max distance).
- f4 = ClusterPredictionFilter(eps=0.025, pred_times=[0.0], height=100, width=100, scaled=False)
- self.assertEqual(f4.keep_indices(rs), [0, 3, 4, 5])
-
- # We regain the clustering power with a higher eps (3 pixels max distance).
- f5 = ClusterPredictionFilter(eps=3.0, pred_times=[0.0], height=100, width=100, scaled=False)
- self.assertEqual(f5.keep_indices(rs), [0, 3])
-
# Catch invalid parameters
- self.assertRaises(ValueError, ClusterPredictionFilter, eps=0.025, width=0)
- self.assertRaises(ValueError, ClusterPredictionFilter, eps=0.025, height=0)
self.assertRaises(ValueError, ClusterPredictionFilter, eps=0.025, pred_times=[])
def test_dbscan_all_results(self):
@@ -79,37 +68,13 @@ def test_dbscan_all_results(self):
)
# Start with 5 clusters
- f1 = ClusterPosAngVelFilter(eps=0.025, height=100, width=100, vel_lims=[0, 100], ang_lims=[0, 1.5])
+ f1 = ClusterPosVelFilter(eps=5.0)
self.assertEqual(f1.keep_indices(rs), [0, 1, 3, 4, 5])
# Larger eps is 3 clusters.
- f2 = ClusterPosAngVelFilter(eps=0.25, height=100, width=100, vel_lims=[0, 100], ang_lims=[0, 1.5])
+ f2 = ClusterPosVelFilter(eps=20.0)
self.assertEqual(f2.keep_indices(rs), [0, 1, 3])
- # Larger scale is 3 clusters.
- f3 = ClusterPosAngVelFilter(eps=0.025, height=100, width=100, vel_lims=[0, 5000], ang_lims=[0, 1.5])
- self.assertEqual(f3.keep_indices(rs), [0, 1, 3])
-
- # Catch invalid parameters
- self.assertRaises(
- ValueError,
- ClusterPosAngVelFilter,
- eps=0.025,
- height=100,
- width=100,
- vel_lims=[100],
- ang_lims=[0, 1.5],
- )
- self.assertRaises(
- ValueError,
- ClusterPosAngVelFilter,
- eps=0.025,
- height=100,
- width=100,
- vel_lims=[0, 5000],
- ang_lims=[1.5],
- )
-
def test_dbscan_mid_pos(self):
rs = self._make_result_data(
[
@@ -124,50 +89,22 @@ def test_dbscan_mid_pos(self):
)
# Use the median time.
- f1 = ClusterPredictionFilter(eps=0.1, pred_times=[0.95], height=20, width=20)
+ f1 = ClusterPredictionFilter(eps=2.0, pred_times=[0.95])
self.assertEqual(f1.keep_indices(rs), [0, 1, 3, 6])
# Use different times.
- f2 = ClusterPredictionFilter(eps=0.1, pred_times=[10.0], height=20, width=20)
+ f2 = ClusterPredictionFilter(eps=2.0, pred_times=[10.0])
self.assertEqual(f2.keep_indices(rs), [0, 1, 3, 4, 5, 6])
# Use different times again.
- f3 = ClusterPredictionFilter(eps=0.1, pred_times=[0.001], height=20, width=20)
+ f3 = ClusterPredictionFilter(eps=2.0, pred_times=[0.001])
self.assertEqual(f3.keep_indices(rs), [0, 3, 5])
- # Try without scaling
- f4 = ClusterPredictionFilter(eps=0.1, pred_times=[10.95], height=20, width=20, scaled=False)
- self.assertEqual(f4.keep_indices(rs), [0, 1, 2, 3, 4, 5, 6])
-
- def test_dbscan_start_end_pos(self):
- rs = self._make_result_data(
- [
- [10, 11, 1, 2],
- [10, 11, 2, 5],
- [10, 11, 1.01, 1.99],
- [10, 11, 0.99, 2.01],
- [21, 23, 1, 2],
- [21, 23, -10, -10],
- [21, 23, -10, -10.01],
- [21, 23, -10.01, -10],
- [5, 10, 1, 2.1],
- [5, 10, 1, 2],
- [5, 10, 1, 1.9],
- ]
- )
-
- f1 = ClusterPredictionFilter(eps=3.0, pred_times=[10, 11.9], scaled=False)
- self.assertEqual(f1.keep_indices(rs), [0, 1, 4, 5, 8])
-
def test_clustering_results(self):
cluster_params = {
- "ang_lims": [0.0, 1.5],
"cluster_type": "all",
- "eps": 0.03,
+ "eps": 5.0,
"times": self.times,
- "width": 100,
- "height": 100,
- "vel_lims": [5.0, 40.0],
}
results = self._make_result_data(
@@ -196,27 +133,10 @@ def test_clustering_results(self):
apply_clustering(results2, cluster_params)
self.assertEqual(len(results2), 3)
- # Try clustering with start and end positions
- results3 = self._make_result_data(
- [
- [10, 11, 1, 2],
- [10, 11, 10, 20],
- [40, 5, -1, 2],
- [5, 0, 1, 2],
- [5, 1, 1, 2],
- ]
- )
- cluster_params["cluster_type"] = "start_end_position"
- apply_clustering(results3, cluster_params)
- self.assertEqual(len(results3), 4)
-
# Try invalid or missing cluster_type.
cluster_params["cluster_type"] = "invalid"
self.assertRaises(ValueError, apply_clustering, results2, cluster_params)
- del cluster_params["cluster_type"]
- self.assertRaises(ValueError, apply_clustering, results2, cluster_params)
-
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_regression_test.py b/tests/test_regression_test.py
index e8ac76de1..47981fe79 100644
--- a/tests/test_regression_test.py
+++ b/tests/test_regression_test.py
@@ -271,7 +271,7 @@ def perform_search(im_stack, res_filename, default_psf):
"peak_offset": [3.0, 3.0],
"chunk_size": 1000000,
"stamp_type": "cpp_median",
- "eps": 0.03,
+ "eps": 20.0,
"gpu_filter": True,
"clip_negative": True,
"x_pixel_buffer": 10,
From 394c1b0887fc746475800958508db84deb8c1a25 Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Wed, 11 Sep 2024 19:15:03 -0400
Subject: [PATCH 2/5] Update documentation
---
docs/source/user_manual/results_filtering.rst | 16 +++++-----------
notebooks/region_search/discrete_piles_e2e.ipynb | 2 +-
2 files changed, 6 insertions(+), 12 deletions(-)
diff --git a/docs/source/user_manual/results_filtering.rst b/docs/source/user_manual/results_filtering.rst
index b4ff2bba0..c20befea1 100644
--- a/docs/source/user_manual/results_filtering.rst
+++ b/docs/source/user_manual/results_filtering.rst
@@ -54,19 +54,13 @@ In the extreme case imagine a bright object centered at (10, 15) and moving at (
But how do we tell which trajectories are "close"? If we only look at the pixel locations at a given time point (event t=0), we might combined two trajectories with very different velocities that happen to pass near the same pixel at that time. Even if this is not likely for real objects, we might merge a real object with a noisy false detection.
-The `scikit-learn `_ ``DBSCAN`` algorithm performs clustering the trajectories. The algorithm can cluster the results based on a combination of position, velocity, and angle as specified by the parameter cluster_type, which can take on the values of:
+The `scikit-learn `_ ``DBSCAN`` algorithm performs clustering the trajectories. The algorithm can cluster the results based on a combination of position and velocity angle as specified by the parameter ``cluster_type``, which can take on the values of:
-* ``all`` - Use scaled x position, scaled y position, scaled velocity, and scaled angle as coordinates for clustering.
-* ``position`` - Use only the trajectory's scaled (x, y) position at the first timestep for clustering.
-* ``position_unscaled`` - Use only trajctory's (x, y) position at the first timestep for clustering.
-* ``mid_position`` - Use the (scaled) predicted position at the median time as coordinates for clustering.
-* ``mid_position_unscaled`` - Use the predicted position at the median time as coordinates for clustering.
-* ``start_end_position`` - Use the (scaled) predicted positions at the start and end times as coordinates for clustering.
-* ``start_end_position_unscaled`` - Use the predicted positions at the start and end times as coordinates for clustering.
+* ``all`` or ``pos_vel`` - Use a trajectory's position at the first time stamp (x, y) and velocity (vx, vy) for filtering
+* ``position`` or ``start_position`` - Use only trajctory's (x, y) position at the first timestep for clustering.
+* ``mid_position`` - Use the predicted position at the median time as coordinates for clustering.
-Most of the clustering approaches rely on predicted positions at different times. For example midpoint-based clustering will encode each trajectory `(x0, y0, xv, yv)` as a 2-dimensional point `(x0 + tm * xv, y0 + tm + yv)` where `tm` is the median time. Thus trajectories only need to be close at time=`tm` to be merged into a single trajectory. In contrast the start and eng based clustering will encode the same trajectory as a 4-dimensional point (x0, y0, x0 + te * xv, y0 + te + yv)` where `te` is the last time. Thus the points will need to be close at both time=0.0 and time=`te` to be merged into a single result.
-
-Each of the positional based clusterings have both a scaled and unscaled version. This impacts how DBSCAN interprets distances. In the scaled version all values are divided by the width of the corresponding dimension to normalize the values. This maps points within the image to [0, 1], so an `eps` value of 0.01 might make sense. In contrast the unscaled versions do not perform normalization. The distances between two trajectories is measured in pixels. Here an `eps` value of 10 (for 10 pixels) might be better.
+The way DBSCAN computes distances between the trajectories depends on the encoding used. For positional encodings, such as ``position`` and ``mid_position``, the distance is measured directly in pixels. The ``all`` encoding behaves somewhat similarly. However since it combines positions and velocities (or change in pixels per day), they are not actually in the same space.
Relevant clustering parameters include:
diff --git a/notebooks/region_search/discrete_piles_e2e.ipynb b/notebooks/region_search/discrete_piles_e2e.ipynb
index fdc1a0acd..9f71552b0 100644
--- a/notebooks/region_search/discrete_piles_e2e.ipynb
+++ b/notebooks/region_search/discrete_piles_e2e.ipynb
@@ -213,7 +213,7 @@
" \"peak_offset\": [3.0, 3.0],\n",
" \"chunk_size\": 1000000,\n",
" \"stamp_type\": \"cpp_median\",\n",
- " \"eps\": 0.03,\n",
+ " \"eps\": 20.0,\n",
" \"clip_negative\": True,\n",
" \"mask_num_images\": 0,\n",
" \"cluster_type\": \"position\",\n",
From 69cf1050a1dbc5e6ebaf5cb6d2a5d873ccfb0738 Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:11:27 -0400
Subject: [PATCH 3/5] Rename eps and add a scaling parameter
---
docs/source/user_manual/results_filtering.rst | 8 ++-
docs/source/user_manual/search_params.rst | 21 ++++----
.../region_search/discrete_piles_e2e.ipynb | 2 +-
src/kbmod/configuration.py | 3 +-
src/kbmod/run_search.py | 3 +-
tests/test_clustering_filters.py | 53 +++++++++++++++----
tests/test_regression_test.py | 2 +-
7 files changed, 66 insertions(+), 26 deletions(-)
diff --git a/docs/source/user_manual/results_filtering.rst b/docs/source/user_manual/results_filtering.rst
index c20befea1..f8ad39952 100644
--- a/docs/source/user_manual/results_filtering.rst
+++ b/docs/source/user_manual/results_filtering.rst
@@ -59,14 +59,18 @@ The `scikit-learn `_ ``DBSCAN`` algorithm perf
* ``all`` or ``pos_vel`` - Use a trajectory's position at the first time stamp (x, y) and velocity (vx, vy) for filtering
* ``position`` or ``start_position`` - Use only trajctory's (x, y) position at the first timestep for clustering.
* ``mid_position`` - Use the predicted position at the median time as coordinates for clustering.
+* ``start_end_position`` - Use the predicted positions at the start and end times as coordinates for clustering.
-The way DBSCAN computes distances between the trajectories depends on the encoding used. For positional encodings, such as ``position`` and ``mid_position``, the distance is measured directly in pixels. The ``all`` encoding behaves somewhat similarly. However since it combines positions and velocities (or change in pixels per day), they are not actually in the same space.
+Most of the clustering approaches rely on predicted positions at different times. For example midpoint-based clustering will encode each trajectory `(x0, y0, xv, yv)` as a 2-dimensional point `(x0 + tm * xv, y0 + tm + yv)` where `tm` is the median time. Thus trajectories only need to be close at time=`tm` to be merged into a single trajectory. In contrast the start and eng based clustering will encode the same trajectory as a 4-dimensional point (x0, y0, x0 + te * xv, y0 + te + yv)` where `te` is the last time. Thus the points will need to be close at both time=0.0 and time=`te` to be merged into a single result.
+
+The way DBSCAN computes distances between the trajectories depends on the encoding used. For positional encodings, such as ``position``, ``mid_position``, and ``start_end_position``, the distance is measured directly in pixels. The ``all`` encoding behaves somewhat similarly. However since it combines positions and velocities (or change in pixels per day), they are not actually in the same space.
Relevant clustering parameters include:
* ``cluster_type`` - The types of predicted values to use when determining which trajectories should be clustered together, including position, velocity, and angles (if ``do_clustering = True``). Must be one of all, position, or mid_position.
* ``do_clustering`` - Cluster the resulting trajectories to remove duplicates.
-* ``eps`` - The distance threshold used by DBSCAN.
+* ``cluster_eps`` - The distance threshold used by DBSCAN.
+* ``cluster_v_scale`` - The relative scale between velocity differences and positional differences in ``all`` clustering.
See Also
________
diff --git a/docs/source/user_manual/search_params.rst b/docs/source/user_manual/search_params.rst
index df0ad09d4..5559fdffc 100644
--- a/docs/source/user_manual/search_params.rst
+++ b/docs/source/user_manual/search_params.rst
@@ -28,15 +28,20 @@ This document serves to provide a quick overview of the existing parameters and
| | | remove all negative values prior to |
| | | computing the percentiles. |
+------------------------+-----------------------------+----------------------------------------+
+| ``cluster_eps `` | 20.0 | The threshold to use for clustering |
+| | | similar results. |
++------------------------+-----------------------------+----------------------------------------+
| ``cluster_type`` | all | Types of predicted values to use when |
| | | determining trajectories to clustered |
-| | | together, including position, velocity,|
-| | | and angles (if do_clustering = True). |
+| | | together, including position and |
+| | | velocities (if do_clustering = True). |
| | | Options include: ``all``, ``position``,|
-| | | ``position_unscaled``, ``mid_position``|
-| | | ``mid_position_unscaled``, |
-| | | ``start_end_position``, or |
-| | | ``start_end_position_unscaled``. |
+| | | ``mid_position``, and |
+| | | ``start_end_position`` |
++------------------------+-----------------------------+----------------------------------------+
+| ``cluster_v_scale`` | 1.0 | The weight of differences in velocity |
+| | | relative to differences in distances |
+| | | during clustering. |
+------------------------+-----------------------------+----------------------------------------+
| ``coadds`` | [] | A list of additional coadds to create. |
| | | These are not used in filtering, but |
@@ -56,10 +61,6 @@ This document serves to provide a quick overview of the existing parameters and
| ``do_stamp_filter`` | True | Apply post-search filtering on the |
| | | image stamps. |
+------------------------+-----------------------------+----------------------------------------+
-| ``eps`` | 0.03 | The epsilon value to use in DBSCAN |
-| | | clustering (if ``cluster_type=DBSCAN`` |
-| | | and ``do_clustering=True``). |
-+------------------------+-----------------------------+----------------------------------------+
| ``encode_num_bytes`` | -1 | The number of bytes to use to encode |
| | | ``psi`` and ``phi`` images on GPU. By |
| | | default a ``float`` encoding is used. |
diff --git a/notebooks/region_search/discrete_piles_e2e.ipynb b/notebooks/region_search/discrete_piles_e2e.ipynb
index 9f71552b0..29dc3f134 100644
--- a/notebooks/region_search/discrete_piles_e2e.ipynb
+++ b/notebooks/region_search/discrete_piles_e2e.ipynb
@@ -213,7 +213,7 @@
" \"peak_offset\": [3.0, 3.0],\n",
" \"chunk_size\": 1000000,\n",
" \"stamp_type\": \"cpp_median\",\n",
- " \"eps\": 20.0,\n",
+ " \"cluster_eps\": 20.0,\n",
" \"clip_negative\": True,\n",
" \"mask_num_images\": 0,\n",
" \"cluster_type\": \"position\",\n",
diff --git a/src/kbmod/configuration.py b/src/kbmod/configuration.py
index cecf9546a..fed0a87de 100644
--- a/src/kbmod/configuration.py
+++ b/src/kbmod/configuration.py
@@ -24,13 +24,14 @@ def __init__(self):
"center_thresh": 0.00,
"chunk_size": 500000,
"clip_negative": False,
+ "cluster_eps": 20.0,
"cluster_type": "all",
+ "cluster_v_scale": 1.0,
"coadds": [],
"debug": False,
"do_clustering": True,
"do_mask": True,
"do_stamp_filter": True,
- "eps": 20.0,
"encode_num_bytes": -1,
"generator_config": None,
"gpu_filter": False,
diff --git a/src/kbmod/run_search.py b/src/kbmod/run_search.py
index 18678ed6e..9f262c5ca 100644
--- a/src/kbmod/run_search.py
+++ b/src/kbmod/run_search.py
@@ -233,7 +233,8 @@ def run_search(self, config, stack, trj_generator=None):
mjds = [stack.get_obstime(t) for t in range(stack.img_count())]
cluster_params = {
"cluster_type": config["cluster_type"],
- "eps": config["eps"],
+ "cluster_eps": config["cluster_eps"],
+ "cluster_v_scale": config["cluster_v_scale"],
"times": np.array(mjds),
}
apply_clustering(keep, cluster_params)
diff --git a/tests/test_clustering_filters.py b/tests/test_clustering_filters.py
index 96a6762fd..dfdce7ae1 100644
--- a/tests/test_clustering_filters.py
+++ b/tests/test_clustering_filters.py
@@ -40,19 +40,19 @@ def test_dbscan_position_results(self):
)
# Standard-ish params collapses to 2 clusters.
- f1 = ClusterPredictionFilter(eps=5.0, pred_times=[0.0])
+ f1 = ClusterPredictionFilter(cluster_eps=5.0, pred_times=[0.0])
self.assertEqual(f1.keep_indices(rs), [0, 3])
# Small eps is 4 clusters.
- f2 = ClusterPredictionFilter(eps=0.000015, pred_times=[0.0])
+ f2 = ClusterPredictionFilter(cluster_eps=0.000015, pred_times=[0.0])
self.assertEqual(f2.keep_indices(rs), [0, 3, 4, 5])
# Large scale means 1 cluster
- f3 = ClusterPredictionFilter(eps=5000.0, pred_times=[0.0])
+ f3 = ClusterPredictionFilter(cluster_eps=5000.0, pred_times=[0.0])
self.assertEqual(f3.keep_indices(rs), [0])
# Catch invalid parameters
- self.assertRaises(ValueError, ClusterPredictionFilter, eps=0.025, pred_times=[])
+ self.assertRaises(ValueError, ClusterPredictionFilter, cluster_eps=0.025, pred_times=[])
def test_dbscan_all_results(self):
"""Test clustering based on the median position of the trajectory."""
@@ -68,13 +68,20 @@ def test_dbscan_all_results(self):
)
# Start with 5 clusters
- f1 = ClusterPosVelFilter(eps=5.0)
+ f1 = ClusterPosVelFilter(cluster_eps=5.0)
self.assertEqual(f1.keep_indices(rs), [0, 1, 3, 4, 5])
# Larger eps is 3 clusters.
- f2 = ClusterPosVelFilter(eps=20.0)
+ f2 = ClusterPosVelFilter(cluster_eps=20.0)
self.assertEqual(f2.keep_indices(rs), [0, 1, 3])
+ # Adding the scaling factor increases or reduces the impact of velocity.
+ f3 = ClusterPosVelFilter(cluster_eps=5.0, cluster_v_scale=5.0)
+ self.assertEqual(f3.keep_indices(rs), [0, 1, 3, 4, 5])
+
+ f4 = ClusterPosVelFilter(cluster_eps=5.0, cluster_v_scale=1e-9)
+ self.assertEqual(f4.keep_indices(rs), [0, 3])
+
def test_dbscan_mid_pos(self):
rs = self._make_result_data(
[
@@ -89,21 +96,42 @@ def test_dbscan_mid_pos(self):
)
# Use the median time.
- f1 = ClusterPredictionFilter(eps=2.0, pred_times=[0.95])
+ f1 = ClusterPredictionFilter(cluster_eps=2.0, pred_times=[0.95])
self.assertEqual(f1.keep_indices(rs), [0, 1, 3, 6])
# Use different times.
- f2 = ClusterPredictionFilter(eps=2.0, pred_times=[10.0])
+ f2 = ClusterPredictionFilter(cluster_eps=2.0, pred_times=[10.0])
self.assertEqual(f2.keep_indices(rs), [0, 1, 3, 4, 5, 6])
# Use different times again.
- f3 = ClusterPredictionFilter(eps=2.0, pred_times=[0.001])
+ f3 = ClusterPredictionFilter(cluster_eps=2.0, pred_times=[0.001])
self.assertEqual(f3.keep_indices(rs), [0, 3, 5])
+ def test_dbscan_start_end_pos(self):
+ rs = self._make_result_data(
+ [
+ [10, 11, 1, 2],
+ [10, 11, 2, 5],
+ [10, 11, 1.01, 1.99],
+ [10, 11, 0.99, 2.01],
+ [21, 23, 1, 2],
+ [21, 23, -10, -10],
+ [21, 23, -10, -10.01],
+ [21, 23, -10.01, -10],
+ [5, 10, 1, 2.1],
+ [5, 10, 1, 2],
+ [5, 10, 1, 1.9],
+ ]
+ )
+
+ f1 = ClusterPredictionFilter(cluster_eps=3.0, pred_times=[10, 11.9])
+ self.assertEqual(f1.keep_indices(rs), [0, 1, 4, 5, 8])
+
def test_clustering_results(self):
cluster_params = {
"cluster_type": "all",
- "eps": 5.0,
+ "cluster_eps": 5.0,
+ "cluster_v_scale": 1.0,
"times": self.times,
}
@@ -119,6 +147,11 @@ def test_clustering_results(self):
apply_clustering(results, cluster_params)
self.assertEqual(len(results), 4)
+ # If we remove the weighting on velocity, then we drop to three clusters.
+ cluster_params["cluster_v_scale"] = 1e-16
+ apply_clustering(results, cluster_params)
+ self.assertEqual(len(results), 3)
+
# Try clustering with only positions.
results2 = self._make_result_data(
[
diff --git a/tests/test_regression_test.py b/tests/test_regression_test.py
index 47981fe79..521a54e8c 100644
--- a/tests/test_regression_test.py
+++ b/tests/test_regression_test.py
@@ -271,7 +271,7 @@ def perform_search(im_stack, res_filename, default_psf):
"peak_offset": [3.0, 3.0],
"chunk_size": 1000000,
"stamp_type": "cpp_median",
- "eps": 20.0,
+ "cluster_eps": 20.0,
"gpu_filter": True,
"clip_negative": True,
"x_pixel_buffer": 10,
From 61a654798404c821546f1c4a09d30f4bbe77af51 Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:18:48 -0400
Subject: [PATCH 4/5] Rename eps to cluster_eps
---
src/kbmod/filters/clustering_filters.py | 62 ++++++++++++++-----------
1 file changed, 35 insertions(+), 27 deletions(-)
diff --git a/src/kbmod/filters/clustering_filters.py b/src/kbmod/filters/clustering_filters.py
index 2412a536b..cc6f5360e 100644
--- a/src/kbmod/filters/clustering_filters.py
+++ b/src/kbmod/filters/clustering_filters.py
@@ -11,17 +11,17 @@ class DBSCANFilter:
"""Cluster the candidates using DBSCAN and only keep a
single representative trajectory from each cluster."""
- def __init__(self, eps, **kwargs):
+ def __init__(self, cluster_eps, **kwargs):
"""Create a DBSCANFilter.
Parameters
----------
- eps : `float`
+ cluster_eps : `float`
The clustering threshold.
"""
- self.eps = eps
+ self.cluster_eps = cluster_eps
self.cluster_type = ""
- self.cluster_args = dict(eps=self.eps, min_samples=1, n_jobs=-1)
+ self.cluster_args = dict(eps=self.cluster_eps, min_samples=1, n_jobs=-1)
def get_filter_name(self):
"""Get the name of the filter.
@@ -31,7 +31,7 @@ def get_filter_name(self):
str
The filter name.
"""
- return f"DBSCAN_{self.cluster_type} eps={self.eps}"
+ return f"DBSCAN_{self.cluster_type} eps={self.cluster_eps}"
def _build_clustering_data(self, result_data):
"""Build the specific data set for this clustering approach.
@@ -82,18 +82,18 @@ def keep_indices(self, result_data):
class ClusterPredictionFilter(DBSCANFilter):
"""Cluster the candidates using their positions at specific times."""
- def __init__(self, eps, pred_times=[0.0], **kwargs):
+ def __init__(self, cluster_eps, pred_times=[0.0], **kwargs):
"""Create a DBSCANFilter.
Parameters
----------
- eps : `float`
+ cluster_eps : `float`
The clustering threshold.
pred_times : `list`
The times a which to prediction the positions.
Default = [0.0] (starting position only)
"""
- super().__init__(eps, **kwargs)
+ super().__init__(cluster_eps, **kwargs)
# Confirm we have at least one prediction time.
if len(pred_times) == 0:
@@ -134,15 +134,22 @@ def _build_clustering_data(self, result_data):
class ClusterPosVelFilter(DBSCANFilter):
"""Cluster the candidates using their starting position and velocities."""
- def __init__(self, eps, **kwargs):
+ def __init__(self, cluster_eps, cluster_v_scale=1.0, **kwargs):
"""Create a DBSCANFilter.
Parameters
----------
- eps : `float`
+ cluster_eps : `float`
The clustering threshold in pixels.
+ cluster_v_scale : `float`
+ The relative scaling of velocity differences compared to position
+ differences. Default: 1.0 (no difference).
"""
- super().__init__(eps, **kwargs)
+ super().__init__(cluster_eps, **kwargs)
+ if cluster_v_scale <= 0.0:
+ # Avoid divide by zero.
+ cluster_v_scale = 1e-12
+ self.cluster_v_scale = cluster_v_scale
self.cluster_type = "all"
def _build_clustering_data(self, result_data):
@@ -162,8 +169,8 @@ def _build_clustering_data(self, result_data):
# Create arrays of each the trajectories information.
x_arr = np.array(result_data["x"])
y_arr = np.array(result_data["y"])
- vx_arr = np.array(result_data["vx"])
- vy_arr = np.array(result_data["vy"])
+ vx_arr = np.array(result_data["vx"]) * self.cluster_v_scale
+ vy_arr = np.array(result_data["vy"]) * self.cluster_v_scale
return np.array([x_arr, y_arr, vx_arr, vy_arr])
@@ -177,7 +184,7 @@ def apply_clustering(result_data, cluster_params):
the filtering.
cluster_params : dict
Contains values concerning the image and search settings including:
- cluster_type, eps, and times.
+ cluster_type, cluster_eps, times, and cluster_v_scale (optional).
Raises
------
@@ -188,28 +195,29 @@ def apply_clustering(result_data, cluster_params):
raise KeyError("Missing cluster_type parameter")
cluster_type = cluster_params["cluster_type"]
- if "eps" not in cluster_params:
- raise KeyError("Missing eps parameter")
- eps = cluster_params["eps"]
-
# Skip clustering if there is nothing to cluster.
if len(result_data) == 0:
logger.info("Clustering : skipping, no results.")
return
+ # Get the times used for prediction clustering.
+ if not "times" in cluster_params:
+ raise KeyError("Missing times parameter in the clustering parameters.")
+ all_times = np.sort(cluster_params["times"])
+ zeroed_times = np.array(all_times) - all_times[0]
+
# Do the clustering and the filtering.
if cluster_type == "all" or cluster_type == "pos_vel":
- filt = ClusterPosVelFilter(eps)
+ filt = ClusterPosVelFilter(**cluster_params)
elif cluster_type == "position" or cluster_type == "start_position":
- filt = ClusterPredictionFilter(eps, pred_times=[0.0])
+ cluster_params["pred_times"] = [0.0]
+ filt = ClusterPredictionFilter(**cluster_params)
elif cluster_type == "mid_position":
- if not "times" in cluster_params:
- raise KeyError("Missing cluster_type parameter")
- all_times = np.sort(cluster_params["times"])
- zeroed_times = np.array(all_times) - all_times[0]
- median_time = np.median(zeroed_times)
-
- filt = ClusterPredictionFilter(eps, pred_times=[median_time])
+ cluster_params["pred_times"] = [np.median(zeroed_times)]
+ filt = ClusterPredictionFilter(**cluster_params)
+ elif cluster_type == "start_end_position":
+ cluster_params["pred_times"] = [0.0, zeroed_times[-1]]
+ filt = ClusterPredictionFilter(**cluster_params)
else:
raise ValueError(f"Unknown clustering type: {cluster_type}")
logger.info(f"Clustering {len(result_data)} results using {filt.get_filter_name()}")
From 1e98a1b23825dc1cdc64a10e26d257b32014414f Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Fri, 13 Sep 2024 12:36:32 -0400
Subject: [PATCH 5/5] Update clustering_filters.py
---
src/kbmod/filters/clustering_filters.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/src/kbmod/filters/clustering_filters.py b/src/kbmod/filters/clustering_filters.py
index cc6f5360e..db7015889 100644
--- a/src/kbmod/filters/clustering_filters.py
+++ b/src/kbmod/filters/clustering_filters.py
@@ -146,9 +146,8 @@ def __init__(self, cluster_eps, cluster_v_scale=1.0, **kwargs):
differences. Default: 1.0 (no difference).
"""
super().__init__(cluster_eps, **kwargs)
- if cluster_v_scale <= 0.0:
- # Avoid divide by zero.
- cluster_v_scale = 1e-12
+ if cluster_v_scale < 0.0:
+ raise ValueError("cluster_v_scale cannot be negative.")
self.cluster_v_scale = cluster_v_scale
self.cluster_type = "all"