From b59fc2fe90a9783fb310ac78cb0213383330232e Mon Sep 17 00:00:00 2001 From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com> Date: Fri, 22 Nov 2024 12:15:31 -0500 Subject: [PATCH] Add a notebook for joining results and known objects --- notebooks/join_known_objects_example.ipynb | 166 +++++++++++++++++++++ src/kbmod/filters/known_object_filters.py | 30 ++-- src/kbmod/results.py | 32 ++++ tests/test_known_object_filters.py | 32 +++- tests/test_results.py | 27 ++++ 5 files changed, 272 insertions(+), 15 deletions(-) create mode 100644 notebooks/join_known_objects_example.ipynb diff --git a/notebooks/join_known_objects_example.ipynb b/notebooks/join_known_objects_example.ipynb new file mode 100644 index 000000000..bd1170beb --- /dev/null +++ b/notebooks/join_known_objects_example.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a89ad362-6ed6-489f-806c-2fd94fc9356d", + "metadata": {}, + "source": [ + "# Example Known Object Labeling\n", + "\n", + "This notebook serves as an example (and usable tool) for labeling objects in the results file as corresponding to a known object. It assumes the user has run KBMOD to produce a results .ecsv and has access to a table of known results.\n", + "\n", + "This notebook uses specific files and parameters from the DEEP reprocessing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b1d47da-db6f-4530-93a2-9cb83a6af145", + "metadata": {}, + "outputs": [], + "source": [ + "from astropy.table import Table\n", + "import numpy as np\n", + "\n", + "from kbmod.filters.known_object_filters import KnownObjsMatcher\n", + "from kbmod.results import Results" + ] + }, + { + "cell_type": "markdown", + "id": "43b6aaa7-9bf3-4ed7-8659-8daea4ed4765", + "metadata": {}, + "source": [ + "We start by loading the results data and known object data. The results data is the ecsv file produced by a KBMOD run and contains information on each trajectory found. The known object table is a given file with information on the location (RA, dec) of each observation at different time steps.\n", + "\n", + "We also extract the required metadata (global WCS and a list of all observation times) from the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ce8dcad-fa91-4267-b2cf-7c787ac0d665", + "metadata": {}, + "outputs": [], + "source": [ + "# These two files are specific to the UW DEEP reprocessing runs and should be replaced\n", + "# by the user's files of interest.\n", + "res_file = \"/epyc/projects/kbmod/runs/DEEP/results/20190402_A0b_001.results.ecsv\"\n", + "known_file = \"/epyc/projects/kbmod/data/fakes_detections_joined.fits\"\n", + "\n", + "in_results = Results.read_table(res_file)\n", + "print(f\"Loaded a results table with {len(res_file)} entries and columns:\\n{in_results.colnames}\")\n", + "\n", + "wcs = in_results.wcs\n", + "if wcs is None:\n", + " raise ValueError(\"WCS missing from results file.\")\n", + "\n", + "if \"mjd_mid\" in in_results.table.meta:\n", + " obstimes = np.array(in_results.table.meta[\"mjd_mid\"])\n", + "else:\n", + " raise ValueError(\"Metadata 'mjd_mid' missing from results file.\")\n", + "print(f\"Loaded {len(obstimes)} timestamps.\")\n", + "\n", + "known_table = Table.read(known_file)\n", + "print(f\"\\n\\nLoaded a known objects table with {len(known_table)} entries and columns:\\n{known_table.colnames}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c06cc1e2-2186-4418-a746-1bbe5e7bc971", + "metadata": {}, + "source": [ + "We use the `KnownObjsMatcher` to determine which of the found results correspond to previously known objects. `KnownObjsMatcher` provides the ability to match by either the number or ratio of observations that are in close proximity to the known object. Here we use a minimum number with reasonable proximity thresholds in space and time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19eeeb00-b7f2-4969-8737-185f9161b34a", + "metadata": {}, + "outputs": [], + "source": [ + "min_obs = 10\n", + "fake_matcher = KnownObjsMatcher(\n", + " known_table,\n", + " obstimes,\n", + " matcher_name=\"known_matcher\",\n", + " sep_thresh=2.0, # Obs must be within 2 arcsecs.\n", + " time_thresh_s=600.0, # Obs must be within 10 minutes.\n", + " name_col=\"ORBITID\", # For the DEEP-data known objects only.\n", + ")\n", + "\n", + "# First create the matches column.\n", + "fake_matcher.match(in_results, wcs)\n", + "\n", + "# Second filter the matches.\n", + "fake_matcher.match_on_min_obs(in_results, min_obs)\n", + "\n", + "matched_col_name = fake_matcher.match_min_obs_col(min_obs)\n", + "print(f\"Matches stored in column '{matched_col_name}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "2d8faadb-a496-4798-a0df-27712e4db8cd", + "metadata": {}, + "source": [ + "Iterate over the matched column computing a Boolean of whether there was any match (True if the match list is not empty). Add the resulting list as a new \"is_known\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6490988-242e-4a0f-b355-fcb1804d118d", + "metadata": {}, + "outputs": [], + "source": [ + "is_known = ~in_results.is_empty_value(matched_col_name)\n", + "in_results.table[\"is_known\"] = is_known\n", + "matched_count = np.count_nonzero(is_known)\n", + "\n", + "print(f\"Found {matched_count} of the {len(in_results)} results matched known objects.\")" + ] + }, + { + "cell_type": "markdown", + "id": "c4c79978-ed82-4261-abd4-14e745efb09a", + "metadata": {}, + "source": [ + "We could save the resulting joined table using:\n", + "```\n", + "in_results.write_table(output_filename)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56aadd08-aaed-4de5-a9eb-77ecafef1e55", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Jeremy's KBMOD", + "language": "python", + "name": "kbmod_jk" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/kbmod/filters/known_object_filters.py b/src/kbmod/filters/known_object_filters.py index 6c6cdc6ec..a3fa20f8a 100644 --- a/src/kbmod/filters/known_object_filters.py +++ b/src/kbmod/filters/known_object_filters.py @@ -22,9 +22,9 @@ class KnownObjsMatcher: In addition to modifying a KBMOD `Results` table to include columns for matched known objects, it also provides methods for filtering the results based on the matches. This includes - marking observations that matched to known objects as invalid, and filtering out results that matched to known objects by - either the minimum number of observations that matched to that known object or the proportion - of observations from the catalog for that known object that were matched to a given result. + marking observations that matched to known objects as invalid, and filtering out results that matched to + known objects by either the minimum number of observations that matched to that known object or the + proportion of observations from the catalog for that known object that were matched to a given result. """ def __init__( @@ -152,12 +152,13 @@ def to_skycoords(self): def match(self, result_data, wcs): """This function takes a list of results and matches them to known objects. - This modifies the `Results` table by adding a column with name `self.matcher_name` that provides for each result a dictionary mapping the names of known - objects (as defined by the catalog's `name_col`) to a boolean array indicating which observations - in the result matched to that known object. Note that depending on the matching parameters, a result - can match to multiple known objects from the catalog even at the same observation time. + This modifies the `Results` table by adding a column with name `self.matcher_name` that provides for each + result a dictionary mapping the names of known objects (as defined by the catalog's `name_col`) to a boolean + array indicating which observations in the result matched to that known object. Note that depending on the + matching parameters, a result can match to multiple known objects from the catalog even at the same observation time. - So for a dataset with 5 observations a result matching to 2 known objects, A and B, might have an entry in the column `self.matcher_name` like: + So for a dataset with 5 observations a result matching to 2 known objects, A and B, might have an entry in + the column `self.matcher_name` like: ```{ "A": [True, True, False, False, False], "B": [False, False, False, True, True], @@ -192,7 +193,8 @@ def match(self, result_data, wcs): # can use this to later map back to the original index of all observations in the stack. trj_idx_to_obs_idx = np.where(result_data[result_idx]["obs_valid"])[0] - # Now we can compare the SkyCoords of the known objects to the SkyCoords of the result trajectories using search_around_sky + # Now we can compare the SkyCoords of the known objects to the SkyCoords of the result trajectories + # using search_around_sky. # This will return a list of indices of known objects that are within sep_thresh of a trajectory # Note that subsequent calls by default will use the same underlying KD-Tree iin coords2.cache. trjs_idx, known_objs_idx, _, _ = search_around_sky( @@ -297,6 +299,11 @@ def match_on_min_obs( `Results` The modified `Results` object returned for chaining. """ + if self.matcher_name not in result_data.table.colnames: + raise ValueError( + f"Column {self.matcher_name} not found in results table. Please run match() first." + ) + matched_objs = [] for idx in range(len(result_data)): matched_objs.append(set([])) @@ -341,6 +348,11 @@ def match_on_obs_ratio( if obs_ratio < 0 or obs_ratio > 1: raise ValueError("obs_ratio must be within the range [0, 1].") + if self.matcher_name not in result_data.table.colnames: + raise ValueError( + f"Column {self.matcher_name} not found in results table. Please run match() first." + ) + # Create a dictionary of how many observations we have for each known object # in our catalog known_obj_cnts = dict(Counter(self.data[self.name_col])) diff --git a/src/kbmod/results.py b/src/kbmod/results.py index 9cd79c57f..15b6ebac6 100644 --- a/src/kbmod/results.py +++ b/src/kbmod/results.py @@ -480,6 +480,38 @@ def mask_based_on_invalid_obs(self, input_mat, mask_value): masked_mat[~self.table["obs_valid"]] = mask_value return masked_mat + def is_empty_value(self, colname): + """Create a Boolean vector indicating whether the entry in each row + is an 'empty' value (None or anything of length 0). Used to mark or + filter missing values. + + Parameter + --------- + colname : str + The name of the column to check. + + Returns + ------- + result : `numpy.ndarray` + An array of Boolean values indicating whether the result is + one of the empty values. + """ + if colname not in self.table.colnames: + raise KeyError(f"Querying unknown column {colname}") + + # Skip numeric types (integers, floats, etc.) + result = np.full(len(self.table), False) + if np.issubdtype(self.table[colname].dtype, np.number): + return result + + # Go through each entry and check whether it is None or something of length=0. + for idx, val in enumerate(self.table[colname]): + if val is None: + result[idx] = True + elif hasattr(val, "__len__") and len(val) == 0: + result[idx] = True + return result + def filter_rows(self, rows, label=""): """Filter the rows in the `Results` to only include those indices that are provided in a list of row indices (integers) or marked diff --git a/tests/test_known_object_filters.py b/tests/test_known_object_filters.py index a6d2e7b11..c19ad8029 100644 --- a/tests/test_known_object_filters.py +++ b/tests/test_known_object_filters.py @@ -595,15 +595,23 @@ def test_match_min_obs(self): def test_empty_filter_matches(self): # Test that we can filter matches with an empty Results table + empty_res = Results() matcher = KnownObjsMatcher( self.known_objs, self.obstimes, self.matcher_name, ) - # Adds a matching column to our empty table. - empty_res = matcher.match_on_obs_ratio(Results(), 0.5) + + # No matcher_name column in the data. + with self.assertRaises(ValueError): + _ = matcher.match_on_obs_ratio(empty_res, 0.5) + + # Do the match to add the columns. + matcher.match(empty_res, self.wcs) + empty_res = matcher.match_on_obs_ratio(empty_res, 0.5) + + # Test an invalid matching column with self.assertRaises(ValueError): - # Test an inavlid matching column matcher.filter_matches(empty_res, "empty") empty_res = matcher.filter_matches(empty_res, matcher.match_obs_ratio_col(0.5)) @@ -611,17 +619,29 @@ def test_empty_filter_matches(self): def test_empty_get_recovered_objects(self): # Test that we can get recovered objects with an empty Results table + empty_res = Results() matcher = KnownObjsMatcher( self.known_objs, self.obstimes, self.matcher_name, ) - # Adds a matching column to our empty table. - empty_res = matcher.match_on_min_obs(Results(), 5) + + # No matcher_name column in the data. + with self.assertRaises(ValueError): + _ = matcher.match_on_min_obs(empty_res, 5) + + # Do the match to add the columns. + matcher.match(empty_res, self.wcs) + empty_res = matcher.match_on_min_obs(empty_res, 5) + + # Test an invalid matching column with self.assertRaises(ValueError): - # Test an inavlid matching column matcher.get_recovered_objects(empty_res, "empty") recovered, missed = matcher.get_recovered_objects(empty_res, matcher.match_min_obs_col(5)) self.assertEqual(0, len(recovered)) self.assertEqual(0, len(missed)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_results.py b/tests/test_results.py index 84664e277..56bc73d73 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -251,6 +251,33 @@ def test_compute_likelihood_curves(self): ) self.assertTrue(np.array_equal(np.isfinite(lh_mat3), expected)) + def test_is_empty_value(self): + table = Results.from_trajectories(self.trj_list) + + # Create a two new columns: one with integers and the other with meaningless + # index pairs (three of which are empty) + nums_col = [i for i in range(len(table))] + table.table["nums"] = nums_col + + pairs_col = [(i, i + 1) for i in range(len(table))] + pairs_col[1] = None + pairs_col[3] = () + pairs_col[7] = () + table.table["pairs"] = pairs_col + + expected = [False] * len(table) + expected[1] = True + expected[3] = True + expected[7] = True + + # Check that we can tell which entries are empty. + nums_is_empty = table.is_empty_value("nums") + self.assertFalse(np.any(nums_is_empty)) + + pairs_is_empty = table.is_empty_value("pairs") + print(pairs_is_empty) + self.assertTrue(np.array_equal(pairs_is_empty, expected)) + def test_filter_by_index(self): table = Results.from_trajectories(self.trj_list) self.assertEqual(len(table), self.num_entries)