From 45f628171abb9268d9410cc7f50939603ca84109 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 11 Sep 2024 16:59:51 -0400 Subject: [PATCH 01/34] draft implementation using the TempArrayHandler --- .../tests/test_outlier_detection.py | 12 +- jwst/outlier_detection/utils.py | 251 +++++++++--------- 2 files changed, 138 insertions(+), 125 deletions(-) diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index ebd732437b..c2e695eab4 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -600,8 +600,18 @@ def test_create_median(three_sci_as_asn, tmp_cwd): lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True) lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False) + ## make this test meaningful w.r.t. handling of weights + with (lib_on_disk, lib_in_memory): + for lib in [lib_on_disk, lib_in_memory]: + for model in lib: + model.wht = np.ones_like(model.data) + model.wht[4,9] = 0.5 + lib.shelve(model, modify=True) + median_on_disk = create_median(lib_on_disk, 0.7) median_in_memory = create_median(lib_in_memory, 0.7) + assert np.isnan(median_in_memory[4,9]) + # Make sure the median library is the same for on-disk and in-memory - assert np.allclose(median_on_disk, median_in_memory) + assert np.allclose(median_on_disk, median_in_memory, equal_nan=True) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 4abdfbad1b..03548b16d3 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -1,9 +1,10 @@ """ The ever-present utils sub-module. A home for all... """ -import warnings import numpy as np +import tempfile +from pathlib import Path from jwst.lib.pipe_utils import match_nans_and_flags from jwst.resample.resample import compute_image_pixel_area @@ -20,6 +21,112 @@ _ONE_MB = 1 << 20 +# not inheriting from MutableSequence here as insert is complicated +class TempArrayHandler: + def __init__(self, tempdir=""): + self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) + self._temp_path = Path(self._temp_dir.name) + self._filenames = [] + self._data_shape = None + self._data_dtype = None + + @property + def closed(self): + return not hasattr(self, "_temp_dir") + + def close(self): + if self.closed: + return + self._temp_dir.cleanup() + del self._temp_dir + + def __del__(self): + self.close() + + def __len__(self): + if self.closed: + raise Exception("use after close") + return len(self._filenames) + + def __getitem__(self, index): + if self.closed: + raise Exception("use after close") + fn = self._filenames[index] + return np.load(fn) + + def _validate_input(self, arr): + if arr.ndim != 2: + raise Exception(f"Only 2D arrays are supported: {arr.ndim}") + if self._data_shape is None: + self._data_shape = arr.shape + else: + if arr.shape != self._data_shape: + raise Exception( + f"Input shape mismatch: {arr.shape} != {self._data_shape}" + ) + if self._data_dtype is None: + self._data_dtype = arr.dtype + else: + if arr.dtype != self._data_dtype: + raise Exception( + f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}" + ) + + def __setitem__(self, index, value): + self._validate_input(value) + if self.closed: + raise Exception("use after close") + fn = self._filenames[index] + if fn is None: + fn = self._temp_path / f"{index}.npy" + np.save(fn, value, False) + self._filenames[index] = fn + + def append(self, value): + if self.closed: + raise Exception("use after close") + index = len(self) + self._filenames.append(None) + self.__setitem__(index, value) + + def median(self, buffer_size=100 << 20): + if self.closed: + raise Exception("use after close") + if not len(self): + raise Exception("can't take median of empty list") + + # figure out how big the buffer can be + n_arrays = len(self) + allowed_memory_per_array = buffer_size // n_arrays + + n_dim_1 = allowed_memory_per_array // ( + self._data_dtype.itemsize * self._data_shape[0] + ) + if n_dim_1 < 1: + # TODO more useful error message + raise Exception("Not enough memory") + if n_dim_1 >= self._data_shape[1]: + return np.nanmedian(self, axis=0) + + buffer = np.empty( + (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype + ) + median = np.empty(self._data_shape, dtype=self._data_dtype) + + e = n_dim_1 + slices = [slice(0, e)] + while e <= self._data_shape[1]: + s = e + e += n_dim_1 + slices.append(slice(s, min(e, self._data_shape[1]))) + + for s in slices: + for i, arr in enumerate(self): + buffer[i, :, : (s.stop - s.start)] = arr[:, s] + median[:, s] = np.nanmedian(buffer[:, :, : (s.stop - s.start)], axis=0) + return median + + def create_cube_median(cube_model, maskpt): log.info("Computing median") @@ -55,136 +162,32 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0): median_image : ndarray The median image. """ + # initialize storage for median computation + if on_disk: + # harder case: need special on-disk numpy array handling + model_list = TempArrayHandler() + else: + # easier case: all models in library can be loaded into memory at once + model_list = [] + # Compute the weight threshold for each input model - weight_thresholds = [] with resampled_models: for resampled in resampled_models: weight_threshold = compute_weight_threshold(resampled.wht, maskpt) - weight_thresholds.append(weight_threshold) - resampled_models.shelve(resampled, modify=False) - - # compute median over all models + mask = np.less(resampled.wht, weight_threshold) + resampled.data[mask] = np.nan + model_list.append(resampled.data) + # this is still modified if on_disk is False, but doesn't matter because never used again + resampled_models.shelve(resampled, modify=False) + del resampled + if not on_disk: - # easier case: all models in library can be loaded into memory at once - model_list = [] - with resampled_models: - for resampled in resampled_models: - model_list.append(resampled.data) - resampled_models.shelve(resampled, modify=False) - del resampled return np.nanmedian(np.array(model_list), axis=0) else: - # set up buffered access to all input models - with resampled_models: - example_model = resampled_models.borrow(0) - shp = example_model.data.shape - dtype = example_model.data.dtype - nsections, section_nrows = _compute_buffer_indices(example_model, buffer_size) - resampled_models.shelve(example_model, modify=False) - del example_model - - # get spatial sections of library and compute timewise median, one by one - resampled_sections = _get_sections(resampled_models, nsections, section_nrows, shp[0]) - median_image_empty = np.empty(shp, dtype) * np.nan - return _create_median(resampled_sections, resampled_models, weight_thresholds, median_image_empty) - - -def _get_sections(library, nsections, section_nrows, imrows): - """Iterator to return sections from a ModelLibrary. - - Parameters - ---------- - library : ModelLibrary - The input data models. - - nsections : int - The number of spatial sections in each model - - section_nrows : int - The number of rows in each section - - imrows : int - The total number of rows in the image - - Yields - ------ - data_subset : ndarray - array of shape (len(library), section_nrows, ncols) representing a spatial - subset of all the data arrays in the library - - weight_subset : ndarray - weights corresponding to data_list - - row_range : tuple - The range of rows in the image covered by the data arrays - """ - with library: - example_model = library.borrow(0) - dtype = example_model.data.dtype - dtype_wht = example_model.wht.dtype - shp = example_model.data.shape - library.shelve(example_model, 0, modify=False) - del example_model - for i in range(nsections): - row1 = i * section_nrows - row2 = min(row1 + section_nrows, imrows) - - data_subset = np.empty((len(library), row2 - row1, shp[1]), dtype) - weight_subset = np.empty((len(library), row2 - row1, shp[1]), dtype_wht) - with library: - for j, model in enumerate(library): - data_subset[j] = model.data[row1:row2] - weight_subset[j] = model.wht[row1:row2] - library.shelve(model, j, modify=False) - del model - yield (data_subset, weight_subset, (row1, row2)) - - -def _compute_buffer_indices(model, buffer_size=None): - - imrows, imcols = model.data.shape - data_item_size = model.data.itemsize - min_buffer_size = imcols * data_item_size - buffer_size = min_buffer_size if buffer_size is None else (buffer_size * _ONE_MB) - section_nrows = min(imrows, int(buffer_size // min_buffer_size)) - if section_nrows == 0: - buffer_size = min_buffer_size - log.warning("WARNING: Buffer size is too small to hold a single row." - f"Increasing buffer size to {buffer_size / _ONE_MB}MB") - section_nrows = 1 - nsections = int(np.ceil(imrows / section_nrows)) - return nsections, section_nrows - - -def _create_median(resampled_sections, resampled_models, weight_thresholds, median_image_empty): - median_image = median_image_empty - for (resampled_sci, resampled_weight, (row1, row2)) in resampled_sections: - # Create a mask for each input image, masking out areas where there is - # no data or the data has very low weight - badmasks = [] - for weight, weight_threshold in zip(resampled_weight, weight_thresholds): - badmask = np.less(weight, weight_threshold) - log.debug("Percentage of pixels with low weight: {}".format( - np.sum(badmask) / len(weight.flat) * 100)) - badmasks.append(badmask) - - # Fill resampled_sci array with nan's where mask values are True - for f1, f2 in zip(resampled_sci, badmasks): - for elem1, elem2 in zip(f1, f2): - elem1[elem2] = np.nan - - del badmasks - - # For a stack of images with "bad" data replaced with Nan - # use np.nanmedian to compute the median. - with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", - message="All-NaN slice encountered", - category=RuntimeWarning) - median_image[row1:row2] = np.nanmedian(resampled_sci, axis=0) - del resampled_sci, resampled_weight - - return median_image + # compute median using on-disk arrays + median_data = model_list.median(buffer_size=buffer_size*_ONE_MB) + model_list.close() + return median_data def flag_crs_in_models( From a08277f3cd4e5ef64f530281af7a29a93637fb18 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 12 Sep 2024 12:35:20 -0400 Subject: [PATCH 02/34] added nicer errors and type hints to TempArrayHandler --- jwst/outlier_detection/utils.py | 52 +++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 03548b16d3..a67a33ce08 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -21,9 +21,19 @@ _ONE_MB = 1 << 20 +class TempArrayHandlerError(Exception): + """Generic error for TempArrayHandler.""" + pass + +class UseAfterCloseError(TempArrayHandlerError): + def __init__(self, msg="Attempted use after close", *args, **kwargs): + super().__init__(msg, *args, **kwargs) + # not inheriting from MutableSequence here as insert is complicated class TempArrayHandler: - def __init__(self, tempdir=""): + """Handler for operations on a list of 2-D numpy arrays that are too large + to all fit in memory at once.""" + def __init__(self, tempdir: str=""): self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) self._temp_path = Path(self._temp_dir.name) self._filenames = [] @@ -43,57 +53,57 @@ def close(self): def __del__(self): self.close() - def __len__(self): + def __len__(self) -> int: if self.closed: - raise Exception("use after close") + raise UseAfterCloseError() return len(self._filenames) - def __getitem__(self, index): + def __getitem__(self, index: int) -> np.ndarray: if self.closed: - raise Exception("use after close") + raise UseAfterCloseError() fn = self._filenames[index] return np.load(fn) - def _validate_input(self, arr): + def _validate_input(self, arr: np.ndarray): if arr.ndim != 2: - raise Exception(f"Only 2D arrays are supported: {arr.ndim}") + raise TempArrayHandlerError(f"Only 2D arrays are supported: {arr.ndim}") if self._data_shape is None: self._data_shape = arr.shape else: if arr.shape != self._data_shape: - raise Exception( + raise TempArrayHandlerError( f"Input shape mismatch: {arr.shape} != {self._data_shape}" ) if self._data_dtype is None: self._data_dtype = arr.dtype else: if arr.dtype != self._data_dtype: - raise Exception( + raise TempArrayHandlerError( f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}" ) - def __setitem__(self, index, value): + def __setitem__(self, index: int, value: np.ndarray): self._validate_input(value) if self.closed: - raise Exception("use after close") + raise UseAfterCloseError() fn = self._filenames[index] if fn is None: fn = self._temp_path / f"{index}.npy" np.save(fn, value, False) self._filenames[index] = fn - def append(self, value): + def append(self, value: np.ndarray): if self.closed: - raise Exception("use after close") + raise UseAfterCloseError() index = len(self) self._filenames.append(None) self.__setitem__(index, value) - def median(self, buffer_size=100 << 20): + def median(self, buffer_size: int=100 << 20) -> np.ndarray: if self.closed: - raise Exception("use after close") + raise UseAfterCloseError() if not len(self): - raise Exception("can't take median of empty list") + raise TempArrayHandlerError("Can't take median of empty list") # figure out how big the buffer can be n_arrays = len(self) @@ -104,9 +114,13 @@ def median(self, buffer_size=100 << 20): ) if n_dim_1 < 1: # TODO more useful error message - raise Exception("Not enough memory") + msg = f"Memory buffer {allowed_memory_per_array} is too small to hold "+ \ + f"a single row of length {self._data_shape[0]}" + raise TempArrayHandlerError(msg) if n_dim_1 >= self._data_shape[1]: return np.nanmedian(self, axis=0) + log.info(f"Computing median in {int(self._data_shape[1]/n_dim_1)} chunks"+ \ + f"of size {(n_dim_1, self._data_shape[0])}") buffer = np.empty( (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype @@ -139,7 +153,7 @@ def create_cube_median(cube_model, maskpt): return median -def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0): +def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0): """Create a median image from the singly resampled images. Parameters @@ -185,7 +199,7 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0): return np.nanmedian(np.array(model_list), axis=0) else: # compute median using on-disk arrays - median_data = model_list.median(buffer_size=buffer_size*_ONE_MB) + median_data = model_list.median(buffer_size=int(buffer_size*_ONE_MB)) model_list.close() return median_data From d11b6f93344ff7fb43b42c626ebe6e8b15aacd73 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 12 Sep 2024 16:20:22 -0400 Subject: [PATCH 03/34] first draft of appending to disk --- .../tests/test_outlier_detection.py | 2 +- jwst/outlier_detection/utils.py | 310 ++++++++++-------- 2 files changed, 173 insertions(+), 139 deletions(-) diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index c2e695eab4..18270aefc0 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -109,7 +109,7 @@ def we_many_sci( ): """Provide numsci science images with different noise but identical source and same background level""" - shape = (20, 20) + shape = (21, 20) sci1 = datamodels.ImageModel(shape) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index a67a33ce08..d47f43847f 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -2,6 +2,7 @@ The ever-present utils sub-module. A home for all... """ +import warnings import numpy as np import tempfile from pathlib import Path @@ -21,124 +22,34 @@ _ONE_MB = 1 << 20 -class TempArrayHandlerError(Exception): - """Generic error for TempArrayHandler.""" - pass - -class UseAfterCloseError(TempArrayHandlerError): - def __init__(self, msg="Attempted use after close", *args, **kwargs): - super().__init__(msg, *args, **kwargs) - -# not inheriting from MutableSequence here as insert is complicated -class TempArrayHandler: - """Handler for operations on a list of 2-D numpy arrays that are too large - to all fit in memory at once.""" - def __init__(self, tempdir: str=""): +class DiskAppendableArray: + + def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"): self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) self._temp_path = Path(self._temp_dir.name) - self._filenames = [] - self._data_shape = None - self._data_dtype = None + self._slice_shape = slice_shape + self._dtype = dtype + self._filename = self._temp_path / Path(filestem + ".bits") + self._append_count = 0 + with open(self._filename, 'wb') as f: + pass @property - def closed(self): - return not hasattr(self, "_temp_dir") - - def close(self): - if self.closed: - return - self._temp_dir.cleanup() - del self._temp_dir - - def __del__(self): - self.close() - - def __len__(self) -> int: - if self.closed: - raise UseAfterCloseError() - return len(self._filenames) - - def __getitem__(self, index: int) -> np.ndarray: - if self.closed: - raise UseAfterCloseError() - fn = self._filenames[index] - return np.load(fn) - - def _validate_input(self, arr: np.ndarray): - if arr.ndim != 2: - raise TempArrayHandlerError(f"Only 2D arrays are supported: {arr.ndim}") - if self._data_shape is None: - self._data_shape = arr.shape - else: - if arr.shape != self._data_shape: - raise TempArrayHandlerError( - f"Input shape mismatch: {arr.shape} != {self._data_shape}" - ) - if self._data_dtype is None: - self._data_dtype = arr.dtype - else: - if arr.dtype != self._data_dtype: - raise TempArrayHandlerError( - f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}" - ) - - def __setitem__(self, index: int, value: np.ndarray): - self._validate_input(value) - if self.closed: - raise UseAfterCloseError() - fn = self._filenames[index] - if fn is None: - fn = self._temp_path / f"{index}.npy" - np.save(fn, value, False) - self._filenames[index] = fn - - def append(self, value: np.ndarray): - if self.closed: - raise UseAfterCloseError() - index = len(self) - self._filenames.append(None) - self.__setitem__(index, value) - - def median(self, buffer_size: int=100 << 20) -> np.ndarray: - if self.closed: - raise UseAfterCloseError() - if not len(self): - raise TempArrayHandlerError("Can't take median of empty list") - - # figure out how big the buffer can be - n_arrays = len(self) - allowed_memory_per_array = buffer_size // n_arrays - - n_dim_1 = allowed_memory_per_array // ( - self._data_dtype.itemsize * self._data_shape[0] - ) - if n_dim_1 < 1: - # TODO more useful error message - msg = f"Memory buffer {allowed_memory_per_array} is too small to hold "+ \ - f"a single row of length {self._data_shape[0]}" - raise TempArrayHandlerError(msg) - if n_dim_1 >= self._data_shape[1]: - return np.nanmedian(self, axis=0) - log.info(f"Computing median in {int(self._data_shape[1]/n_dim_1)} chunks"+ \ - f"of size {(n_dim_1, self._data_shape[0])}") - - buffer = np.empty( - (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype - ) - median = np.empty(self._data_shape, dtype=self._data_dtype) - - e = n_dim_1 - slices = [slice(0, e)] - while e <= self._data_shape[1]: - s = e - e += n_dim_1 - slices.append(slice(s, min(e, self._data_shape[1]))) - - for s in slices: - for i, arr in enumerate(self): - buffer[i, :, : (s.stop - s.start)] = arr[:, s] - median[:, s] = np.nanmedian(buffer[:, :, : (s.stop - s.start)], axis=0) - return median + def shape(self): + return (self._append_count,) + self._slice_shape + + def append(self, data): + if data.shape != self._slice_shape: + raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}") + if data.dtype != self._dtype: + raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}") + with open(self._filename, 'ab') as f: + f.write(data.tobytes()) + self._append_count += 1 + + def read(self): + shp = (self._append_count,) + self._slice_shape + return np.fromfile(self._filename, dtype=self._dtype).reshape(shp) def create_cube_median(cube_model, maskpt): @@ -153,7 +64,7 @@ def create_cube_median(cube_model, maskpt): return median -def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0): +def create_median(resampled_models, maskpt, on_disk=True): """Create a median image from the singly resampled images. Parameters @@ -167,41 +78,164 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0): on_disk : bool If True, the input models are on disk and will be read in chunks. - buffer_size : float - The size of chunk in MB, per input model, that will be read into memory. - This parameter has no effect if on_disk is False. - Returns ------- median_image : ndarray The median image. """ - # initialize storage for median computation - if on_disk: - # harder case: need special on-disk numpy array handling - model_list = TempArrayHandler() - else: - # easier case: all models in library can be loaded into memory at once - model_list = [] - # Compute the weight threshold for each input model + weight_thresholds = [] with resampled_models: for resampled in resampled_models: weight_threshold = compute_weight_threshold(resampled.wht, maskpt) - mask = np.less(resampled.wht, weight_threshold) - resampled.data[mask] = np.nan - model_list.append(resampled.data) - # this is still modified if on_disk is False, but doesn't matter because never used again - resampled_models.shelve(resampled, modify=False) - del resampled - + weight_thresholds.append(weight_threshold) + resampled_models.shelve(resampled, modify=False) + + # compute median over all models if not on_disk: + # easier case: all models in library can be loaded into memory at once + model_list = [] + with resampled_models: + for resampled in resampled_models: + model_list.append(resampled.data) + resampled_models.shelve(resampled, modify=False) + del resampled return np.nanmedian(np.array(model_list), axis=0) else: - # compute median using on-disk arrays - median_data = model_list.median(buffer_size=int(buffer_size*_ONE_MB)) - model_list.close() - return median_data + # set up buffered access to all input models + # get spatial sections of library and compute timewise median, one by one + resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds) + return _create_median(resampled_sections, row_indices) + + +def _write_sections(library, weight_thresholds): + """Iterator to return sections from a ModelLibrary. + + Parameters + ---------- + library : ModelLibrary + The input data models. + + weight_thresholds : list + The weight thresholds for masking out low weight pixels. + + Returns + ------- + temp_arrays : list + A list of DiskAppendableArray objects, each holding a spatial section + of every input model, stacked along the time axis. + + row_indices : list + A list of tuples, each containing the start and end row indices of the + spatial section in the original data model. + """ + + with library: + + # get an example model to determine dtype, shape, buffer indices + example_model = library.borrow(0) + dtype = example_model.data.dtype + shp = example_model.data.shape + itemsize = example_model.data.itemsize + imrows = shp[0] + + # reasonable total buffer size is the size of each input datamodel + # since that is the bare minimum that must be loaded into memory in the first place + # for now just use the size of model.data + total_buffer_size = shp[0] * shp[1] * itemsize + per_model_buffer_size = total_buffer_size / len(library) + nsections, section_nrows = _compute_buffer_indices(shp, itemsize, per_model_buffer_size) + log.info(f"Computing median in {nsections} sections each of size: {total_buffer_size / _ONE_MB} MB") + library.shelve(example_model, 0, modify=False) + del example_model + + # set up temp file handlers for each section + # handle zeroth section separately just to get the tempdir for the rest + slice_shape = (section_nrows, shp[1]) + arr0 = DiskAppendableArray(slice_shape, filestem="section0", dtype=dtype) + tempdir = arr0._temp_dir.name + temp_arrays = [arr0,] + for i in range(1, nsections-1): + arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype) + temp_arrays.append(arr) + + # handle the last section separately because it has a different shape + slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1]) + arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype) + temp_arrays.append(arrn) + + # now append data from each model to all the sections + row_indices = [] + for j, model in enumerate(library): + for i in range(nsections): + row1 = i * section_nrows + row2 = min(row1 + section_nrows, imrows) + if j == 0: + row_indices.append((row1, row2)) + arr = temp_arrays[i] + sci = model.data[row1:row2] + + # handle weight thresholding right here, while array is open + thresh = weight_thresholds[j] + sci[model.wht[row1:row2] < thresh] = np.nan + arr.append(sci) + + library.shelve(model, j, modify=False) + del model + + return temp_arrays, row_indices + + +def _compute_buffer_indices(shape, itemsize, buffer_size): + + imrows, imcols = shape + min_buffer_size = imcols * itemsize + section_nrows = min(imrows, int(buffer_size // min_buffer_size)) + + if section_nrows == 0: + buffer_size = min_buffer_size + log.warning("WARNING: Buffer size is too small to hold a single row." + f"Increasing buffer size to {buffer_size / _ONE_MB}MB") + section_nrows = 1 + + nsections = int(np.ceil(imrows / section_nrows)) + return nsections, section_nrows + + +def _create_median(resampled_sections, row_indices): + """ + Parameters + ---------- + resampled_sections : list + List of DiskAppendableArray objects, each holding a spatial section + of every input model, stacked along the time axis. + + row_indices : list + List of tuples, each containing the start and end row indices of the + spatial section in the original data model. + + Returns + ------- + median_image : ndarray + The median image. + """ + + dtype = resampled_sections[0]._dtype + output_rows = row_indices[-1][1] + output_cols = resampled_sections[0].shape[2] + median_image = np.empty((output_rows, output_cols), dtype) * np.nan + + for i, disk_arr in enumerate(resampled_sections): + row1, row2 = row_indices[i] + arr = disk_arr.read() + with warnings.catch_warnings(): + warnings.filterwarnings(action="ignore", + message="All-NaN slice encountered", + category=RuntimeWarning) + median_image[row1:row2] = np.nanmedian(arr, axis=0) + del arr, disk_arr + + return median_image def flag_crs_in_models( From 131cfecdd86d905078c732b739d93dacc2eb41fb Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 12 Sep 2024 18:12:17 -0400 Subject: [PATCH 04/34] add weight handling, change the way buffer compute works --- jwst/outlier_detection/imaging.py | 13 ++++- jwst/outlier_detection/spec.py | 10 +++- .../tests/test_outlier_detection.py | 7 ++- jwst/outlier_detection/utils.py | 55 ++++++++++++++----- 4 files changed, 66 insertions(+), 19 deletions(-) diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 5678b45802..b7817797e2 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -100,7 +100,18 @@ def detect_outliers( input_models.shelve(model, modify=True) # Perform median combination on set of drizzled mosaics - median_data = create_median(drizzled_models, maskpt) + if in_memory: + buffer_size = None + else: + # reasonable buffer size is the size of an input model, since that is + # the smallest theoretical memory footprint + with input_models: + example_model = input_models.borrow(0) + buffer_size = example_model.data.size * example_model.data.itemsize + input_models.shelve(example_model, modify=False) + del example_model + + median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size) if save_intermediate_results: # make a median model diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index c482aad8b9..205c8576bb 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -93,7 +93,15 @@ def detect_outliers( # Perform median combination on set of drizzled mosaics # create_median should be called as a method from parent class - median_data = create_median(drizzled_models, maskpt) + if in_memory: + buffer_size = None + else: + # reasonable buffer size is the size of an input model, since that is + # the smallest theoretical memory footprint + example_model = input_models[0] + buffer_size = example_model.data.size * example_model.data.itemsize + del example_model + median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size) if save_intermediate_results: # Initialize intermediate products used in the outlier detection diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 22c75d1213..61f863029d 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -33,7 +33,7 @@ @pytest.fixture def sci_blot_image_pair(): """Provide a science and blotted ImageModel pair.""" - shape = (20, 20) + shape = (23, 20) sigma = 0.02 background = 3 @@ -608,7 +608,10 @@ def test_create_median(three_sci_as_asn, tmp_cwd): model.wht[4,9] = 0.5 lib.shelve(model, modify=True) - median_on_disk = create_median(lib_on_disk, 0.7) + # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels + # arbitrarily use 5 times that + buffer_size = 4 * 20 * 5 + median_on_disk = create_median(lib_on_disk, 0.7, buffer_size=buffer_size) median_in_memory = create_median(lib_in_memory, 0.7) assert np.isnan(median_in_memory[4,9]) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index e546e85116..fa4ee44385 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -31,7 +31,7 @@ def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"): self._dtype = dtype self._filename = self._temp_path / Path(filestem + ".bits") self._append_count = 0 - with open(self._filename, 'wb') as f: + with open(self._filename, 'wb') as f: # Noqa: F841 pass @property @@ -64,7 +64,7 @@ def create_cube_median(cube_model, maskpt): return median -def create_median(resampled_models, maskpt): +def create_median(resampled_models, maskpt, buffer_size=None): """Create a median image from the singly resampled images. Parameters @@ -75,12 +75,17 @@ def create_median(resampled_models, maskpt): maskpt : float The weight threshold for masking out low weight pixels. + buffer_size : int + The buffer size for the median computation, units of bytes. + Returns ------- median_image : ndarray The median image. """ on_disk = resampled_models._on_disk + if on_disk and buffer_size is None: + raise ValueError("Buffer size must be provided when resampled models are on disk") # Compute the weight threshold for each input model weight_thresholds = [] @@ -109,11 +114,11 @@ def create_median(resampled_models, maskpt): else: # set up buffered access to all input models # get spatial sections of library and compute timewise median, one by one - resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds) + resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds, buffer_size) return _create_median(resampled_sections, row_indices) -def _write_sections(library, weight_thresholds): +def _write_sections(library, weight_thresholds, buffer_size): """Iterator to return sections from a ModelLibrary. Parameters @@ -124,6 +129,9 @@ def _write_sections(library, weight_thresholds): weight_thresholds : list The weight thresholds for masking out low weight pixels. + buffer_size : int + The buffer size for the median computation, units of bytes. + Returns ------- temp_arrays : list @@ -144,13 +152,8 @@ def _write_sections(library, weight_thresholds): itemsize = example_model.data.itemsize imrows = shp[0] - # reasonable total buffer size is the size of each input datamodel - # since that is the bare minimum that must be loaded into memory in the first place - # for now just use the size of model.data - total_buffer_size = shp[0] * shp[1] * itemsize - per_model_buffer_size = total_buffer_size / len(library) - nsections, section_nrows = _compute_buffer_indices(shp, itemsize, per_model_buffer_size) - log.info(f"Computing median in {nsections} sections each of size: {total_buffer_size / _ONE_MB} MB") + # compute buffer indices + nsections, section_nrows = _compute_buffer_indices((len(library),)+shp, itemsize, buffer_size) library.shelve(example_model, 0, modify=False) del example_model @@ -192,18 +195,40 @@ def _write_sections(library, weight_thresholds): def _compute_buffer_indices(shape, itemsize, buffer_size): + """ + Parameters + ---------- + shape : tuple + The shape of the full input, ie, (n_images, imrows, imcols). + + itemsize : int + The size of a single array element in bytes. + + buffer_size : int + The buffer size for the median computation, units of bytes. + + Returns + ------- + nsections : int + The number of sections to divide the input data into. + + section_nrows : int + The number of rows in each section (except the last one). + """ - imrows, imcols = shape + nimages, imrows, imcols = shape + per_model_buffer_size = buffer_size / nimages min_buffer_size = imcols * itemsize - section_nrows = min(imrows, int(buffer_size // min_buffer_size)) + section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size)) if section_nrows == 0: - buffer_size = min_buffer_size + buffer_size = min_buffer_size * nimages log.warning("WARNING: Buffer size is too small to hold a single row." - f"Increasing buffer size to {buffer_size / _ONE_MB}MB") + f"Increasing buffer size to {buffer_size / _ONE_MB}MB per model") section_nrows = 1 nsections = int(np.ceil(imrows / section_nrows)) + log.info(f"Computing median over {nimages} images in {nsections} sections with total memory buffer {buffer_size / _ONE_MB} MB") return nsections, section_nrows From a8c4c12d0fbb7d953eee792233cebc13b88a4e04 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 12 Sep 2024 18:18:57 -0400 Subject: [PATCH 05/34] small cleanups from code self-review --- jwst/outlier_detection/tests/test_outlier_detection.py | 2 +- jwst/outlier_detection/utils.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 61f863029d..0806dd42e7 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -33,7 +33,7 @@ @pytest.fixture def sci_blot_image_pair(): """Provide a science and blotted ImageModel pair.""" - shape = (23, 20) + shape = (20, 20) sigma = 0.02 background = 3 diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index fa4ee44385..8928c9b453 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -119,7 +119,8 @@ def create_median(resampled_models, maskpt, buffer_size=None): def _write_sections(library, weight_thresholds, buffer_size): - """Iterator to return sections from a ModelLibrary. + """Write spatial sections from a ModelLibrary into temporary files + grouped along the time axis. Parameters ---------- @@ -224,7 +225,7 @@ def _compute_buffer_indices(shape, itemsize, buffer_size): if section_nrows == 0: buffer_size = min_buffer_size * nimages log.warning("WARNING: Buffer size is too small to hold a single row." - f"Increasing buffer size to {buffer_size / _ONE_MB}MB per model") + f"Increasing buffer size to {buffer_size / _ONE_MB}MB") section_nrows = 1 nsections = int(np.ceil(imrows / section_nrows)) From ac1a932aeea898f92298537d76dbeb8e0f57107f Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 13 Sep 2024 10:10:40 -0400 Subject: [PATCH 06/34] bugfix for failing regtest and additional docstrings --- jwst/outlier_detection/utils.py | 74 +++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 8928c9b453..7ff32642eb 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -23,8 +23,39 @@ class DiskAppendableArray: + """ + Creates a temporary file to which to append data, in order to perform + timewise operations on a stack of input images without holding all of them + in memory. + + This class is purpose-built for the median computation during outlier detection + and is not very flexible. It is assumed that each data array passed to `append` + represents the same spatial segment of the full dataset. It is also assumed that + each data array passed to `append` represents only a single instant in time; + the append operation will stack them along a new axis. + + The `read` operation is only capable of loading the full array back into memory. + When working with large datasets that do not fit in memory, the + required workflow is to create many DiskAppendableArray objects, each holding + a small spatial segment of the full dataset. + """ def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"): + """ + Parameters + ---------- + slice_shape : tuple + The shape of the spatial section of input data to be appended to the array. + + dtype : str + The data type of the array. + + tempdir : str + The directory in which to create the temporary files. + + filestem : str + The stem of the temporary file name. The extension ".bits" will be added. + """ self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) self._temp_path = Path(self._temp_dir.name) self._slice_shape = slice_shape @@ -84,28 +115,24 @@ def create_median(resampled_models, maskpt, buffer_size=None): The median image. """ on_disk = resampled_models._on_disk - if on_disk and buffer_size is None: - raise ValueError("Buffer size must be provided when resampled models are on disk") + if (on_disk) and (buffer_size is None): + raise ValueError("Buffer size for median calculation must be provided when models are on disk") - # Compute the weight threshold for each input model - weight_thresholds = [] + # Compute weight threshold for each input model and set NaN in data where weight is below threshold model_list = [] with resampled_models: for resampled in resampled_models: weight_threshold = compute_weight_threshold(resampled.wht, maskpt) + resampled.data[resampled.wht < weight_threshold] = np.nan if not on_disk: - # handle weights right away for in-memory case - data = resampled.data.copy() - data[resampled.wht < weight_threshold] = np.nan - model_list.append(data) - del data + model_list.append(resampled.data) + resampled_models.shelve(resampled, modify=False) else: - weight_thresholds.append(weight_threshold) - resampled_models.shelve(resampled, modify=False) + resampled_models.shelve(resampled, modify=True) del resampled - # easier case: all models in library can be loaded into memory at once if not on_disk: + # easier case: all models in library can be loaded into memory at once with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", @@ -114,11 +141,11 @@ def create_median(resampled_models, maskpt, buffer_size=None): else: # set up buffered access to all input models # get spatial sections of library and compute timewise median, one by one - resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds, buffer_size) + resampled_sections, row_indices = _write_sections(resampled_models, buffer_size) return _create_median(resampled_sections, row_indices) -def _write_sections(library, weight_thresholds, buffer_size): +def _write_sections(library, buffer_size): """Write spatial sections from a ModelLibrary into temporary files grouped along the time axis. @@ -127,9 +154,6 @@ def _write_sections(library, weight_thresholds, buffer_size): library : ModelLibrary The input data models. - weight_thresholds : list - The weight thresholds for masking out low weight pixels. - buffer_size : int The buffer size for the median computation, units of bytes. @@ -168,10 +192,11 @@ def _write_sections(library, weight_thresholds, buffer_size): arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype) temp_arrays.append(arr) - # handle the last section separately because it has a different shape - slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1]) - arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype) - temp_arrays.append(arrn) + if nsections > 1: + # handle the last section separately because it has a different shape + slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1]) + arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype) + temp_arrays.append(arrn) # now append data from each model to all the sections row_indices = [] @@ -182,12 +207,7 @@ def _write_sections(library, weight_thresholds, buffer_size): if j == 0: row_indices.append((row1, row2)) arr = temp_arrays[i] - sci = model.data[row1:row2] - - # handle weight thresholding right here, while array is open - thresh = weight_thresholds[j] - sci[model.wht[row1:row2] < thresh] = np.nan - arr.append(sci) + arr.append(model.data[row1:row2]) library.shelve(model, j, modify=False) del model From 177ab426740065bc1552fd98b79dce4d66de4a5d Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 13 Sep 2024 10:36:05 -0400 Subject: [PATCH 07/34] added changelog entry --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 4108daff15..044b600f6a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -528,6 +528,9 @@ outlier_detection - Fixed a bug that caused different results from the median calculation when the `in_memory` parameter was set to `True` vs `False`. [#8777] +- Decrease the amount of file I/O required to compute the median when `in_memory` + is set to `False`. [#8782] + pathloss -------- From 51f126e163e11fb8d0810471f3261b120099b85d Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 13 Sep 2024 16:51:44 -0400 Subject: [PATCH 08/34] attempted decrease memory usage by allocating cube first --- jwst/outlier_detection/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 7ff32642eb..49bdec5618 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -119,13 +119,15 @@ def create_median(resampled_models, maskpt, buffer_size=None): raise ValueError("Buffer size for median calculation must be provided when models are on disk") # Compute weight threshold for each input model and set NaN in data where weight is below threshold - model_list = [] with resampled_models: - for resampled in resampled_models: + for i, resampled in enumerate(resampled_models): weight_threshold = compute_weight_threshold(resampled.wht, maskpt) resampled.data[resampled.wht < weight_threshold] = np.nan if not on_disk: - model_list.append(resampled.data) + if i == 0: + model_cube = np.empty((len(resampled_models),) + resampled.data.shape, + dtype=resampled.data.dtype) + model_cube[i] = resampled.data resampled_models.shelve(resampled, modify=False) else: resampled_models.shelve(resampled, modify=True) @@ -137,7 +139,7 @@ def create_median(resampled_models, maskpt, buffer_size=None): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", category=RuntimeWarning) - return np.nanmedian(np.array(model_list), axis=0) + return np.nanmedian(model_cube, axis=0) else: # set up buffered access to all input models # get spatial sections of library and compute timewise median, one by one From 1fdfee4deada2664110351794d40277e13941ce8 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Mon, 16 Sep 2024 13:34:50 -0400 Subject: [PATCH 09/34] refactor to allow adding models to median computer one by one --- jwst/outlier_detection/imaging.py | 13 +- jwst/outlier_detection/spec.py | 11 +- jwst/outlier_detection/tests/test_utils.py | 106 +++++++ jwst/outlier_detection/utils.py | 305 ++++++++++----------- 4 files changed, 259 insertions(+), 176 deletions(-) create mode 100644 jwst/outlier_detection/tests/test_utils.py diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index b7817797e2..5678b45802 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -100,18 +100,7 @@ def detect_outliers( input_models.shelve(model, modify=True) # Perform median combination on set of drizzled mosaics - if in_memory: - buffer_size = None - else: - # reasonable buffer size is the size of an input model, since that is - # the smallest theoretical memory footprint - with input_models: - example_model = input_models.borrow(0) - buffer_size = example_model.data.size * example_model.data.itemsize - input_models.shelve(example_model, modify=False) - del example_model - - median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size) + median_data = create_median(drizzled_models, maskpt) if save_intermediate_results: # make a median model diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index 205c8576bb..ad8e712d4c 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -92,16 +92,7 @@ def detect_outliers( median_wcs = copy.deepcopy(input_models[0].meta.wcs) # Perform median combination on set of drizzled mosaics - # create_median should be called as a method from parent class - if in_memory: - buffer_size = None - else: - # reasonable buffer size is the size of an input model, since that is - # the smallest theoretical memory footprint - example_model = input_models[0] - buffer_size = example_model.data.size * example_model.data.itemsize - del example_model - median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size) + median_data = create_median(drizzled_models, maskpt) if save_intermediate_results: # Initialize intermediate products used in the outlier detection diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py new file mode 100644 index 0000000000..318384a2c8 --- /dev/null +++ b/jwst/outlier_detection/tests/test_utils.py @@ -0,0 +1,106 @@ +import pytest +from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskTimewiseOperation +from pathlib import Path +import numpy as np +import os + + +def test_disk_appendable_array(tmp_cwd): + + slice_shape = (8,7) + dtype = 'float32' + tempdir = tmp_cwd / Path("tmptest") + os.mkdir(tempdir) + + arr = DiskAppendableArray(slice_shape, dtype=dtype, tempdir=tempdir) + + # check temporary file setup + assert arr._filename.split("/")[-1] in os.listdir(tempdir) + assert len(os.listdir(tempdir)) == 1 + assert arr.shape == (0,) + slice_shape + + # check cwd contains no files + assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)]) + + # check expected append failures + with pytest.raises(ValueError): + candidate = np.zeros((7,7), dtype=dtype) + arr.append(candidate) + with pytest.raises(ValueError): + candidate = np.zeros((8,7), dtype='float64') + arr.append(candidate) + + # check append and read + candidate0 = np.zeros(slice_shape, dtype=dtype) + candidate1 = np.full(slice_shape, 2, dtype=dtype) + candidate2 = np.full(slice_shape, np.nan, dtype=dtype) + for candidate in [candidate0, candidate1, candidate2]: + arr.append(candidate) + + arr_in_memory = arr.read() + + assert arr_in_memory.shape == (3,) + slice_shape + assert np.all(arr_in_memory[0] == candidate0) + assert np.all(arr_in_memory[1] == candidate1) + assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True) + + # check cleanup occurs after reassigning arr + arr = None + assert len(os.listdir(tempdir)) == 0 + + + +def test_on_disk_median(tmp_cwd): + + library_length = 3 + frame_shape = (21,20) + dtype = 'float32' + tempdir = tmp_cwd / Path("tmptest") + os.mkdir(tempdir) + shape = (library_length,) + frame_shape + + median_computer = OnDiskTimewiseOperation(shape, dtype=dtype, tempdir=tempdir) + + # test compute buffer indices + # buffer size equals size of single input model by default + # which means we expect same number of sections as library length + # in reality there is often one more section than that because + # of necessity of integer number of rows per section, but math is exact in this case + expected_buffer_size = frame_shape[0] * frame_shape[1] * np.dtype(dtype).itemsize + expected_section_nrows = frame_shape[0] // library_length + assert median_computer.nsections == library_length + assert median_computer.section_nrows == expected_section_nrows + assert median_computer.buffer_size == expected_buffer_size + + # test temp file setup + assert len(os.listdir(tempdir)) == 1 + assert str(median_computer._temp_path).startswith(str(tempdir)) + assert len(os.listdir(median_computer._temp_path)) == library_length + # check cwd and parent tempdir contain no files + assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)]) + assert all([not os.path.isfile(f) for f in os.listdir(tempdir)]) + + # test validate data + with pytest.raises(ValueError): + candidate = np.zeros((20,20), dtype=dtype) + median_computer.add_image(candidate) + with pytest.raises(ValueError): + candidate = np.zeros((21,20), dtype='float64') + median_computer.add_image(candidate) + + # test add and compute + candidate0 = np.full(frame_shape, 3, dtype=dtype) + candidate1 = np.full(frame_shape, 2, dtype=dtype) + candidate2 = np.full(frame_shape, np.nan, dtype=dtype) + for candidate in [candidate0, candidate1, candidate2]: + median_computer.add_image(candidate) + median = median_computer.compute_median() + assert median.shape == frame_shape + assert np.all(median == 2.5) + + # test expected error trying to add too many frames + # for loop to ensure always happens, not just the first time + candidate3 = np.zeros_like(candidate0) + for _ in range(2): + with pytest.raises(IndexError): + median_computer.add_image(candidate3) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 49bdec5618..309ecf1451 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -22,6 +22,27 @@ _ONE_MB = 1 << 20 +def _nanmedian3D(cube, overwrite_input=True): + """Convenience function to compute the median of a cube ignoring warnings + and setting memory-efficient flag""" + with warnings.catch_warnings(): + warnings.filterwarnings(action="ignore", + message="All-NaN slice encountered", + category=RuntimeWarning) + return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input) + + +def create_cube_median(cube_model, maskpt): + log.info("Computing median") + + weight_threshold = compute_weight_threshold(cube_model.wht, maskpt) + + # not safe to use overwrite_input=True here because we are operating on model.data directly + return _nanmedian3D( + np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan), + overwrite_input=False) + + class DiskAppendableArray: """ Creates a temporary file to which to append data, in order to perform @@ -40,7 +61,7 @@ class DiskAppendableArray: a small spatial segment of the full dataset. """ - def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"): + def __init__(self, slice_shape, dtype='float32', tempdir=""): """ Parameters ---------- @@ -52,15 +73,12 @@ def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"): tempdir : str The directory in which to create the temporary files. - - filestem : str - The stem of the temporary file name. The extension ".bits" will be added. + Default is the current working directory. """ - self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) - self._temp_path = Path(self._temp_dir.name) + self._temp_file = tempfile.NamedTemporaryFile(dir=tempdir) + self._filename = self._temp_file.name self._slice_shape = slice_shape self._dtype = dtype - self._filename = self._temp_path / Path(filestem + ".bits") self._append_count = 0 with open(self._filename, 'wb') as f: # Noqa: F841 pass @@ -83,18 +101,6 @@ def read(self): return np.fromfile(self._filename, dtype=self._dtype).reshape(shp) -def create_cube_median(cube_model, maskpt): - log.info("Computing median") - - weight_threshold = compute_weight_threshold(cube_model.wht, maskpt) - - median = np.nanmedian( - np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan), - axis=0, - ) - return median - - def create_median(resampled_models, maskpt, buffer_size=None): """Create a median image from the singly resampled images. @@ -106,17 +112,12 @@ def create_median(resampled_models, maskpt, buffer_size=None): maskpt : float The weight threshold for masking out low weight pixels. - buffer_size : int - The buffer size for the median computation, units of bytes. - Returns ------- median_image : ndarray The median image. """ on_disk = resampled_models._on_disk - if (on_disk) and (buffer_size is None): - raise ValueError("Buffer size for median calculation must be provided when models are on disk") # Compute weight threshold for each input model and set NaN in data where weight is below threshold with resampled_models: @@ -128,9 +129,16 @@ def create_median(resampled_models, maskpt, buffer_size=None): model_cube = np.empty((len(resampled_models),) + resampled.data.shape, dtype=resampled.data.dtype) model_cube[i] = resampled.data - resampled_models.shelve(resampled, modify=False) else: - resampled_models.shelve(resampled, modify=True) + if i == 0: + # set up temporary storage for spatial sections of all input models + shp = (len(resampled_models),) + resampled.data.shape + median_computer = OnDiskTimewiseOperation(shp, + dtype=resampled.data.dtype, + buffer_size=buffer_size) + # write spatial sections to disk + median_computer.add_image(resampled.data) + resampled_models.shelve(resampled, i, modify=False) del resampled if not on_disk: @@ -139,156 +147,145 @@ def create_median(resampled_models, maskpt, buffer_size=None): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", category=RuntimeWarning) - return np.nanmedian(model_cube, axis=0) + return _nanmedian3D(model_cube) else: - # set up buffered access to all input models - # get spatial sections of library and compute timewise median, one by one - resampled_sections, row_indices = _write_sections(resampled_models, buffer_size) - return _create_median(resampled_sections, row_indices) + # retrieve spatial sections from disk one by one and compute timewise median + return median_computer.compute_median() -def _write_sections(library, buffer_size): - """Write spatial sections from a ModelLibrary into temporary files - grouped along the time axis. +class OnDiskTimewiseOperation: - Parameters - ---------- - library : ModelLibrary - The input data models. - - buffer_size : int - The buffer size for the median computation, units of bytes. + def __init__(self, shape, tempdir="", dtype='float32', buffer_size=None): + """ + Class to compute timewise operations on a stack of input images without + holding all of them in memory. At present, only the median operation is + supported. To extend, create a function definition for the desired + operation modeled after the `compute_median` method. - Returns - ------- - temp_arrays : list - A list of DiskAppendableArray objects, each holding a spatial section - of every input model, stacked along the time axis. + Parameters + ---------- + shape: tuple + The shape of the entire input, (n_images, imrows, imcols). - row_indices : list - A list of tuples, each containing the start and end row indices of the - spatial section in the original data model. - """ + tempdir : str + The parent directory in which to create the temporary directory, + which itself holds all the DiskAppendableArray tempfiles. + Default is the current working directory. - with library: - - # get an example model to determine dtype, shape, buffer indices - example_model = library.borrow(0) - dtype = example_model.data.dtype - shp = example_model.data.shape - itemsize = example_model.data.itemsize - imrows = shp[0] - - # compute buffer indices - nsections, section_nrows = _compute_buffer_indices((len(library),)+shp, itemsize, buffer_size) - library.shelve(example_model, 0, modify=False) - del example_model - - # set up temp file handlers for each section - # handle zeroth section separately just to get the tempdir for the rest - slice_shape = (section_nrows, shp[1]) - arr0 = DiskAppendableArray(slice_shape, filestem="section0", dtype=dtype) - tempdir = arr0._temp_dir.name - temp_arrays = [arr0,] - for i in range(1, nsections-1): - arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype) - temp_arrays.append(arr) + dtype : str + The data type of the input data. + + buffer_size : int, optional + The buffer size for the median computation, units of bytes. + If None, the buffer size will be set equal to the size of one input + resampled image. This is the largest it can be without increasing + the memory footprint of the entire step: we write the drizzled + model to disk just before the median calculation handled by this class, + meaning we can discard the resampled image from memory before calling + compute_median() such that the memory is freed up for the median calculation. + """ + self._expected_nframes = shape[0] + self.frame_shape = shape[1:] + self.dtype = dtype + self.itemsize = np.dtype(dtype).itemsize + self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) + self._temp_path = Path(self._temp_dir.name) - if nsections > 1: - # handle the last section separately because it has a different shape - slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1]) - arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype) - temp_arrays.append(arrn) - - # now append data from each model to all the sections - row_indices = [] - for j, model in enumerate(library): - for i in range(nsections): - row1 = i * section_nrows - row2 = min(row1 + section_nrows, imrows) - if j == 0: - row_indices.append((row1, row2)) - arr = temp_arrays[i] - arr.append(model.data[row1:row2]) - - library.shelve(model, j, modify=False) - del model - - return temp_arrays, row_indices - - -def _compute_buffer_indices(shape, itemsize, buffer_size): - """ - Parameters - ---------- - shape : tuple - The shape of the full input, ie, (n_images, imrows, imcols). + # figure out number of sections and rows per section that are needed + self.nsections, self.section_nrows = self._get_buffer_indices(buffer_size=buffer_size) + self.slice_shape = (self.section_nrows, shape[2]) + self._n_adds = 0 - itemsize : int - The size of a single array element in bytes. + # instantiate a temporary DiskAppendableArray for each section + self._temp_arrays = self._temparray_setup(dtype) - buffer_size : int - The buffer size for the median computation, units of bytes. - Returns - ------- - nsections : int - The number of sections to divide the input data into. + def _get_buffer_indices(self, buffer_size=None): + """ + Parameters + ---------- + buffer_size : int, optional + The buffer size for the median computation, units of bytes. - section_nrows : int - The number of rows in each section (except the last one). - """ + Returns + ------- + nsections : int + The number of sections to divide the input data into. - nimages, imrows, imcols = shape - per_model_buffer_size = buffer_size / nimages - min_buffer_size = imcols * itemsize - section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size)) + section_nrows : int + The number of rows in each section (except the last one). + """ + imrows, imcols = self.frame_shape + if buffer_size is None: + buffer_size = imrows * imcols * self.itemsize + self.buffer_size = buffer_size + per_model_buffer_size = buffer_size / self._expected_nframes + min_buffer_size = imcols * self.itemsize + section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size)) + + if section_nrows == 0: + buffer_size = min_buffer_size * self._expected_nframes + log.warning("Buffer size is too small to hold a single row." + f"Increasing buffer size to {buffer_size / _ONE_MB}MB") + section_nrows = 1 + + nsections = int(np.ceil(imrows / section_nrows)) + log.info(f"Computing median over {self._expected_nframes} images in {nsections}" + f"sections with total memory buffer {buffer_size / _ONE_MB} MB") + return nsections, section_nrows + + + def _temparray_setup(self, dtype): + """Set up temp file handlers for each spatial section""" + temp_arrays = [] + for i in range(0, self.nsections): + shp = self.slice_shape + if i == self.nsections - 1: + # last section has whatever shape is left over + shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1]) + arr = DiskAppendableArray(shp, + tempdir=self._temp_path, + dtype=dtype) + temp_arrays.append(arr) + return temp_arrays - if section_nrows == 0: - buffer_size = min_buffer_size * nimages - log.warning("WARNING: Buffer size is too small to hold a single row." - f"Increasing buffer size to {buffer_size / _ONE_MB}MB") - section_nrows = 1 - nsections = int(np.ceil(imrows / section_nrows)) - log.info(f"Computing median over {nimages} images in {nsections} sections with total memory buffer {buffer_size / _ONE_MB} MB") - return nsections, section_nrows + def add_image(self, data): + """Split resampled model data into spatial sections and write to disk.""" + if self._n_adds >= self.nsections: + raise IndexError(f"Too many calls to add_image. Expected at most {self.nsections} input models.") + self._validate_data(data) + self._n_adds += 1 + for i in range(self.nsections): + row1 = i * self.section_nrows + row2 = min(row1 + self.section_nrows, self.frame_shape[0]) + arr = self._temp_arrays[i] + arr.append(data[row1:row2]) -def _create_median(resampled_sections, row_indices): - """ - Parameters - ---------- - resampled_sections : list - List of DiskAppendableArray objects, each holding a spatial section - of every input model, stacked along the time axis. + def _validate_data(self, data): + if data.shape != self.frame_shape: + raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}") + if data.dtype != self.dtype: + raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}") - row_indices : list - List of tuples, each containing the start and end row indices of the - spatial section in the original data model. - Returns - ------- - median_image : ndarray - The median image. - """ + def compute_median(self): + """Read spatial sections from disk and compute the median.""" + row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0])) + for i in range(self.nsections)] - dtype = resampled_sections[0]._dtype - output_rows = row_indices[-1][1] - output_cols = resampled_sections[0].shape[2] - median_image = np.empty((output_rows, output_cols), dtype) * np.nan + output_rows = row_indices[-1][1] + output_cols = self._temp_arrays[0].shape[2] + median_image = np.empty((output_rows, output_cols), self.dtype) * np.nan - for i, disk_arr in enumerate(resampled_sections): - row1, row2 = row_indices[i] - arr = disk_arr.read() - with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", - message="All-NaN slice encountered", - category=RuntimeWarning) - median_image[row1:row2] = np.nanmedian(arr, axis=0) - del arr, disk_arr + for i, disk_arr in enumerate(self._temp_arrays): + row1, row2 = row_indices[i] + arr = disk_arr.read() + median_image[row1:row2] = _nanmedian3D(arr) + del arr, disk_arr - return median_image + return median_image def flag_crs_in_models( From fef97b817575c0e85824b5d29cf51c96b9c94ee4 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 17 Sep 2024 18:20:32 -0400 Subject: [PATCH 10/34] handle temp file open, close, and delete very manually --- jwst/outlier_detection/tests/test_utils.py | 44 ++++- jwst/outlier_detection/utils.py | 182 +++++++++++---------- 2 files changed, 137 insertions(+), 89 deletions(-) diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index 318384a2c8..0b2cf9e679 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -1,5 +1,5 @@ import pytest -from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskTimewiseOperation +from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskMedian from pathlib import Path import numpy as np import os @@ -8,11 +8,11 @@ def test_disk_appendable_array(tmp_cwd): slice_shape = (8,7) - dtype = 'float32' + dtype = "float32" tempdir = tmp_cwd / Path("tmptest") os.mkdir(tempdir) - arr = DiskAppendableArray(slice_shape, dtype=dtype, tempdir=tempdir) + arr = DiskAppendableArray(slice_shape, dtype, tempdir) # check temporary file setup assert arr._filename.split("/")[-1] in os.listdir(tempdir) @@ -44,11 +44,32 @@ def test_disk_appendable_array(tmp_cwd): assert np.all(arr_in_memory[1] == candidate1) assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True) - # check cleanup occurs after reassigning arr - arr = None + # test cleanup + arr.cleanup() assert len(os.listdir(tempdir)) == 0 +def test_disk_appendable_array_bad_inputs(tmp_cwd): + + slice_shape = (8,7) + dtype = "float32" + tempdir = tmp_cwd / Path("tmptest") + + # test input directory does not exist + with pytest.raises(FileNotFoundError): + arr = DiskAppendableArray(slice_shape, dtype, tempdir) + + # make the input directory + os.mkdir(tempdir) + + # test slice_shape is not 2-D + with pytest.raises(ValueError): + arr = DiskAppendableArray((3,5,7), dtype, tempdir) + + # test dtype is not valid + with pytest.raises(TypeError): + arr = DiskAppendableArray(slice_shape, "float3", tempdir) + def test_on_disk_median(tmp_cwd): @@ -59,7 +80,7 @@ def test_on_disk_median(tmp_cwd): os.mkdir(tempdir) shape = (library_length,) + frame_shape - median_computer = OnDiskTimewiseOperation(shape, dtype=dtype, tempdir=tempdir) + median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir) # test compute buffer indices # buffer size equals size of single input model by default @@ -95,6 +116,7 @@ def test_on_disk_median(tmp_cwd): for candidate in [candidate0, candidate1, candidate2]: median_computer.add_image(candidate) median = median_computer.compute_median() + median_computer.cleanup() assert median.shape == frame_shape assert np.all(median == 2.5) @@ -104,3 +126,13 @@ def test_on_disk_median(tmp_cwd): for _ in range(2): with pytest.raises(IndexError): median_computer.add_image(candidate3) + + # test cleanup of tmpdir and everything else + assert not os.path.exists(median_computer._temp_path) + assert len(os.listdir(tempdir)) == 0 + assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory + + +def test_on_disk_median_bad_inputs(tmp_cwd): + # FIXME + pass \ No newline at end of file diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 309ecf1451..a733ee724f 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -43,6 +43,60 @@ def create_cube_median(cube_model, maskpt): overwrite_input=False) +def create_median(resampled_models, maskpt, buffer_size=None): + """Create a median image from the singly resampled images. + + Parameters + ---------- + resampled_models : ModelLibrary + The singly resampled images. + + maskpt : float + The weight threshold for masking out low weight pixels. + + Returns + ------- + median_image : ndarray + The median image. + """ + on_disk = resampled_models._on_disk + + # Compute weight threshold for each input model and set NaN in data where weight is below threshold + with resampled_models: + for i, resampled in enumerate(resampled_models): + weight_threshold = compute_weight_threshold(resampled.wht, maskpt) + resampled.data[resampled.wht < weight_threshold] = np.nan + if not on_disk: + if i == 0: + model_cube = np.empty((len(resampled_models),) + resampled.data.shape, + dtype=resampled.data.dtype) + model_cube[i] = resampled.data + else: + if i == 0: + # set up temporary storage for spatial sections of all input models + shp = (len(resampled_models),) + resampled.data.shape + median_computer = OnDiskMedian(shp, + dtype=resampled.data.dtype, + buffer_size=buffer_size) + # write spatial sections to disk + median_computer.add_image(resampled.data) + resampled_models.shelve(resampled, i, modify=False) + del resampled + + if not on_disk: + # easier case: all models in library can be loaded into memory at once + with warnings.catch_warnings(): + warnings.filterwarnings(action="ignore", + message="All-NaN slice encountered", + category=RuntimeWarning) + return _nanmedian3D(model_cube) + else: + # retrieve spatial sections from disk one by one and compute timewise median + med = median_computer.compute_median() + median_computer.cleanup() + return med + + class DiskAppendableArray: """ Creates a temporary file to which to append data, in order to perform @@ -61,7 +115,7 @@ class DiskAppendableArray: a small spatial segment of the full dataset. """ - def __init__(self, slice_shape, dtype='float32', tempdir=""): + def __init__(self, slice_shape, dtype, tempdir): """ Parameters ---------- @@ -69,125 +123,81 @@ def __init__(self, slice_shape, dtype='float32', tempdir=""): The shape of the spatial section of input data to be appended to the array. dtype : str - The data type of the array. + The data type of the array. Must be a valid numpy array datatype. tempdir : str The directory in which to create the temporary files. Default is the current working directory. """ - self._temp_file = tempfile.NamedTemporaryFile(dir=tempdir) - self._filename = self._temp_file.name + if len(slice_shape) != 2: + raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.") + self._temp_file, self._filename = tempfile.mkstemp(dir=tempdir) self._slice_shape = slice_shape - self._dtype = dtype + self._dtype = np.dtype(dtype) self._append_count = 0 - with open(self._filename, 'wb') as f: # Noqa: F841 - pass + @property def shape(self): return (self._append_count,) + self._slice_shape - + + def append(self, data): + """Add a new slice to the temporary file""" if data.shape != self._slice_shape: raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}") if data.dtype != self._dtype: raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}") - with open(self._filename, 'ab') as f: + with open(self._filename, "ab") as f: f.write(data.tobytes()) self._append_count += 1 + def read(self): + """Read the 3-D array into memory, then delete the tempfile""" shp = (self._append_count,) + self._slice_shape - return np.fromfile(self._filename, dtype=self._dtype).reshape(shp) - - -def create_median(resampled_models, maskpt, buffer_size=None): - """Create a median image from the singly resampled images. - - Parameters - ---------- - resampled_models : ModelLibrary - The singly resampled images. - - maskpt : float - The weight threshold for masking out low weight pixels. - - Returns - ------- - median_image : ndarray - The median image. - """ - on_disk = resampled_models._on_disk + with open(self._filename, "rb") as f: + output = np.fromfile(f, dtype=self._dtype).reshape(shp) + return output - # Compute weight threshold for each input model and set NaN in data where weight is below threshold - with resampled_models: - for i, resampled in enumerate(resampled_models): - weight_threshold = compute_weight_threshold(resampled.wht, maskpt) - resampled.data[resampled.wht < weight_threshold] = np.nan - if not on_disk: - if i == 0: - model_cube = np.empty((len(resampled_models),) + resampled.data.shape, - dtype=resampled.data.dtype) - model_cube[i] = resampled.data - else: - if i == 0: - # set up temporary storage for spatial sections of all input models - shp = (len(resampled_models),) + resampled.data.shape - median_computer = OnDiskTimewiseOperation(shp, - dtype=resampled.data.dtype, - buffer_size=buffer_size) - # write spatial sections to disk - median_computer.add_image(resampled.data) - resampled_models.shelve(resampled, i, modify=False) - del resampled - - if not on_disk: - # easier case: all models in library can be loaded into memory at once - with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", - message="All-NaN slice encountered", - category=RuntimeWarning) - return _nanmedian3D(model_cube) - else: - # retrieve spatial sections from disk one by one and compute timewise median - return median_computer.compute_median() + + def cleanup(self): + Path.unlink(self._filename) + return -class OnDiskTimewiseOperation: +class OnDiskMedian: - def __init__(self, shape, tempdir="", dtype='float32', buffer_size=None): + def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None): """ - Class to compute timewise operations on a stack of input images without - holding all of them in memory. At present, only the median operation is - supported. To extend, create a function definition for the desired - operation modeled after the `compute_median` method. + Set up temporary files to perform operations on a stack of 2-D input arrays + along the stacking axis (e.g., a time axis) without + holding all of them in memory. Currently the only supported operation + is the median. Parameters ---------- shape: tuple The shape of the entire input, (n_images, imrows, imcols). + dtype : str + The data type of the input data. + tempdir : str The parent directory in which to create the temporary directory, which itself holds all the DiskAppendableArray tempfiles. Default is the current working directory. - dtype : str - The data type of the input data. - buffer_size : int, optional - The buffer size for the median computation, units of bytes. - If None, the buffer size will be set equal to the size of one input - resampled image. This is the largest it can be without increasing - the memory footprint of the entire step: we write the drizzled - model to disk just before the median calculation handled by this class, - meaning we can discard the resampled image from memory before calling - compute_median() such that the memory is freed up for the median calculation. + The buffer size, units of bytes. + Default is the size of one input image. """ + if len(shape) != 3: + raise ValueError(f"Invalid input shape {shape}; only three-dimensional data are supported.") self._expected_nframes = shape[0] self.frame_shape = shape[1:] - self.dtype = dtype - self.itemsize = np.dtype(dtype).itemsize + self.dtype = np.dtype(dtype) + self.itemsize = self.dtype.itemsize self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir) self._temp_path = Path(self._temp_dir.name) @@ -243,9 +253,7 @@ def _temparray_setup(self, dtype): if i == self.nsections - 1: # last section has whatever shape is left over shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1]) - arr = DiskAppendableArray(shp, - tempdir=self._temp_path, - dtype=dtype) + arr = DiskAppendableArray(shp, dtype, self._temp_path) temp_arrays.append(arr) return temp_arrays @@ -268,10 +276,18 @@ def _validate_data(self, data): raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}") if data.dtype != self.dtype: raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}") + + + def cleanup(self): + """Remove the temporary files and directory when finished""" + [arr.cleanup() for arr in self._temp_arrays] + Path.rmdir(self._temp_path) + return def compute_median(self): - """Read spatial sections from disk and compute the median.""" + """Read spatial sections from disk and compute the median across groups + (median over number of exposures on a per-pixel basis)""" row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0])) for i in range(self.nsections)] From ff10987304f07292f5e61f592261b90ec56e8444 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 18 Sep 2024 13:46:23 -0400 Subject: [PATCH 11/34] refactor resample_many_to_many to allow discard drizzled models after use, imaging only --- jwst/outlier_detection/imaging.py | 122 +++++++++---- .../tests/test_outlier_detection.py | 2 +- jwst/outlier_detection/tests/test_utils.py | 6 +- jwst/outlier_detection/utils.py | 31 ++-- jwst/resample/resample.py | 166 ++++++++++-------- 5 files changed, 208 insertions(+), 119 deletions(-) diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 5678b45802..964c0e3c3b 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -6,6 +6,7 @@ import logging import os +import numpy as np from stdatamodels.jwst import datamodels from jwst.datamodels import ModelLibrary @@ -13,8 +14,10 @@ from jwst.resample.resample_utils import build_driz_weight from jwst.stpipe.utilities import record_step_status -from .utils import create_median, flag_model_crs, flag_resampled_model_crs -from ._fileio import remove_file, save_median +from stcal.outlier_detection.utils import compute_weight_threshold + +from .utils import nanmedian3D, flag_model_crs, flag_resampled_model_crs, OnDiskMedian +from ._fileio import save_median log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) @@ -58,17 +61,18 @@ def detect_outliers( log.warning("Outlier detection will be skipped") record_step_status(input_models, "outlier_detection", False) return input_models + + with input_models: + example_model = input_models.borrow(0) + output_path = make_output_path(basepath=example_model.meta.filename, + suffix='') + input_models.shelve(example_model, modify=False) + del example_model + output_path = os.path.dirname(output_path) if resample_data: # Start by creating resampled/mosaic images for # each group of exposures - with input_models: - example_model = input_models.borrow(0) - output_path = make_output_path(basepath=example_model.meta.filename, - suffix='') - input_models.shelve(example_model, modify=False) - del example_model - output_path = os.path.dirname(output_path) resamp = resample.ResampleData( input_models, output=output_path, @@ -84,42 +88,96 @@ def detect_outliers( allowed_memory=allowed_memory, ) median_wcs = resamp.output_wcs - drizzled_models = resamp.do_drizzle(input_models) + + j = 0 + for group_id, indices in input_models.group_indices.items(): + + # single is hardcoded to True above so it's ok to just call resample_group directly + drizzled_model = resamp.resample_group(input_models, indices) + if j == 0: + ngroups = len(input_models.group_names) + full_shape = (ngroups,) + drizzled_model.data.shape + if in_memory: + data_frames = np.empty(full_shape, dtype=np.float32) + else: + median_computer = OnDiskMedian(full_shape, + dtype=drizzled_model.data.dtype, + buffer_size=None) + + if save_intermediate_results: + output_name = drizzled_model.meta.filename + + if resamp.output_dir is not None: + output_name = os.path.join(resamp.output_dir, output_name) + drizzled_model.save(output_name) + log.info(f"Saved model in {output_name}") + + # handle the weights right away, so only data array needs to be saved + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + + if in_memory: + data_frames[j] = drizzled_model.data + else: + # write spatial sections to disk + median_computer.add_image(drizzled_model.data) + j += 1 + else: # for non-dithered data, the resampled image is just the original image - drizzled_models = input_models with input_models: - for i, model in enumerate(input_models): - model.wht = build_driz_weight( - model, + for i, drizzled_model in enumerate(input_models): + drizzled_model.wht = build_driz_weight( + drizzled_model, weight_type=weight_type, good_bits=good_bits) - # copy for when saving median and input is a filename? + input_models.shelve(drizzled_model, modify=True) + if i == 0: - median_wcs = copy.deepcopy(model.meta.wcs) - input_models.shelve(model, modify=True) + # copy for when saving median and input is a filename? + median_wcs = copy.deepcopy(drizzled_model.meta.wcs) + full_shape = (len(input_models),) + drizzled_model.data.shape + if in_memory: + data_frames = np.empty(full_shape, dtype=np.float32) + else: + median_computer = OnDiskMedian(full_shape, + dtype=drizzled_model.data.dtype, + buffer_size=None) + + if save_intermediate_results: + output_name = drizzled_model.meta.filename + if output_path is not None: + output_name = os.path.join(output_path, output_name) + drizzled_model.save(output_name) + log.info(f"Saved model in {output_name}") + + # handle the weights right away, so only data array needs to be saved + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + + if in_memory: + data_frames[i] = drizzled_model.data + else: + # write spatial sections to disk + median_computer.add_image(drizzled_model.data) + # Perform median combination on set of drizzled mosaics - median_data = create_median(drizzled_models, maskpt) + if in_memory: + median_data = nanmedian3D(data_frames) + del data_frames + else: + median_data = median_computer.compute_median() + median_computer.cleanup() if save_intermediate_results: # make a median model - with drizzled_models: - example_model = drizzled_models.borrow(0) - drizzled_models.shelve(example_model, modify=False) - median_model = datamodels.ImageModel(median_data) - median_model.update(example_model) - median_model.meta.wcs = median_wcs - del example_model - + median_model = datamodels.ImageModel(median_data) + median_model.update(drizzled_model) + median_model.meta.wcs = median_wcs save_median(median_model, make_output_path, asn_id) del median_model - else: - # since we're not saving intermediate results if the drizzled models - # were written to disk, remove them - if not in_memory: - for fn in drizzled_models.asn["products"][0]["members"]: - remove_file(fn["expname"]) + del drizzled_model # Perform outlier detection using statistical comparisons between # each original input image and its blotted version of the median image diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 0806dd42e7..4aea9f9748 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -231,7 +231,7 @@ def test_outlier_step_base(we_three_sci, tmp_cwd): lambda model, index: model.data.copy(), modify=False)) result = OutlierDetectionStep.call( - container, save_results=True, save_intermediate_results=True + container, save_results=True, save_intermediate_results=True, ) with result: diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index 0b2cf9e679..6e5cbb21bd 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -57,18 +57,18 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd): # test input directory does not exist with pytest.raises(FileNotFoundError): - arr = DiskAppendableArray(slice_shape, dtype, tempdir) + DiskAppendableArray(slice_shape, dtype, tempdir) # make the input directory os.mkdir(tempdir) # test slice_shape is not 2-D with pytest.raises(ValueError): - arr = DiskAppendableArray((3,5,7), dtype, tempdir) + DiskAppendableArray((3,5,7), dtype, tempdir) # test dtype is not valid with pytest.raises(TypeError): - arr = DiskAppendableArray(slice_shape, "float3", tempdir) + DiskAppendableArray(slice_shape, "float3", tempdir) def test_on_disk_median(tmp_cwd): diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index a733ee724f..990d0dd90a 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -22,14 +22,27 @@ _ONE_MB = 1 << 20 -def _nanmedian3D(cube, overwrite_input=True): - """Convenience function to compute the median of a cube ignoring warnings - and setting memory-efficient flag""" +def nanmedian3D(cube, overwrite_input=True): + """Convenience function to compute the median of a cube ignoring warnings, + setting memory-efficient flag, and specifying output type + + Notes + ----- + np.nanmedian returns float64 unless "out" parameter is specified, and specifying + that only changes the dtype of the output but not the internal precision of nanmedian. + It appears to be impossible to stop nanmedian from computing with + at least 64-bit precision internally. + Pre-allocating the output array, i.e., out=np.empty(cube.shape[1:], dtype=np.float32) + actually increases memory usage a bit, so it's better to just convert after the fact. + Returning float64 here causes gwcs_blot to segfault down the line + because that function is hard-coded to expect float32. + """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", category=RuntimeWarning) - return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input) + + return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32) def create_cube_median(cube_model, maskpt): @@ -38,7 +51,7 @@ def create_cube_median(cube_model, maskpt): weight_threshold = compute_weight_threshold(cube_model.wht, maskpt) # not safe to use overwrite_input=True here because we are operating on model.data directly - return _nanmedian3D( + return nanmedian3D( np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan), overwrite_input=False) @@ -85,11 +98,7 @@ def create_median(resampled_models, maskpt, buffer_size=None): if not on_disk: # easier case: all models in library can be loaded into memory at once - with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", - message="All-NaN slice encountered", - category=RuntimeWarning) - return _nanmedian3D(model_cube) + return nanmedian3D(model_cube) else: # retrieve spatial sections from disk one by one and compute timewise median med = median_computer.compute_median() @@ -298,7 +307,7 @@ def compute_median(self): for i, disk_arr in enumerate(self._temp_arrays): row1, row2 = row_indices[i] arr = disk_arr.read() - median_image[row1:row2] = _nanmedian3D(arr) + median_image[row1:row2] = nanmedian3D(arr) del arr, disk_arr return median_image diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py index 6350c3496e..e2297db25d 100644 --- a/jwst/resample/resample.py +++ b/jwst/resample/resample.py @@ -251,6 +251,90 @@ def _get_intensity_scale(self, img): else: iscale = 1.0 return iscale + + def resample_group(self, input_models, indices): + """Apply resample_many_to_many for one group + + Parameters + ---------- + input_models : ModelLibrary + + indices : list + """ + output_model = self.blank_output + + copy_asn_info_from_library(input_models, output_model) + + with input_models: + example_image = input_models.borrow(indices[0]) + + # Determine output file type from input exposure filenames + # Use this for defining the output filename + indx = example_image.meta.filename.rfind('.') + output_type = example_image.meta.filename[indx:] + output_root = '_'.join(example_image.meta.filename.replace( + output_type, '').split('_')[:-1]) + if self.asn_id is not None: + output_model.meta.filename = ( + f'{output_root}_{self.asn_id}_' + f'{self.intermediate_suffix}{output_type}') + else: + output_model.meta.filename = ( + f'{output_root}_' + f'{self.intermediate_suffix}{output_type}') + input_models.shelve(example_image, indices[0], modify=False) + del example_image + + # Initialize the output with the wcs + driz = gwcs_drizzle.GWCSDrizzle(output_model, pixfrac=self.pixfrac, + kernel=self.kernel, fillval=self.fillval) + + log.info(f"{len(indices)} exposures to drizzle together") + for index in indices: + img = input_models.borrow(index) + if isinstance(img, datamodels.SlitModel): + # must call this explicitly to populate area extension + # although the existence of this extension may not be necessary + img.area = img.area + iscale = self._get_intensity_scale(img) + log.debug(f'Using intensity scale iscale={iscale}') + + inwht = resample_utils.build_driz_weight( + img, + weight_type=self.weight_type, + good_bits=self.good_bits + ) + + # apply sky subtraction + blevel = img.meta.background.level + if not img.meta.background.subtracted and blevel is not None: + data = img.data - blevel + else: + data = img.data + + xmin, xmax, ymin, ymax = resample_utils._resample_range( + data.shape, + img.meta.wcs.bounding_box + ) + + driz.add_image( + data, + img.meta.wcs, + iscale=iscale, + inwht=inwht, + xmin=xmin, + xmax=xmax, + ymin=ymin, + ymax=ymax + ) + del data + input_models.shelve(img, index, modify=False) + del img + + out = output_model.copy() + output_model.data *= 0. + output_model.wht *= 0. + return out def resample_many_to_many(self, input_models): """Resample many inputs to many outputs where outputs have a common frame. @@ -263,78 +347,18 @@ def resample_many_to_many(self, input_models): """ output_models = [] for group_id, indices in input_models.group_indices.items(): - output_model = self.blank_output - - copy_asn_info_from_library(input_models, output_model) - - with input_models: - example_image = input_models.borrow(indices[0]) - - # Determine output file type from input exposure filenames - # Use this for defining the output filename - indx = example_image.meta.filename.rfind('.') - output_type = example_image.meta.filename[indx:] - output_root = '_'.join(example_image.meta.filename.replace( - output_type, '').split('_')[:-1]) - if self.asn_id is not None: - output_model.meta.filename = ( - f'{output_root}_{self.asn_id}_' - f'{self.intermediate_suffix}{output_type}') - else: - output_model.meta.filename = ( - f'{output_root}_' - f'{self.intermediate_suffix}{output_type}') - input_models.shelve(example_image, indices[0], modify=False) - del example_image - - # Initialize the output with the wcs - driz = gwcs_drizzle.GWCSDrizzle(output_model, pixfrac=self.pixfrac, - kernel=self.kernel, fillval=self.fillval) - - log.info(f"{len(indices)} exposures to drizzle together") - for index in indices: - img = input_models.borrow(index) - if isinstance(img, datamodels.SlitModel): - # must call this explicitly to populate area extension - # although the existence of this extension may not be necessary - img.area = img.area - iscale = self._get_intensity_scale(img) - log.debug(f'Using intensity scale iscale={iscale}') - - inwht = resample_utils.build_driz_weight( - img, - weight_type=self.weight_type, - good_bits=self.good_bits - ) - - # apply sky subtraction - blevel = img.meta.background.level - if not img.meta.background.subtracted and blevel is not None: - data = img.data - blevel - else: - data = img.data - - xmin, xmax, ymin, ymax = resample_utils._resample_range( - data.shape, - img.meta.wcs.bounding_box - ) - - driz.add_image( - data, - img.meta.wcs, - iscale=iscale, - inwht=inwht, - xmin=xmin, - xmax=xmax, - ymin=ymin, - ymax=ymax - ) - del data - input_models.shelve(img, index, modify=False) - del img + + output_model = self.resample_group(input_models, indices) if not self.in_memory: + # Here, write out model to disk in a way that is immediately useful to create_median + # using the outlier detection utils I just wrote. Have this return the median computer + # back to outlier detection + # This does mean weights need to be handled here, though... + # Write out model to disk, then return filename + # only do this write to disk if save_intermediate_results is True + # this write should occur before handling weights output_name = output_model.meta.filename if self.output_dir is not None: output_name = os.path.join(self.output_dir, output_name) @@ -342,9 +366,7 @@ def resample_many_to_many(self, input_models): log.info(f"Saved model in {output_name}") output_models.append(output_name) else: - output_models.append(output_model.copy()) - output_model.data *= 0. - output_model.wht *= 0. + output_models.append(output_model.data) if not self.in_memory: # build ModelLibrary as an association from the output files From 50ece826419193bbdb939a7238c72f0175845699 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 18 Sep 2024 13:53:14 -0400 Subject: [PATCH 12/34] typo fix --- jwst/outlier_detection/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 990d0dd90a..c215337b7f 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -249,7 +249,7 @@ def _get_buffer_indices(self, buffer_size=None): section_nrows = 1 nsections = int(np.ceil(imrows / section_nrows)) - log.info(f"Computing median over {self._expected_nframes} images in {nsections}" + log.info(f"Computing median over {self._expected_nframes} images in {nsections} " f"sections with total memory buffer {buffer_size / _ONE_MB} MB") return nsections, section_nrows From 5400560dd0c37e26de8aace713e3cd076efe4c21 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 18 Sep 2024 16:57:41 -0400 Subject: [PATCH 13/34] first draft extending refactor to spectroscopic mode --- jwst/outlier_detection/imaging.py | 143 ++++-------------- jwst/outlier_detection/spec.py | 101 ++++--------- .../tests/test_outlier_detection.py | 41 ++--- jwst/outlier_detection/utils.py | 125 ++++++++++----- 4 files changed, 166 insertions(+), 244 deletions(-) diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 964c0e3c3b..e8f7be0d19 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -2,22 +2,14 @@ Submodule for performing outlier detection on imaging data. """ -import copy import logging import os -import numpy as np -from stdatamodels.jwst import datamodels - from jwst.datamodels import ModelLibrary from jwst.resample import resample -from jwst.resample.resample_utils import build_driz_weight from jwst.stpipe.utilities import record_step_status -from stcal.outlier_detection.utils import compute_weight_threshold - -from .utils import nanmedian3D, flag_model_crs, flag_resampled_model_crs, OnDiskMedian -from ._fileio import save_median +from .utils import flag_model_crs, flag_resampled_model_crs, drizzle_and_median log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) @@ -69,115 +61,32 @@ def detect_outliers( input_models.shelve(example_model, modify=False) del example_model output_path = os.path.dirname(output_path) + + # Start by creating resampled/mosaic images for + # each group of exposures + resamp = resample.ResampleData( + input_models, + output=output_path, + single=True, + blendheaders=False, + wht_type=weight_type, + pixfrac=pixfrac, + kernel=kernel, + fillval=fillval, + good_bits=good_bits, + in_memory=in_memory, + asn_id=asn_id, + allowed_memory=allowed_memory, + ) + + median_data, median_wcs = drizzle_and_median(input_models, + resamp, + maskpt, + make_output_path, + resample_data=resample_data, + in_memory=in_memory, + save_intermediate_results=save_intermediate_results) - if resample_data: - # Start by creating resampled/mosaic images for - # each group of exposures - resamp = resample.ResampleData( - input_models, - output=output_path, - single=True, - blendheaders=False, - wht_type=weight_type, - pixfrac=pixfrac, - kernel=kernel, - fillval=fillval, - good_bits=good_bits, - in_memory=in_memory, - asn_id=asn_id, - allowed_memory=allowed_memory, - ) - median_wcs = resamp.output_wcs - - j = 0 - for group_id, indices in input_models.group_indices.items(): - - # single is hardcoded to True above so it's ok to just call resample_group directly - drizzled_model = resamp.resample_group(input_models, indices) - if j == 0: - ngroups = len(input_models.group_names) - full_shape = (ngroups,) + drizzled_model.data.shape - if in_memory: - data_frames = np.empty(full_shape, dtype=np.float32) - else: - median_computer = OnDiskMedian(full_shape, - dtype=drizzled_model.data.dtype, - buffer_size=None) - - if save_intermediate_results: - output_name = drizzled_model.meta.filename - - if resamp.output_dir is not None: - output_name = os.path.join(resamp.output_dir, output_name) - drizzled_model.save(output_name) - log.info(f"Saved model in {output_name}") - - # handle the weights right away, so only data array needs to be saved - weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) - drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - - if in_memory: - data_frames[j] = drizzled_model.data - else: - # write spatial sections to disk - median_computer.add_image(drizzled_model.data) - j += 1 - - else: - # for non-dithered data, the resampled image is just the original image - with input_models: - for i, drizzled_model in enumerate(input_models): - drizzled_model.wht = build_driz_weight( - drizzled_model, - weight_type=weight_type, - good_bits=good_bits) - input_models.shelve(drizzled_model, modify=True) - - if i == 0: - # copy for when saving median and input is a filename? - median_wcs = copy.deepcopy(drizzled_model.meta.wcs) - full_shape = (len(input_models),) + drizzled_model.data.shape - if in_memory: - data_frames = np.empty(full_shape, dtype=np.float32) - else: - median_computer = OnDiskMedian(full_shape, - dtype=drizzled_model.data.dtype, - buffer_size=None) - - if save_intermediate_results: - output_name = drizzled_model.meta.filename - if output_path is not None: - output_name = os.path.join(output_path, output_name) - drizzled_model.save(output_name) - log.info(f"Saved model in {output_name}") - - # handle the weights right away, so only data array needs to be saved - weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) - drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - - if in_memory: - data_frames[i] = drizzled_model.data - else: - # write spatial sections to disk - median_computer.add_image(drizzled_model.data) - - - # Perform median combination on set of drizzled mosaics - if in_memory: - median_data = nanmedian3D(data_frames) - del data_frames - else: - median_data = median_computer.compute_median() - median_computer.cleanup() - - if save_intermediate_results: - # make a median model - median_model = datamodels.ImageModel(median_data) - median_model.update(drizzled_model) - median_model.meta.wcs = median_wcs - save_median(median_model, make_output_path, asn_id) - del median_model - del drizzled_model # Perform outlier detection using statistical comparisons between # each original input image and its blotted version of the median image diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index ad8e712d4c..0a234f37ba 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -1,16 +1,13 @@ """ Submodule for performing outlier detection on spectra. """ -import copy import os -from stdatamodels.jwst import datamodels from jwst.datamodels import ModelContainer, ModelLibrary from jwst.stpipe.utilities import record_step_status -from ..resample import resample_spec, resample_utils -from .utils import create_median, flag_crs_in_models, flag_crs_in_models_with_resampling -from ._fileio import remove_file +from ..resample import resample_spec +from .utils import flag_crs_in_models, flag_crs_in_models_with_resampling, drizzle_and_median import logging log = logging.getLogger(__name__) @@ -53,69 +50,37 @@ def detect_outliers( record_step_status(input_models, "outlier_detection", False) return input_models - if resample_data is True: - # Start by creating resampled/mosaic images for - # each group of exposures - output_path = make_output_path( - basepath=input_models[0].meta.filename, suffix='') - output_path = os.path.dirname(output_path) - resamp = resample_spec.ResampleSpecData( - input_models, - output=output_path, - single=True, - blendheaders=False, - wht_type=weight_type, - pixfrac=pixfrac, - kernel=kernel, - fillval=fillval, - good_bits=good_bits, - in_memory=in_memory, - asn_id=asn_id, - ) - median_wcs = resamp.output_wcs - - # convert to library for resample - # for compatibility with image3 pipeline - library = ModelLibrary(input_models, on_disk=False) - drizzled_models = resamp.do_drizzle(library) - - else: - drizzled_models = ModelLibrary(input_models) - with drizzled_models: - for i, model in enumerate(drizzled_models): - model.wht = resample_utils.build_driz_weight( - input_models[i], - weight_type=weight_type, - good_bits=good_bits) - drizzled_models.shelve(model) - # copy for when saving median and input is a filename? - median_wcs = copy.deepcopy(input_models[0].meta.wcs) - - # Perform median combination on set of drizzled mosaics - median_data = create_median(drizzled_models, maskpt) - - if save_intermediate_results: - # Initialize intermediate products used in the outlier detection - median_model = datamodels.ImageModel(median_data) - with drizzled_models: - example_model = drizzled_models.borrow(0) - drizzled_models.shelve(example_model, 0, modify=False) - median_model.meta = example_model.meta - median_model.meta.filename = make_output_path( - basepath=input_models[0].meta.filename, - suffix='median' - ) - - log.info("Writing out MEDIAN image to: {}".format( - median_model.meta.filename)) - median_model.save(median_model.meta.filename) - del median_model - else: - # since we're not saving intermediate results if the drizzled models - # were written to disk, remove them - if not in_memory: - for fn in drizzled_models._members: - remove_file(fn["expname"]) + # Start by creating resampled/mosaic images for + # each group of exposures + output_path = make_output_path( + basepath=input_models[0].meta.filename, suffix='') + output_path = os.path.dirname(output_path) + resamp = resample_spec.ResampleSpecData( + input_models, + output=output_path, + single=True, + blendheaders=False, + wht_type=weight_type, + pixfrac=pixfrac, + kernel=kernel, + fillval=fillval, + good_bits=good_bits, + in_memory=in_memory, + asn_id=asn_id, + ) + median_wcs = resamp.output_wcs + + # convert to library for resample + # for compatibility with image3 pipeline + library = ModelLibrary(input_models, on_disk=False) + + median_data, median_wcs = drizzle_and_median(library, + resamp, + maskpt, + make_output_path, + resample_data=resample_data, + in_memory=False, + save_intermediate_results=save_intermediate_results) # Perform outlier detection using statistical comparisons between # each original input image and its blotted version of the median image diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 4aea9f9748..eca74212e4 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -10,7 +10,7 @@ from jwst.assign_wcs import AssignWcsStep from jwst.assign_wcs.pointing import create_fitswcs from jwst.outlier_detection import OutlierDetectionStep -from jwst.outlier_detection.utils import _flag_resampled_model_crs, create_median +from jwst.outlier_detection.utils import _flag_resampled_model_crs from jwst.outlier_detection.outlier_detection_step import ( IMAGE_MODES, TSO_SPEC_MODES, @@ -595,26 +595,27 @@ def test_outlier_step_weak_cr_tso(exptype, tsovisit): assert result.dq[cr_timestep, 12, 12] == OUTLIER_DO_NOT_USE -def test_create_median(three_sci_as_asn, tmp_cwd): - """Test creation of median on disk vs in memory""" - lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True) - lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False) +# def test_drizzle_and_median(three_sci_as_asn, tmp_cwd): +# """Test creation of median on disk vs in memory""" +# # FIXME: update this test +# lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True) +# lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False) - # make this test meaningful w.r.t. handling of weights - with (lib_on_disk, lib_in_memory): - for lib in [lib_on_disk, lib_in_memory]: - for model in lib: - model.wht = np.ones_like(model.data) - model.wht[4,9] = 0.5 - lib.shelve(model, modify=True) +# # make this test meaningful w.r.t. handling of weights +# with (lib_on_disk, lib_in_memory): +# for lib in [lib_on_disk, lib_in_memory]: +# for model in lib: +# model.wht = np.ones_like(model.data) +# model.wht[4,9] = 0.5 +# lib.shelve(model, modify=True) - # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels - # arbitrarily use 5 times that - buffer_size = 4 * 20 * 5 - median_on_disk = create_median(lib_on_disk, 0.7, buffer_size=buffer_size) - median_in_memory = create_median(lib_in_memory, 0.7) +# # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels +# # arbitrarily use 5 times that +# buffer_size = 4 * 20 * 5 +# median_on_disk = drizzle_and_median(lib_on_disk, 0.7, buffer_size=buffer_size) +# median_in_memory = drizzle_and_median(lib_in_memory, 0.7) - assert np.isnan(median_in_memory[4,9]) +# assert np.isnan(median_in_memory[4,9]) - # Make sure the median library is the same for on-disk and in-memory - assert np.allclose(median_on_disk, median_in_memory, equal_nan=True) +# # Make sure the median library is the same for on-disk and in-memory +# assert np.allclose(median_on_disk, median_in_memory, equal_nan=True) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index c215337b7f..5f3393233e 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -2,6 +2,8 @@ The ever-present utils sub-module. A home for all... """ +import os +import copy import warnings import numpy as np import tempfile @@ -9,8 +11,10 @@ from jwst.lib.pipe_utils import match_nans_and_flags from jwst.resample.resample import compute_image_pixel_area +from jwst.resample.resample_utils import build_driz_weight from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs from stdatamodels.jwst import datamodels +from ._fileio import save_median import logging log = logging.getLogger(__name__) @@ -56,54 +60,97 @@ def create_cube_median(cube_model, maskpt): overwrite_input=False) -def create_median(resampled_models, maskpt, buffer_size=None): - """Create a median image from the singly resampled images. - +def drizzle_and_median(input_models, + resamp, + maskpt, + make_output_path, + resample_data=False, + in_memory=True, + save_intermediate_results=False): + """ Parameters ---------- - resampled_models : ModelLibrary - The singly resampled images. + input_models : ModelLibrary + The input datamodels + + resamp : ResampleData object maskpt : float - The weight threshold for masking out low weight pixels. - Returns - ------- - median_image : ndarray - The median image. + make_output_path : ? """ - on_disk = resampled_models._on_disk - - # Compute weight threshold for each input model and set NaN in data where weight is below threshold - with resampled_models: - for i, resampled in enumerate(resampled_models): - weight_threshold = compute_weight_threshold(resampled.wht, maskpt) - resampled.data[resampled.wht < weight_threshold] = np.nan - if not on_disk: - if i == 0: - model_cube = np.empty((len(resampled_models),) + resampled.data.shape, - dtype=resampled.data.dtype) - model_cube[i] = resampled.data + if not resamp.single: + raise ValueError("drizzle_and_median should only be used for resample_many_to_many") + if resample_data: + indices_by_group = list(input_models.group_indices.values()) + else: + indices_by_group = [[i] for i in range(len(input_models))] + + with input_models: + for i, indices in enumerate(indices_by_group): + + if resample_data: + median_wcs = resamp.output_wcs + + drizzled_model = resamp.resample_group(input_models, indices) + else: - if i == 0: - # set up temporary storage for spatial sections of all input models - shp = (len(resampled_models),) + resampled.data.shape - median_computer = OnDiskMedian(shp, - dtype=resampled.data.dtype, - buffer_size=buffer_size) - # write spatial sections to disk - median_computer.add_image(resampled.data) - resampled_models.shelve(resampled, i, modify=False) - del resampled - - if not on_disk: - # easier case: all models in library can be loaded into memory at once - return nanmedian3D(model_cube) + # for non-dithered data, the resampled image is just the original image + drizzled_model = input_models.borrow(i) + drizzled_model.wht = build_driz_weight( + drizzled_model, + weight_type=resamp.weight_type, + good_bits=resamp.good_bits) + input_models.shelve(drizzled_model, i, modify=True) + median_wcs = copy.deepcopy(drizzled_model.meta.wcs) + + if i == 0: + # allocate either temporary storage or memory for data arrays that go into median + ngroups = len(indices_by_group) + full_shape = (ngroups,) + drizzled_model.data.shape + if in_memory: + data_frames = np.empty(full_shape, dtype=np.float32) + else: + median_computer = OnDiskMedian(full_shape, + dtype=drizzled_model.data.dtype, + buffer_size=None) + + if save_intermediate_results: + # write the drizzled model to file + output_name = drizzled_model.meta.filename + if resamp.output_dir is not None: + output_name = os.path.join(resamp.output_dir, output_name) + drizzled_model.save(output_name) + log.info(f"Saved model in {output_name}") + + # handle the weights right away, so only data array needs to be saved + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + + # put drizzled models into either storage or memory for median + if in_memory: + data_frames[i] = drizzled_model.data + else: + median_computer.add_image(drizzled_model.data) + + # Perform median combination on set of drizzled mosaics + if in_memory: + median_data = nanmedian3D(data_frames) + del data_frames else: - # retrieve spatial sections from disk one by one and compute timewise median - med = median_computer.compute_median() + median_data = median_computer.compute_median() median_computer.cleanup() - return med + + if save_intermediate_results: + # make a median model + median_model = datamodels.ImageModel(median_data) + median_model.update(drizzled_model) + median_model.meta.wcs = median_wcs + save_median(median_model, make_output_path, resamp.asn_id) + del median_model + del drizzled_model + + return median_data, median_wcs class DiskAppendableArray: From f4af4324dd223ea33693f254830db7c1cc846eda Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 18 Sep 2024 17:44:19 -0400 Subject: [PATCH 14/34] added unit tests for drizzle_and_median --- .../tests/test_outlier_detection.py | 108 ++++++++++++++---- jwst/outlier_detection/utils.py | 9 +- 2 files changed, 90 insertions(+), 27 deletions(-) diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index eca74212e4..584425e952 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -4,6 +4,7 @@ from glob import glob import os +from gwcs.wcs import WCS from stdatamodels.jwst import datamodels from jwst.datamodels import ModelContainer, ModelLibrary @@ -18,6 +19,8 @@ CORON_IMAGE_MODES, ) from jwst.resample.tests.test_resample_step import miri_rate_model +from jwst.outlier_detection.utils import drizzle_and_median +from jwst.resample.resample import ResampleData OUTLIER_DO_NOT_USE = np.bitwise_or( datamodels.dqflags.pixel["DO_NOT_USE"], datamodels.dqflags.pixel["OUTLIER"] @@ -595,27 +598,84 @@ def test_outlier_step_weak_cr_tso(exptype, tsovisit): assert result.dq[cr_timestep, 12, 12] == OUTLIER_DO_NOT_USE -# def test_drizzle_and_median(three_sci_as_asn, tmp_cwd): -# """Test creation of median on disk vs in memory""" -# # FIXME: update this test -# lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True) -# lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False) - -# # make this test meaningful w.r.t. handling of weights -# with (lib_on_disk, lib_in_memory): -# for lib in [lib_on_disk, lib_in_memory]: -# for model in lib: -# model.wht = np.ones_like(model.data) -# model.wht[4,9] = 0.5 -# lib.shelve(model, modify=True) - -# # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels -# # arbitrarily use 5 times that -# buffer_size = 4 * 20 * 5 -# median_on_disk = drizzle_and_median(lib_on_disk, 0.7, buffer_size=buffer_size) -# median_in_memory = drizzle_and_median(lib_in_memory, 0.7) - -# assert np.isnan(median_in_memory[4,9]) - -# # Make sure the median library is the same for on-disk and in-memory -# assert np.allclose(median_on_disk, median_in_memory, equal_nan=True) +def test_same_median_on_disk(three_sci_as_asn, tmp_cwd): + """Test creation of median on disk vs in memory""" + lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True) + lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False) + + # make this test meaningful w.r.t. handling of weights + with (lib_on_disk, lib_in_memory): + for lib in [lib_on_disk, lib_in_memory]: + for model in lib: + model.var_rnoise = np.ones_like(model.data) + model.var_rnoise[4,9] = 2.0 + lib.shelve(model, modify=True) + + + # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels + # arbitrarily use 5 times that + buffer_size = 4 * 20 * 5 + median_on_disk, _ = drizzle_and_median(lib_on_disk, + make_resamp(lib_on_disk, False), + 0.7, + buffer_size=buffer_size, + resample_data=False) + median_in_memory, _ = drizzle_and_median(lib_in_memory, + make_resamp(lib_in_memory, False), + 0.7, + buffer_size=buffer_size, + resample_data=False) + + assert np.isnan(median_in_memory[4,9]) + + # Make sure the median library is the same for on-disk and in-memory + assert np.allclose(median_on_disk, median_in_memory, equal_nan=True) + + +def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): + lib = ModelLibrary(three_sci_as_asn, on_disk=False) + + resamp = make_resamp(lib, True) + median, wcs = drizzle_and_median(lib, + resamp, + 0.7, + resample_data=True) + + assert isinstance(wcs, WCS) + assert median.shape == (21,20) + + with pytest.raises(ValueError): + # should fail if save_intermediate results but no output path + drizzle_and_median(lib, + resamp, + 0.7, + make_output_path=None, + save_intermediate_results=True) + + resamp.single = False + with pytest.raises(ValueError): + # should fail if try to call when resamp.single is False + drizzle_and_median(lib, + resamp, + 0.7, + make_output_path=None, + save_intermediate_results=True) + + +def make_resamp(input_models, in_memory): + """All defaults are same as what is run by default by outlier detection""" + resamp = ResampleData( + input_models, + output="", + single=True, + blendheaders=False, + wht_type="ivm", + pixfrac=1.0, + kernel="square", + fillval="INDEF", + good_bits="~DO_NOT_USE", + in_memory=in_memory, + asn_id="test", + allowed_memory=None, + ) + return resamp \ No newline at end of file diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 5f3393233e..042f46d1e6 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -63,10 +63,11 @@ def create_cube_median(cube_model, maskpt): def drizzle_and_median(input_models, resamp, maskpt, - make_output_path, + make_output_path=None, resample_data=False, in_memory=True, - save_intermediate_results=False): + save_intermediate_results=False, + buffer_size=None): """ Parameters ---------- @@ -79,6 +80,8 @@ def drizzle_and_median(input_models, make_output_path : ? """ + if save_intermediate_results and (make_output_path is None): + raise ValueError("make_output_path is required if save_intermediate_results is True") if not resamp.single: raise ValueError("drizzle_and_median should only be used for resample_many_to_many") if resample_data: @@ -113,7 +116,7 @@ def drizzle_and_median(input_models, else: median_computer = OnDiskMedian(full_shape, dtype=drizzled_model.data.dtype, - buffer_size=None) + buffer_size=buffer_size) if save_intermediate_results: # write the drizzled model to file From 13a116154cbe9985e65bac1a87172d672b629530 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 19 Sep 2024 11:28:47 -0400 Subject: [PATCH 15/34] cleanup from self-review plus more unit tests --- CHANGES.rst | 2 + jwst/outlier_detection/_fileio.py | 8 --- jwst/outlier_detection/imaging.py | 5 +- jwst/outlier_detection/spec.py | 6 +- .../tests/test_outlier_detection.py | 2 +- jwst/outlier_detection/tests/test_utils.py | 30 +++++++++- jwst/outlier_detection/utils.py | 55 +++++++++++++------ jwst/resample/resample.py | 7 --- 8 files changed, 73 insertions(+), 42 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 043480cea5..3c580c3f62 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -204,6 +204,8 @@ resample - Ensure that NaNs and DO_NOT_USE flags match up in all input data before resampling. [#8557] +- Permit creating drizzled models one at a time in many-to-many mode. [#8782] + resample_spec ------------- diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 6975ac1717..53dbb8bc6f 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -1,16 +1,8 @@ -import os - import logging log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) -def remove_file(fn): - if isinstance(fn, str) and os.path.isfile(fn): - os.remove(fn) - log.info(f"Removing file {fn}") - - def save_median(median_model, make_output_path, asn_id=None): ''' Save median if requested by user diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index e8f7be0d19..89cf232648 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -82,10 +82,9 @@ def detect_outliers( median_data, median_wcs = drizzle_and_median(input_models, resamp, maskpt, - make_output_path, resample_data=resample_data, - in_memory=in_memory, - save_intermediate_results=save_intermediate_results) + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path,) # Perform outlier detection using statistical comparisons between diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index 0a234f37ba..05f7fd204f 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -68,7 +68,6 @@ def detect_outliers( in_memory=in_memory, asn_id=asn_id, ) - median_wcs = resamp.output_wcs # convert to library for resample # for compatibility with image3 pipeline @@ -77,10 +76,9 @@ def detect_outliers( median_data, median_wcs = drizzle_and_median(library, resamp, maskpt, - make_output_path, resample_data=resample_data, - in_memory=False, - save_intermediate_results=save_intermediate_results) + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path) # Perform outlier detection using statistical comparisons between # each original input image and its blotted version of the median image diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 584425e952..7f6cf421c4 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -234,7 +234,7 @@ def test_outlier_step_base(we_three_sci, tmp_cwd): lambda model, index: model.data.copy(), modify=False)) result = OutlierDetectionStep.call( - container, save_results=True, save_intermediate_results=True, + container, save_results=True, save_intermediate_results=True ) with result: diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index 6e5cbb21bd..b64f6f91f7 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -5,6 +5,9 @@ import os +# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error +# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException +@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_disk_appendable_array(tmp_cwd): slice_shape = (8,7) @@ -49,6 +52,7 @@ def test_disk_appendable_array(tmp_cwd): assert len(os.listdir(tempdir)) == 0 +@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_disk_appendable_array_bad_inputs(tmp_cwd): slice_shape = (8,7) @@ -71,6 +75,7 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd): DiskAppendableArray(slice_shape, "float3", tempdir) +@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_on_disk_median(tmp_cwd): library_length = 3 @@ -133,6 +138,27 @@ def test_on_disk_median(tmp_cwd): assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory +@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_on_disk_median_bad_inputs(tmp_cwd): - # FIXME - pass \ No newline at end of file + + library_length = 3 + frame_shape = (21,20) + dtype = 'float32' + tempdir = tmp_cwd / Path("tmptest") + os.mkdir(tempdir) + shape = (library_length,) + frame_shape + + with pytest.raises(ValueError): + OnDiskMedian(frame_shape, dtype=dtype, tempdir=tempdir) + + with pytest.raises(TypeError): + OnDiskMedian(shape, dtype="float3", tempdir=tempdir) + + with pytest.raises(FileNotFoundError): + OnDiskMedian(shape, dtype="float32", tempdir="dne") + + # unreasonable buffer size will get set to minimum + min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length + median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1) + assert median_computer.buffer_size == min_buffer + median_computer.cleanup() \ No newline at end of file diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 042f46d1e6..58c553d984 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -36,16 +36,15 @@ def nanmedian3D(cube, overwrite_input=True): that only changes the dtype of the output but not the internal precision of nanmedian. It appears to be impossible to stop nanmedian from computing with at least 64-bit precision internally. - Pre-allocating the output array, i.e., out=np.empty(cube.shape[1:], dtype=np.float32) + Pre-allocating the output array with `out=np.empty(cube.shape[1:], dtype=np.float32)` actually increases memory usage a bit, so it's better to just convert after the fact. - Returning float64 here causes gwcs_blot to segfault down the line + Failing to return float32 here causes gwcs_blot to segfault down the line because that function is hard-coded to expect float32. """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", category=RuntimeWarning) - return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32) @@ -63,30 +62,51 @@ def create_cube_median(cube_model, maskpt): def drizzle_and_median(input_models, resamp, maskpt, - make_output_path=None, resample_data=False, - in_memory=True, save_intermediate_results=False, + make_output_path=None, buffer_size=None): """ + Shared code between imaging and spec modes for resampling and median computation + Parameters ---------- input_models : ModelLibrary - The input datamodels + The input datamodels. - resamp : ResampleData object + resamp : resample.resample.ResampleData object + The controlling object for the resampling process. maskpt : float + The weight threshold for masking out low weight pixels. + + resample_data : bool + Whether or not to do resampling. + + save_intermediate_results : bool + if True, save the drizzled models and median model to fits. + + make_output_path : function + The functools.partial instance to pass to save_median. Must be + specified if save_intermediate_results is True. Default None. - make_output_path : ? + buffer_size : int + The size of chunk in bytes that will be read into memory when computing the median. + This parameter has no effect if the input library has its on_disk attribute + set to False. """ + + # validate inputs if save_intermediate_results and (make_output_path is None): raise ValueError("make_output_path is required if save_intermediate_results is True") if not resamp.single: raise ValueError("drizzle_and_median should only be used for resample_many_to_many") + in_memory = not input_models._on_disk + if resample_data: indices_by_group = list(input_models.group_indices.values()) else: + # treat each input model as if it is the only member of its group indices_by_group = [[i] for i in range(len(input_models))] with input_models: @@ -94,9 +114,7 @@ def drizzle_and_median(input_models, if resample_data: median_wcs = resamp.output_wcs - drizzled_model = resamp.resample_group(input_models, indices) - else: # for non-dithered data, the resampled image is just the original image drizzled_model = input_models.borrow(i) @@ -105,15 +123,17 @@ def drizzle_and_median(input_models, weight_type=resamp.weight_type, good_bits=resamp.good_bits) input_models.shelve(drizzled_model, i, modify=True) + # copy for when saving median and input is a filename? median_wcs = copy.deepcopy(drizzled_model.meta.wcs) if i == 0: - # allocate either temporary storage or memory for data arrays that go into median ngroups = len(indices_by_group) full_shape = (ngroups,) + drizzled_model.data.shape if in_memory: + # allocate memory for data arrays that go into median data_frames = np.empty(full_shape, dtype=np.float32) else: + # set up temporary storage for data arrays that go into median median_computer = OnDiskMedian(full_shape, dtype=drizzled_model.data.dtype, buffer_size=buffer_size) @@ -130,10 +150,11 @@ def drizzle_and_median(input_models, weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - # put drizzled models into either storage or memory for median if in_memory: + # populate pre-allocated memory with the drizzled data data_frames[i] = drizzled_model.data else: + # distribute the drizzled data into the temporary storage median_computer.add_image(drizzled_model.data) # Perform median combination on set of drizzled mosaics @@ -145,7 +166,7 @@ def drizzle_and_median(input_models, median_computer.cleanup() if save_intermediate_results: - # make a median model + # Save median model to fits median_model = datamodels.ImageModel(median_data) median_model.update(drizzled_model) median_model.meta.wcs = median_wcs @@ -221,7 +242,7 @@ def read(self): def cleanup(self): - Path.unlink(self._filename) + Path.unlink(Path(self._filename)) return @@ -287,16 +308,16 @@ def _get_buffer_indices(self, buffer_size=None): imrows, imcols = self.frame_shape if buffer_size is None: buffer_size = imrows * imcols * self.itemsize - self.buffer_size = buffer_size per_model_buffer_size = buffer_size / self._expected_nframes min_buffer_size = imcols * self.itemsize section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size)) - if section_nrows == 0: + if section_nrows <= 0: buffer_size = min_buffer_size * self._expected_nframes log.warning("Buffer size is too small to hold a single row." f"Increasing buffer size to {buffer_size / _ONE_MB}MB") section_nrows = 1 + self.buffer_size = buffer_size nsections = int(np.ceil(imrows / section_nrows)) log.info(f"Computing median over {self._expected_nframes} images in {nsections} " @@ -340,7 +361,7 @@ def _validate_data(self, data): def cleanup(self): """Remove the temporary files and directory when finished""" [arr.cleanup() for arr in self._temp_arrays] - Path.rmdir(self._temp_path) + self._temp_dir.cleanup() return diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py index e2297db25d..393d6f55d9 100644 --- a/jwst/resample/resample.py +++ b/jwst/resample/resample.py @@ -351,14 +351,7 @@ def resample_many_to_many(self, input_models): output_model = self.resample_group(input_models, indices) if not self.in_memory: - # Here, write out model to disk in a way that is immediately useful to create_median - # using the outlier detection utils I just wrote. Have this return the median computer - # back to outlier detection - # This does mean weights need to be handled here, though... - # Write out model to disk, then return filename - # only do this write to disk if save_intermediate_results is True - # this write should occur before handling weights output_name = output_model.meta.filename if self.output_dir is not None: output_name = os.path.join(self.output_dir, output_name) From 4a19c7c90444a11b04fd360536c6fcc1beb4639f Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 19 Sep 2024 15:01:47 -0400 Subject: [PATCH 16/34] small fix to test --- .../tests/test_outlier_detection.py | 14 ++++++++------ jwst/outlier_detection/tests/test_utils.py | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 7f6cf421c4..141b3bce1b 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -616,16 +616,17 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd): # arbitrarily use 5 times that buffer_size = 4 * 20 * 5 median_on_disk, _ = drizzle_and_median(lib_on_disk, - make_resamp(lib_on_disk, False), + make_resamp(lib_on_disk), 0.7, buffer_size=buffer_size, resample_data=False) median_in_memory, _ = drizzle_and_median(lib_in_memory, - make_resamp(lib_in_memory, False), + make_resamp(lib_in_memory), 0.7, buffer_size=buffer_size, resample_data=False) + # Make sure the high-variance (low-weight) pixel is set to NaN assert np.isnan(median_in_memory[4,9]) # Make sure the median library is the same for on-disk and in-memory @@ -635,7 +636,7 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd): def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): lib = ModelLibrary(three_sci_as_asn, on_disk=False) - resamp = make_resamp(lib, True) + resamp = make_resamp(lib) median, wcs = drizzle_and_median(lib, resamp, 0.7, @@ -645,7 +646,7 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): assert median.shape == (21,20) with pytest.raises(ValueError): - # should fail if save_intermediate results but no output path + # ensure failure if save_intermediate results but no output path drizzle_and_median(lib, resamp, 0.7, @@ -654,7 +655,7 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): resamp.single = False with pytest.raises(ValueError): - # should fail if try to call when resamp.single is False + # ensure failure if try to call when resamp.single is False drizzle_and_median(lib, resamp, 0.7, @@ -662,8 +663,9 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): save_intermediate_results=True) -def make_resamp(input_models, in_memory): +def make_resamp(input_models): """All defaults are same as what is run by default by outlier detection""" + in_memory = not input_models._on_disk resamp = ResampleData( input_models, output="", diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index b64f6f91f7..e20c181a03 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -66,11 +66,11 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd): # make the input directory os.mkdir(tempdir) - # test slice_shape is not 2-D + # ensure failure if slice_shape is not 2-D with pytest.raises(ValueError): DiskAppendableArray((3,5,7), dtype, tempdir) - # test dtype is not valid + # ensure failure if dtype is not valid with pytest.raises(TypeError): DiskAppendableArray(slice_shape, "float3", tempdir) @@ -157,7 +157,7 @@ def test_on_disk_median_bad_inputs(tmp_cwd): with pytest.raises(FileNotFoundError): OnDiskMedian(shape, dtype="float32", tempdir="dne") - # unreasonable buffer size will get set to minimum + # ensure unreasonable buffer size will get set to minimum reasonable buffer min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1) assert median_computer.buffer_size == min_buffer From 04ac058817ac1140d0c1297e64f5550812ea6870 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 20 Sep 2024 15:51:23 -0400 Subject: [PATCH 17/34] save memory with nanmedian loop --- .../tests/test_algorithms.py | 10 +++++++ jwst/outlier_detection/utils.py | 30 +++++++++---------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py index b3b9255bb4..ba9af2661d 100644 --- a/jwst/outlier_detection/tests/test_algorithms.py +++ b/jwst/outlier_detection/tests/test_algorithms.py @@ -1,6 +1,7 @@ import numpy as np from jwst.outlier_detection.tso import moving_median_over_zeroth_axis +from jwst.outlier_detection.utils import nanmedian3D def test_rolling_median(): @@ -14,3 +15,12 @@ def test_rolling_median(): result = moving_median_over_zeroth_axis(arr, w) expected = expected_time_axis[:, np.newaxis, np.newaxis] * spatial_axis[np.newaxis, :, :] assert np.allclose(result, expected) + + +def test_nanmedian3D(): + + shp = (11, 50, 60) + cube = np.random.normal(shp) + med = nanmedian3D(cube) + assert np.allclose(med, np.nanmedian(cube)) + assert med.dtype == np.float32 \ No newline at end of file diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 58c553d984..31a8ba500e 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -27,25 +27,22 @@ def nanmedian3D(cube, overwrite_input=True): - """Convenience function to compute the median of a cube ignoring warnings, - setting memory-efficient flag, and specifying output type - - Notes - ----- - np.nanmedian returns float64 unless "out" parameter is specified, and specifying - that only changes the dtype of the output but not the internal precision of nanmedian. - It appears to be impossible to stop nanmedian from computing with - at least 64-bit precision internally. - Pre-allocating the output array with `out=np.empty(cube.shape[1:], dtype=np.float32)` - actually increases memory usage a bit, so it's better to just convert after the fact. - Failing to return float32 here causes gwcs_blot to segfault down the line - because that function is hard-coded to expect float32. + """Compute the median of a cube ignoring warnings and with memory efficiency. + np.nanmedian always uses at least 64-bit precision internally, and this is too + memory-intensive. Instead, loop over the median calculation to avoid the + memory usage of the internal upcasting and temporary array allocation. + The additional runtime of this loop is indistinguishable from zero, + but this loop decreases overall memory usage of the step by as much as half. """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", message="All-NaN slice encountered", category=RuntimeWarning) - return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32) + output_arr = np.empty(cube.shape[1:], dtype=np.float32) + for i in range(output_arr.shape[0]): + # this for loop looks silly, but see docstring above + output_arr[i,:] = np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input).astype(np.float32) + return output_arr def create_cube_median(cube_model, maskpt): @@ -248,6 +245,9 @@ def cleanup(self): class OnDiskMedian: + # TODO: can/should this class inherit directly from tempfile.TemporaryDirectory? + # probably not, because it "has a" tempdir, it is not itself a tempdir + def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None): """ Set up temporary files to perform operations on a stack of 2-D input arrays @@ -320,7 +320,7 @@ def _get_buffer_indices(self, buffer_size=None): self.buffer_size = buffer_size nsections = int(np.ceil(imrows / section_nrows)) - log.info(f"Computing median over {self._expected_nframes} images in {nsections} " + log.info(f"Computing median over {self._expected_nframes} groups in {nsections} " f"sections with total memory buffer {buffer_size / _ONE_MB} MB") return nsections, section_nrows From 87b850407a3a31666e8f3ddb07f152d67e9aefc2 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 20 Sep 2024 16:41:39 -0400 Subject: [PATCH 18/34] changes in response to Brett's review --- .../tests/test_algorithms.py | 8 ++++--- jwst/outlier_detection/tests/test_utils.py | 14 +++++++++---- jwst/outlier_detection/utils.py | 21 +++++++++++-------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py index ba9af2661d..d1c8ffd85d 100644 --- a/jwst/outlier_detection/tests/test_algorithms.py +++ b/jwst/outlier_detection/tests/test_algorithms.py @@ -20,7 +20,9 @@ def test_rolling_median(): def test_nanmedian3D(): shp = (11, 50, 60) - cube = np.random.normal(shp) + cube = np.random.normal(size=shp) + cube[5, 5:7, 5:8] = np.nan med = nanmedian3D(cube) - assert np.allclose(med, np.nanmedian(cube)) - assert med.dtype == np.float32 \ No newline at end of file + + assert med.dtype == np.float32 + assert np.allclose(med, np.nanmedian(cube, axis=0), equal_nan=True) \ No newline at end of file diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index e20c181a03..875b5060d7 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -14,11 +14,12 @@ def test_disk_appendable_array(tmp_cwd): dtype = "float32" tempdir = tmp_cwd / Path("tmptest") os.mkdir(tempdir) + fname = tempdir / "test.bin" - arr = DiskAppendableArray(slice_shape, dtype, tempdir) + arr = DiskAppendableArray(slice_shape, dtype, fname) # check temporary file setup - assert arr._filename.split("/")[-1] in os.listdir(tempdir) + assert str(arr._filename).split("/")[-1] in os.listdir(tempdir) assert len(os.listdir(tempdir)) == 1 assert arr.shape == (0,) + slice_shape @@ -58,20 +59,25 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd): slice_shape = (8,7) dtype = "float32" tempdir = tmp_cwd / Path("tmptest") + fname = "test.bin" # test input directory does not exist with pytest.raises(FileNotFoundError): - DiskAppendableArray(slice_shape, dtype, tempdir) + DiskAppendableArray(slice_shape, dtype, tempdir / fname) # make the input directory os.mkdir(tempdir) # ensure failure if slice_shape is not 2-D with pytest.raises(ValueError): - DiskAppendableArray((3,5,7), dtype, tempdir) + DiskAppendableArray((3,5,7), dtype, tempdir / fname) # ensure failure if dtype is not valid with pytest.raises(TypeError): + DiskAppendableArray(slice_shape, "float3", tempdir / fname) + + # ensure failure if pass directory instead of filename + with pytest.raises(IsADirectoryError): DiskAppendableArray(slice_shape, "float3", tempdir) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 31a8ba500e..258d2c8663 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -32,7 +32,8 @@ def nanmedian3D(cube, overwrite_input=True): memory-intensive. Instead, loop over the median calculation to avoid the memory usage of the internal upcasting and temporary array allocation. The additional runtime of this loop is indistinguishable from zero, - but this loop decreases overall memory usage of the step by as much as half. + but this loop cuts overall step memory usage roughly in half for at least one + test association. """ with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", @@ -192,7 +193,7 @@ class DiskAppendableArray: a small spatial segment of the full dataset. """ - def __init__(self, slice_shape, dtype, tempdir): + def __init__(self, slice_shape, dtype, filename): """ Parameters ---------- @@ -202,13 +203,14 @@ def __init__(self, slice_shape, dtype, tempdir): dtype : str The data type of the array. Must be a valid numpy array datatype. - tempdir : str - The directory in which to create the temporary files. - Default is the current working directory. + filename : str + The full file path in which to store the array """ if len(slice_shape) != 2: raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.") - self._temp_file, self._filename = tempfile.mkstemp(dir=tempdir) + self._filename = Path(filename) + with open(filename, "wb") as f: # noqa: F841 + pass self._slice_shape = slice_shape self._dtype = np.dtype(dtype) self._append_count = 0 @@ -226,7 +228,8 @@ def append(self, data): if data.dtype != self._dtype: raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}") with open(self._filename, "ab") as f: - f.write(data.tobytes()) + data.tofile(f, sep="") + #f.write(data.tobytes()) self._append_count += 1 @@ -333,7 +336,7 @@ def _temparray_setup(self, dtype): if i == self.nsections - 1: # last section has whatever shape is left over shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1]) - arr = DiskAppendableArray(shp, dtype, self._temp_path) + arr = DiskAppendableArray(shp, dtype, self._temp_path / f"{i}.bin") temp_arrays.append(arr) return temp_arrays @@ -360,7 +363,7 @@ def _validate_data(self, data): def cleanup(self): """Remove the temporary files and directory when finished""" - [arr.cleanup() for arr in self._temp_arrays] + #[arr.cleanup() for arr in self._temp_arrays] self._temp_dir.cleanup() return From 5b2d71a86ccf8844cefae9fdbf45f1e688f3a6a4 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Mon, 23 Sep 2024 14:41:01 -0400 Subject: [PATCH 19/34] towncrier changelog --- CHANGES.rst | 14 -------------- changes/8782.outlier_detection.rst | 1 + changes/8782.resample.rst | 1 + 3 files changed, 2 insertions(+), 14 deletions(-) create mode 100644 changes/8782.outlier_detection.rst create mode 100644 changes/8782.resample.rst diff --git a/CHANGES.rst b/CHANGES.rst index 3cd847985d..7500ad652f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,17 +1,3 @@ -1.16.1 (unreleased) -=================== - -outlier_detection ------------------ - -- Decrease the amount of file I/O required to compute the median when `in_memory` - is set to `False`. [#8782] - -resample --------- - -- Permit creating drizzled models one at a time in many-to-many mode. [#8782] - 1.16.0 (2024-09-20) =================== diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst new file mode 100644 index 0000000000..e63c0cf4f3 --- /dev/null +++ b/changes/8782.outlier_detection.rst @@ -0,0 +1 @@ +Decrease the amount of file I/O required to compute the median when in_memory is set to False. diff --git a/changes/8782.resample.rst b/changes/8782.resample.rst new file mode 100644 index 0000000000..05b3e87052 --- /dev/null +++ b/changes/8782.resample.rst @@ -0,0 +1 @@ +Permit creating drizzled models one at a time in many-to-many mode. From 1e6f8dfecfd5f462302c55da223433ea0b24102d Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 24 Sep 2024 09:10:29 -0400 Subject: [PATCH 20/34] suggestions from Melanie --- jwst/outlier_detection/utils.py | 2 +- jwst/resample/resample.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 258d2c8663..71a55c758c 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -376,7 +376,7 @@ def compute_median(self): output_rows = row_indices[-1][1] output_cols = self._temp_arrays[0].shape[2] - median_image = np.empty((output_rows, output_cols), self.dtype) * np.nan + median_image = np.full((output_rows, output_cols), np.nan, dtype=self.dtype) for i, disk_arr in enumerate(self._temp_arrays): row1, row2 = row_indices[i] diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py index 393d6f55d9..f961b3f1e0 100644 --- a/jwst/resample/resample.py +++ b/jwst/resample/resample.py @@ -261,7 +261,7 @@ def resample_group(self, input_models, indices): indices : list """ - output_model = self.blank_output + output_model = self.blank_output.copy() copy_asn_info_from_library(input_models, output_model) @@ -331,10 +331,7 @@ def resample_group(self, input_models, indices): input_models.shelve(img, index, modify=False) del img - out = output_model.copy() - output_model.data *= 0. - output_model.wht *= 0. - return out + return output_model def resample_many_to_many(self, input_models): """Resample many inputs to many outputs where outputs have a common frame. From 4b0bac4ad02c417ae85da64ba76f1df0c41f9063 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 24 Sep 2024 12:15:56 -0400 Subject: [PATCH 21/34] fixed file names for intermediate files, added regtest to cover it --- jwst/outlier_detection/_fileio.py | 16 ++++++++++------ jwst/outlier_detection/utils.py | 6 ++++++ jwst/regtest/test_nirspec_fs_spec3.py | 12 ++++++++++-- jwst/resample/resample.py | 2 +- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 53dbb8bc6f..828d21bece 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -1,3 +1,4 @@ +import re import logging log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) @@ -12,13 +13,16 @@ def save_median(median_model, make_output_path, asn_id=None): median_model : ~jwst.datamodels.ImageModel The median ImageModel or CubeModel to save ''' - default_suffix = "_outlier_i2d.fits" - if asn_id is None: - suffix_to_remove = default_suffix - else: - suffix_to_remove = f"_{asn_id}{default_suffix}" + # need unique slit ID for MultiSlitModel inputs to step + slit_id = getattr(median_model, "name", "").lower() + if slit_id: + slit_id = "_"+slit_id + # regex to handle i2d and s2d as though they are the same string + default_suffix = r"_outlier_.2d\.fits" + suffix_to_remove = default_suffix if asn_id is None else fr"_{asn_id}{default_suffix}" + basepath = re.sub(suffix_to_remove, ".fits", median_model.meta.filename) median_model_output_path = make_output_path( - basepath=median_model.meta.filename.replace(suffix_to_remove, '.fits'), + basepath=basepath, suffix='median') median_model.save(median_model_output_path) log.info(f"Saved model in {median_model_output_path}") diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 71a55c758c..2342186aec 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -139,6 +139,9 @@ def drizzle_and_median(input_models, if save_intermediate_results: # write the drizzled model to file output_name = drizzled_model.meta.filename + # need unique intermediate filenames for slits in MultiSlitModels + if hasattr(drizzled_model, "name") and drizzled_model.name is not None: + output_name = output_name.replace("_outlier_", f"_{drizzled_model.name.lower()}_outlier_") if resamp.output_dir is not None: output_name = os.path.join(resamp.output_dir, output_name) drizzled_model.save(output_name) @@ -423,6 +426,9 @@ def flag_resampled_model_crs( if make_output_path is None: raise ValueError("make_output_path must be provided if save_blot is True") model_path = make_output_path(input_model.meta.filename, suffix='blot') + # for MultiSlitModels, need to insert the slit ID into the filename + if hasattr(input_model, "name") and input_model.name is not None: + model_path = model_path.replace("_blot", f"_{input_model.name.lower()}_blot") blot_model = _make_blot_model(input_model, blot) blot_model.meta.filename = model_path blot_model.save(model_path) diff --git a/jwst/regtest/test_nirspec_fs_spec3.py b/jwst/regtest/test_nirspec_fs_spec3.py index 4f3bca7edd..4fd2d62cf5 100644 --- a/jwst/regtest/test_nirspec_fs_spec3.py +++ b/jwst/regtest/test_nirspec_fs_spec3.py @@ -1,6 +1,7 @@ from astropy.io.fits.diff import FITSDiff import pytest import numpy as np +import os from gwcs import wcstools from jwst.stpipe import Step @@ -20,20 +21,27 @@ def run_pipeline(rtdata_module): # Run the calwebb_spec3 pipeline; save results from intermediate steps args = ["calwebb_spec3", rtdata.input, "--steps.outlier_detection.save_results=true", + "--steps.outlier_detection.save_intermediate_results=true", "--steps.resample_spec.save_results=true", "--steps.extract_1d.save_results=true"] Step.from_cmdline(args) @pytest.mark.bigdata -@pytest.mark.parametrize("suffix", ["cal", "crf", "s2d", "x1d"]) +@pytest.mark.parametrize("suffix", ["cal", "crf", "s2d", "x1d", "median"]) @pytest.mark.parametrize("source_id,slit_name", [("s000000001","s200a2"), ("s000000021","s200a1"), ("s000000023","s400a1"), ("s000000024","s1600a1"), ("s000000025","s200b1")]) def test_nirspec_fs_spec3(run_pipeline, rtdata_module, fitsdiff_default_kwargs, suffix, source_id, slit_name): """Test spec3 pipeline on a set of NIRSpec FS exposures.""" rtdata = rtdata_module - output = f"jw01309-o022_{source_id}_nirspec_f290lp-g395h-{slit_name}-allslits_{suffix}.fits" + if suffix == "median": + output = f"jw01309022001_04102_00002_nrs2_{slit_name}_{suffix}.fits" + # also ensure drizzled and blot models were created with the correct names + assert os.path.isfile(output.replace("median", "outlier_s2d")) + assert os.path.isfile(output.replace("median", "blot")) + else: + output = f"jw01309-o022_{source_id}_nirspec_f290lp-g395h-{slit_name}-allslits_{suffix}.fits" rtdata.output = output rtdata.get_truth(f"truth/test_nirspec_fs_spec3/{output}") diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py index f961b3f1e0..795ecef9a1 100644 --- a/jwst/resample/resample.py +++ b/jwst/resample/resample.py @@ -356,7 +356,7 @@ def resample_many_to_many(self, input_models): log.info(f"Saved model in {output_name}") output_models.append(output_name) else: - output_models.append(output_model.data) + output_models.append(output_model) if not self.in_memory: # build ModelLibrary as an association from the output files From f2c7b5be757f5edf47cf1bb43ae4fd638e6950b9 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 24 Sep 2024 15:46:34 -0400 Subject: [PATCH 22/34] refactor drizzle_and_median in response to @braingram --- jwst/outlier_detection/_fileio.py | 66 +++-- jwst/outlier_detection/imaging.py | 62 ++--- jwst/outlier_detection/spec.py | 66 ++--- .../tests/test_outlier_detection.py | 50 ++-- jwst/outlier_detection/tests/test_utils.py | 4 - jwst/outlier_detection/utils.py | 235 ++++++++++-------- 6 files changed, 269 insertions(+), 214 deletions(-) diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 828d21bece..6544fcdc70 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -1,10 +1,11 @@ import re import logging +from jwst.datamodels import ImageModel log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) -def save_median(median_model, make_output_path, asn_id=None): +def save_median(median_data, median_wcs, example_model, make_output_path=None): ''' Save median if requested by user @@ -13,16 +14,53 @@ def save_median(median_model, make_output_path, asn_id=None): median_model : ~jwst.datamodels.ImageModel The median ImageModel or CubeModel to save ''' - # need unique slit ID for MultiSlitModel inputs to step - slit_id = getattr(median_model, "name", "").lower() - if slit_id: - slit_id = "_"+slit_id - # regex to handle i2d and s2d as though they are the same string - default_suffix = r"_outlier_.2d\.fits" - suffix_to_remove = default_suffix if asn_id is None else fr"_{asn_id}{default_suffix}" - basepath = re.sub(suffix_to_remove, ".fits", median_model.meta.filename) - median_model_output_path = make_output_path( - basepath=basepath, - suffix='median') - median_model.save(median_model_output_path) - log.info(f"Saved model in {median_model_output_path}") + median_model = ImageModel(median_data) + median_model.update(example_model) + median_model.meta.wcs = median_wcs + output_path = intermediate_output_path(example_model, "median", make_output_path) + _save_to_path(median_model, output_path, suffix="median") + + +def save_drizzled(drizzled_model, make_output_path): + expected_tail = "outlier_?2d.fits" + suffix = drizzled_model.meta.filename[-len(expected_tail):-5] + output_path = intermediate_output_path(drizzled_model, suffix, make_output_path) + _save_to_path(drizzled_model, output_path, suffix=suffix) + + +def save_blot(input_model, blot, make_output_path): + + blot_model = _make_blot_model(input_model, blot) + output_path = intermediate_output_path(input_model, "blot", make_output_path, suffix_to_remove=r"_cal.*") + _save_to_path(blot_model, output_path, suffix="blot") + + +def _make_blot_model(input_model, blot): + blot_model = type(input_model)() + blot_model.data = blot + blot_model.update(input_model) + return blot_model + + +def intermediate_output_path(input_model, suffix, make_output_path, suffix_to_remove=r"_outlier.*"): + """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions""" + if hasattr(input_model, "name") and input_model.name is not None: + if "_"+input_model.name.lower()+"_" not in input_model.meta.filename: + replacement = f"_{input_model.name.lower()}" + else: + replacement = "" + else: + replacement = "" + basepath = re.sub(suffix_to_remove, replacement, input_model.meta.filename) + + if make_output_path is None: + def make_output_path(basepath, suffix): + return basepath.replace(".fits", f"_{suffix}.fits") + + return make_output_path(basepath, suffix=suffix) + + +def _save_to_path(model, output_path, suffix=""): + model.meta.filename = output_path + model.save(output_path) + log.info(f"Saved {suffix} model in {output_path}") \ No newline at end of file diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 89cf232648..37f807b8bf 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -3,13 +3,15 @@ """ import logging -import os from jwst.datamodels import ModelLibrary from jwst.resample import resample from jwst.stpipe.utilities import record_step_status -from .utils import flag_model_crs, flag_resampled_model_crs, drizzle_and_median +from .utils import (flag_model_crs, + flag_resampled_model_crs, + median_without_resampling, + median_with_resampling) log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) @@ -53,38 +55,36 @@ def detect_outliers( log.warning("Outlier detection will be skipped") record_step_status(input_models, "outlier_detection", False) return input_models - - with input_models: - example_model = input_models.borrow(0) - output_path = make_output_path(basepath=example_model.meta.filename, - suffix='') - input_models.shelve(example_model, modify=False) - del example_model - output_path = os.path.dirname(output_path) # Start by creating resampled/mosaic images for # each group of exposures - resamp = resample.ResampleData( - input_models, - output=output_path, - single=True, - blendheaders=False, - wht_type=weight_type, - pixfrac=pixfrac, - kernel=kernel, - fillval=fillval, - good_bits=good_bits, - in_memory=in_memory, - asn_id=asn_id, - allowed_memory=allowed_memory, - ) - - median_data, median_wcs = drizzle_and_median(input_models, - resamp, - maskpt, - resample_data=resample_data, - save_intermediate_results=save_intermediate_results, - make_output_path=make_output_path,) + if resample_data: + resamp = resample.ResampleData( + input_models, + output=None, + single=True, + blendheaders=False, + wht_type=weight_type, + pixfrac=pixfrac, + kernel=kernel, + fillval=fillval, + good_bits=good_bits, + in_memory=in_memory, + asn_id=asn_id, + allowed_memory=allowed_memory, + ) + median_data, median_wcs = median_with_resampling(input_models, + resamp, + maskpt, + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path,) + else: + median_data, median_wcs = median_without_resampling(input_models, + maskpt, + weight_type, + good_bits, + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path,) # Perform outlier detection using statistical comparisons between diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index 05f7fd204f..23d06ac88c 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -1,13 +1,14 @@ """ Submodule for performing outlier detection on spectra. """ -import os - from jwst.datamodels import ModelContainer, ModelLibrary from jwst.stpipe.utilities import record_step_status from ..resample import resample_spec -from .utils import flag_crs_in_models, flag_crs_in_models_with_resampling, drizzle_and_median +from .utils import (flag_crs_in_models, + flag_crs_in_models_with_resampling, + median_with_resampling, + median_without_resampling) import logging log = logging.getLogger(__name__) @@ -50,35 +51,42 @@ def detect_outliers( record_step_status(input_models, "outlier_detection", False) return input_models - # Start by creating resampled/mosaic images for - # each group of exposures - output_path = make_output_path( - basepath=input_models[0].meta.filename, suffix='') - output_path = os.path.dirname(output_path) - resamp = resample_spec.ResampleSpecData( - input_models, - output=output_path, - single=True, - blendheaders=False, - wht_type=weight_type, - pixfrac=pixfrac, - kernel=kernel, - fillval=fillval, - good_bits=good_bits, - in_memory=in_memory, - asn_id=asn_id, - ) - # convert to library for resample # for compatibility with image3 pipeline library = ModelLibrary(input_models, on_disk=False) - - median_data, median_wcs = drizzle_and_median(library, - resamp, - maskpt, - resample_data=resample_data, - save_intermediate_results=save_intermediate_results, - make_output_path=make_output_path) + + if resample_data is True: + # Start by creating resampled/mosaic images for + # each group of exposures + resamp = resample_spec.ResampleSpecData( + input_models, + output=None, + single=True, + blendheaders=False, + wht_type=weight_type, + pixfrac=pixfrac, + kernel=kernel, + fillval=fillval, + good_bits=good_bits, + in_memory=in_memory, + asn_id=asn_id, + ) + + median_data, median_wcs = median_with_resampling( + library, + resamp, + maskpt, + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path,) + else: + median_data, median_wcs = median_without_resampling( + library, + maskpt, + weight_type, + good_bits, + save_intermediate_results=save_intermediate_results, + make_output_path=make_output_path, + ) # Perform outlier detection using statistical comparisons between # each original input image and its blotted version of the median image diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py index 141b3bce1b..3155824fc8 100644 --- a/jwst/outlier_detection/tests/test_outlier_detection.py +++ b/jwst/outlier_detection/tests/test_outlier_detection.py @@ -19,7 +19,7 @@ CORON_IMAGE_MODES, ) from jwst.resample.tests.test_resample_step import miri_rate_model -from jwst.outlier_detection.utils import drizzle_and_median +from jwst.outlier_detection.utils import median_with_resampling, median_without_resampling from jwst.resample.resample import ResampleData OUTLIER_DO_NOT_USE = np.bitwise_or( @@ -615,16 +615,18 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd): # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels # arbitrarily use 5 times that buffer_size = 4 * 20 * 5 - median_on_disk, _ = drizzle_and_median(lib_on_disk, - make_resamp(lib_on_disk), - 0.7, - buffer_size=buffer_size, - resample_data=False) - median_in_memory, _ = drizzle_and_median(lib_in_memory, - make_resamp(lib_in_memory), - 0.7, - buffer_size=buffer_size, - resample_data=False) + median_on_disk, _ = median_without_resampling( + lib_on_disk, + 0.7, + "ivm", + "~DO_NOT_USE", + buffer_size=buffer_size,) + median_in_memory, _ = median_without_resampling( + lib_in_memory, + 0.7, + "ivm", + "~DO_NOT_USE", + buffer_size=buffer_size,) # Make sure the high-variance (low-weight) pixel is set to NaN assert np.isnan(median_in_memory[4,9]) @@ -637,30 +639,22 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd): lib = ModelLibrary(three_sci_as_asn, on_disk=False) resamp = make_resamp(lib) - median, wcs = drizzle_and_median(lib, - resamp, - 0.7, - resample_data=True) + median, wcs = median_with_resampling( + lib, + resamp, + 0.7) assert isinstance(wcs, WCS) assert median.shape == (21,20) - - with pytest.raises(ValueError): - # ensure failure if save_intermediate results but no output path - drizzle_and_median(lib, - resamp, - 0.7, - make_output_path=None, - save_intermediate_results=True) resamp.single = False with pytest.raises(ValueError): # ensure failure if try to call when resamp.single is False - drizzle_and_median(lib, - resamp, - 0.7, - make_output_path=None, - save_intermediate_results=True) + median_with_resampling( + lib, + resamp, + 0.7, + save_intermediate_results=True) def make_resamp(input_models): diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index 875b5060d7..ff62440890 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -48,10 +48,6 @@ def test_disk_appendable_array(tmp_cwd): assert np.all(arr_in_memory[1] == candidate1) assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True) - # test cleanup - arr.cleanup() - assert len(os.listdir(tempdir)) == 0 - @pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_disk_appendable_array_bad_inputs(tmp_cwd): diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 2342186aec..5eaddeba5b 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -2,7 +2,6 @@ The ever-present utils sub-module. A home for all... """ -import os import copy import warnings import numpy as np @@ -14,7 +13,7 @@ from jwst.resample.resample_utils import build_driz_weight from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs from stdatamodels.jwst import datamodels -from ._fileio import save_median +from . import _fileio import logging log = logging.getLogger(__name__) @@ -42,7 +41,7 @@ def nanmedian3D(cube, overwrite_input=True): output_arr = np.empty(cube.shape[1:], dtype=np.float32) for i in range(output_arr.shape[0]): # this for loop looks silly, but see docstring above - output_arr[i,:] = np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input).astype(np.float32) + np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input, out=output_arr[i,:]) return output_arr @@ -57,13 +56,80 @@ def create_cube_median(cube_model, maskpt): overwrite_input=False) -def drizzle_and_median(input_models, - resamp, - maskpt, - resample_data=False, - save_intermediate_results=False, - make_output_path=None, - buffer_size=None): +def median_without_resampling(input_models, + maskpt, + weight_type, + good_bits, + save_intermediate_results=False, + make_output_path=None, + buffer_size=None): + """ + Shared code between imaging and spec modes for resampling and median computation + + Parameters + ---------- + input_models : ModelLibrary + The input datamodels. + + maskpt : float + The weight threshold for masking out low weight pixels. + + weight_type : str + The type of weighting to use when combining images. Options are: + 'ivm' (inverse variance) or 'exptime' (exposure time). + + good_bits : int + The bit values that are considered good when determining the + data quality of the input. + + save_intermediate_results : bool + if True, save the drizzled models and median model to fits. + + make_output_path : function + The functools.partial instance to pass to save_median. Must be + specified if save_intermediate_results is True. Default None. + + buffer_size : int + The size of chunk in bytes that will be read into memory when computing the median. + This parameter has no effect if the input library has its on_disk attribute + set to False. + """ + in_memory = not input_models._on_disk + ngroups = len(input_models) + + with input_models: + for i in range(len(input_models)): + + drizzled_model = input_models.borrow(i) + drizzled_model.wht = build_driz_weight(drizzled_model, + weight_type=weight_type, + good_bits=good_bits) + median_wcs = copy.deepcopy(drizzled_model.meta.wcs) + input_models.shelve(drizzled_model, i, modify=True) + + if save_intermediate_results: + # write the drizzled model to file + _fileio.save_drizzled(drizzled_model, make_output_path) + + median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size) + + # Perform median combination on set of drizzled mosaics + median_data = _evaluate_median_computer(median_computer, in_memory) + + if save_intermediate_results: + # Save median model to fits + _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path) + del drizzled_model + + return median_data, median_wcs + + +def median_with_resampling(input_models, + resamp, + maskpt, + save_intermediate_results=False, + make_output_path=None, + buffer_size=None): """ Shared code between imaging and spec modes for resampling and median computation @@ -78,9 +144,6 @@ def drizzle_and_median(input_models, maskpt : float The weight threshold for masking out low weight pixels. - resample_data : bool - Whether or not to do resampling. - save_intermediate_results : bool if True, save the drizzled models and median model to fits. @@ -93,91 +156,74 @@ def drizzle_and_median(input_models, This parameter has no effect if the input library has its on_disk attribute set to False. """ - - # validate inputs - if save_intermediate_results and (make_output_path is None): - raise ValueError("make_output_path is required if save_intermediate_results is True") if not resamp.single: - raise ValueError("drizzle_and_median should only be used for resample_many_to_many") + raise ValueError("median_with_resampling should only be used for resample_many_to_many") + in_memory = not input_models._on_disk - - if resample_data: - indices_by_group = list(input_models.group_indices.values()) - else: - # treat each input model as if it is the only member of its group - indices_by_group = [[i] for i in range(len(input_models))] + indices_by_group = list(input_models.group_indices.values()) + ngroups = len(indices_by_group) with input_models: for i, indices in enumerate(indices_by_group): - if resample_data: - median_wcs = resamp.output_wcs - drizzled_model = resamp.resample_group(input_models, indices) - else: - # for non-dithered data, the resampled image is just the original image - drizzled_model = input_models.borrow(i) - drizzled_model.wht = build_driz_weight( - drizzled_model, - weight_type=resamp.weight_type, - good_bits=resamp.good_bits) - input_models.shelve(drizzled_model, i, modify=True) - # copy for when saving median and input is a filename? - median_wcs = copy.deepcopy(drizzled_model.meta.wcs) - - if i == 0: - ngroups = len(indices_by_group) - full_shape = (ngroups,) + drizzled_model.data.shape - if in_memory: - # allocate memory for data arrays that go into median - data_frames = np.empty(full_shape, dtype=np.float32) - else: - # set up temporary storage for data arrays that go into median - median_computer = OnDiskMedian(full_shape, - dtype=drizzled_model.data.dtype, - buffer_size=buffer_size) + median_wcs = resamp.output_wcs + drizzled_model = resamp.resample_group(input_models, indices) if save_intermediate_results: # write the drizzled model to file - output_name = drizzled_model.meta.filename - # need unique intermediate filenames for slits in MultiSlitModels - if hasattr(drizzled_model, "name") and drizzled_model.name is not None: - output_name = output_name.replace("_outlier_", f"_{drizzled_model.name.lower()}_outlier_") - if resamp.output_dir is not None: - output_name = os.path.join(resamp.output_dir, output_name) - drizzled_model.save(output_name) - log.info(f"Saved model in {output_name}") - - # handle the weights right away, so only data array needs to be saved - weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) - drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - - if in_memory: - # populate pre-allocated memory with the drizzled data - data_frames[i] = drizzled_model.data - else: - # distribute the drizzled data into the temporary storage - median_computer.add_image(drizzled_model.data) + _fileio.save_drizzled(drizzled_model, make_output_path) + + median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size) # Perform median combination on set of drizzled mosaics - if in_memory: - median_data = nanmedian3D(data_frames) - del data_frames - else: - median_data = median_computer.compute_median() - median_computer.cleanup() + median_data = _evaluate_median_computer(median_computer, in_memory) if save_intermediate_results: # Save median model to fits - median_model = datamodels.ImageModel(median_data) - median_model.update(drizzled_model) - median_model.meta.wcs = median_wcs - save_median(median_model, make_output_path, resamp.asn_id) - del median_model + _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path) del drizzled_model return median_data, median_wcs +def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory, buffer_size): + + if idx == 0: + full_shape = (ngroups,) + drizzled_model.data.shape + global median_computer + if in_memory: + # allocate memory for data arrays that go into median + median_computer = np.empty(full_shape, dtype=np.float32) + else: + # set up temporary storage for data arrays that go into median + median_computer = OnDiskMedian(full_shape, + dtype=drizzled_model.data.dtype, + buffer_size=buffer_size) + + # handle the weights right away, so only data array needs to be saved + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + + if in_memory: + # populate pre-allocated memory with the drizzled data + median_computer[idx] = drizzled_model.data + else: + # distribute the drizzled data into the temporary storage + median_computer.add_image(drizzled_model.data) + + return median_computer + + +def _evaluate_median_computer(median_computer, in_memory): + if in_memory: + median_data = nanmedian3D(median_computer) + del median_computer + else: + median_data = median_computer.compute_median() + median_computer.cleanup() + return median_data + + class DiskAppendableArray: """ Creates a temporary file to which to append data, in order to perform @@ -232,28 +278,19 @@ def append(self, data): raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}") with open(self._filename, "ab") as f: data.tofile(f, sep="") - #f.write(data.tobytes()) self._append_count += 1 def read(self): - """Read the 3-D array into memory, then delete the tempfile""" + """Read the 3-D array into memory""" shp = (self._append_count,) + self._slice_shape with open(self._filename, "rb") as f: output = np.fromfile(f, dtype=self._dtype).reshape(shp) return output - - - def cleanup(self): - Path.unlink(Path(self._filename)) - return class OnDiskMedian: - # TODO: can/should this class inherit directly from tempfile.TemporaryDirectory? - # probably not, because it "has a" tempdir, it is not itself a tempdir - def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None): """ Set up temporary files to perform operations on a stack of 2-D input arrays @@ -366,7 +403,6 @@ def _validate_data(self, data): def cleanup(self): """Remove the temporary files and directory when finished""" - #[arr.cleanup() for arr in self._temp_arrays] self._temp_dir.cleanup() return @@ -423,17 +459,7 @@ def flag_resampled_model_crs( blot = gwcs_blot(median_data, median_wcs, input_model.data.shape, input_model.meta.wcs, pix_ratio) if save_blot: - if make_output_path is None: - raise ValueError("make_output_path must be provided if save_blot is True") - model_path = make_output_path(input_model.meta.filename, suffix='blot') - # for MultiSlitModels, need to insert the slit ID into the filename - if hasattr(input_model, "name") and input_model.name is not None: - model_path = model_path.replace("_blot", f"_{input_model.name.lower()}_blot") - blot_model = _make_blot_model(input_model, blot) - blot_model.meta.filename = model_path - blot_model.save(model_path) - log.info(f"Saved model in {model_path}") - del blot_model + _fileio.save_blot(input_model, blot, make_output_path) # dq flags will be updated in-place _flag_resampled_model_crs(input_model, blot, snr1, snr2, scale1, scale2, backg) @@ -504,10 +530,3 @@ def flag_model_crs(image, blot, snr): match_nans_and_flags(image) log.info(f"{np.count_nonzero(cr_mask)} pixels marked as outliers") - - -def _make_blot_model(input_model, blot): - blot_model = type(input_model)() - blot_model.data = blot - blot_model.update(input_model) - return blot_model \ No newline at end of file From b51c27e88a21323613b5f636fcdd4b7dd6846a99 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 24 Sep 2024 17:37:03 -0400 Subject: [PATCH 23/34] simplified _save_intermediate_output --- changes/8782.outlier_detection.rst | 1 + jwst/outlier_detection/_fileio.py | 36 ++++++++++++------------------ 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst index e63c0cf4f3..1b1e683d66 100644 --- a/changes/8782.outlier_detection.rst +++ b/changes/8782.outlier_detection.rst @@ -1 +1,2 @@ Decrease the amount of file I/O required to compute the median when in_memory is set to False. +Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed. \ No newline at end of file diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 6544fcdc70..165b43943b 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -17,22 +17,19 @@ def save_median(median_data, median_wcs, example_model, make_output_path=None): median_model = ImageModel(median_data) median_model.update(example_model) median_model.meta.wcs = median_wcs - output_path = intermediate_output_path(example_model, "median", make_output_path) - _save_to_path(median_model, output_path, suffix="median") + _save_intermediate_output(median_model, "median", make_output_path) def save_drizzled(drizzled_model, make_output_path): expected_tail = "outlier_?2d.fits" suffix = drizzled_model.meta.filename[-len(expected_tail):-5] - output_path = intermediate_output_path(drizzled_model, suffix, make_output_path) - _save_to_path(drizzled_model, output_path, suffix=suffix) + _save_intermediate_output(drizzled_model, suffix, make_output_path) def save_blot(input_model, blot, make_output_path): blot_model = _make_blot_model(input_model, blot) - output_path = intermediate_output_path(input_model, "blot", make_output_path, suffix_to_remove=r"_cal.*") - _save_to_path(blot_model, output_path, suffix="blot") + _save_intermediate_output(blot_model, "blot", make_output_path) def _make_blot_model(input_model, blot): @@ -42,25 +39,20 @@ def _make_blot_model(input_model, blot): return blot_model -def intermediate_output_path(input_model, suffix, make_output_path, suffix_to_remove=r"_outlier.*"): +def _save_intermediate_output(model, suffix, make_output_path): """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions""" - if hasattr(input_model, "name") and input_model.name is not None: - if "_"+input_model.name.lower()+"_" not in input_model.meta.filename: - replacement = f"_{input_model.name.lower()}" + input_path = model.meta.filename.replace("_outlier_", "_") + + # Add a slit name to the output path for MultiSlitModel input data if it isn't already there + if hasattr(model, "name") and model.name is not None: + if "_"+model.name.lower() not in model.meta.filename: + slit_name = f"{model.name.lower()}" else: - replacement = "" + slit_name = None else: - replacement = "" - basepath = re.sub(suffix_to_remove, replacement, input_model.meta.filename) - - if make_output_path is None: - def make_output_path(basepath, suffix): - return basepath.replace(".fits", f"_{suffix}.fits") - - return make_output_path(basepath, suffix=suffix) - + slit_name = None -def _save_to_path(model, output_path, suffix=""): + output_path = make_output_path(input_path, suffix=suffix, components=slit_name) model.meta.filename = output_path model.save(output_path) - log.info(f"Saved {suffix} model in {output_path}") \ No newline at end of file + log.info(f"Saved {suffix} model in {output_path}") From d1efbe9905f8614821fdd52362cae467d917c52f Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Tue, 24 Sep 2024 19:50:28 -0400 Subject: [PATCH 24/34] make file naming easier to understand and hopefully fix bug --- jwst/outlier_detection/_fileio.py | 35 ++++++++++--------- jwst/outlier_detection/coron.py | 3 +- .../outlier_detection_step.py | 2 -- jwst/outlier_detection/tso.py | 3 +- jwst/outlier_detection/utils.py | 13 +++++-- jwst/resample/resample.py | 11 ++---- 6 files changed, 34 insertions(+), 33 deletions(-) diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 165b43943b..e146cc1932 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -1,11 +1,9 @@ -import re import logging -from jwst.datamodels import ImageModel log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) -def save_median(median_data, median_wcs, example_model, make_output_path=None): +def save_median(median_model, make_output_path): ''' Save median if requested by user @@ -14,9 +12,6 @@ def save_median(median_data, median_wcs, example_model, make_output_path=None): median_model : ~jwst.datamodels.ImageModel The median ImageModel or CubeModel to save ''' - median_model = ImageModel(median_data) - median_model.update(example_model) - median_model.meta.wcs = median_wcs _save_intermediate_output(median_model, "median", make_output_path) @@ -27,7 +22,6 @@ def save_drizzled(drizzled_model, make_output_path): def save_blot(input_model, blot, make_output_path): - blot_model = _make_blot_model(input_model, blot) _save_intermediate_output(blot_model, "blot", make_output_path) @@ -40,19 +34,26 @@ def _make_blot_model(input_model, blot): def _save_intermediate_output(model, suffix, make_output_path): - """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions""" + """ + Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions + + Notes + ----- + self.make_output_path() is updated globally for the step in the main pipeline + to include the asn_id in the output path, so no need to handle it here. + """ + + # make_output_path cannot handle suffix with an underscore inside it, + # e.g. _outlier_s2d.fits, so do a manual string replacement input_path = model.meta.filename.replace("_outlier_", "_") - # Add a slit name to the output path for MultiSlitModel input data if it isn't already there + # Add a slit name to the output path for MultiSlitModel data if not present + components = None if hasattr(model, "name") and model.name is not None: - if "_"+model.name.lower() not in model.meta.filename: - slit_name = f"{model.name.lower()}" - else: - slit_name = None - else: - slit_name = None - - output_path = make_output_path(input_path, suffix=suffix, components=slit_name) + if "_"+model.name.lower() not in input_path: + components = f"{model.name.lower()}" + + output_path = make_output_path(input_path, suffix=suffix, components=components) model.meta.filename = output_path model.save(output_path) log.info(f"Saved {suffix} model in {output_path}") diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py index 500ad49602..86d5cef71d 100644 --- a/jwst/outlier_detection/coron.py +++ b/jwst/outlier_detection/coron.py @@ -27,7 +27,6 @@ def detect_outliers( good_bits, maskpt, snr, - asn_id, make_output_path, ): """ @@ -56,7 +55,7 @@ def detect_outliers( median_model.update(input_model) median_model.meta.wcs = input_model.meta.wcs - save_median(median_model, make_output_path, asn_id) + save_median(median_model, make_output_path) del median_model # Perform outlier detection using statistical comparisons between diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py index f18fb054e3..d11334fd0e 100644 --- a/jwst/outlier_detection/outlier_detection_step.py +++ b/jwst/outlier_detection/outlier_detection_step.py @@ -94,7 +94,6 @@ def process(self, input_data): self.maskpt, self.rolling_window_width, snr1, - asn_id, self.make_output_path, ) elif mode == 'coron': @@ -104,7 +103,6 @@ def process(self, input_data): self.good_bits, self.maskpt, snr1, - asn_id, self.make_output_path, ) elif mode == 'imaging': diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py index 58aedd19b0..d0161dffe7 100644 --- a/jwst/outlier_detection/tso.py +++ b/jwst/outlier_detection/tso.py @@ -22,7 +22,6 @@ def detect_outliers( maskpt, rolling_window_width, snr, - asn_id, make_output_path, ): """ @@ -53,7 +52,7 @@ def detect_outliers( median_model = dm.CubeModel(data=medians) with dm.open(weighted_cube) as dm0: median_model.update(dm0) - save_median(median_model, make_output_path, asn_id) + save_median(median_model, make_output_path) del median_model # no need for blotting, resample is turned off for TSO diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 5eaddeba5b..7ffc497ee9 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -3,6 +3,7 @@ """ import copy +from functools import partial import warnings import numpy as np import tempfile @@ -118,7 +119,10 @@ def median_without_resampling(input_models, if save_intermediate_results: # Save median model to fits - _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path) + median_model = datamodels.ImageModel(median_data) + median_model.update(drizzled_model) + median_model.meta.wcs = median_wcs + _fileio.save_median(median_model, make_output_path) del drizzled_model return median_data, median_wcs @@ -180,7 +184,12 @@ def median_with_resampling(input_models, if save_intermediate_results: # Save median model to fits - _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path) + median_model = datamodels.ImageModel(median_data) + median_model.update(drizzled_model) + median_model.meta.wcs = median_wcs + # drizzled model already contains asn_id + make_output_path = partial(make_output_path, asn_id=None) + _fileio.save_median(median_model, make_output_path) del drizzled_model return median_data, median_wcs diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py index 795ecef9a1..7d70ae840d 100644 --- a/jwst/resample/resample.py +++ b/jwst/resample/resample.py @@ -274,14 +274,9 @@ def resample_group(self, input_models, indices): output_type = example_image.meta.filename[indx:] output_root = '_'.join(example_image.meta.filename.replace( output_type, '').split('_')[:-1]) - if self.asn_id is not None: - output_model.meta.filename = ( - f'{output_root}_{self.asn_id}_' - f'{self.intermediate_suffix}{output_type}') - else: - output_model.meta.filename = ( - f'{output_root}_' - f'{self.intermediate_suffix}{output_type}') + output_model.meta.filename = ( + f'{output_root}_' + f'{self.intermediate_suffix}{output_type}') input_models.shelve(example_image, indices[0], modify=False) del example_image From b329a00b20fcdfb0933c5c996a7adde10b23a038 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 09:14:27 -0400 Subject: [PATCH 25/34] remove need to pass asn_id around the step --- jwst/outlier_detection/coron.py | 2 ++ jwst/outlier_detection/imaging.py | 2 -- jwst/outlier_detection/outlier_detection_step.py | 8 +++----- jwst/outlier_detection/spec.py | 2 -- jwst/outlier_detection/tso.py | 2 ++ 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py index 86d5cef71d..e1676de58f 100644 --- a/jwst/outlier_detection/coron.py +++ b/jwst/outlier_detection/coron.py @@ -3,6 +3,7 @@ """ +from functools import partial import logging import numpy as np @@ -55,6 +56,7 @@ def detect_outliers( median_model.update(input_model) median_model.meta.wcs = input_model.meta.wcs + make_output_path = partial(make_output_path, asn_id=None) save_median(median_model, make_output_path) del median_model diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 37f807b8bf..1a8511b29c 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -37,7 +37,6 @@ def detect_outliers( fillval, allowed_memory, in_memory, - asn_id, make_output_path, ): """ @@ -70,7 +69,6 @@ def detect_outliers( fillval=fillval, good_bits=good_bits, in_memory=in_memory, - asn_id=asn_id, allowed_memory=allowed_memory, ) median_data, median_wcs = median_with_resampling(input_models, diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py index d11334fd0e..1ffdbf6af4 100644 --- a/jwst/outlier_detection/outlier_detection_step.py +++ b/jwst/outlier_detection/outlier_detection_step.py @@ -80,8 +80,7 @@ def process(self, input_data): self.log.info(f"Outlier Detection mode: {mode}") # determine the asn_id (if not set by the pipeline) - asn_id = self._get_asn_id(input_data) - self.log.info(f"Outlier Detection asn_id: {asn_id}") + self._get_asn_id(input_data) snr1, snr2 = [float(v) for v in self.snr.split()] scale1, scale2 = [float(v) for v in self.scale.split()] @@ -123,7 +122,6 @@ def process(self, input_data): self.fillval, self.allowed_memory, self.in_memory, - asn_id, self.make_output_path, ) elif mode == 'spec': @@ -143,7 +141,6 @@ def process(self, input_data): self.kernel, self.fillval, self.in_memory, - asn_id, self.make_output_path, ) elif mode == 'ifu': @@ -225,7 +222,8 @@ def _get_asn_id(self, input_models): _make_output_path, asn_id=asn_id ) - return asn_id + self.log.info(f"Outlier Detection asn_id: {asn_id}") + return def _set_status(self, input_models, status): # this might be called with the input which might be a filename or path diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index 23d06ac88c..ad8e3db1c1 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -34,7 +34,6 @@ def detect_outliers( kernel, fillval, in_memory, - asn_id, make_output_path, ): """ @@ -69,7 +68,6 @@ def detect_outliers( fillval=fillval, good_bits=good_bits, in_memory=in_memory, - asn_id=asn_id, ) median_data, median_wcs = median_with_resampling( diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py index d0161dffe7..95de86f82f 100644 --- a/jwst/outlier_detection/tso.py +++ b/jwst/outlier_detection/tso.py @@ -1,3 +1,4 @@ +from functools import partial import numpy as np from jwst.resample.resample_utils import build_mask @@ -52,6 +53,7 @@ def detect_outliers( median_model = dm.CubeModel(data=medians) with dm.open(weighted_cube) as dm0: median_model.update(dm0) + make_output_path = partial(make_output_path, asn_id=None) save_median(median_model, make_output_path) del median_model From 6b0ade72f3680dc39e305312f07ef2b123460882 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 09:57:13 -0400 Subject: [PATCH 26/34] fix indeterminate ordering of asn_id and slit_id --- jwst/outlier_detection/_fileio.py | 5 +-- jwst/outlier_detection/tests/test_fileio.py | 41 +++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 jwst/outlier_detection/tests/test_fileio.py diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index e146cc1932..958e8a8473 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -48,12 +48,11 @@ def _save_intermediate_output(model, suffix, make_output_path): input_path = model.meta.filename.replace("_outlier_", "_") # Add a slit name to the output path for MultiSlitModel data if not present - components = None if hasattr(model, "name") and model.name is not None: if "_"+model.name.lower() not in input_path: - components = f"{model.name.lower()}" + suffix = f"{model.name.lower()}_{suffix}" - output_path = make_output_path(input_path, suffix=suffix, components=components) + output_path = make_output_path(input_path, suffix=suffix) model.meta.filename = output_path model.save(output_path) log.info(f"Saved {suffix} model in {output_path}") diff --git a/jwst/outlier_detection/tests/test_fileio.py b/jwst/outlier_detection/tests/test_fileio.py new file mode 100644 index 0000000000..8a114bb257 --- /dev/null +++ b/jwst/outlier_detection/tests/test_fileio.py @@ -0,0 +1,41 @@ +import pytest +from jwst.outlier_detection._fileio import _save_intermediate_output +from jwst.datamodels import ImageModel +from jwst.step import OutlierDetectionStep +import os +import numpy as np +from functools import partial + + +SLIT_ID = "slit007" +ASN_ID = "a008" +INPUT_FILENAME = "foo_outlier_s2d.fits" +SUFFIX = "test" + +@pytest.fixture(scope="module") +def model(): + model = ImageModel(data=np.zeros((10,10))) + model.meta.filename = INPUT_FILENAME + return model + + +@pytest.fixture(scope="module") +def step(): + return OutlierDetectionStep() + + +@pytest.mark.parametrize("asn_id", [None, ASN_ID]) +@pytest.mark.parametrize("slit_id", [None, SLIT_ID]) +def test_save(tmp_cwd, model, step, asn_id, slit_id): + + this_model = model.copy() + if slit_id is not None: + this_model.name = slit_id + make_output_path = step.make_output_path + make_output_path = partial(make_output_path, asn_id=asn_id) + _save_intermediate_output(this_model, SUFFIX, make_output_path) + + stem = model.meta.filename.split("_")[0] + inputs = [val for val in [stem, asn_id, slit_id, SUFFIX] if val is not None] + expected_filename = "_".join(inputs)+".fits" + assert os.path.isfile(expected_filename) \ No newline at end of file From 4db72e374feea7bbdaf5f1769a4fcd0fe831d40c Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 10:08:57 -0400 Subject: [PATCH 27/34] missed one suggested change, updated changelog --- changes/8782.outlier_detection.rst | 3 ++- jwst/outlier_detection/_fileio.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst index 1b1e683d66..55df086d02 100644 --- a/changes/8782.outlier_detection.rst +++ b/changes/8782.outlier_detection.rst @@ -1,2 +1,3 @@ Decrease the amount of file I/O required to compute the median when in_memory is set to False. -Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed. \ No newline at end of file +Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed. +Add association IDs to intermediate filenames. \ No newline at end of file diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 958e8a8473..732f9b34fc 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -53,6 +53,5 @@ def _save_intermediate_output(model, suffix, make_output_path): suffix = f"{model.name.lower()}_{suffix}" output_path = make_output_path(input_path, suffix=suffix) - model.meta.filename = output_path model.save(output_path) log.info(f"Saved {suffix} model in {output_path}") From 8e4ae1684f5a65aeaea2e403370795e459f729f3 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 10:31:46 -0400 Subject: [PATCH 28/34] fix median filenames for tso and coron modes --- jwst/outlier_detection/coron.py | 1 - jwst/outlier_detection/tso.py | 1 - 2 files changed, 2 deletions(-) diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py index e1676de58f..8d24d50df4 100644 --- a/jwst/outlier_detection/coron.py +++ b/jwst/outlier_detection/coron.py @@ -56,7 +56,6 @@ def detect_outliers( median_model.update(input_model) median_model.meta.wcs = input_model.meta.wcs - make_output_path = partial(make_output_path, asn_id=None) save_median(median_model, make_output_path) del median_model diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py index 95de86f82f..9eedb8ea4b 100644 --- a/jwst/outlier_detection/tso.py +++ b/jwst/outlier_detection/tso.py @@ -53,7 +53,6 @@ def detect_outliers( median_model = dm.CubeModel(data=medians) with dm.open(weighted_cube) as dm0: median_model.update(dm0) - make_output_path = partial(make_output_path, asn_id=None) save_median(median_model, make_output_path) del median_model From 71cca05c64010ecf5d7ccb6aa60d596da5088ead Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 10:32:52 -0400 Subject: [PATCH 29/34] remove unused functools imports --- jwst/outlier_detection/coron.py | 2 -- jwst/outlier_detection/tso.py | 1 - 2 files changed, 3 deletions(-) diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py index 8d24d50df4..9a3bf9e984 100644 --- a/jwst/outlier_detection/coron.py +++ b/jwst/outlier_detection/coron.py @@ -2,8 +2,6 @@ Submodule for performing outlier detection on coronagraphy data. """ - -from functools import partial import logging import numpy as np diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py index 9eedb8ea4b..d0161dffe7 100644 --- a/jwst/outlier_detection/tso.py +++ b/jwst/outlier_detection/tso.py @@ -1,4 +1,3 @@ -from functools import partial import numpy as np from jwst.resample.resample_utils import build_mask From bb78cf7421922b5f3fac4a7dfef64319dd0fd0d4 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Wed, 25 Sep 2024 12:42:14 -0400 Subject: [PATCH 30/34] small changes from self review --- jwst/outlier_detection/imaging.py | 3 --- jwst/outlier_detection/outlier_detection_step.py | 3 +++ jwst/outlier_detection/spec.py | 1 - jwst/outlier_detection/tests/test_fileio.py | 7 +++---- jwst/outlier_detection/tests/test_utils.py | 6 ++---- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py index 1a8511b29c..0dacd5ec43 100644 --- a/jwst/outlier_detection/imaging.py +++ b/jwst/outlier_detection/imaging.py @@ -55,12 +55,9 @@ def detect_outliers( record_step_status(input_models, "outlier_detection", False) return input_models - # Start by creating resampled/mosaic images for - # each group of exposures if resample_data: resamp = resample.ResampleData( input_models, - output=None, single=True, blendheaders=False, wht_type=weight_type, diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py index 1ffdbf6af4..5ecc90a231 100644 --- a/jwst/outlier_detection/outlier_detection_step.py +++ b/jwst/outlier_detection/outlier_detection_step.py @@ -198,6 +198,9 @@ def _guess_mode(self, input_models): return None def _get_asn_id(self, input_models): + """Find association ID for any allowed input model type, + and update make_output_path such that the association ID + is included in intermediate and output file names.""" # handle if input_models isn't open if isinstance(input_models, (str, dict)): input_models = datamodels.open(input_models, asn_n_members=1) diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py index ad8e3db1c1..ad94551976 100644 --- a/jwst/outlier_detection/spec.py +++ b/jwst/outlier_detection/spec.py @@ -59,7 +59,6 @@ def detect_outliers( # each group of exposures resamp = resample_spec.ResampleSpecData( input_models, - output=None, single=True, blendheaders=False, wht_type=weight_type, diff --git a/jwst/outlier_detection/tests/test_fileio.py b/jwst/outlier_detection/tests/test_fileio.py index 8a114bb257..6e01557c8e 100644 --- a/jwst/outlier_detection/tests/test_fileio.py +++ b/jwst/outlier_detection/tests/test_fileio.py @@ -20,18 +20,17 @@ def model(): @pytest.fixture(scope="module") -def step(): - return OutlierDetectionStep() +def make_output_path(): + return OutlierDetectionStep().make_output_path @pytest.mark.parametrize("asn_id", [None, ASN_ID]) @pytest.mark.parametrize("slit_id", [None, SLIT_ID]) -def test_save(tmp_cwd, model, step, asn_id, slit_id): +def test_save(tmp_cwd, model, make_output_path, asn_id, slit_id): this_model = model.copy() if slit_id is not None: this_model.name = slit_id - make_output_path = step.make_output_path make_output_path = partial(make_output_path, asn_id=asn_id) _save_intermediate_output(this_model, SUFFIX, make_output_path) diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py index ff62440890..d1fc0bd576 100644 --- a/jwst/outlier_detection/tests/test_utils.py +++ b/jwst/outlier_detection/tests/test_utils.py @@ -5,9 +5,6 @@ import os -# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error -# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException -@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_disk_appendable_array(tmp_cwd): slice_shape = (8,7) @@ -49,7 +46,6 @@ def test_disk_appendable_array(tmp_cwd): assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True) -@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_disk_appendable_array_bad_inputs(tmp_cwd): slice_shape = (8,7) @@ -77,6 +73,8 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd): DiskAppendableArray(slice_shape, "float3", tempdir) +# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error +# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException @pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning") def test_on_disk_median(tmp_cwd): From adb1ae1d73af05a04d1979f343a100d7cd2af67c Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 26 Sep 2024 10:19:17 -0400 Subject: [PATCH 31/34] fixed comment and updated changelog entry in response to @braingram final review --- changes/8782.outlier_detection.0.rst | 1 + changes/8782.outlier_detection.1.rst | 1 + changes/8782.outlier_detection.rst | 3 --- jwst/outlier_detection/_fileio.py | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 changes/8782.outlier_detection.0.rst create mode 100644 changes/8782.outlier_detection.1.rst delete mode 100644 changes/8782.outlier_detection.rst diff --git a/changes/8782.outlier_detection.0.rst b/changes/8782.outlier_detection.0.rst new file mode 100644 index 0000000000..469e927e44 --- /dev/null +++ b/changes/8782.outlier_detection.0.rst @@ -0,0 +1 @@ +Decrease the amount of file I/O required to compute the median when in_memory is set to False. \ No newline at end of file diff --git a/changes/8782.outlier_detection.1.rst b/changes/8782.outlier_detection.1.rst new file mode 100644 index 0000000000..a1f9bd8e55 --- /dev/null +++ b/changes/8782.outlier_detection.1.rst @@ -0,0 +1 @@ +Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed. \ No newline at end of file diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst deleted file mode 100644 index 55df086d02..0000000000 --- a/changes/8782.outlier_detection.rst +++ /dev/null @@ -1,3 +0,0 @@ -Decrease the amount of file I/O required to compute the median when in_memory is set to False. -Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed. -Add association IDs to intermediate filenames. \ No newline at end of file diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py index 732f9b34fc..25dc3a26bd 100644 --- a/jwst/outlier_detection/_fileio.py +++ b/jwst/outlier_detection/_fileio.py @@ -43,8 +43,8 @@ def _save_intermediate_output(model, suffix, make_output_path): to include the asn_id in the output path, so no need to handle it here. """ - # make_output_path cannot handle suffix with an underscore inside it, - # e.g. _outlier_s2d.fits, so do a manual string replacement + # outlier_?2d is not a known suffix, and make_output_path cannot handle an + # underscore in an unknown suffix, so do a manual string replacement input_path = model.meta.filename.replace("_outlier_", "_") # Add a slit name to the output path for MultiSlitModel data if not present From 7c6492051f5601f1ae43391b0cc88bf7455167bc Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 26 Sep 2024 13:18:56 -0400 Subject: [PATCH 32/34] remove unnecessary use of global --- jwst/outlier_detection/utils.py | 38 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index 7ffc497ee9..a87a863bd3 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -112,7 +112,9 @@ def median_without_resampling(input_models, # write the drizzled model to file _fileio.save_drizzled(drizzled_model, make_output_path) - median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size) + if i == 0: + median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size) + _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory) # Perform median combination on set of drizzled mosaics median_data = _evaluate_median_computer(median_computer, in_memory) @@ -177,7 +179,9 @@ def median_with_resampling(input_models, # write the drizzled model to file _fileio.save_drizzled(drizzled_model, make_output_path) - median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size) + if i == 0: + median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size) + _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory) # Perform median combination on set of drizzled mosaics median_data = _evaluate_median_computer(median_computer, in_memory) @@ -195,20 +199,22 @@ def median_with_resampling(input_models, return median_data, median_wcs -def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory, buffer_size): - - if idx == 0: - full_shape = (ngroups,) + drizzled_model.data.shape - global median_computer - if in_memory: - # allocate memory for data arrays that go into median - median_computer = np.empty(full_shape, dtype=np.float32) - else: - # set up temporary storage for data arrays that go into median - median_computer = OnDiskMedian(full_shape, - dtype=drizzled_model.data.dtype, - buffer_size=buffer_size) +def _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size): + + full_shape = (ngroups,) + drizzled_model.data.shape + if in_memory: + # allocate memory for data arrays that go into median + median_computer = np.empty(full_shape, dtype=np.float32) + else: + # set up temporary storage for data arrays that go into median + median_computer = OnDiskMedian(full_shape, + dtype=drizzled_model.data.dtype, + buffer_size=buffer_size) + return median_computer + +def _append_to_median_computer(median_computer, idx, drizzled_model, maskpt, in_memory): + # handle the weights right away, so only data array needs to be saved weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan @@ -220,8 +226,6 @@ def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory, # distribute the drizzled data into the temporary storage median_computer.add_image(drizzled_model.data) - return median_computer - def _evaluate_median_computer(median_computer, in_memory): if in_memory: From 9c0f0afb810cd719b96fbc96955ebb11af720e40 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Thu, 26 Sep 2024 15:31:32 -0400 Subject: [PATCH 33/34] make median computer helper functions agnostic to input data --- jwst/outlier_detection/utils.py | 35 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py index a87a863bd3..03446aac5e 100644 --- a/jwst/outlier_detection/utils.py +++ b/jwst/outlier_detection/utils.py @@ -113,8 +113,13 @@ def median_without_resampling(input_models, _fileio.save_drizzled(drizzled_model, make_output_path) if i == 0: - median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size) - _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory) + input_shape = (ngroups,)+drizzled_model.data.shape + dtype = drizzled_model.data.dtype + median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype) + + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory) # Perform median combination on set of drizzled mosaics median_data = _evaluate_median_computer(median_computer, in_memory) @@ -180,8 +185,14 @@ def median_with_resampling(input_models, _fileio.save_drizzled(drizzled_model, make_output_path) if i == 0: - median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size) - _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory) + input_shape = (ngroups,)+drizzled_model.data.shape + dtype = drizzled_model.data.dtype + median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype) + + weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) + drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan + _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory) + # Perform median combination on set of drizzled mosaics median_data = _evaluate_median_computer(median_computer, in_memory) @@ -199,32 +210,26 @@ def median_with_resampling(input_models, return median_data, median_wcs -def _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size): +def _make_median_computer(full_shape, in_memory, buffer_size, dtype): - full_shape = (ngroups,) + drizzled_model.data.shape if in_memory: # allocate memory for data arrays that go into median median_computer = np.empty(full_shape, dtype=np.float32) else: # set up temporary storage for data arrays that go into median median_computer = OnDiskMedian(full_shape, - dtype=drizzled_model.data.dtype, + dtype=dtype, buffer_size=buffer_size) return median_computer -def _append_to_median_computer(median_computer, idx, drizzled_model, maskpt, in_memory): - - # handle the weights right away, so only data array needs to be saved - weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt) - drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan - +def _append_to_median_computer(median_computer, idx, data, in_memory): if in_memory: # populate pre-allocated memory with the drizzled data - median_computer[idx] = drizzled_model.data + median_computer[idx] = data else: # distribute the drizzled data into the temporary storage - median_computer.add_image(drizzled_model.data) + median_computer.add_image(data) def _evaluate_median_computer(median_computer, in_memory): From 2d5281c911ef61de02f0a3706672c5b90f35f6f2 Mon Sep 17 00:00:00 2001 From: Ned Molter Date: Fri, 27 Sep 2024 09:30:20 -0400 Subject: [PATCH 34/34] updated memory model section of outlier detection imaging docs --- .../outlier_detection/outlier_detection_imaging.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/jwst/outlier_detection/outlier_detection_imaging.rst b/docs/jwst/outlier_detection/outlier_detection_imaging.rst index c2cd6318b9..a6a6ec78e7 100644 --- a/docs/jwst/outlier_detection/outlier_detection_imaging.rst +++ b/docs/jwst/outlier_detection/outlier_detection_imaging.rst @@ -58,7 +58,8 @@ Specifically, this routine performs the following operations: should be used when resampling to create the output mosaic. Any pixel with a DQ value not included in this value (or list of values) will be ignored when resampling. - * Resampled images will be written out to disk with the suffix ``__outlier_i2d.fits`` + * When the ``save_intermediate_results`` parameter is set to True, + resampled images will be written out to disk with the suffix ``__outlier_i2d.fits`` if the input model container has an , otherwise the suffix will be ``_outlier_i2d.fits`` by default. * **If resampling is turned off** through the use of the ``resample_data`` parameter, @@ -162,9 +163,12 @@ during processing includes: :py:class:`~jwst.resample.ResampleStep` as well, to set whether or not to keep the resampled images in memory or not. -#. Computing the median image works section-by-section by only keeping 1Mb of each input - in memory at a time. As a result, only the final output product array for the final - median image along with a stack of 1Mb image sections are kept in memory. +#. Computing the median image works by writing the resampled data frames to appendable files + on disk that are split into sections spatially but contain the entire ngroups (i.e., time) + axis. The section size is set to use roughly the same amount of memory as a single resampled + model, and since the resampled models are discarded from memory after this write operation this + choice avoids increasing the memory usage beyond a single resampled model. + Those sections are then read in one at a time to compute the median image. These changes result in a minimum amount of memory usage during processing at the obvious expense of reading and writing the products from disk.