From 45f628171abb9268d9410cc7f50939603ca84109 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 11 Sep 2024 16:59:51 -0400
Subject: [PATCH 01/34] draft implementation using the TempArrayHandler

---
 .../tests/test_outlier_detection.py           |  12 +-
 jwst/outlier_detection/utils.py               | 251 +++++++++---------
 2 files changed, 138 insertions(+), 125 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index ebd732437b..c2e695eab4 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -600,8 +600,18 @@ def test_create_median(three_sci_as_asn, tmp_cwd):
     lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True)
     lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False)
 
+    ## make this test meaningful w.r.t. handling of weights
+    with (lib_on_disk, lib_in_memory):
+        for lib in [lib_on_disk, lib_in_memory]:
+            for model in lib:
+                model.wht = np.ones_like(model.data)
+                model.wht[4,9] = 0.5
+                lib.shelve(model, modify=True)
+
     median_on_disk = create_median(lib_on_disk, 0.7)
     median_in_memory = create_median(lib_in_memory, 0.7)
 
+    assert np.isnan(median_in_memory[4,9])
+
     # Make sure the median library is the same for on-disk and in-memory
-    assert np.allclose(median_on_disk, median_in_memory)
+    assert np.allclose(median_on_disk, median_in_memory, equal_nan=True)
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 4abdfbad1b..03548b16d3 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -1,9 +1,10 @@
 """
 The ever-present utils sub-module. A home for all...
 """
-import warnings
 
 import numpy as np
+import tempfile
+from pathlib import Path
 
 from jwst.lib.pipe_utils import match_nans_and_flags
 from jwst.resample.resample import compute_image_pixel_area
@@ -20,6 +21,112 @@
 _ONE_MB = 1 << 20
 
 
+# not inheriting from MutableSequence here as insert is complicated
+class TempArrayHandler:
+    def __init__(self, tempdir=""):
+        self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
+        self._temp_path = Path(self._temp_dir.name)
+        self._filenames = []
+        self._data_shape = None
+        self._data_dtype = None
+
+    @property
+    def closed(self):
+        return not hasattr(self, "_temp_dir")
+
+    def close(self):
+        if self.closed:
+            return
+        self._temp_dir.cleanup()
+        del self._temp_dir
+
+    def __del__(self):
+        self.close()
+
+    def __len__(self):
+        if self.closed:
+            raise Exception("use after close")
+        return len(self._filenames)
+
+    def __getitem__(self, index):
+        if self.closed:
+            raise Exception("use after close")
+        fn = self._filenames[index]
+        return np.load(fn)
+
+    def _validate_input(self, arr):
+        if arr.ndim != 2:
+            raise Exception(f"Only 2D arrays are supported: {arr.ndim}")
+        if self._data_shape is None:
+            self._data_shape = arr.shape
+        else:
+            if arr.shape != self._data_shape:
+                raise Exception(
+                    f"Input shape mismatch: {arr.shape} != {self._data_shape}"
+                )
+        if self._data_dtype is None:
+            self._data_dtype = arr.dtype
+        else:
+            if arr.dtype != self._data_dtype:
+                raise Exception(
+                    f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}"
+                )
+
+    def __setitem__(self, index, value):
+        self._validate_input(value)
+        if self.closed:
+            raise Exception("use after close")
+        fn = self._filenames[index]
+        if fn is None:
+            fn = self._temp_path / f"{index}.npy"
+        np.save(fn, value, False)
+        self._filenames[index] = fn
+
+    def append(self, value):
+        if self.closed:
+            raise Exception("use after close")
+        index = len(self)
+        self._filenames.append(None)
+        self.__setitem__(index, value)
+
+    def median(self, buffer_size=100 << 20):
+        if self.closed:
+            raise Exception("use after close")
+        if not len(self):
+            raise Exception("can't take median of empty list")
+
+        # figure out how big the buffer can be
+        n_arrays = len(self)
+        allowed_memory_per_array = buffer_size // n_arrays
+
+        n_dim_1 = allowed_memory_per_array // (
+            self._data_dtype.itemsize * self._data_shape[0]
+        )
+        if n_dim_1 < 1:
+            # TODO more useful error message
+            raise Exception("Not enough memory")
+        if n_dim_1 >= self._data_shape[1]:
+            return np.nanmedian(self, axis=0)
+
+        buffer = np.empty(
+            (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype
+        )
+        median = np.empty(self._data_shape, dtype=self._data_dtype)
+
+        e = n_dim_1
+        slices = [slice(0, e)]
+        while e <= self._data_shape[1]:
+            s = e
+            e += n_dim_1
+            slices.append(slice(s, min(e, self._data_shape[1])))
+
+        for s in slices:
+            for i, arr in enumerate(self):
+                buffer[i, :, : (s.stop - s.start)] = arr[:, s]
+            median[:, s] = np.nanmedian(buffer[:, :, : (s.stop - s.start)], axis=0)
+        return median
+
+
 def create_cube_median(cube_model, maskpt):
     log.info("Computing median")
 
@@ -55,136 +162,32 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0):
     median_image : ndarray
         The median image.
     """
+    # initialize storage for median computation
+    if on_disk:
+        # harder case: need special on-disk numpy array handling
+        model_list = TempArrayHandler()
+    else:
+        # easier case: all models in library can be loaded into memory at once
+        model_list = []
+
     # Compute the weight threshold for each input model
-    weight_thresholds = []
     with resampled_models:
         for resampled in resampled_models:
             weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
-            weight_thresholds.append(weight_threshold)
-            resampled_models.shelve(resampled, modify=False)
-
-    # compute median over all models
+            mask = np.less(resampled.wht, weight_threshold)
+            resampled.data[mask] = np.nan
+            model_list.append(resampled.data)
+            # this is still modified if on_disk is False, but doesn't matter because never used again
+            resampled_models.shelve(resampled, modify=False) 
+            del resampled
+    
     if not on_disk:
-        # easier case: all models in library can be loaded into memory at once
-        model_list = []
-        with resampled_models:
-            for resampled in resampled_models:
-                model_list.append(resampled.data)
-                resampled_models.shelve(resampled, modify=False)
-                del resampled
         return np.nanmedian(np.array(model_list), axis=0)
     else:
-        # set up buffered access to all input models
-        with resampled_models:
-            example_model = resampled_models.borrow(0)
-            shp = example_model.data.shape
-            dtype = example_model.data.dtype
-            nsections, section_nrows = _compute_buffer_indices(example_model, buffer_size)
-            resampled_models.shelve(example_model, modify=False)
-            del example_model
-
-        # get spatial sections of library and compute timewise median, one by one
-        resampled_sections = _get_sections(resampled_models, nsections, section_nrows, shp[0])
-        median_image_empty = np.empty(shp, dtype) * np.nan
-        return _create_median(resampled_sections, resampled_models, weight_thresholds, median_image_empty)
-
-
-def _get_sections(library, nsections, section_nrows, imrows):
-    """Iterator to return sections from a ModelLibrary.
-
-    Parameters
-    ----------
-    library : ModelLibrary
-        The input data models.
-
-    nsections : int
-        The number of spatial sections in each model
-
-    section_nrows : int
-        The number of rows in each section
-
-    imrows : int
-        The total number of rows in the image
-
-    Yields
-    ------
-    data_subset : ndarray
-        array of shape (len(library), section_nrows, ncols) representing a spatial
-        subset of all the data arrays in the library
-
-    weight_subset : ndarray
-        weights corresponding to data_list
-
-    row_range : tuple
-        The range of rows in the image covered by the data arrays
-    """
-    with library:
-        example_model = library.borrow(0)
-        dtype = example_model.data.dtype
-        dtype_wht = example_model.wht.dtype
-        shp = example_model.data.shape
-        library.shelve(example_model, 0, modify=False)
-        del example_model
-    for i in range(nsections):
-        row1 = i * section_nrows
-        row2 = min(row1 + section_nrows, imrows)
-        
-        data_subset = np.empty((len(library), row2 - row1, shp[1]), dtype)
-        weight_subset = np.empty((len(library), row2 - row1, shp[1]), dtype_wht)
-        with library:
-            for j, model in enumerate(library):
-                data_subset[j] = model.data[row1:row2]
-                weight_subset[j] = model.wht[row1:row2]
-                library.shelve(model, j, modify=False)
-                del model
-        yield (data_subset, weight_subset, (row1, row2))
-
-
-def _compute_buffer_indices(model, buffer_size=None):
-
-    imrows, imcols = model.data.shape
-    data_item_size = model.data.itemsize
-    min_buffer_size = imcols * data_item_size
-    buffer_size = min_buffer_size if buffer_size is None else (buffer_size * _ONE_MB)
-    section_nrows = min(imrows, int(buffer_size // min_buffer_size))
-    if section_nrows == 0:
-        buffer_size = min_buffer_size
-        log.warning("WARNING: Buffer size is too small to hold a single row."
-                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
-        section_nrows = 1
-    nsections = int(np.ceil(imrows / section_nrows))
-    return nsections, section_nrows
-
-
-def _create_median(resampled_sections, resampled_models, weight_thresholds, median_image_empty):
-    median_image = median_image_empty
-    for (resampled_sci, resampled_weight, (row1, row2)) in resampled_sections:
-        # Create a mask for each input image, masking out areas where there is
-        # no data or the data has very low weight
-        badmasks = []
-        for weight, weight_threshold in zip(resampled_weight, weight_thresholds):
-            badmask = np.less(weight, weight_threshold)
-            log.debug("Percentage of pixels with low weight: {}".format(
-                np.sum(badmask) / len(weight.flat) * 100))
-            badmasks.append(badmask)
-
-        # Fill resampled_sci array with nan's where mask values are True
-        for f1, f2 in zip(resampled_sci, badmasks):
-            for elem1, elem2 in zip(f1, f2):
-                elem1[elem2] = np.nan
-
-        del badmasks
-
-        # For a stack of images with "bad" data replaced with Nan
-        # use np.nanmedian to compute the median.
-        with warnings.catch_warnings():
-            warnings.filterwarnings(action="ignore",
-                                    message="All-NaN slice encountered",
-                                    category=RuntimeWarning)
-            median_image[row1:row2] = np.nanmedian(resampled_sci, axis=0)
-        del resampled_sci, resampled_weight
-
-    return median_image
+        # compute median using on-disk arrays
+        median_data = model_list.median(buffer_size=buffer_size*_ONE_MB)
+        model_list.close()
+        return median_data
 
 
 def flag_crs_in_models(

From a08277f3cd4e5ef64f530281af7a29a93637fb18 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 12 Sep 2024 12:35:20 -0400
Subject: [PATCH 02/34] added nicer errors and type hints to TempArrayHandler

---
 jwst/outlier_detection/utils.py | 52 +++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 03548b16d3..a67a33ce08 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -21,9 +21,19 @@
 _ONE_MB = 1 << 20
 
 
+class TempArrayHandlerError(Exception):
+    """Generic error for TempArrayHandler."""
+    pass
+
+class UseAfterCloseError(TempArrayHandlerError):
+    def __init__(self, msg="Attempted use after close", *args, **kwargs):
+        super().__init__(msg, *args, **kwargs)
+
 # not inheriting from MutableSequence here as insert is complicated
 class TempArrayHandler:
-    def __init__(self, tempdir=""):
+    """Handler for operations on a list of 2-D numpy arrays that are too large
+    to all fit in memory at once."""
+    def __init__(self, tempdir: str=""):
         self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
         self._temp_path = Path(self._temp_dir.name)
         self._filenames = []
@@ -43,57 +53,57 @@ def close(self):
     def __del__(self):
         self.close()
 
-    def __len__(self):
+    def __len__(self) -> int:
         if self.closed:
-            raise Exception("use after close")
+            raise UseAfterCloseError()
         return len(self._filenames)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> np.ndarray:
         if self.closed:
-            raise Exception("use after close")
+            raise UseAfterCloseError()
         fn = self._filenames[index]
         return np.load(fn)
 
-    def _validate_input(self, arr):
+    def _validate_input(self, arr: np.ndarray):
         if arr.ndim != 2:
-            raise Exception(f"Only 2D arrays are supported: {arr.ndim}")
+            raise TempArrayHandlerError(f"Only 2D arrays are supported: {arr.ndim}")
         if self._data_shape is None:
             self._data_shape = arr.shape
         else:
             if arr.shape != self._data_shape:
-                raise Exception(
+                raise TempArrayHandlerError(
                     f"Input shape mismatch: {arr.shape} != {self._data_shape}"
                 )
         if self._data_dtype is None:
             self._data_dtype = arr.dtype
         else:
             if arr.dtype != self._data_dtype:
-                raise Exception(
+                raise TempArrayHandlerError(
                     f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}"
                 )
 
-    def __setitem__(self, index, value):
+    def __setitem__(self, index: int, value: np.ndarray):
         self._validate_input(value)
         if self.closed:
-            raise Exception("use after close")
+            raise UseAfterCloseError()
         fn = self._filenames[index]
         if fn is None:
             fn = self._temp_path / f"{index}.npy"
         np.save(fn, value, False)
         self._filenames[index] = fn
 
-    def append(self, value):
+    def append(self, value: np.ndarray):
         if self.closed:
-            raise Exception("use after close")
+            raise UseAfterCloseError()
         index = len(self)
         self._filenames.append(None)
         self.__setitem__(index, value)
 
-    def median(self, buffer_size=100 << 20):
+    def median(self, buffer_size: int=100 << 20) -> np.ndarray:
         if self.closed:
-            raise Exception("use after close")
+            raise UseAfterCloseError()
         if not len(self):
-            raise Exception("can't take median of empty list")
+            raise TempArrayHandlerError("Can't take median of empty list")
 
         # figure out how big the buffer can be
         n_arrays = len(self)
@@ -104,9 +114,13 @@ def median(self, buffer_size=100 << 20):
         )
         if n_dim_1 < 1:
             # TODO more useful error message
-            raise Exception("Not enough memory")
+            msg = f"Memory buffer {allowed_memory_per_array} is too small to hold "+ \
+                  f"a single row of length {self._data_shape[0]}"
+            raise TempArrayHandlerError(msg)
         if n_dim_1 >= self._data_shape[1]:
             return np.nanmedian(self, axis=0)
+        log.info(f"Computing median in {int(self._data_shape[1]/n_dim_1)} chunks"+ \
+                 f"of size {(n_dim_1, self._data_shape[0])}")
 
         buffer = np.empty(
             (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype
@@ -139,7 +153,7 @@ def create_cube_median(cube_model, maskpt):
     return median
 
 
-def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0):
+def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0):
     """Create a median image from the singly resampled images.
 
     Parameters
@@ -185,7 +199,7 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=10.0):
         return np.nanmedian(np.array(model_list), axis=0)
     else:
         # compute median using on-disk arrays
-        median_data = model_list.median(buffer_size=buffer_size*_ONE_MB)
+        median_data = model_list.median(buffer_size=int(buffer_size*_ONE_MB))
         model_list.close()
         return median_data
 

From d11b6f93344ff7fb43b42c626ebe6e8b15aacd73 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 12 Sep 2024 16:20:22 -0400
Subject: [PATCH 03/34] first draft of appending to disk

---
 .../tests/test_outlier_detection.py           |   2 +-
 jwst/outlier_detection/utils.py               | 310 ++++++++++--------
 2 files changed, 173 insertions(+), 139 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index c2e695eab4..18270aefc0 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -109,7 +109,7 @@ def we_many_sci(
 ):
     """Provide numsci science images with different noise but identical source
     and same background level"""
-    shape = (20, 20)
+    shape = (21, 20)
 
     sci1 = datamodels.ImageModel(shape)
 
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index a67a33ce08..d47f43847f 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -2,6 +2,7 @@
 The ever-present utils sub-module. A home for all...
 """
 
+import warnings
 import numpy as np
 import tempfile
 from pathlib import Path
@@ -21,124 +22,34 @@
 _ONE_MB = 1 << 20
 
 
-class TempArrayHandlerError(Exception):
-    """Generic error for TempArrayHandler."""
-    pass
-
-class UseAfterCloseError(TempArrayHandlerError):
-    def __init__(self, msg="Attempted use after close", *args, **kwargs):
-        super().__init__(msg, *args, **kwargs)
-
-# not inheriting from MutableSequence here as insert is complicated
-class TempArrayHandler:
-    """Handler for operations on a list of 2-D numpy arrays that are too large
-    to all fit in memory at once."""
-    def __init__(self, tempdir: str=""):
+class DiskAppendableArray:
+    
+    def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"):
         self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
         self._temp_path = Path(self._temp_dir.name)
-        self._filenames = []
-        self._data_shape = None
-        self._data_dtype = None
+        self._slice_shape = slice_shape
+        self._dtype = dtype
+        self._filename = self._temp_path / Path(filestem + ".bits")
+        self._append_count = 0
+        with open(self._filename, 'wb') as f:
+            pass
 
     @property
-    def closed(self):
-        return not hasattr(self, "_temp_dir")
-
-    def close(self):
-        if self.closed:
-            return
-        self._temp_dir.cleanup()
-        del self._temp_dir
-
-    def __del__(self):
-        self.close()
-
-    def __len__(self) -> int:
-        if self.closed:
-            raise UseAfterCloseError()
-        return len(self._filenames)
-
-    def __getitem__(self, index: int) -> np.ndarray:
-        if self.closed:
-            raise UseAfterCloseError()
-        fn = self._filenames[index]
-        return np.load(fn)
-
-    def _validate_input(self, arr: np.ndarray):
-        if arr.ndim != 2:
-            raise TempArrayHandlerError(f"Only 2D arrays are supported: {arr.ndim}")
-        if self._data_shape is None:
-            self._data_shape = arr.shape
-        else:
-            if arr.shape != self._data_shape:
-                raise TempArrayHandlerError(
-                    f"Input shape mismatch: {arr.shape} != {self._data_shape}"
-                )
-        if self._data_dtype is None:
-            self._data_dtype = arr.dtype
-        else:
-            if arr.dtype != self._data_dtype:
-                raise TempArrayHandlerError(
-                    f"Input dtype mismatch: {arr.dtype} != {self._data_dtype}"
-                )
-
-    def __setitem__(self, index: int, value: np.ndarray):
-        self._validate_input(value)
-        if self.closed:
-            raise UseAfterCloseError()
-        fn = self._filenames[index]
-        if fn is None:
-            fn = self._temp_path / f"{index}.npy"
-        np.save(fn, value, False)
-        self._filenames[index] = fn
-
-    def append(self, value: np.ndarray):
-        if self.closed:
-            raise UseAfterCloseError()
-        index = len(self)
-        self._filenames.append(None)
-        self.__setitem__(index, value)
-
-    def median(self, buffer_size: int=100 << 20) -> np.ndarray:
-        if self.closed:
-            raise UseAfterCloseError()
-        if not len(self):
-            raise TempArrayHandlerError("Can't take median of empty list")
-
-        # figure out how big the buffer can be
-        n_arrays = len(self)
-        allowed_memory_per_array = buffer_size // n_arrays
-
-        n_dim_1 = allowed_memory_per_array // (
-            self._data_dtype.itemsize * self._data_shape[0]
-        )
-        if n_dim_1 < 1:
-            # TODO more useful error message
-            msg = f"Memory buffer {allowed_memory_per_array} is too small to hold "+ \
-                  f"a single row of length {self._data_shape[0]}"
-            raise TempArrayHandlerError(msg)
-        if n_dim_1 >= self._data_shape[1]:
-            return np.nanmedian(self, axis=0)
-        log.info(f"Computing median in {int(self._data_shape[1]/n_dim_1)} chunks"+ \
-                 f"of size {(n_dim_1, self._data_shape[0])}")
-
-        buffer = np.empty(
-            (n_arrays, self._data_shape[0], n_dim_1), dtype=self._data_dtype
-        )
-        median = np.empty(self._data_shape, dtype=self._data_dtype)
-
-        e = n_dim_1
-        slices = [slice(0, e)]
-        while e <= self._data_shape[1]:
-            s = e
-            e += n_dim_1
-            slices.append(slice(s, min(e, self._data_shape[1])))
-
-        for s in slices:
-            for i, arr in enumerate(self):
-                buffer[i, :, : (s.stop - s.start)] = arr[:, s]
-            median[:, s] = np.nanmedian(buffer[:, :, : (s.stop - s.start)], axis=0)
-        return median
+    def shape(self):
+        return (self._append_count,) + self._slice_shape
+        
+    def append(self, data):
+        if data.shape != self._slice_shape:
+            raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}")
+        if data.dtype != self._dtype:
+            raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}")
+        with open(self._filename, 'ab') as f:
+            f.write(data.tobytes())
+        self._append_count += 1
+
+    def read(self):
+        shp = (self._append_count,) + self._slice_shape
+        return np.fromfile(self._filename, dtype=self._dtype).reshape(shp)
 
 
 def create_cube_median(cube_model, maskpt):
@@ -153,7 +64,7 @@ def create_cube_median(cube_model, maskpt):
     return median
 
 
-def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0):
+def create_median(resampled_models, maskpt, on_disk=True):
     """Create a median image from the singly resampled images.
 
     Parameters
@@ -167,41 +78,164 @@ def create_median(resampled_models, maskpt, on_disk=True, buffer_size=1.0):
     on_disk : bool
         If True, the input models are on disk and will be read in chunks.
 
-    buffer_size : float
-        The size of chunk in MB, per input model, that will be read into memory.
-        This parameter has no effect if on_disk is False.
-
     Returns
     -------
     median_image : ndarray
         The median image.
     """
-    # initialize storage for median computation
-    if on_disk:
-        # harder case: need special on-disk numpy array handling
-        model_list = TempArrayHandler()
-    else:
-        # easier case: all models in library can be loaded into memory at once
-        model_list = []
-
     # Compute the weight threshold for each input model
+    weight_thresholds = []
     with resampled_models:
         for resampled in resampled_models:
             weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
-            mask = np.less(resampled.wht, weight_threshold)
-            resampled.data[mask] = np.nan
-            model_list.append(resampled.data)
-            # this is still modified if on_disk is False, but doesn't matter because never used again
-            resampled_models.shelve(resampled, modify=False) 
-            del resampled
-    
+            weight_thresholds.append(weight_threshold)
+            resampled_models.shelve(resampled, modify=False)
+
+    # compute median over all models
     if not on_disk:
+        # easier case: all models in library can be loaded into memory at once
+        model_list = []
+        with resampled_models:
+            for resampled in resampled_models:
+                model_list.append(resampled.data)
+                resampled_models.shelve(resampled, modify=False)
+                del resampled
         return np.nanmedian(np.array(model_list), axis=0)
     else:
-        # compute median using on-disk arrays
-        median_data = model_list.median(buffer_size=int(buffer_size*_ONE_MB))
-        model_list.close()
-        return median_data
+        # set up buffered access to all input models
+        # get spatial sections of library and compute timewise median, one by one
+        resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds)
+        return _create_median(resampled_sections, row_indices)
+
+
+def _write_sections(library, weight_thresholds):
+    """Iterator to return sections from a ModelLibrary.
+
+    Parameters
+    ----------
+    library : ModelLibrary
+        The input data models.
+
+    weight_thresholds : list
+        The weight thresholds for masking out low weight pixels.
+
+    Returns
+    -------
+    temp_arrays : list
+        A list of DiskAppendableArray objects, each holding a spatial section
+        of every input model, stacked along the time axis.
+
+    row_indices : list
+        A list of tuples, each containing the start and end row indices of the
+        spatial section in the original data model.
+    """
+
+    with library:
+
+        # get an example model to determine dtype, shape, buffer indices
+        example_model = library.borrow(0)
+        dtype = example_model.data.dtype
+        shp = example_model.data.shape
+        itemsize = example_model.data.itemsize
+        imrows = shp[0]
+
+        # reasonable total buffer size is the size of each input datamodel
+        # since that is the bare minimum that must be loaded into memory in the first place
+        # for now just use the size of model.data
+        total_buffer_size = shp[0] * shp[1] * itemsize
+        per_model_buffer_size = total_buffer_size / len(library)
+        nsections, section_nrows = _compute_buffer_indices(shp, itemsize, per_model_buffer_size)
+        log.info(f"Computing median in {nsections} sections each of size: {total_buffer_size / _ONE_MB} MB")
+        library.shelve(example_model, 0, modify=False)
+        del example_model
+
+        # set up temp file handlers for each section
+        # handle zeroth section separately just to get the tempdir for the rest
+        slice_shape = (section_nrows, shp[1])
+        arr0 = DiskAppendableArray(slice_shape, filestem="section0", dtype=dtype)
+        tempdir = arr0._temp_dir.name
+        temp_arrays = [arr0,]
+        for i in range(1, nsections-1):
+            arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype)
+            temp_arrays.append(arr)
+
+        # handle the last section separately because it has a different shape
+        slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1])
+        arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype)
+        temp_arrays.append(arrn)
+            
+        # now append data from each model to all the sections
+        row_indices = []
+        for j, model in enumerate(library):
+            for i in range(nsections):
+                row1 = i * section_nrows
+                row2 = min(row1 + section_nrows, imrows)
+                if j == 0:
+                    row_indices.append((row1, row2))
+                arr = temp_arrays[i]
+                sci = model.data[row1:row2]
+
+                # handle weight thresholding right here, while array is open
+                thresh = weight_thresholds[j]
+                sci[model.wht[row1:row2] < thresh] = np.nan
+                arr.append(sci)
+
+            library.shelve(model, j, modify=False)
+        del model
+
+    return temp_arrays, row_indices
+
+
+def _compute_buffer_indices(shape, itemsize, buffer_size):
+
+    imrows, imcols = shape
+    min_buffer_size = imcols * itemsize
+    section_nrows = min(imrows, int(buffer_size // min_buffer_size))
+
+    if section_nrows == 0:
+        buffer_size = min_buffer_size
+        log.warning("WARNING: Buffer size is too small to hold a single row."
+                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
+        section_nrows = 1
+
+    nsections = int(np.ceil(imrows / section_nrows))
+    return nsections, section_nrows
+
+
+def _create_median(resampled_sections, row_indices):
+    """
+    Parameters
+    ----------
+    resampled_sections : list
+        List of DiskAppendableArray objects, each holding a spatial section
+        of every input model, stacked along the time axis.
+
+    row_indices : list
+        List of tuples, each containing the start and end row indices of the
+        spatial section in the original data model.
+
+    Returns
+    -------
+    median_image : ndarray
+        The median image.
+    """
+
+    dtype = resampled_sections[0]._dtype
+    output_rows = row_indices[-1][1]
+    output_cols = resampled_sections[0].shape[2]
+    median_image = np.empty((output_rows, output_cols), dtype) * np.nan
+
+    for i, disk_arr in enumerate(resampled_sections):
+        row1, row2 = row_indices[i]
+        arr = disk_arr.read()
+        with warnings.catch_warnings():
+            warnings.filterwarnings(action="ignore",
+                                    message="All-NaN slice encountered",
+                                    category=RuntimeWarning)
+            median_image[row1:row2] = np.nanmedian(arr, axis=0)
+        del arr, disk_arr
+
+    return median_image
 
 
 def flag_crs_in_models(

From 131cfecdd86d905078c732b739d93dacc2eb41fb Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 12 Sep 2024 18:12:17 -0400
Subject: [PATCH 04/34] add weight handling, change the way buffer compute
 works

---
 jwst/outlier_detection/imaging.py             | 13 ++++-
 jwst/outlier_detection/spec.py                | 10 +++-
 .../tests/test_outlier_detection.py           |  7 ++-
 jwst/outlier_detection/utils.py               | 55 ++++++++++++++-----
 4 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 5678b45802..b7817797e2 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -100,7 +100,18 @@ def detect_outliers(
                 input_models.shelve(model, modify=True)
 
     # Perform median combination on set of drizzled mosaics
-    median_data = create_median(drizzled_models, maskpt)
+    if in_memory:
+        buffer_size = None
+    else:
+        # reasonable buffer size is the size of an input model, since that is
+        # the smallest theoretical memory footprint
+        with input_models:
+            example_model = input_models.borrow(0)
+            buffer_size = example_model.data.size * example_model.data.itemsize
+            input_models.shelve(example_model, modify=False)
+            del example_model
+
+    median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size)
 
     if save_intermediate_results:
         # make a median model
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index c482aad8b9..205c8576bb 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -93,7 +93,15 @@ def detect_outliers(
 
     # Perform median combination on set of drizzled mosaics
     # create_median should be called as a method from parent class
-    median_data = create_median(drizzled_models, maskpt)
+    if in_memory:
+        buffer_size = None
+    else:
+        # reasonable buffer size is the size of an input model, since that is
+        # the smallest theoretical memory footprint
+        example_model = input_models[0]
+        buffer_size = example_model.data.size * example_model.data.itemsize
+        del example_model
+    median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size)
 
     if save_intermediate_results:
         # Initialize intermediate products used in the outlier detection
diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 22c75d1213..61f863029d 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -33,7 +33,7 @@
 @pytest.fixture
 def sci_blot_image_pair():
     """Provide a science and blotted ImageModel pair."""
-    shape = (20, 20)
+    shape = (23, 20)
     sigma = 0.02
     background = 3
 
@@ -608,7 +608,10 @@ def test_create_median(three_sci_as_asn, tmp_cwd):
                 model.wht[4,9] = 0.5
                 lib.shelve(model, modify=True)
 
-    median_on_disk = create_median(lib_on_disk, 0.7)
+    # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
+    # arbitrarily use 5 times that
+    buffer_size = 4 * 20 * 5 
+    median_on_disk = create_median(lib_on_disk, 0.7, buffer_size=buffer_size)
     median_in_memory = create_median(lib_in_memory, 0.7)
 
     assert np.isnan(median_in_memory[4,9])
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index e546e85116..fa4ee44385 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -31,7 +31,7 @@ def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"):
         self._dtype = dtype
         self._filename = self._temp_path / Path(filestem + ".bits")
         self._append_count = 0
-        with open(self._filename, 'wb') as f:
+        with open(self._filename, 'wb') as f: # Noqa: F841
             pass
 
     @property
@@ -64,7 +64,7 @@ def create_cube_median(cube_model, maskpt):
     return median
 
 
-def create_median(resampled_models, maskpt):
+def create_median(resampled_models, maskpt, buffer_size=None):
     """Create a median image from the singly resampled images.
 
     Parameters
@@ -75,12 +75,17 @@ def create_median(resampled_models, maskpt):
     maskpt : float
         The weight threshold for masking out low weight pixels.
 
+    buffer_size : int
+        The buffer size for the median computation, units of bytes.
+
     Returns
     -------
     median_image : ndarray
         The median image.
     """
     on_disk = resampled_models._on_disk
+    if on_disk and buffer_size is None:
+        raise ValueError("Buffer size must be provided when resampled models are on disk")
     
     # Compute the weight threshold for each input model
     weight_thresholds = []
@@ -109,11 +114,11 @@ def create_median(resampled_models, maskpt):
     else:
         # set up buffered access to all input models
         # get spatial sections of library and compute timewise median, one by one
-        resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds)
+        resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds, buffer_size)
         return _create_median(resampled_sections, row_indices)
 
 
-def _write_sections(library, weight_thresholds):
+def _write_sections(library, weight_thresholds, buffer_size):
     """Iterator to return sections from a ModelLibrary.
 
     Parameters
@@ -124,6 +129,9 @@ def _write_sections(library, weight_thresholds):
     weight_thresholds : list
         The weight thresholds for masking out low weight pixels.
 
+    buffer_size : int
+        The buffer size for the median computation, units of bytes.
+
     Returns
     -------
     temp_arrays : list
@@ -144,13 +152,8 @@ def _write_sections(library, weight_thresholds):
         itemsize = example_model.data.itemsize
         imrows = shp[0]
 
-        # reasonable total buffer size is the size of each input datamodel
-        # since that is the bare minimum that must be loaded into memory in the first place
-        # for now just use the size of model.data
-        total_buffer_size = shp[0] * shp[1] * itemsize
-        per_model_buffer_size = total_buffer_size / len(library)
-        nsections, section_nrows = _compute_buffer_indices(shp, itemsize, per_model_buffer_size)
-        log.info(f"Computing median in {nsections} sections each of size: {total_buffer_size / _ONE_MB} MB")
+        # compute buffer indices
+        nsections, section_nrows = _compute_buffer_indices((len(library),)+shp, itemsize, buffer_size)
         library.shelve(example_model, 0, modify=False)
         del example_model
 
@@ -192,18 +195,40 @@ def _write_sections(library, weight_thresholds):
 
 
 def _compute_buffer_indices(shape, itemsize, buffer_size):
+    """
+    Parameters
+    ----------
+    shape : tuple
+        The shape of the full input, ie, (n_images, imrows, imcols).
+
+    itemsize : int
+        The size of a single array element in bytes.
+
+    buffer_size : int
+        The buffer size for the median computation, units of bytes.
+
+    Returns
+    -------
+    nsections : int
+        The number of sections to divide the input data into.
+
+    section_nrows : int
+        The number of rows in each section (except the last one).
+    """
 
-    imrows, imcols = shape
+    nimages, imrows, imcols = shape
+    per_model_buffer_size = buffer_size / nimages
     min_buffer_size = imcols * itemsize
-    section_nrows = min(imrows, int(buffer_size // min_buffer_size))
+    section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size))
 
     if section_nrows == 0:
-        buffer_size = min_buffer_size
+        buffer_size = min_buffer_size * nimages
         log.warning("WARNING: Buffer size is too small to hold a single row."
-                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
+                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB per model")
         section_nrows = 1
 
     nsections = int(np.ceil(imrows / section_nrows))
+    log.info(f"Computing median over {nimages} images in {nsections} sections with total memory buffer {buffer_size / _ONE_MB} MB")
     return nsections, section_nrows
 
 

From a8c4c12d0fbb7d953eee792233cebc13b88a4e04 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 12 Sep 2024 18:18:57 -0400
Subject: [PATCH 05/34] small cleanups from code self-review

---
 jwst/outlier_detection/tests/test_outlier_detection.py | 2 +-
 jwst/outlier_detection/utils.py                        | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 61f863029d..0806dd42e7 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -33,7 +33,7 @@
 @pytest.fixture
 def sci_blot_image_pair():
     """Provide a science and blotted ImageModel pair."""
-    shape = (23, 20)
+    shape = (20, 20)
     sigma = 0.02
     background = 3
 
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index fa4ee44385..8928c9b453 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -119,7 +119,8 @@ def create_median(resampled_models, maskpt, buffer_size=None):
 
 
 def _write_sections(library, weight_thresholds, buffer_size):
-    """Iterator to return sections from a ModelLibrary.
+    """Write spatial sections from a ModelLibrary into temporary files
+    grouped along the time axis.
 
     Parameters
     ----------
@@ -224,7 +225,7 @@ def _compute_buffer_indices(shape, itemsize, buffer_size):
     if section_nrows == 0:
         buffer_size = min_buffer_size * nimages
         log.warning("WARNING: Buffer size is too small to hold a single row."
-                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB per model")
+                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
         section_nrows = 1
 
     nsections = int(np.ceil(imrows / section_nrows))

From ac1a932aeea898f92298537d76dbeb8e0f57107f Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 13 Sep 2024 10:10:40 -0400
Subject: [PATCH 06/34] bugfix for failing regtest and additional docstrings

---
 jwst/outlier_detection/utils.py | 74 +++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 8928c9b453..7ff32642eb 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -23,8 +23,39 @@
 
 
 class DiskAppendableArray:
+    """
+    Creates a temporary file to which to append data, in order to perform
+    timewise operations on a stack of input images without holding all of them
+    in memory.
+    
+    This class is purpose-built for the median computation during outlier detection 
+    and is not very flexible. It is assumed that each data array passed to `append`
+    represents the same spatial segment of the full dataset. It is also assumed that 
+    each data array passed to `append` represents only a single instant in time; 
+    the append operation will stack them along a new axis.
+    
+    The `read` operation is only capable of loading the full array back into memory.
+    When working with large datasets that do not fit in memory, the
+    required workflow is to create many DiskAppendableArray objects, each holding
+    a small spatial segment of the full dataset.
+    """
     
     def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"):
+        """
+        Parameters
+        ----------
+        slice_shape : tuple
+            The shape of the spatial section of input data to be appended to the array.
+
+        dtype : str
+            The data type of the array.
+
+        tempdir : str
+            The directory in which to create the temporary files.
+
+        filestem : str
+            The stem of the temporary file name. The extension ".bits" will be added.
+        """
         self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
         self._temp_path = Path(self._temp_dir.name)
         self._slice_shape = slice_shape
@@ -84,28 +115,24 @@ def create_median(resampled_models, maskpt, buffer_size=None):
         The median image.
     """
     on_disk = resampled_models._on_disk
-    if on_disk and buffer_size is None:
-        raise ValueError("Buffer size must be provided when resampled models are on disk")
+    if (on_disk) and (buffer_size is None):
+        raise ValueError("Buffer size for median calculation must be provided when models are on disk")
     
-    # Compute the weight threshold for each input model
-    weight_thresholds = []
+    # Compute weight threshold for each input model and set NaN in data where weight is below threshold
     model_list = []
     with resampled_models:
         for resampled in resampled_models:
             weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
+            resampled.data[resampled.wht < weight_threshold] = np.nan
             if not on_disk:
-                # handle weights right away for in-memory case
-                data = resampled.data.copy()
-                data[resampled.wht < weight_threshold] = np.nan
-                model_list.append(data)
-                del data
+                model_list.append(resampled.data)
+                resampled_models.shelve(resampled, modify=False)
             else:
-                weight_thresholds.append(weight_threshold)
-            resampled_models.shelve(resampled, modify=False)
+                resampled_models.shelve(resampled, modify=True)
         del resampled
     
-    # easier case: all models in library can be loaded into memory at once
     if not on_disk:
+        # easier case: all models in library can be loaded into memory at once
         with warnings.catch_warnings():
             warnings.filterwarnings(action="ignore",
                                     message="All-NaN slice encountered",
@@ -114,11 +141,11 @@ def create_median(resampled_models, maskpt, buffer_size=None):
     else:
         # set up buffered access to all input models
         # get spatial sections of library and compute timewise median, one by one
-        resampled_sections, row_indices = _write_sections(resampled_models, weight_thresholds, buffer_size)
+        resampled_sections, row_indices = _write_sections(resampled_models, buffer_size)
         return _create_median(resampled_sections, row_indices)
 
 
-def _write_sections(library, weight_thresholds, buffer_size):
+def _write_sections(library, buffer_size):
     """Write spatial sections from a ModelLibrary into temporary files
     grouped along the time axis.
 
@@ -127,9 +154,6 @@ def _write_sections(library, weight_thresholds, buffer_size):
     library : ModelLibrary
         The input data models.
 
-    weight_thresholds : list
-        The weight thresholds for masking out low weight pixels.
-
     buffer_size : int
         The buffer size for the median computation, units of bytes.
 
@@ -168,10 +192,11 @@ def _write_sections(library, weight_thresholds, buffer_size):
             arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype)
             temp_arrays.append(arr)
 
-        # handle the last section separately because it has a different shape
-        slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1])
-        arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype)
-        temp_arrays.append(arrn)
+        if nsections > 1:
+            # handle the last section separately because it has a different shape
+            slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1])
+            arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype)
+            temp_arrays.append(arrn)
             
         # now append data from each model to all the sections
         row_indices = []
@@ -182,12 +207,7 @@ def _write_sections(library, weight_thresholds, buffer_size):
                 if j == 0:
                     row_indices.append((row1, row2))
                 arr = temp_arrays[i]
-                sci = model.data[row1:row2]
-
-                # handle weight thresholding right here, while array is open
-                thresh = weight_thresholds[j]
-                sci[model.wht[row1:row2] < thresh] = np.nan
-                arr.append(sci)
+                arr.append(model.data[row1:row2])
 
             library.shelve(model, j, modify=False)
         del model

From 177ab426740065bc1552fd98b79dce4d66de4a5d Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 13 Sep 2024 10:36:05 -0400
Subject: [PATCH 07/34] added changelog entry

---
 CHANGES.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index 4108daff15..044b600f6a 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -528,6 +528,9 @@ outlier_detection
 - Fixed a bug that caused different results from the median calculation when the
   `in_memory` parameter was set to `True` vs `False`. [#8777]
 
+- Decrease the amount of file I/O required to compute the median when `in_memory`
+  is set to `False`. [#8782]
+
 pathloss
 --------
 

From 51f126e163e11fb8d0810471f3261b120099b85d Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 13 Sep 2024 16:51:44 -0400
Subject: [PATCH 08/34] attempted decrease memory usage by allocating cube
 first

---
 jwst/outlier_detection/utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 7ff32642eb..49bdec5618 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -119,13 +119,15 @@ def create_median(resampled_models, maskpt, buffer_size=None):
         raise ValueError("Buffer size for median calculation must be provided when models are on disk")
     
     # Compute weight threshold for each input model and set NaN in data where weight is below threshold
-    model_list = []
     with resampled_models:
-        for resampled in resampled_models:
+        for i, resampled in enumerate(resampled_models):
             weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
             resampled.data[resampled.wht < weight_threshold] = np.nan
             if not on_disk:
-                model_list.append(resampled.data)
+                if i == 0:
+                    model_cube = np.empty((len(resampled_models),) + resampled.data.shape,
+                                         dtype=resampled.data.dtype)
+                model_cube[i] = resampled.data
                 resampled_models.shelve(resampled, modify=False)
             else:
                 resampled_models.shelve(resampled, modify=True)
@@ -137,7 +139,7 @@ def create_median(resampled_models, maskpt, buffer_size=None):
             warnings.filterwarnings(action="ignore",
                                     message="All-NaN slice encountered",
                                     category=RuntimeWarning)
-            return np.nanmedian(np.array(model_list), axis=0)
+            return np.nanmedian(model_cube, axis=0)
     else:
         # set up buffered access to all input models
         # get spatial sections of library and compute timewise median, one by one

From 1fdfee4deada2664110351794d40277e13941ce8 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Mon, 16 Sep 2024 13:34:50 -0400
Subject: [PATCH 09/34] refactor to allow adding models to median computer one
 by one

---
 jwst/outlier_detection/imaging.py          |  13 +-
 jwst/outlier_detection/spec.py             |  11 +-
 jwst/outlier_detection/tests/test_utils.py | 106 +++++++
 jwst/outlier_detection/utils.py            | 305 ++++++++++-----------
 4 files changed, 259 insertions(+), 176 deletions(-)
 create mode 100644 jwst/outlier_detection/tests/test_utils.py

diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index b7817797e2..5678b45802 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -100,18 +100,7 @@ def detect_outliers(
                 input_models.shelve(model, modify=True)
 
     # Perform median combination on set of drizzled mosaics
-    if in_memory:
-        buffer_size = None
-    else:
-        # reasonable buffer size is the size of an input model, since that is
-        # the smallest theoretical memory footprint
-        with input_models:
-            example_model = input_models.borrow(0)
-            buffer_size = example_model.data.size * example_model.data.itemsize
-            input_models.shelve(example_model, modify=False)
-            del example_model
-
-    median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size)
+    median_data = create_median(drizzled_models, maskpt)
 
     if save_intermediate_results:
         # make a median model
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index 205c8576bb..ad8e712d4c 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -92,16 +92,7 @@ def detect_outliers(
         median_wcs = copy.deepcopy(input_models[0].meta.wcs)
 
     # Perform median combination on set of drizzled mosaics
-    # create_median should be called as a method from parent class
-    if in_memory:
-        buffer_size = None
-    else:
-        # reasonable buffer size is the size of an input model, since that is
-        # the smallest theoretical memory footprint
-        example_model = input_models[0]
-        buffer_size = example_model.data.size * example_model.data.itemsize
-        del example_model
-    median_data = create_median(drizzled_models, maskpt, buffer_size=buffer_size)
+    median_data = create_median(drizzled_models, maskpt)
 
     if save_intermediate_results:
         # Initialize intermediate products used in the outlier detection
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
new file mode 100644
index 0000000000..318384a2c8
--- /dev/null
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -0,0 +1,106 @@
+import pytest
+from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskTimewiseOperation
+from pathlib import Path
+import numpy as np
+import os
+
+
+def test_disk_appendable_array(tmp_cwd):
+
+    slice_shape = (8,7)
+    dtype = 'float32'
+    tempdir = tmp_cwd / Path("tmptest")
+    os.mkdir(tempdir)
+
+    arr = DiskAppendableArray(slice_shape, dtype=dtype, tempdir=tempdir)
+
+    # check temporary file setup
+    assert arr._filename.split("/")[-1] in os.listdir(tempdir)
+    assert len(os.listdir(tempdir)) == 1
+    assert arr.shape == (0,) + slice_shape
+
+    # check cwd contains no files
+    assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)])
+
+    # check expected append failures
+    with pytest.raises(ValueError):
+        candidate = np.zeros((7,7), dtype=dtype)
+        arr.append(candidate)
+    with pytest.raises(ValueError):
+        candidate = np.zeros((8,7), dtype='float64')
+        arr.append(candidate)
+
+    # check append and read
+    candidate0 = np.zeros(slice_shape, dtype=dtype)
+    candidate1 = np.full(slice_shape, 2, dtype=dtype)
+    candidate2 = np.full(slice_shape, np.nan, dtype=dtype)
+    for candidate in [candidate0, candidate1, candidate2]:
+        arr.append(candidate)
+    
+    arr_in_memory = arr.read()
+
+    assert arr_in_memory.shape == (3,) + slice_shape
+    assert np.all(arr_in_memory[0] == candidate0)
+    assert np.all(arr_in_memory[1] == candidate1)
+    assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True)
+
+    # check cleanup occurs after reassigning arr
+    arr = None
+    assert len(os.listdir(tempdir)) == 0
+
+
+
+def test_on_disk_median(tmp_cwd):
+
+    library_length = 3
+    frame_shape = (21,20)
+    dtype = 'float32'
+    tempdir = tmp_cwd / Path("tmptest")
+    os.mkdir(tempdir)
+    shape = (library_length,) + frame_shape
+
+    median_computer = OnDiskTimewiseOperation(shape, dtype=dtype, tempdir=tempdir)
+
+    # test compute buffer indices
+    # buffer size equals size of single input model by default
+    # which means we expect same number of sections as library length
+    # in reality there is often one more section than that because
+    # of necessity of integer number of rows per section, but math is exact in this case
+    expected_buffer_size = frame_shape[0] * frame_shape[1] * np.dtype(dtype).itemsize
+    expected_section_nrows = frame_shape[0] // library_length
+    assert median_computer.nsections == library_length
+    assert median_computer.section_nrows == expected_section_nrows
+    assert median_computer.buffer_size == expected_buffer_size
+
+    # test temp file setup
+    assert len(os.listdir(tempdir)) == 1
+    assert str(median_computer._temp_path).startswith(str(tempdir))
+    assert len(os.listdir(median_computer._temp_path)) == library_length
+    # check cwd and parent tempdir contain no files
+    assert all([not os.path.isfile(f) for f in os.listdir(tmp_cwd)])
+    assert all([not os.path.isfile(f) for f in os.listdir(tempdir)])
+    
+    # test validate data
+    with pytest.raises(ValueError):
+        candidate = np.zeros((20,20), dtype=dtype)
+        median_computer.add_image(candidate)
+    with pytest.raises(ValueError):
+        candidate = np.zeros((21,20), dtype='float64')
+        median_computer.add_image(candidate)
+
+    # test add and compute
+    candidate0 = np.full(frame_shape, 3, dtype=dtype)
+    candidate1 = np.full(frame_shape, 2, dtype=dtype)
+    candidate2 = np.full(frame_shape, np.nan, dtype=dtype)
+    for candidate in [candidate0, candidate1, candidate2]:
+        median_computer.add_image(candidate)
+    median = median_computer.compute_median()
+    assert median.shape == frame_shape
+    assert np.all(median == 2.5)
+
+    # test expected error trying to add too many frames
+    # for loop to ensure always happens, not just the first time
+    candidate3 = np.zeros_like(candidate0)
+    for _ in range(2):
+        with pytest.raises(IndexError):
+            median_computer.add_image(candidate3)
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 49bdec5618..309ecf1451 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -22,6 +22,27 @@
 _ONE_MB = 1 << 20
 
 
+def _nanmedian3D(cube, overwrite_input=True):
+    """Convenience function to compute the median of a cube ignoring warnings
+    and setting memory-efficient flag"""
+    with warnings.catch_warnings():
+        warnings.filterwarnings(action="ignore",
+                                message="All-NaN slice encountered",
+                                category=RuntimeWarning)
+        return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input)
+
+
+def create_cube_median(cube_model, maskpt):
+    log.info("Computing median")
+
+    weight_threshold = compute_weight_threshold(cube_model.wht, maskpt)
+
+    # not safe to use overwrite_input=True here because we are operating on model.data directly
+    return _nanmedian3D(
+        np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan),
+        overwrite_input=False)
+
+
 class DiskAppendableArray:
     """
     Creates a temporary file to which to append data, in order to perform
@@ -40,7 +61,7 @@ class DiskAppendableArray:
     a small spatial segment of the full dataset.
     """
     
-    def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"):
+    def __init__(self, slice_shape, dtype='float32', tempdir=""):
         """
         Parameters
         ----------
@@ -52,15 +73,12 @@ def __init__(self, slice_shape, dtype='float32', tempdir="", filestem="data"):
 
         tempdir : str
             The directory in which to create the temporary files.
-
-        filestem : str
-            The stem of the temporary file name. The extension ".bits" will be added.
+            Default is the current working directory.
         """
-        self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
-        self._temp_path = Path(self._temp_dir.name)
+        self._temp_file = tempfile.NamedTemporaryFile(dir=tempdir)
+        self._filename = self._temp_file.name
         self._slice_shape = slice_shape
         self._dtype = dtype
-        self._filename = self._temp_path / Path(filestem + ".bits")
         self._append_count = 0
         with open(self._filename, 'wb') as f: # Noqa: F841
             pass
@@ -83,18 +101,6 @@ def read(self):
         return np.fromfile(self._filename, dtype=self._dtype).reshape(shp)
 
 
-def create_cube_median(cube_model, maskpt):
-    log.info("Computing median")
-
-    weight_threshold = compute_weight_threshold(cube_model.wht, maskpt)
-
-    median = np.nanmedian(
-        np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan),
-        axis=0,
-    )
-    return median
-
-
 def create_median(resampled_models, maskpt, buffer_size=None):
     """Create a median image from the singly resampled images.
 
@@ -106,17 +112,12 @@ def create_median(resampled_models, maskpt, buffer_size=None):
     maskpt : float
         The weight threshold for masking out low weight pixels.
 
-    buffer_size : int
-        The buffer size for the median computation, units of bytes.
-
     Returns
     -------
     median_image : ndarray
         The median image.
     """
     on_disk = resampled_models._on_disk
-    if (on_disk) and (buffer_size is None):
-        raise ValueError("Buffer size for median calculation must be provided when models are on disk")
     
     # Compute weight threshold for each input model and set NaN in data where weight is below threshold
     with resampled_models:
@@ -128,9 +129,16 @@ def create_median(resampled_models, maskpt, buffer_size=None):
                     model_cube = np.empty((len(resampled_models),) + resampled.data.shape,
                                          dtype=resampled.data.dtype)
                 model_cube[i] = resampled.data
-                resampled_models.shelve(resampled, modify=False)
             else:
-                resampled_models.shelve(resampled, modify=True)
+                if i == 0:
+                    # set up temporary storage for spatial sections of all input models
+                    shp = (len(resampled_models),) + resampled.data.shape
+                    median_computer = OnDiskTimewiseOperation(shp,
+                                                              dtype=resampled.data.dtype,
+                                                              buffer_size=buffer_size)
+                # write spatial sections to disk
+                median_computer.add_image(resampled.data)
+            resampled_models.shelve(resampled, i, modify=False)
         del resampled
     
     if not on_disk:
@@ -139,156 +147,145 @@ def create_median(resampled_models, maskpt, buffer_size=None):
             warnings.filterwarnings(action="ignore",
                                     message="All-NaN slice encountered",
                                     category=RuntimeWarning)
-            return np.nanmedian(model_cube, axis=0)
+            return _nanmedian3D(model_cube)
     else:
-        # set up buffered access to all input models
-        # get spatial sections of library and compute timewise median, one by one
-        resampled_sections, row_indices = _write_sections(resampled_models, buffer_size)
-        return _create_median(resampled_sections, row_indices)
+        # retrieve spatial sections from disk one by one and compute timewise median
+        return median_computer.compute_median()
 
 
-def _write_sections(library, buffer_size):
-    """Write spatial sections from a ModelLibrary into temporary files
-    grouped along the time axis.
+class OnDiskTimewiseOperation:
 
-    Parameters
-    ----------
-    library : ModelLibrary
-        The input data models.
-
-    buffer_size : int
-        The buffer size for the median computation, units of bytes.
+    def __init__(self, shape, tempdir="", dtype='float32', buffer_size=None):
+        """
+        Class to compute timewise operations on a stack of input images without
+        holding all of them in memory. At present, only the median operation is
+        supported. To extend, create a function definition for the desired
+        operation modeled after the `compute_median` method.
 
-    Returns
-    -------
-    temp_arrays : list
-        A list of DiskAppendableArray objects, each holding a spatial section
-        of every input model, stacked along the time axis.
+        Parameters
+        ----------
+        shape: tuple
+            The shape of the entire input, (n_images, imrows, imcols).
 
-    row_indices : list
-        A list of tuples, each containing the start and end row indices of the
-        spatial section in the original data model.
-    """
+        tempdir : str
+            The parent directory in which to create the temporary directory,
+            which itself holds all the DiskAppendableArray tempfiles.
+            Default is the current working directory.
 
-    with library:
-
-        # get an example model to determine dtype, shape, buffer indices
-        example_model = library.borrow(0)
-        dtype = example_model.data.dtype
-        shp = example_model.data.shape
-        itemsize = example_model.data.itemsize
-        imrows = shp[0]
-
-        # compute buffer indices
-        nsections, section_nrows = _compute_buffer_indices((len(library),)+shp, itemsize, buffer_size)
-        library.shelve(example_model, 0, modify=False)
-        del example_model
-
-        # set up temp file handlers for each section
-        # handle zeroth section separately just to get the tempdir for the rest
-        slice_shape = (section_nrows, shp[1])
-        arr0 = DiskAppendableArray(slice_shape, filestem="section0", dtype=dtype)
-        tempdir = arr0._temp_dir.name
-        temp_arrays = [arr0,]
-        for i in range(1, nsections-1):
-            arr = DiskAppendableArray(slice_shape, tempdir=tempdir, filestem=f"section{i}", dtype=dtype)
-            temp_arrays.append(arr)
+        dtype : str
+            The data type of the input data.
+
+        buffer_size : int, optional
+            The buffer size for the median computation, units of bytes.
+            If None, the buffer size will be set equal to the size of one input
+            resampled image. This is the largest it can be without increasing
+            the memory footprint of the entire step: we write the drizzled
+            model to disk just before the median calculation handled by this class,
+            meaning we can discard the resampled image from memory before calling
+            compute_median() such that the memory is freed up for the median calculation.
+        """
+        self._expected_nframes = shape[0]
+        self.frame_shape = shape[1:]
+        self.dtype = dtype
+        self.itemsize = np.dtype(dtype).itemsize
+        self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
+        self._temp_path = Path(self._temp_dir.name)
 
-        if nsections > 1:
-            # handle the last section separately because it has a different shape
-            slice_shape_last = (imrows - (nsections-1) * section_nrows, shp[1])
-            arrn = DiskAppendableArray(slice_shape_last, tempdir=tempdir, filestem=f"section{nsections-1}", dtype=dtype)
-            temp_arrays.append(arrn)
-            
-        # now append data from each model to all the sections
-        row_indices = []
-        for j, model in enumerate(library):
-            for i in range(nsections):
-                row1 = i * section_nrows
-                row2 = min(row1 + section_nrows, imrows)
-                if j == 0:
-                    row_indices.append((row1, row2))
-                arr = temp_arrays[i]
-                arr.append(model.data[row1:row2])
-
-            library.shelve(model, j, modify=False)
-        del model
-
-    return temp_arrays, row_indices
-
-
-def _compute_buffer_indices(shape, itemsize, buffer_size):
-    """
-    Parameters
-    ----------
-    shape : tuple
-        The shape of the full input, ie, (n_images, imrows, imcols).
+        # figure out number of sections and rows per section that are needed
+        self.nsections, self.section_nrows = self._get_buffer_indices(buffer_size=buffer_size)
+        self.slice_shape = (self.section_nrows, shape[2])
+        self._n_adds = 0
 
-    itemsize : int
-        The size of a single array element in bytes.
+        # instantiate a temporary DiskAppendableArray for each section
+        self._temp_arrays = self._temparray_setup(dtype)
 
-    buffer_size : int
-        The buffer size for the median computation, units of bytes.
 
-    Returns
-    -------
-    nsections : int
-        The number of sections to divide the input data into.
+    def _get_buffer_indices(self, buffer_size=None):
+        """
+        Parameters
+        ----------
+        buffer_size : int, optional
+            The buffer size for the median computation, units of bytes.
 
-    section_nrows : int
-        The number of rows in each section (except the last one).
-    """
+        Returns
+        -------
+        nsections : int
+            The number of sections to divide the input data into.
 
-    nimages, imrows, imcols = shape
-    per_model_buffer_size = buffer_size / nimages
-    min_buffer_size = imcols * itemsize
-    section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size))
+        section_nrows : int
+            The number of rows in each section (except the last one).
+        """
+        imrows, imcols = self.frame_shape
+        if buffer_size is None:
+            buffer_size = imrows * imcols * self.itemsize
+        self.buffer_size = buffer_size
+        per_model_buffer_size = buffer_size / self._expected_nframes
+        min_buffer_size = imcols * self.itemsize
+        section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size))
+
+        if section_nrows == 0:
+            buffer_size = min_buffer_size * self._expected_nframes
+            log.warning("Buffer size is too small to hold a single row."
+                            f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
+            section_nrows = 1
+
+        nsections = int(np.ceil(imrows / section_nrows))
+        log.info(f"Computing median over {self._expected_nframes} images in {nsections}"
+                    f"sections with total memory buffer {buffer_size / _ONE_MB} MB")
+        return nsections, section_nrows
+
+
+    def _temparray_setup(self, dtype):
+        """Set up temp file handlers for each spatial section"""
+        temp_arrays = []
+        for i in range(0, self.nsections):
+            shp = self.slice_shape
+            if i == self.nsections - 1:
+                # last section has whatever shape is left over
+                shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1])
+            arr = DiskAppendableArray(shp,
+                                      tempdir=self._temp_path,
+                                      dtype=dtype)
+            temp_arrays.append(arr)
+        return temp_arrays
 
-    if section_nrows == 0:
-        buffer_size = min_buffer_size * nimages
-        log.warning("WARNING: Buffer size is too small to hold a single row."
-                        f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
-        section_nrows = 1
 
-    nsections = int(np.ceil(imrows / section_nrows))
-    log.info(f"Computing median over {nimages} images in {nsections} sections with total memory buffer {buffer_size / _ONE_MB} MB")
-    return nsections, section_nrows
+    def add_image(self, data):
+        """Split resampled model data into spatial sections and write to disk."""
+        if self._n_adds >= self.nsections:
+            raise IndexError(f"Too many calls to add_image. Expected at most {self.nsections} input models.")
+        self._validate_data(data)
+        self._n_adds += 1
+        for i in range(self.nsections):
+            row1 = i * self.section_nrows
+            row2 = min(row1 + self.section_nrows, self.frame_shape[0])
+            arr = self._temp_arrays[i]
+            arr.append(data[row1:row2])
 
 
-def _create_median(resampled_sections, row_indices):
-    """
-    Parameters
-    ----------
-    resampled_sections : list
-        List of DiskAppendableArray objects, each holding a spatial section
-        of every input model, stacked along the time axis.
+    def _validate_data(self, data):
+        if data.shape != self.frame_shape:
+            raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}")
+        if data.dtype != self.dtype:
+            raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}")
 
-    row_indices : list
-        List of tuples, each containing the start and end row indices of the
-        spatial section in the original data model.
 
-    Returns
-    -------
-    median_image : ndarray
-        The median image.
-    """
+    def compute_median(self):
+        """Read spatial sections from disk and compute the median."""
+        row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0]))
+                       for i in range(self.nsections)]
 
-    dtype = resampled_sections[0]._dtype
-    output_rows = row_indices[-1][1]
-    output_cols = resampled_sections[0].shape[2]
-    median_image = np.empty((output_rows, output_cols), dtype) * np.nan
+        output_rows = row_indices[-1][1]
+        output_cols = self._temp_arrays[0].shape[2]
+        median_image = np.empty((output_rows, output_cols), self.dtype) * np.nan
 
-    for i, disk_arr in enumerate(resampled_sections):
-        row1, row2 = row_indices[i]
-        arr = disk_arr.read()
-        with warnings.catch_warnings():
-            warnings.filterwarnings(action="ignore",
-                                    message="All-NaN slice encountered",
-                                    category=RuntimeWarning)
-            median_image[row1:row2] = np.nanmedian(arr, axis=0)
-        del arr, disk_arr
+        for i, disk_arr in enumerate(self._temp_arrays):
+            row1, row2 = row_indices[i]
+            arr = disk_arr.read()
+            median_image[row1:row2] = _nanmedian3D(arr)
+            del arr, disk_arr
 
-    return median_image
+        return median_image
 
 
 def flag_crs_in_models(

From fef97b817575c0e85824b5d29cf51c96b9c94ee4 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 17 Sep 2024 18:20:32 -0400
Subject: [PATCH 10/34] handle temp file open, close, and delete very manually

---
 jwst/outlier_detection/tests/test_utils.py |  44 ++++-
 jwst/outlier_detection/utils.py            | 182 +++++++++++----------
 2 files changed, 137 insertions(+), 89 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index 318384a2c8..0b2cf9e679 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -1,5 +1,5 @@
 import pytest
-from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskTimewiseOperation
+from jwst.outlier_detection.utils import DiskAppendableArray, OnDiskMedian
 from pathlib import Path
 import numpy as np
 import os
@@ -8,11 +8,11 @@
 def test_disk_appendable_array(tmp_cwd):
 
     slice_shape = (8,7)
-    dtype = 'float32'
+    dtype = "float32"
     tempdir = tmp_cwd / Path("tmptest")
     os.mkdir(tempdir)
 
-    arr = DiskAppendableArray(slice_shape, dtype=dtype, tempdir=tempdir)
+    arr = DiskAppendableArray(slice_shape, dtype, tempdir)
 
     # check temporary file setup
     assert arr._filename.split("/")[-1] in os.listdir(tempdir)
@@ -44,11 +44,32 @@ def test_disk_appendable_array(tmp_cwd):
     assert np.all(arr_in_memory[1] == candidate1)
     assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True)
 
-    # check cleanup occurs after reassigning arr
-    arr = None
+    # test cleanup 
+    arr.cleanup()
     assert len(os.listdir(tempdir)) == 0
 
 
+def test_disk_appendable_array_bad_inputs(tmp_cwd):
+
+    slice_shape = (8,7)
+    dtype = "float32"
+    tempdir = tmp_cwd / Path("tmptest")
+    
+    # test input directory does not exist
+    with pytest.raises(FileNotFoundError):
+        arr = DiskAppendableArray(slice_shape, dtype, tempdir)
+
+    # make the input directory
+    os.mkdir(tempdir)
+
+    # test slice_shape is not 2-D
+    with pytest.raises(ValueError):
+        arr = DiskAppendableArray((3,5,7), dtype, tempdir)
+
+    # test dtype is not valid
+    with pytest.raises(TypeError):
+        arr = DiskAppendableArray(slice_shape, "float3", tempdir)
+
 
 def test_on_disk_median(tmp_cwd):
 
@@ -59,7 +80,7 @@ def test_on_disk_median(tmp_cwd):
     os.mkdir(tempdir)
     shape = (library_length,) + frame_shape
 
-    median_computer = OnDiskTimewiseOperation(shape, dtype=dtype, tempdir=tempdir)
+    median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir)
 
     # test compute buffer indices
     # buffer size equals size of single input model by default
@@ -95,6 +116,7 @@ def test_on_disk_median(tmp_cwd):
     for candidate in [candidate0, candidate1, candidate2]:
         median_computer.add_image(candidate)
     median = median_computer.compute_median()
+    median_computer.cleanup()
     assert median.shape == frame_shape
     assert np.all(median == 2.5)
 
@@ -104,3 +126,13 @@ def test_on_disk_median(tmp_cwd):
     for _ in range(2):
         with pytest.raises(IndexError):
             median_computer.add_image(candidate3)
+
+    # test cleanup of tmpdir and everything else
+    assert not os.path.exists(median_computer._temp_path)
+    assert len(os.listdir(tempdir)) == 0
+    assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory
+
+
+def test_on_disk_median_bad_inputs(tmp_cwd):
+    # FIXME
+    pass
\ No newline at end of file
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 309ecf1451..a733ee724f 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -43,6 +43,60 @@ def create_cube_median(cube_model, maskpt):
         overwrite_input=False)
 
 
+def create_median(resampled_models, maskpt, buffer_size=None):
+    """Create a median image from the singly resampled images.
+
+    Parameters
+    ----------
+    resampled_models : ModelLibrary
+        The singly resampled images.
+
+    maskpt : float
+        The weight threshold for masking out low weight pixels.
+
+    Returns
+    -------
+    median_image : ndarray
+        The median image.
+    """
+    on_disk = resampled_models._on_disk
+    
+    # Compute weight threshold for each input model and set NaN in data where weight is below threshold
+    with resampled_models:
+        for i, resampled in enumerate(resampled_models):
+            weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
+            resampled.data[resampled.wht < weight_threshold] = np.nan
+            if not on_disk:
+                if i == 0:
+                    model_cube = np.empty((len(resampled_models),) + resampled.data.shape,
+                                         dtype=resampled.data.dtype)
+                model_cube[i] = resampled.data
+            else:
+                if i == 0:
+                    # set up temporary storage for spatial sections of all input models
+                    shp = (len(resampled_models),) + resampled.data.shape
+                    median_computer = OnDiskMedian(shp,
+                                                   dtype=resampled.data.dtype,
+                                                   buffer_size=buffer_size)
+                # write spatial sections to disk
+                median_computer.add_image(resampled.data)
+            resampled_models.shelve(resampled, i, modify=False)
+        del resampled
+    
+    if not on_disk:
+        # easier case: all models in library can be loaded into memory at once
+        with warnings.catch_warnings():
+            warnings.filterwarnings(action="ignore",
+                                    message="All-NaN slice encountered",
+                                    category=RuntimeWarning)
+            return _nanmedian3D(model_cube)
+    else:
+        # retrieve spatial sections from disk one by one and compute timewise median
+        med = median_computer.compute_median()
+        median_computer.cleanup()
+        return med
+
+
 class DiskAppendableArray:
     """
     Creates a temporary file to which to append data, in order to perform
@@ -61,7 +115,7 @@ class DiskAppendableArray:
     a small spatial segment of the full dataset.
     """
     
-    def __init__(self, slice_shape, dtype='float32', tempdir=""):
+    def __init__(self, slice_shape, dtype, tempdir):
         """
         Parameters
         ----------
@@ -69,125 +123,81 @@ def __init__(self, slice_shape, dtype='float32', tempdir=""):
             The shape of the spatial section of input data to be appended to the array.
 
         dtype : str
-            The data type of the array.
+            The data type of the array. Must be a valid numpy array datatype.
 
         tempdir : str
             The directory in which to create the temporary files.
             Default is the current working directory.
         """
-        self._temp_file = tempfile.NamedTemporaryFile(dir=tempdir)
-        self._filename = self._temp_file.name
+        if len(slice_shape) != 2:
+            raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.")
+        self._temp_file, self._filename = tempfile.mkstemp(dir=tempdir)
         self._slice_shape = slice_shape
-        self._dtype = dtype
+        self._dtype = np.dtype(dtype)
         self._append_count = 0
-        with open(self._filename, 'wb') as f: # Noqa: F841
-            pass
+
 
     @property
     def shape(self):
         return (self._append_count,) + self._slice_shape
-        
+
+
     def append(self, data):
+        """Add a new slice to the temporary file"""
         if data.shape != self._slice_shape:
             raise ValueError(f"Data shape {data.shape} does not match slice shape {self._slice_shape}")
         if data.dtype != self._dtype:
             raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}")
-        with open(self._filename, 'ab') as f:
+        with open(self._filename, "ab") as f:
             f.write(data.tobytes())
         self._append_count += 1
 
+
     def read(self):
+        """Read the 3-D array into memory, then delete the tempfile"""
         shp = (self._append_count,) + self._slice_shape
-        return np.fromfile(self._filename, dtype=self._dtype).reshape(shp)
-
-
-def create_median(resampled_models, maskpt, buffer_size=None):
-    """Create a median image from the singly resampled images.
-
-    Parameters
-    ----------
-    resampled_models : ModelLibrary
-        The singly resampled images.
-
-    maskpt : float
-        The weight threshold for masking out low weight pixels.
-
-    Returns
-    -------
-    median_image : ndarray
-        The median image.
-    """
-    on_disk = resampled_models._on_disk
+        with open(self._filename, "rb") as f:
+            output = np.fromfile(f, dtype=self._dtype).reshape(shp)
+        return output
     
-    # Compute weight threshold for each input model and set NaN in data where weight is below threshold
-    with resampled_models:
-        for i, resampled in enumerate(resampled_models):
-            weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
-            resampled.data[resampled.wht < weight_threshold] = np.nan
-            if not on_disk:
-                if i == 0:
-                    model_cube = np.empty((len(resampled_models),) + resampled.data.shape,
-                                         dtype=resampled.data.dtype)
-                model_cube[i] = resampled.data
-            else:
-                if i == 0:
-                    # set up temporary storage for spatial sections of all input models
-                    shp = (len(resampled_models),) + resampled.data.shape
-                    median_computer = OnDiskTimewiseOperation(shp,
-                                                              dtype=resampled.data.dtype,
-                                                              buffer_size=buffer_size)
-                # write spatial sections to disk
-                median_computer.add_image(resampled.data)
-            resampled_models.shelve(resampled, i, modify=False)
-        del resampled
-    
-    if not on_disk:
-        # easier case: all models in library can be loaded into memory at once
-        with warnings.catch_warnings():
-            warnings.filterwarnings(action="ignore",
-                                    message="All-NaN slice encountered",
-                                    category=RuntimeWarning)
-            return _nanmedian3D(model_cube)
-    else:
-        # retrieve spatial sections from disk one by one and compute timewise median
-        return median_computer.compute_median()
+
+    def cleanup(self):
+        Path.unlink(self._filename)
+        return
 
 
-class OnDiskTimewiseOperation:
+class OnDiskMedian:
 
-    def __init__(self, shape, tempdir="", dtype='float32', buffer_size=None):
+    def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None):
         """
-        Class to compute timewise operations on a stack of input images without
-        holding all of them in memory. At present, only the median operation is
-        supported. To extend, create a function definition for the desired
-        operation modeled after the `compute_median` method.
+        Set up temporary files to perform operations on a stack of 2-D input arrays
+        along the stacking axis (e.g., a time axis) without
+        holding all of them in memory. Currently the only supported operation
+        is the median.
 
         Parameters
         ----------
         shape: tuple
             The shape of the entire input, (n_images, imrows, imcols).
 
+        dtype : str
+            The data type of the input data.
+
         tempdir : str
             The parent directory in which to create the temporary directory,
             which itself holds all the DiskAppendableArray tempfiles.
             Default is the current working directory.
 
-        dtype : str
-            The data type of the input data.
-
         buffer_size : int, optional
-            The buffer size for the median computation, units of bytes.
-            If None, the buffer size will be set equal to the size of one input
-            resampled image. This is the largest it can be without increasing
-            the memory footprint of the entire step: we write the drizzled
-            model to disk just before the median calculation handled by this class,
-            meaning we can discard the resampled image from memory before calling
-            compute_median() such that the memory is freed up for the median calculation.
+            The buffer size, units of bytes.
+            Default is the size of one input image.
         """
+        if len(shape) != 3:
+            raise ValueError(f"Invalid input shape {shape}; only three-dimensional data are supported.")
         self._expected_nframes = shape[0]
         self.frame_shape = shape[1:]
-        self.dtype = dtype
-        self.itemsize = np.dtype(dtype).itemsize
+        self.dtype = np.dtype(dtype)
+        self.itemsize = self.dtype.itemsize
         self._temp_dir = tempfile.TemporaryDirectory(dir=tempdir)
         self._temp_path = Path(self._temp_dir.name)
 
@@ -243,9 +253,7 @@ def _temparray_setup(self, dtype):
             if i == self.nsections - 1:
                 # last section has whatever shape is left over
                 shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1])
-            arr = DiskAppendableArray(shp,
-                                      tempdir=self._temp_path,
-                                      dtype=dtype)
+            arr = DiskAppendableArray(shp, dtype, self._temp_path)
             temp_arrays.append(arr)
         return temp_arrays
 
@@ -268,10 +276,18 @@ def _validate_data(self, data):
             raise ValueError(f"Data shape {data.shape} does not match expected shape {self.frame_shape}")
         if data.dtype != self.dtype:
             raise ValueError(f"Data dtype {data.dtype} does not match expected dtype {self.dtype}")
+        
+
+    def cleanup(self):
+        """Remove the temporary files and directory when finished"""
+        [arr.cleanup() for arr in self._temp_arrays]
+        Path.rmdir(self._temp_path)
+        return
 
 
     def compute_median(self):
-        """Read spatial sections from disk and compute the median."""
+        """Read spatial sections from disk and compute the median across groups
+        (median over number of exposures on a per-pixel basis)"""
         row_indices = [(i * self.section_nrows, min((i+1) * self.section_nrows, self.frame_shape[0]))
                        for i in range(self.nsections)]
 

From ff10987304f07292f5e61f592261b90ec56e8444 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 18 Sep 2024 13:46:23 -0400
Subject: [PATCH 11/34] refactor resample_many_to_many to allow discard
 drizzled models after use, imaging only

---
 jwst/outlier_detection/imaging.py             | 122 +++++++++----
 .../tests/test_outlier_detection.py           |   2 +-
 jwst/outlier_detection/tests/test_utils.py    |   6 +-
 jwst/outlier_detection/utils.py               |  31 ++--
 jwst/resample/resample.py                     | 166 ++++++++++--------
 5 files changed, 208 insertions(+), 119 deletions(-)

diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 5678b45802..964c0e3c3b 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -6,6 +6,7 @@
 import logging
 import os
 
+import numpy as np
 from stdatamodels.jwst import datamodels
 
 from jwst.datamodels import ModelLibrary
@@ -13,8 +14,10 @@
 from jwst.resample.resample_utils import build_driz_weight
 from jwst.stpipe.utilities import record_step_status
 
-from .utils import create_median, flag_model_crs, flag_resampled_model_crs
-from ._fileio import remove_file, save_median
+from stcal.outlier_detection.utils import compute_weight_threshold
+
+from .utils import nanmedian3D, flag_model_crs, flag_resampled_model_crs, OnDiskMedian
+from ._fileio import save_median
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
@@ -58,17 +61,18 @@ def detect_outliers(
         log.warning("Outlier detection will be skipped")
         record_step_status(input_models, "outlier_detection", False)
         return input_models
+    
+    with input_models:
+        example_model = input_models.borrow(0)
+        output_path = make_output_path(basepath=example_model.meta.filename,
+                    suffix='')
+        input_models.shelve(example_model, modify=False)
+        del example_model
+    output_path = os.path.dirname(output_path)
 
     if resample_data:
         # Start by creating resampled/mosaic images for
         # each group of exposures
-        with input_models:
-            example_model = input_models.borrow(0)
-            output_path = make_output_path(basepath=example_model.meta.filename,
-                        suffix='')
-            input_models.shelve(example_model, modify=False)
-            del example_model
-        output_path = os.path.dirname(output_path)
         resamp = resample.ResampleData(
             input_models,
             output=output_path,
@@ -84,42 +88,96 @@ def detect_outliers(
             allowed_memory=allowed_memory,
         )
         median_wcs = resamp.output_wcs
-        drizzled_models = resamp.do_drizzle(input_models)
+
+        j = 0
+        for group_id, indices in input_models.group_indices.items():
+
+            # single is hardcoded to True above so it's ok to just call resample_group directly
+            drizzled_model = resamp.resample_group(input_models, indices)
+            if j == 0:
+                ngroups = len(input_models.group_names)
+                full_shape = (ngroups,) + drizzled_model.data.shape
+                if in_memory:
+                    data_frames = np.empty(full_shape, dtype=np.float32)
+                else:
+                    median_computer = OnDiskMedian(full_shape,
+                                                   dtype=drizzled_model.data.dtype,
+                                                   buffer_size=None)
+
+            if save_intermediate_results:
+                output_name = drizzled_model.meta.filename
+
+                if resamp.output_dir is not None:
+                    output_name = os.path.join(resamp.output_dir, output_name)
+                drizzled_model.save(output_name)
+                log.info(f"Saved model in {output_name}")
+
+            # handle the weights right away, so only data array needs to be saved
+            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+
+            if in_memory:
+                data_frames[j] = drizzled_model.data
+            else:
+                # write spatial sections to disk
+                median_computer.add_image(drizzled_model.data)
+            j += 1
+
     else:
         # for non-dithered data, the resampled image is just the original image
-        drizzled_models = input_models
         with input_models:
-            for i, model in enumerate(input_models):
-                model.wht = build_driz_weight(
-                    model,
+            for i, drizzled_model in enumerate(input_models):
+                drizzled_model.wht = build_driz_weight(
+                    drizzled_model,
                     weight_type=weight_type,
                     good_bits=good_bits)
-                # copy for when saving median and input is a filename?
+                input_models.shelve(drizzled_model, modify=True)
+
                 if i == 0:
-                    median_wcs = copy.deepcopy(model.meta.wcs)
-                input_models.shelve(model, modify=True)
+                    # copy for when saving median and input is a filename?
+                    median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
+                    full_shape = (len(input_models),) + drizzled_model.data.shape
+                    if in_memory:
+                        data_frames = np.empty(full_shape, dtype=np.float32)
+                    else:
+                        median_computer = OnDiskMedian(full_shape,
+                                                    dtype=drizzled_model.data.dtype,
+                                                    buffer_size=None)
+
+                if save_intermediate_results:
+                    output_name = drizzled_model.meta.filename
+                    if output_path is not None:
+                        output_name = os.path.join(output_path, output_name)
+                    drizzled_model.save(output_name)
+                    log.info(f"Saved model in {output_name}")
+
+                # handle the weights right away, so only data array needs to be saved
+                weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+                drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+
+                if in_memory:
+                    data_frames[i] = drizzled_model.data
+                else:
+                    # write spatial sections to disk
+                    median_computer.add_image(drizzled_model.data)
+
 
     # Perform median combination on set of drizzled mosaics
-    median_data = create_median(drizzled_models, maskpt)
+    if in_memory:
+        median_data = nanmedian3D(data_frames)
+        del data_frames
+    else:
+        median_data = median_computer.compute_median()
+        median_computer.cleanup()
 
     if save_intermediate_results:
         # make a median model
-        with drizzled_models:
-            example_model = drizzled_models.borrow(0)
-            drizzled_models.shelve(example_model, modify=False)
-            median_model = datamodels.ImageModel(median_data)
-            median_model.update(example_model)
-            median_model.meta.wcs = median_wcs
-            del example_model
-
+        median_model = datamodels.ImageModel(median_data)
+        median_model.update(drizzled_model)
+        median_model.meta.wcs = median_wcs
         save_median(median_model, make_output_path, asn_id)
         del median_model
-    else:
-        # since we're not saving intermediate results if the drizzled models
-        # were written to disk, remove them
-        if not in_memory:
-            for fn in drizzled_models.asn["products"][0]["members"]:
-                remove_file(fn["expname"])
+    del drizzled_model
 
     # Perform outlier detection using statistical comparisons between
     # each original input image and its blotted version of the median image
diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 0806dd42e7..4aea9f9748 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -231,7 +231,7 @@ def test_outlier_step_base(we_three_sci, tmp_cwd):
         lambda model, index: model.data.copy(), modify=False))
 
     result = OutlierDetectionStep.call(
-        container, save_results=True, save_intermediate_results=True
+        container, save_results=True, save_intermediate_results=True,
     )
 
     with result:
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index 0b2cf9e679..6e5cbb21bd 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -57,18 +57,18 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd):
     
     # test input directory does not exist
     with pytest.raises(FileNotFoundError):
-        arr = DiskAppendableArray(slice_shape, dtype, tempdir)
+        DiskAppendableArray(slice_shape, dtype, tempdir)
 
     # make the input directory
     os.mkdir(tempdir)
 
     # test slice_shape is not 2-D
     with pytest.raises(ValueError):
-        arr = DiskAppendableArray((3,5,7), dtype, tempdir)
+        DiskAppendableArray((3,5,7), dtype, tempdir)
 
     # test dtype is not valid
     with pytest.raises(TypeError):
-        arr = DiskAppendableArray(slice_shape, "float3", tempdir)
+        DiskAppendableArray(slice_shape, "float3", tempdir)
 
 
 def test_on_disk_median(tmp_cwd):
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index a733ee724f..990d0dd90a 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -22,14 +22,27 @@
 _ONE_MB = 1 << 20
 
 
-def _nanmedian3D(cube, overwrite_input=True):
-    """Convenience function to compute the median of a cube ignoring warnings
-    and setting memory-efficient flag"""
+def nanmedian3D(cube, overwrite_input=True):
+    """Convenience function to compute the median of a cube ignoring warnings,
+    setting memory-efficient flag, and specifying output type
+    
+    Notes
+    -----
+    np.nanmedian returns float64 unless "out" parameter is specified, and specifying
+    that only changes the dtype of the output but not the internal precision of nanmedian.
+    It appears to be impossible to stop nanmedian from computing with 
+    at least 64-bit precision internally.
+    Pre-allocating the output array, i.e., out=np.empty(cube.shape[1:], dtype=np.float32)
+    actually increases memory usage a bit, so it's better to just convert after the fact.
+    Returning float64 here causes gwcs_blot to segfault down the line 
+    because that function is hard-coded to expect float32. 
+    """
     with warnings.catch_warnings():
         warnings.filterwarnings(action="ignore",
                                 message="All-NaN slice encountered",
                                 category=RuntimeWarning)
-        return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input)
+
+        return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32)
 
 
 def create_cube_median(cube_model, maskpt):
@@ -38,7 +51,7 @@ def create_cube_median(cube_model, maskpt):
     weight_threshold = compute_weight_threshold(cube_model.wht, maskpt)
 
     # not safe to use overwrite_input=True here because we are operating on model.data directly
-    return _nanmedian3D(
+    return nanmedian3D(
         np.ma.masked_array(cube_model.data, np.less(cube_model.wht, weight_threshold), fill_value=np.nan),
         overwrite_input=False)
 
@@ -85,11 +98,7 @@ def create_median(resampled_models, maskpt, buffer_size=None):
     
     if not on_disk:
         # easier case: all models in library can be loaded into memory at once
-        with warnings.catch_warnings():
-            warnings.filterwarnings(action="ignore",
-                                    message="All-NaN slice encountered",
-                                    category=RuntimeWarning)
-            return _nanmedian3D(model_cube)
+        return nanmedian3D(model_cube)
     else:
         # retrieve spatial sections from disk one by one and compute timewise median
         med = median_computer.compute_median()
@@ -298,7 +307,7 @@ def compute_median(self):
         for i, disk_arr in enumerate(self._temp_arrays):
             row1, row2 = row_indices[i]
             arr = disk_arr.read()
-            median_image[row1:row2] = _nanmedian3D(arr)
+            median_image[row1:row2] = nanmedian3D(arr)
             del arr, disk_arr
 
         return median_image
diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py
index 6350c3496e..e2297db25d 100644
--- a/jwst/resample/resample.py
+++ b/jwst/resample/resample.py
@@ -251,6 +251,90 @@ def _get_intensity_scale(self, img):
         else:
             iscale = 1.0
         return iscale
+    
+    def resample_group(self, input_models, indices):
+        """Apply resample_many_to_many for one group
+        
+        Parameters
+        ----------
+        input_models : ModelLibrary
+
+        indices : list
+        """
+        output_model = self.blank_output
+
+        copy_asn_info_from_library(input_models, output_model)
+
+        with input_models:
+            example_image = input_models.borrow(indices[0])
+
+            # Determine output file type from input exposure filenames
+            # Use this for defining the output filename
+            indx = example_image.meta.filename.rfind('.')
+            output_type = example_image.meta.filename[indx:]
+            output_root = '_'.join(example_image.meta.filename.replace(
+                output_type, '').split('_')[:-1])
+            if self.asn_id is not None:
+                output_model.meta.filename = (
+                    f'{output_root}_{self.asn_id}_'
+                    f'{self.intermediate_suffix}{output_type}')
+            else:
+                output_model.meta.filename = (
+                    f'{output_root}_'
+                    f'{self.intermediate_suffix}{output_type}')
+            input_models.shelve(example_image, indices[0], modify=False)
+            del example_image
+
+            # Initialize the output with the wcs
+            driz = gwcs_drizzle.GWCSDrizzle(output_model, pixfrac=self.pixfrac,
+                                            kernel=self.kernel, fillval=self.fillval)
+
+            log.info(f"{len(indices)} exposures to drizzle together")
+            for index in indices:
+                img = input_models.borrow(index)
+                if isinstance(img, datamodels.SlitModel):
+                    # must call this explicitly to populate area extension
+                    # although the existence of this extension may not be necessary
+                    img.area = img.area 
+                iscale = self._get_intensity_scale(img)
+                log.debug(f'Using intensity scale iscale={iscale}')
+
+                inwht = resample_utils.build_driz_weight(
+                    img,
+                    weight_type=self.weight_type,
+                    good_bits=self.good_bits
+                )
+
+                # apply sky subtraction
+                blevel = img.meta.background.level
+                if not img.meta.background.subtracted and blevel is not None:
+                    data = img.data - blevel
+                else:
+                    data = img.data
+
+                xmin, xmax, ymin, ymax = resample_utils._resample_range(
+                    data.shape,
+                    img.meta.wcs.bounding_box
+                )
+
+                driz.add_image(
+                    data,
+                    img.meta.wcs,
+                    iscale=iscale,
+                    inwht=inwht,
+                    xmin=xmin,
+                    xmax=xmax,
+                    ymin=ymin,
+                    ymax=ymax
+                )
+                del data
+                input_models.shelve(img, index, modify=False)
+                del img
+
+        out = output_model.copy()
+        output_model.data *= 0.
+        output_model.wht *= 0.
+        return out
 
     def resample_many_to_many(self, input_models):
         """Resample many inputs to many outputs where outputs have a common frame.
@@ -263,78 +347,18 @@ def resample_many_to_many(self, input_models):
         """
         output_models = []
         for group_id, indices in input_models.group_indices.items():
-            output_model = self.blank_output
-
-            copy_asn_info_from_library(input_models, output_model)
-
-            with input_models:
-                example_image = input_models.borrow(indices[0])
-
-                # Determine output file type from input exposure filenames
-                # Use this for defining the output filename
-                indx = example_image.meta.filename.rfind('.')
-                output_type = example_image.meta.filename[indx:]
-                output_root = '_'.join(example_image.meta.filename.replace(
-                    output_type, '').split('_')[:-1])
-                if self.asn_id is not None:
-                    output_model.meta.filename = (
-                        f'{output_root}_{self.asn_id}_'
-                        f'{self.intermediate_suffix}{output_type}')
-                else:
-                    output_model.meta.filename = (
-                        f'{output_root}_'
-                        f'{self.intermediate_suffix}{output_type}')
-                input_models.shelve(example_image, indices[0], modify=False)
-                del example_image
-
-                # Initialize the output with the wcs
-                driz = gwcs_drizzle.GWCSDrizzle(output_model, pixfrac=self.pixfrac,
-                                                kernel=self.kernel, fillval=self.fillval)
-
-                log.info(f"{len(indices)} exposures to drizzle together")
-                for index in indices:
-                    img = input_models.borrow(index)
-                    if isinstance(img, datamodels.SlitModel):
-                        # must call this explicitly to populate area extension
-                        # although the existence of this extension may not be necessary
-                        img.area = img.area 
-                    iscale = self._get_intensity_scale(img)
-                    log.debug(f'Using intensity scale iscale={iscale}')
-
-                    inwht = resample_utils.build_driz_weight(
-                        img,
-                        weight_type=self.weight_type,
-                        good_bits=self.good_bits
-                    )
-
-                    # apply sky subtraction
-                    blevel = img.meta.background.level
-                    if not img.meta.background.subtracted and blevel is not None:
-                        data = img.data - blevel
-                    else:
-                        data = img.data
-
-                    xmin, xmax, ymin, ymax = resample_utils._resample_range(
-                        data.shape,
-                        img.meta.wcs.bounding_box
-                    )
-
-                    driz.add_image(
-                        data,
-                        img.meta.wcs,
-                        iscale=iscale,
-                        inwht=inwht,
-                        xmin=xmin,
-                        xmax=xmax,
-                        ymin=ymin,
-                        ymax=ymax
-                    )
-                    del data
-                    input_models.shelve(img, index, modify=False)
-                    del img
+            
+            output_model = self.resample_group(input_models, indices)
 
             if not self.in_memory:
+                # Here, write out model to disk in a way that is immediately useful to create_median
+                # using the outlier detection utils I just wrote. Have this return the median computer
+                # back to outlier detection
+                # This does mean weights need to be handled here, though...
+
                 # Write out model to disk, then return filename
+                # only do this write to disk if save_intermediate_results is True
+                # this write should occur before handling weights
                 output_name = output_model.meta.filename
                 if self.output_dir is not None:
                     output_name = os.path.join(self.output_dir, output_name)
@@ -342,9 +366,7 @@ def resample_many_to_many(self, input_models):
                 log.info(f"Saved model in {output_name}")
                 output_models.append(output_name)
             else:
-                output_models.append(output_model.copy())
-            output_model.data *= 0.
-            output_model.wht *= 0.
+                output_models.append(output_model.data)
 
         if not self.in_memory:
             # build ModelLibrary as an association from the output files

From 50ece826419193bbdb939a7238c72f0175845699 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 18 Sep 2024 13:53:14 -0400
Subject: [PATCH 12/34] typo fix

---
 jwst/outlier_detection/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 990d0dd90a..c215337b7f 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -249,7 +249,7 @@ def _get_buffer_indices(self, buffer_size=None):
             section_nrows = 1
 
         nsections = int(np.ceil(imrows / section_nrows))
-        log.info(f"Computing median over {self._expected_nframes} images in {nsections}"
+        log.info(f"Computing median over {self._expected_nframes} images in {nsections} "
                     f"sections with total memory buffer {buffer_size / _ONE_MB} MB")
         return nsections, section_nrows
 

From 5400560dd0c37e26de8aace713e3cd076efe4c21 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 18 Sep 2024 16:57:41 -0400
Subject: [PATCH 13/34] first draft extending refactor to spectroscopic mode

---
 jwst/outlier_detection/imaging.py             | 143 ++++--------------
 jwst/outlier_detection/spec.py                | 101 ++++---------
 .../tests/test_outlier_detection.py           |  41 ++---
 jwst/outlier_detection/utils.py               | 125 ++++++++++-----
 4 files changed, 166 insertions(+), 244 deletions(-)

diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 964c0e3c3b..e8f7be0d19 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -2,22 +2,14 @@
 Submodule for performing outlier detection on imaging data.
 """
 
-import copy
 import logging
 import os
 
-import numpy as np
-from stdatamodels.jwst import datamodels
-
 from jwst.datamodels import ModelLibrary
 from jwst.resample import resample
-from jwst.resample.resample_utils import build_driz_weight
 from jwst.stpipe.utilities import record_step_status
 
-from stcal.outlier_detection.utils import compute_weight_threshold
-
-from .utils import nanmedian3D, flag_model_crs, flag_resampled_model_crs, OnDiskMedian
-from ._fileio import save_median
+from .utils import flag_model_crs, flag_resampled_model_crs, drizzle_and_median
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
@@ -69,115 +61,32 @@ def detect_outliers(
         input_models.shelve(example_model, modify=False)
         del example_model
     output_path = os.path.dirname(output_path)
+        
+    # Start by creating resampled/mosaic images for
+    # each group of exposures
+    resamp = resample.ResampleData(
+        input_models,
+        output=output_path,
+        single=True,
+        blendheaders=False,
+        wht_type=weight_type,
+        pixfrac=pixfrac,
+        kernel=kernel,
+        fillval=fillval,
+        good_bits=good_bits,
+        in_memory=in_memory,
+        asn_id=asn_id,
+        allowed_memory=allowed_memory,
+    )
+
+    median_data, median_wcs = drizzle_and_median(input_models,
+                                                 resamp,
+                                                 maskpt,
+                                                 make_output_path,
+                                                 resample_data=resample_data,
+                                                 in_memory=in_memory,
+                                                 save_intermediate_results=save_intermediate_results)
 
-    if resample_data:
-        # Start by creating resampled/mosaic images for
-        # each group of exposures
-        resamp = resample.ResampleData(
-            input_models,
-            output=output_path,
-            single=True,
-            blendheaders=False,
-            wht_type=weight_type,
-            pixfrac=pixfrac,
-            kernel=kernel,
-            fillval=fillval,
-            good_bits=good_bits,
-            in_memory=in_memory,
-            asn_id=asn_id,
-            allowed_memory=allowed_memory,
-        )
-        median_wcs = resamp.output_wcs
-
-        j = 0
-        for group_id, indices in input_models.group_indices.items():
-
-            # single is hardcoded to True above so it's ok to just call resample_group directly
-            drizzled_model = resamp.resample_group(input_models, indices)
-            if j == 0:
-                ngroups = len(input_models.group_names)
-                full_shape = (ngroups,) + drizzled_model.data.shape
-                if in_memory:
-                    data_frames = np.empty(full_shape, dtype=np.float32)
-                else:
-                    median_computer = OnDiskMedian(full_shape,
-                                                   dtype=drizzled_model.data.dtype,
-                                                   buffer_size=None)
-
-            if save_intermediate_results:
-                output_name = drizzled_model.meta.filename
-
-                if resamp.output_dir is not None:
-                    output_name = os.path.join(resamp.output_dir, output_name)
-                drizzled_model.save(output_name)
-                log.info(f"Saved model in {output_name}")
-
-            # handle the weights right away, so only data array needs to be saved
-            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
-            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-
-            if in_memory:
-                data_frames[j] = drizzled_model.data
-            else:
-                # write spatial sections to disk
-                median_computer.add_image(drizzled_model.data)
-            j += 1
-
-    else:
-        # for non-dithered data, the resampled image is just the original image
-        with input_models:
-            for i, drizzled_model in enumerate(input_models):
-                drizzled_model.wht = build_driz_weight(
-                    drizzled_model,
-                    weight_type=weight_type,
-                    good_bits=good_bits)
-                input_models.shelve(drizzled_model, modify=True)
-
-                if i == 0:
-                    # copy for when saving median and input is a filename?
-                    median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
-                    full_shape = (len(input_models),) + drizzled_model.data.shape
-                    if in_memory:
-                        data_frames = np.empty(full_shape, dtype=np.float32)
-                    else:
-                        median_computer = OnDiskMedian(full_shape,
-                                                    dtype=drizzled_model.data.dtype,
-                                                    buffer_size=None)
-
-                if save_intermediate_results:
-                    output_name = drizzled_model.meta.filename
-                    if output_path is not None:
-                        output_name = os.path.join(output_path, output_name)
-                    drizzled_model.save(output_name)
-                    log.info(f"Saved model in {output_name}")
-
-                # handle the weights right away, so only data array needs to be saved
-                weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
-                drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-
-                if in_memory:
-                    data_frames[i] = drizzled_model.data
-                else:
-                    # write spatial sections to disk
-                    median_computer.add_image(drizzled_model.data)
-
-
-    # Perform median combination on set of drizzled mosaics
-    if in_memory:
-        median_data = nanmedian3D(data_frames)
-        del data_frames
-    else:
-        median_data = median_computer.compute_median()
-        median_computer.cleanup()
-
-    if save_intermediate_results:
-        # make a median model
-        median_model = datamodels.ImageModel(median_data)
-        median_model.update(drizzled_model)
-        median_model.meta.wcs = median_wcs
-        save_median(median_model, make_output_path, asn_id)
-        del median_model
-    del drizzled_model
 
     # Perform outlier detection using statistical comparisons between
     # each original input image and its blotted version of the median image
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index ad8e712d4c..0a234f37ba 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -1,16 +1,13 @@
 """
 Submodule for performing outlier detection on spectra.
 """
-import copy
 import os
 
-from stdatamodels.jwst import datamodels
 from jwst.datamodels import ModelContainer, ModelLibrary
 from jwst.stpipe.utilities import record_step_status
 
-from ..resample import resample_spec, resample_utils
-from .utils import create_median, flag_crs_in_models, flag_crs_in_models_with_resampling
-from ._fileio import remove_file
+from ..resample import resample_spec
+from .utils import flag_crs_in_models, flag_crs_in_models_with_resampling, drizzle_and_median
 
 import logging
 log = logging.getLogger(__name__)
@@ -53,69 +50,37 @@ def detect_outliers(
         record_step_status(input_models, "outlier_detection", False)
         return input_models
 
-    if resample_data is True:
-        # Start by creating resampled/mosaic images for
-        #  each group of exposures
-        output_path = make_output_path(
-            basepath=input_models[0].meta.filename, suffix='')
-        output_path = os.path.dirname(output_path)
-        resamp = resample_spec.ResampleSpecData(
-            input_models,
-            output=output_path,
-            single=True,
-            blendheaders=False,
-            wht_type=weight_type,
-            pixfrac=pixfrac,
-            kernel=kernel,
-            fillval=fillval,
-            good_bits=good_bits,
-            in_memory=in_memory,
-            asn_id=asn_id,
-        )
-        median_wcs = resamp.output_wcs
-
-        # convert to library for resample
-        # for compatibility with image3 pipeline
-        library = ModelLibrary(input_models, on_disk=False)
-        drizzled_models = resamp.do_drizzle(library)
-
-    else:
-        drizzled_models = ModelLibrary(input_models)
-        with drizzled_models:
-            for i, model in enumerate(drizzled_models):
-                model.wht = resample_utils.build_driz_weight(
-                    input_models[i],
-                    weight_type=weight_type,
-                    good_bits=good_bits)
-                drizzled_models.shelve(model)
-        # copy for when saving median and input is a filename?
-        median_wcs = copy.deepcopy(input_models[0].meta.wcs)
-
-    # Perform median combination on set of drizzled mosaics
-    median_data = create_median(drizzled_models, maskpt)
-
-    if save_intermediate_results:
-        # Initialize intermediate products used in the outlier detection
-        median_model = datamodels.ImageModel(median_data)
-        with drizzled_models:
-            example_model = drizzled_models.borrow(0)
-            drizzled_models.shelve(example_model, 0, modify=False)
-        median_model.meta = example_model.meta
-        median_model.meta.filename = make_output_path(
-            basepath=input_models[0].meta.filename,
-            suffix='median'
-        )
-
-        log.info("Writing out MEDIAN image to: {}".format(
-                 median_model.meta.filename))
-        median_model.save(median_model.meta.filename)
-        del median_model
-    else:
-        # since we're not saving intermediate results if the drizzled models
-        # were written to disk, remove them
-        if not in_memory:
-            for fn in drizzled_models._members:
-                remove_file(fn["expname"])
+    # Start by creating resampled/mosaic images for
+    #  each group of exposures
+    output_path = make_output_path(
+        basepath=input_models[0].meta.filename, suffix='')
+    output_path = os.path.dirname(output_path)
+    resamp = resample_spec.ResampleSpecData(
+        input_models,
+        output=output_path,
+        single=True,
+        blendheaders=False,
+        wht_type=weight_type,
+        pixfrac=pixfrac,
+        kernel=kernel,
+        fillval=fillval,
+        good_bits=good_bits,
+        in_memory=in_memory,
+        asn_id=asn_id,
+    )
+    median_wcs = resamp.output_wcs
+
+    # convert to library for resample
+    # for compatibility with image3 pipeline
+    library = ModelLibrary(input_models, on_disk=False)
+    
+    median_data, median_wcs = drizzle_and_median(library,
+                                                 resamp,
+                                                 maskpt,
+                                                 make_output_path,
+                                                 resample_data=resample_data,
+                                                 in_memory=False,
+                                                 save_intermediate_results=save_intermediate_results)
 
     # Perform outlier detection using statistical comparisons between
     # each original input image and its blotted version of the median image
diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 4aea9f9748..eca74212e4 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -10,7 +10,7 @@
 from jwst.assign_wcs import AssignWcsStep
 from jwst.assign_wcs.pointing import create_fitswcs
 from jwst.outlier_detection import OutlierDetectionStep
-from jwst.outlier_detection.utils import _flag_resampled_model_crs, create_median
+from jwst.outlier_detection.utils import _flag_resampled_model_crs
 from jwst.outlier_detection.outlier_detection_step import (
     IMAGE_MODES,
     TSO_SPEC_MODES,
@@ -595,26 +595,27 @@ def test_outlier_step_weak_cr_tso(exptype, tsovisit):
     assert result.dq[cr_timestep, 12, 12] == OUTLIER_DO_NOT_USE
 
 
-def test_create_median(three_sci_as_asn, tmp_cwd):
-    """Test creation of median on disk vs in memory"""
-    lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True)
-    lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False)
+# def test_drizzle_and_median(three_sci_as_asn, tmp_cwd):
+#     """Test creation of median on disk vs in memory"""
+#     # FIXME: update this test
+#     lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True)
+#     lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False)
 
-    # make this test meaningful w.r.t. handling of weights
-    with (lib_on_disk, lib_in_memory):
-        for lib in [lib_on_disk, lib_in_memory]:
-            for model in lib:
-                model.wht = np.ones_like(model.data)
-                model.wht[4,9] = 0.5
-                lib.shelve(model, modify=True)
+#     # make this test meaningful w.r.t. handling of weights
+#     with (lib_on_disk, lib_in_memory):
+#         for lib in [lib_on_disk, lib_in_memory]:
+#             for model in lib:
+#                 model.wht = np.ones_like(model.data)
+#                 model.wht[4,9] = 0.5
+#                 lib.shelve(model, modify=True)
 
-    # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
-    # arbitrarily use 5 times that
-    buffer_size = 4 * 20 * 5 
-    median_on_disk = create_median(lib_on_disk, 0.7, buffer_size=buffer_size)
-    median_in_memory = create_median(lib_in_memory, 0.7)
+#     # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
+#     # arbitrarily use 5 times that
+#     buffer_size = 4 * 20 * 5 
+#     median_on_disk = drizzle_and_median(lib_on_disk, 0.7, buffer_size=buffer_size)
+#     median_in_memory = drizzle_and_median(lib_in_memory, 0.7)
 
-    assert np.isnan(median_in_memory[4,9])
+#     assert np.isnan(median_in_memory[4,9])
 
-    # Make sure the median library is the same for on-disk and in-memory
-    assert np.allclose(median_on_disk, median_in_memory, equal_nan=True)
+#     # Make sure the median library is the same for on-disk and in-memory
+#     assert np.allclose(median_on_disk, median_in_memory, equal_nan=True)
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index c215337b7f..5f3393233e 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -2,6 +2,8 @@
 The ever-present utils sub-module. A home for all...
 """
 
+import os
+import copy
 import warnings
 import numpy as np
 import tempfile
@@ -9,8 +11,10 @@
 
 from jwst.lib.pipe_utils import match_nans_and_flags
 from jwst.resample.resample import compute_image_pixel_area
+from jwst.resample.resample_utils import build_driz_weight
 from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs
 from stdatamodels.jwst import datamodels
+from ._fileio import save_median
 
 import logging
 log = logging.getLogger(__name__)
@@ -56,54 +60,97 @@ def create_cube_median(cube_model, maskpt):
         overwrite_input=False)
 
 
-def create_median(resampled_models, maskpt, buffer_size=None):
-    """Create a median image from the singly resampled images.
-
+def drizzle_and_median(input_models,
+                       resamp,
+                       maskpt,
+                       make_output_path,
+                       resample_data=False,
+                       in_memory=True,
+                       save_intermediate_results=False):
+    """
     Parameters
     ----------
-    resampled_models : ModelLibrary
-        The singly resampled images.
+    input_models : ModelLibrary
+        The input datamodels
+
+    resamp : ResampleData object
 
     maskpt : float
-        The weight threshold for masking out low weight pixels.
 
-    Returns
-    -------
-    median_image : ndarray
-        The median image.
+    make_output_path : ?
     """
-    on_disk = resampled_models._on_disk
-    
-    # Compute weight threshold for each input model and set NaN in data where weight is below threshold
-    with resampled_models:
-        for i, resampled in enumerate(resampled_models):
-            weight_threshold = compute_weight_threshold(resampled.wht, maskpt)
-            resampled.data[resampled.wht < weight_threshold] = np.nan
-            if not on_disk:
-                if i == 0:
-                    model_cube = np.empty((len(resampled_models),) + resampled.data.shape,
-                                         dtype=resampled.data.dtype)
-                model_cube[i] = resampled.data
+    if not resamp.single:
+        raise ValueError("drizzle_and_median should only be used for resample_many_to_many")
+    if resample_data:
+        indices_by_group = list(input_models.group_indices.values())
+    else:
+        indices_by_group = [[i] for i in range(len(input_models))]
+
+    with input_models:
+        for i, indices in enumerate(indices_by_group):
+
+            if resample_data:
+                median_wcs = resamp.output_wcs
+
+                drizzled_model = resamp.resample_group(input_models, indices)
+
             else:
-                if i == 0:
-                    # set up temporary storage for spatial sections of all input models
-                    shp = (len(resampled_models),) + resampled.data.shape
-                    median_computer = OnDiskMedian(shp,
-                                                   dtype=resampled.data.dtype,
-                                                   buffer_size=buffer_size)
-                # write spatial sections to disk
-                median_computer.add_image(resampled.data)
-            resampled_models.shelve(resampled, i, modify=False)
-        del resampled
-    
-    if not on_disk:
-        # easier case: all models in library can be loaded into memory at once
-        return nanmedian3D(model_cube)
+                # for non-dithered data, the resampled image is just the original image
+                drizzled_model = input_models.borrow(i)
+                drizzled_model.wht = build_driz_weight(
+                    drizzled_model,
+                    weight_type=resamp.weight_type,
+                    good_bits=resamp.good_bits)
+                input_models.shelve(drizzled_model, i, modify=True)
+                median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
+
+            if i == 0:
+                # allocate either temporary storage or memory for data arrays that go into median
+                ngroups = len(indices_by_group)
+                full_shape = (ngroups,) + drizzled_model.data.shape
+                if in_memory:
+                    data_frames = np.empty(full_shape, dtype=np.float32)
+                else:
+                    median_computer = OnDiskMedian(full_shape,
+                                                    dtype=drizzled_model.data.dtype,
+                                                    buffer_size=None)
+
+            if save_intermediate_results:
+                # write the drizzled model to file
+                output_name = drizzled_model.meta.filename
+                if resamp.output_dir is not None:
+                    output_name = os.path.join(resamp.output_dir, output_name)
+                drizzled_model.save(output_name)
+                log.info(f"Saved model in {output_name}")
+
+            # handle the weights right away, so only data array needs to be saved
+            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+
+            # put drizzled models into either storage or memory for median
+            if in_memory:
+                data_frames[i] = drizzled_model.data
+            else:
+                median_computer.add_image(drizzled_model.data)
+
+    # Perform median combination on set of drizzled mosaics
+    if in_memory:
+        median_data = nanmedian3D(data_frames)
+        del data_frames
     else:
-        # retrieve spatial sections from disk one by one and compute timewise median
-        med = median_computer.compute_median()
+        median_data = median_computer.compute_median()
         median_computer.cleanup()
-        return med
+
+    if save_intermediate_results:
+        # make a median model
+        median_model = datamodels.ImageModel(median_data)
+        median_model.update(drizzled_model)
+        median_model.meta.wcs = median_wcs
+        save_median(median_model, make_output_path, resamp.asn_id)
+        del median_model
+    del drizzled_model
+
+    return median_data, median_wcs
 
 
 class DiskAppendableArray:

From f4af4324dd223ea33693f254830db7c1cc846eda Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 18 Sep 2024 17:44:19 -0400
Subject: [PATCH 14/34] added unit tests for drizzle_and_median

---
 .../tests/test_outlier_detection.py           | 108 ++++++++++++++----
 jwst/outlier_detection/utils.py               |   9 +-
 2 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index eca74212e4..584425e952 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -4,6 +4,7 @@
 from glob import glob
 import os
 
+from gwcs.wcs import WCS
 from stdatamodels.jwst import datamodels
 
 from jwst.datamodels import ModelContainer, ModelLibrary
@@ -18,6 +19,8 @@
     CORON_IMAGE_MODES,
 )
 from jwst.resample.tests.test_resample_step import miri_rate_model
+from jwst.outlier_detection.utils import drizzle_and_median
+from jwst.resample.resample import ResampleData
 
 OUTLIER_DO_NOT_USE = np.bitwise_or(
     datamodels.dqflags.pixel["DO_NOT_USE"], datamodels.dqflags.pixel["OUTLIER"]
@@ -595,27 +598,84 @@ def test_outlier_step_weak_cr_tso(exptype, tsovisit):
     assert result.dq[cr_timestep, 12, 12] == OUTLIER_DO_NOT_USE
 
 
-# def test_drizzle_and_median(three_sci_as_asn, tmp_cwd):
-#     """Test creation of median on disk vs in memory"""
-#     # FIXME: update this test
-#     lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True)
-#     lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False)
-
-#     # make this test meaningful w.r.t. handling of weights
-#     with (lib_on_disk, lib_in_memory):
-#         for lib in [lib_on_disk, lib_in_memory]:
-#             for model in lib:
-#                 model.wht = np.ones_like(model.data)
-#                 model.wht[4,9] = 0.5
-#                 lib.shelve(model, modify=True)
-
-#     # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
-#     # arbitrarily use 5 times that
-#     buffer_size = 4 * 20 * 5 
-#     median_on_disk = drizzle_and_median(lib_on_disk, 0.7, buffer_size=buffer_size)
-#     median_in_memory = drizzle_and_median(lib_in_memory, 0.7)
-
-#     assert np.isnan(median_in_memory[4,9])
-
-#     # Make sure the median library is the same for on-disk and in-memory
-#     assert np.allclose(median_on_disk, median_in_memory, equal_nan=True)
+def test_same_median_on_disk(three_sci_as_asn, tmp_cwd):
+    """Test creation of median on disk vs in memory"""
+    lib_on_disk = ModelLibrary(three_sci_as_asn, on_disk=True)
+    lib_in_memory = ModelLibrary(three_sci_as_asn, on_disk=False)
+
+    # make this test meaningful w.r.t. handling of weights
+    with (lib_on_disk, lib_in_memory):
+        for lib in [lib_on_disk, lib_in_memory]:
+            for model in lib:
+                model.var_rnoise = np.ones_like(model.data)
+                model.var_rnoise[4,9] = 2.0
+                lib.shelve(model, modify=True)
+
+
+    # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
+    # arbitrarily use 5 times that
+    buffer_size = 4 * 20 * 5 
+    median_on_disk, _ = drizzle_and_median(lib_on_disk,
+                                           make_resamp(lib_on_disk, False),
+                                           0.7,
+                                           buffer_size=buffer_size,
+                                           resample_data=False)
+    median_in_memory, _ = drizzle_and_median(lib_in_memory,
+                                             make_resamp(lib_in_memory, False),
+                                             0.7,
+                                             buffer_size=buffer_size,
+                                             resample_data=False)
+
+    assert np.isnan(median_in_memory[4,9])
+
+    # Make sure the median library is the same for on-disk and in-memory
+    assert np.allclose(median_on_disk, median_in_memory, equal_nan=True)
+
+
+def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
+    lib = ModelLibrary(three_sci_as_asn, on_disk=False)
+
+    resamp = make_resamp(lib, True)
+    median, wcs = drizzle_and_median(lib,
+                                     resamp,
+                                     0.7,
+                                     resample_data=True)
+    
+    assert isinstance(wcs, WCS)
+    assert median.shape == (21,20)
+
+    with pytest.raises(ValueError):
+        # should fail if save_intermediate results but no output path
+        drizzle_and_median(lib,
+                           resamp,
+                           0.7,
+                           make_output_path=None,
+                           save_intermediate_results=True)
+        
+    resamp.single = False
+    with pytest.raises(ValueError):
+        # should fail if try to call when resamp.single is False
+        drizzle_and_median(lib,
+                           resamp,
+                           0.7,
+                           make_output_path=None,
+                           save_intermediate_results=True)
+
+
+def make_resamp(input_models, in_memory):
+    """All defaults are same as what is run by default by outlier detection"""
+    resamp = ResampleData(
+        input_models,
+        output="",
+        single=True,
+        blendheaders=False,
+        wht_type="ivm",
+        pixfrac=1.0,
+        kernel="square",
+        fillval="INDEF",
+        good_bits="~DO_NOT_USE",
+        in_memory=in_memory,
+        asn_id="test",
+        allowed_memory=None,
+    )
+    return resamp
\ No newline at end of file
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 5f3393233e..042f46d1e6 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -63,10 +63,11 @@ def create_cube_median(cube_model, maskpt):
 def drizzle_and_median(input_models,
                        resamp,
                        maskpt,
-                       make_output_path,
+                       make_output_path=None,
                        resample_data=False,
                        in_memory=True,
-                       save_intermediate_results=False):
+                       save_intermediate_results=False,
+                       buffer_size=None):
     """
     Parameters
     ----------
@@ -79,6 +80,8 @@ def drizzle_and_median(input_models,
 
     make_output_path : ?
     """
+    if save_intermediate_results and (make_output_path is None):
+        raise ValueError("make_output_path is required if save_intermediate_results is True")
     if not resamp.single:
         raise ValueError("drizzle_and_median should only be used for resample_many_to_many")
     if resample_data:
@@ -113,7 +116,7 @@ def drizzle_and_median(input_models,
                 else:
                     median_computer = OnDiskMedian(full_shape,
                                                     dtype=drizzled_model.data.dtype,
-                                                    buffer_size=None)
+                                                    buffer_size=buffer_size)
 
             if save_intermediate_results:
                 # write the drizzled model to file

From 13a116154cbe9985e65bac1a87172d672b629530 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 19 Sep 2024 11:28:47 -0400
Subject: [PATCH 15/34] cleanup from self-review plus more unit tests

---
 CHANGES.rst                                   |  2 +
 jwst/outlier_detection/_fileio.py             |  8 ---
 jwst/outlier_detection/imaging.py             |  5 +-
 jwst/outlier_detection/spec.py                |  6 +-
 .../tests/test_outlier_detection.py           |  2 +-
 jwst/outlier_detection/tests/test_utils.py    | 30 +++++++++-
 jwst/outlier_detection/utils.py               | 55 +++++++++++++------
 jwst/resample/resample.py                     |  7 ---
 8 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 043480cea5..3c580c3f62 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -204,6 +204,8 @@ resample
 - Ensure that NaNs and DO_NOT_USE flags match up in all input data before
   resampling. [#8557]
 
+- Permit creating drizzled models one at a time in many-to-many mode. [#8782]
+
 resample_spec
 -------------
 
diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 6975ac1717..53dbb8bc6f 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -1,16 +1,8 @@
-import os
-
 import logging
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
 
 
-def remove_file(fn):
-    if isinstance(fn, str) and os.path.isfile(fn):
-        os.remove(fn)
-        log.info(f"Removing file {fn}")
-
-
 def save_median(median_model, make_output_path, asn_id=None):
     '''
     Save median if requested by user
diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index e8f7be0d19..89cf232648 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -82,10 +82,9 @@ def detect_outliers(
     median_data, median_wcs = drizzle_and_median(input_models,
                                                  resamp,
                                                  maskpt,
-                                                 make_output_path,
                                                  resample_data=resample_data,
-                                                 in_memory=in_memory,
-                                                 save_intermediate_results=save_intermediate_results)
+                                                 save_intermediate_results=save_intermediate_results,
+                                                 make_output_path=make_output_path,)
 
 
     # Perform outlier detection using statistical comparisons between
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index 0a234f37ba..05f7fd204f 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -68,7 +68,6 @@ def detect_outliers(
         in_memory=in_memory,
         asn_id=asn_id,
     )
-    median_wcs = resamp.output_wcs
 
     # convert to library for resample
     # for compatibility with image3 pipeline
@@ -77,10 +76,9 @@ def detect_outliers(
     median_data, median_wcs = drizzle_and_median(library,
                                                  resamp,
                                                  maskpt,
-                                                 make_output_path,
                                                  resample_data=resample_data,
-                                                 in_memory=False,
-                                                 save_intermediate_results=save_intermediate_results)
+                                                 save_intermediate_results=save_intermediate_results,
+                                                 make_output_path=make_output_path)
 
     # Perform outlier detection using statistical comparisons between
     # each original input image and its blotted version of the median image
diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 584425e952..7f6cf421c4 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -234,7 +234,7 @@ def test_outlier_step_base(we_three_sci, tmp_cwd):
         lambda model, index: model.data.copy(), modify=False))
 
     result = OutlierDetectionStep.call(
-        container, save_results=True, save_intermediate_results=True,
+        container, save_results=True, save_intermediate_results=True
     )
 
     with result:
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index 6e5cbb21bd..b64f6f91f7 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -5,6 +5,9 @@
 import os
 
 
+# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error
+# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException
+@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_disk_appendable_array(tmp_cwd):
 
     slice_shape = (8,7)
@@ -49,6 +52,7 @@ def test_disk_appendable_array(tmp_cwd):
     assert len(os.listdir(tempdir)) == 0
 
 
+@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_disk_appendable_array_bad_inputs(tmp_cwd):
 
     slice_shape = (8,7)
@@ -71,6 +75,7 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd):
         DiskAppendableArray(slice_shape, "float3", tempdir)
 
 
+@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_on_disk_median(tmp_cwd):
 
     library_length = 3
@@ -133,6 +138,27 @@ def test_on_disk_median(tmp_cwd):
     assert len(os.listdir(os.getcwd())) == 1 # this is the "tmptest" directory
 
 
+@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_on_disk_median_bad_inputs(tmp_cwd):
-    # FIXME
-    pass
\ No newline at end of file
+
+    library_length = 3
+    frame_shape = (21,20)
+    dtype = 'float32'
+    tempdir = tmp_cwd / Path("tmptest")
+    os.mkdir(tempdir)
+    shape = (library_length,) + frame_shape
+
+    with pytest.raises(ValueError):
+        OnDiskMedian(frame_shape, dtype=dtype, tempdir=tempdir)
+
+    with pytest.raises(TypeError):
+        OnDiskMedian(shape, dtype="float3", tempdir=tempdir)
+
+    with pytest.raises(FileNotFoundError):
+        OnDiskMedian(shape, dtype="float32", tempdir="dne")
+
+    # unreasonable buffer size will get set to minimum
+    min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length
+    median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1)
+    assert median_computer.buffer_size == min_buffer
+    median_computer.cleanup()
\ No newline at end of file
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 042f46d1e6..58c553d984 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -36,16 +36,15 @@ def nanmedian3D(cube, overwrite_input=True):
     that only changes the dtype of the output but not the internal precision of nanmedian.
     It appears to be impossible to stop nanmedian from computing with 
     at least 64-bit precision internally.
-    Pre-allocating the output array, i.e., out=np.empty(cube.shape[1:], dtype=np.float32)
+    Pre-allocating the output array with `out=np.empty(cube.shape[1:], dtype=np.float32)`
     actually increases memory usage a bit, so it's better to just convert after the fact.
-    Returning float64 here causes gwcs_blot to segfault down the line 
+    Failing to return float32 here causes gwcs_blot to segfault down the line 
     because that function is hard-coded to expect float32. 
     """
     with warnings.catch_warnings():
         warnings.filterwarnings(action="ignore",
                                 message="All-NaN slice encountered",
                                 category=RuntimeWarning)
-
         return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32)
 
 
@@ -63,30 +62,51 @@ def create_cube_median(cube_model, maskpt):
 def drizzle_and_median(input_models,
                        resamp,
                        maskpt,
-                       make_output_path=None,
                        resample_data=False,
-                       in_memory=True,
                        save_intermediate_results=False,
+                       make_output_path=None,
                        buffer_size=None):
     """
+    Shared code between imaging and spec modes for resampling and median computation
+
     Parameters
     ----------
     input_models : ModelLibrary
-        The input datamodels
+        The input datamodels.
 
-    resamp : ResampleData object
+    resamp : resample.resample.ResampleData object
+        The controlling object for the resampling process.
 
     maskpt : float
+        The weight threshold for masking out low weight pixels.
+
+    resample_data : bool
+        Whether or not to do resampling.
+
+    save_intermediate_results : bool
+        if True, save the drizzled models and median model to fits.
+
+    make_output_path : function
+        The functools.partial instance to pass to save_median. Must be 
+        specified if save_intermediate_results is True. Default None.
 
-    make_output_path : ?
+    buffer_size : int
+        The size of chunk in bytes that will be read into memory when computing the median.
+        This parameter has no effect if the input library has its on_disk attribute
+        set to False.
     """
+
+    # validate inputs
     if save_intermediate_results and (make_output_path is None):
         raise ValueError("make_output_path is required if save_intermediate_results is True")
     if not resamp.single:
         raise ValueError("drizzle_and_median should only be used for resample_many_to_many")
+    in_memory = not input_models._on_disk
+
     if resample_data:
         indices_by_group = list(input_models.group_indices.values())
     else:
+        # treat each input model as if it is the only member of its group
         indices_by_group = [[i] for i in range(len(input_models))]
 
     with input_models:
@@ -94,9 +114,7 @@ def drizzle_and_median(input_models,
 
             if resample_data:
                 median_wcs = resamp.output_wcs
-
                 drizzled_model = resamp.resample_group(input_models, indices)
-
             else:
                 # for non-dithered data, the resampled image is just the original image
                 drizzled_model = input_models.borrow(i)
@@ -105,15 +123,17 @@ def drizzle_and_median(input_models,
                     weight_type=resamp.weight_type,
                     good_bits=resamp.good_bits)
                 input_models.shelve(drizzled_model, i, modify=True)
+                # copy for when saving median and input is a filename?
                 median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
 
             if i == 0:
-                # allocate either temporary storage or memory for data arrays that go into median
                 ngroups = len(indices_by_group)
                 full_shape = (ngroups,) + drizzled_model.data.shape
                 if in_memory:
+                    # allocate memory for data arrays that go into median
                     data_frames = np.empty(full_shape, dtype=np.float32)
                 else:
+                    # set up temporary storage for data arrays that go into median
                     median_computer = OnDiskMedian(full_shape,
                                                     dtype=drizzled_model.data.dtype,
                                                     buffer_size=buffer_size)
@@ -130,10 +150,11 @@ def drizzle_and_median(input_models,
             weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
             drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
 
-            # put drizzled models into either storage or memory for median
             if in_memory:
+                # populate pre-allocated memory with the drizzled data
                 data_frames[i] = drizzled_model.data
             else:
+                # distribute the drizzled data into the temporary storage
                 median_computer.add_image(drizzled_model.data)
 
     # Perform median combination on set of drizzled mosaics
@@ -145,7 +166,7 @@ def drizzle_and_median(input_models,
         median_computer.cleanup()
 
     if save_intermediate_results:
-        # make a median model
+        # Save median model to fits
         median_model = datamodels.ImageModel(median_data)
         median_model.update(drizzled_model)
         median_model.meta.wcs = median_wcs
@@ -221,7 +242,7 @@ def read(self):
     
 
     def cleanup(self):
-        Path.unlink(self._filename)
+        Path.unlink(Path(self._filename))
         return
 
 
@@ -287,16 +308,16 @@ def _get_buffer_indices(self, buffer_size=None):
         imrows, imcols = self.frame_shape
         if buffer_size is None:
             buffer_size = imrows * imcols * self.itemsize
-        self.buffer_size = buffer_size
         per_model_buffer_size = buffer_size / self._expected_nframes
         min_buffer_size = imcols * self.itemsize
         section_nrows = min(imrows, int(per_model_buffer_size // min_buffer_size))
 
-        if section_nrows == 0:
+        if section_nrows <= 0:
             buffer_size = min_buffer_size * self._expected_nframes
             log.warning("Buffer size is too small to hold a single row."
                             f"Increasing buffer size to {buffer_size / _ONE_MB}MB")
             section_nrows = 1
+        self.buffer_size = buffer_size
 
         nsections = int(np.ceil(imrows / section_nrows))
         log.info(f"Computing median over {self._expected_nframes} images in {nsections} "
@@ -340,7 +361,7 @@ def _validate_data(self, data):
     def cleanup(self):
         """Remove the temporary files and directory when finished"""
         [arr.cleanup() for arr in self._temp_arrays]
-        Path.rmdir(self._temp_path)
+        self._temp_dir.cleanup()
         return
 
 
diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py
index e2297db25d..393d6f55d9 100644
--- a/jwst/resample/resample.py
+++ b/jwst/resample/resample.py
@@ -351,14 +351,7 @@ def resample_many_to_many(self, input_models):
             output_model = self.resample_group(input_models, indices)
 
             if not self.in_memory:
-                # Here, write out model to disk in a way that is immediately useful to create_median
-                # using the outlier detection utils I just wrote. Have this return the median computer
-                # back to outlier detection
-                # This does mean weights need to be handled here, though...
-
                 # Write out model to disk, then return filename
-                # only do this write to disk if save_intermediate_results is True
-                # this write should occur before handling weights
                 output_name = output_model.meta.filename
                 if self.output_dir is not None:
                     output_name = os.path.join(self.output_dir, output_name)

From 4a19c7c90444a11b04fd360536c6fcc1beb4639f Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 19 Sep 2024 15:01:47 -0400
Subject: [PATCH 16/34] small fix to test

---
 .../tests/test_outlier_detection.py                | 14 ++++++++------
 jwst/outlier_detection/tests/test_utils.py         |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 7f6cf421c4..141b3bce1b 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -616,16 +616,17 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd):
     # arbitrarily use 5 times that
     buffer_size = 4 * 20 * 5 
     median_on_disk, _ = drizzle_and_median(lib_on_disk,
-                                           make_resamp(lib_on_disk, False),
+                                           make_resamp(lib_on_disk),
                                            0.7,
                                            buffer_size=buffer_size,
                                            resample_data=False)
     median_in_memory, _ = drizzle_and_median(lib_in_memory,
-                                             make_resamp(lib_in_memory, False),
+                                             make_resamp(lib_in_memory),
                                              0.7,
                                              buffer_size=buffer_size,
                                              resample_data=False)
 
+    # Make sure the high-variance (low-weight) pixel is set to NaN
     assert np.isnan(median_in_memory[4,9])
 
     # Make sure the median library is the same for on-disk and in-memory
@@ -635,7 +636,7 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd):
 def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
     lib = ModelLibrary(three_sci_as_asn, on_disk=False)
 
-    resamp = make_resamp(lib, True)
+    resamp = make_resamp(lib)
     median, wcs = drizzle_and_median(lib,
                                      resamp,
                                      0.7,
@@ -645,7 +646,7 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
     assert median.shape == (21,20)
 
     with pytest.raises(ValueError):
-        # should fail if save_intermediate results but no output path
+        # ensure failure if save_intermediate results but no output path
         drizzle_and_median(lib,
                            resamp,
                            0.7,
@@ -654,7 +655,7 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
         
     resamp.single = False
     with pytest.raises(ValueError):
-        # should fail if try to call when resamp.single is False
+        # ensure failure if try to call when resamp.single is False
         drizzle_and_median(lib,
                            resamp,
                            0.7,
@@ -662,8 +663,9 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
                            save_intermediate_results=True)
 
 
-def make_resamp(input_models, in_memory):
+def make_resamp(input_models):
     """All defaults are same as what is run by default by outlier detection"""
+    in_memory = not input_models._on_disk
     resamp = ResampleData(
         input_models,
         output="",
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index b64f6f91f7..e20c181a03 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -66,11 +66,11 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd):
     # make the input directory
     os.mkdir(tempdir)
 
-    # test slice_shape is not 2-D
+    # ensure failure if slice_shape is not 2-D
     with pytest.raises(ValueError):
         DiskAppendableArray((3,5,7), dtype, tempdir)
 
-    # test dtype is not valid
+    # ensure failure if dtype is not valid
     with pytest.raises(TypeError):
         DiskAppendableArray(slice_shape, "float3", tempdir)
 
@@ -157,7 +157,7 @@ def test_on_disk_median_bad_inputs(tmp_cwd):
     with pytest.raises(FileNotFoundError):
         OnDiskMedian(shape, dtype="float32", tempdir="dne")
 
-    # unreasonable buffer size will get set to minimum
+    # ensure unreasonable buffer size will get set to minimum reasonable buffer
     min_buffer = np.dtype(dtype).itemsize*frame_shape[1]*library_length
     median_computer = OnDiskMedian(shape, dtype=dtype, tempdir=tempdir, buffer_size=-1)
     assert median_computer.buffer_size == min_buffer

From 04ac058817ac1140d0c1297e64f5550812ea6870 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 20 Sep 2024 15:51:23 -0400
Subject: [PATCH 17/34] save memory with nanmedian loop

---
 .../tests/test_algorithms.py                  | 10 +++++++
 jwst/outlier_detection/utils.py               | 30 +++++++++----------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py
index b3b9255bb4..ba9af2661d 100644
--- a/jwst/outlier_detection/tests/test_algorithms.py
+++ b/jwst/outlier_detection/tests/test_algorithms.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from jwst.outlier_detection.tso import moving_median_over_zeroth_axis
+from jwst.outlier_detection.utils import nanmedian3D
 
 
 def test_rolling_median():
@@ -14,3 +15,12 @@ def test_rolling_median():
     result = moving_median_over_zeroth_axis(arr, w)
     expected = expected_time_axis[:, np.newaxis, np.newaxis] * spatial_axis[np.newaxis, :, :]
     assert np.allclose(result, expected)
+
+
+def test_nanmedian3D():
+
+    shp = (11, 50, 60)
+    cube = np.random.normal(shp)
+    med = nanmedian3D(cube)
+    assert np.allclose(med, np.nanmedian(cube))
+    assert med.dtype == np.float32
\ No newline at end of file
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 58c553d984..31a8ba500e 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -27,25 +27,22 @@
 
 
 def nanmedian3D(cube, overwrite_input=True):
-    """Convenience function to compute the median of a cube ignoring warnings,
-    setting memory-efficient flag, and specifying output type
-    
-    Notes
-    -----
-    np.nanmedian returns float64 unless "out" parameter is specified, and specifying
-    that only changes the dtype of the output but not the internal precision of nanmedian.
-    It appears to be impossible to stop nanmedian from computing with 
-    at least 64-bit precision internally.
-    Pre-allocating the output array with `out=np.empty(cube.shape[1:], dtype=np.float32)`
-    actually increases memory usage a bit, so it's better to just convert after the fact.
-    Failing to return float32 here causes gwcs_blot to segfault down the line 
-    because that function is hard-coded to expect float32. 
+    """Compute the median of a cube ignoring warnings and with memory efficiency.
+    np.nanmedian always uses at least 64-bit precision internally, and this is too
+    memory-intensive. Instead, loop over the median calculation to avoid the
+    memory usage of the internal upcasting and temporary array allocation.
+    The additional runtime of this loop is indistinguishable from zero,
+    but this loop decreases overall memory usage of the step by as much as half.
     """
     with warnings.catch_warnings():
         warnings.filterwarnings(action="ignore",
                                 message="All-NaN slice encountered",
                                 category=RuntimeWarning)
-        return np.nanmedian(cube, axis=0, overwrite_input=overwrite_input).astype(np.float32)
+        output_arr = np.empty(cube.shape[1:], dtype=np.float32)
+        for i in range(output_arr.shape[0]):
+            # this for loop looks silly, but see docstring above
+            output_arr[i,:] = np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input).astype(np.float32)
+        return output_arr
 
 
 def create_cube_median(cube_model, maskpt):
@@ -248,6 +245,9 @@ def cleanup(self):
 
 class OnDiskMedian:
 
+    # TODO: can/should this class inherit directly from tempfile.TemporaryDirectory?
+    # probably not, because it "has a" tempdir, it is not itself a tempdir
+
     def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None):
         """
         Set up temporary files to perform operations on a stack of 2-D input arrays
@@ -320,7 +320,7 @@ def _get_buffer_indices(self, buffer_size=None):
         self.buffer_size = buffer_size
 
         nsections = int(np.ceil(imrows / section_nrows))
-        log.info(f"Computing median over {self._expected_nframes} images in {nsections} "
+        log.info(f"Computing median over {self._expected_nframes} groups in {nsections} "
                     f"sections with total memory buffer {buffer_size / _ONE_MB} MB")
         return nsections, section_nrows
 

From 87b850407a3a31666e8f3ddb07f152d67e9aefc2 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 20 Sep 2024 16:41:39 -0400
Subject: [PATCH 18/34] changes in response to Brett's review

---
 .../tests/test_algorithms.py                  |  8 ++++---
 jwst/outlier_detection/tests/test_utils.py    | 14 +++++++++----
 jwst/outlier_detection/utils.py               | 21 +++++++++++--------
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/jwst/outlier_detection/tests/test_algorithms.py b/jwst/outlier_detection/tests/test_algorithms.py
index ba9af2661d..d1c8ffd85d 100644
--- a/jwst/outlier_detection/tests/test_algorithms.py
+++ b/jwst/outlier_detection/tests/test_algorithms.py
@@ -20,7 +20,9 @@ def test_rolling_median():
 def test_nanmedian3D():
 
     shp = (11, 50, 60)
-    cube = np.random.normal(shp)
+    cube = np.random.normal(size=shp)
+    cube[5, 5:7, 5:8] = np.nan
     med = nanmedian3D(cube)
-    assert np.allclose(med, np.nanmedian(cube))
-    assert med.dtype == np.float32
\ No newline at end of file
+
+    assert med.dtype == np.float32
+    assert np.allclose(med, np.nanmedian(cube, axis=0), equal_nan=True)
\ No newline at end of file
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index e20c181a03..875b5060d7 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -14,11 +14,12 @@ def test_disk_appendable_array(tmp_cwd):
     dtype = "float32"
     tempdir = tmp_cwd / Path("tmptest")
     os.mkdir(tempdir)
+    fname = tempdir / "test.bin"
 
-    arr = DiskAppendableArray(slice_shape, dtype, tempdir)
+    arr = DiskAppendableArray(slice_shape, dtype, fname)
 
     # check temporary file setup
-    assert arr._filename.split("/")[-1] in os.listdir(tempdir)
+    assert str(arr._filename).split("/")[-1] in os.listdir(tempdir)
     assert len(os.listdir(tempdir)) == 1
     assert arr.shape == (0,) + slice_shape
 
@@ -58,20 +59,25 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd):
     slice_shape = (8,7)
     dtype = "float32"
     tempdir = tmp_cwd / Path("tmptest")
+    fname = "test.bin"
     
     # test input directory does not exist
     with pytest.raises(FileNotFoundError):
-        DiskAppendableArray(slice_shape, dtype, tempdir)
+        DiskAppendableArray(slice_shape, dtype, tempdir / fname)
 
     # make the input directory
     os.mkdir(tempdir)
 
     # ensure failure if slice_shape is not 2-D
     with pytest.raises(ValueError):
-        DiskAppendableArray((3,5,7), dtype, tempdir)
+        DiskAppendableArray((3,5,7), dtype, tempdir / fname)
 
     # ensure failure if dtype is not valid
     with pytest.raises(TypeError):
+        DiskAppendableArray(slice_shape, "float3", tempdir / fname)
+
+    # ensure failure if pass directory instead of filename
+    with pytest.raises(IsADirectoryError):
         DiskAppendableArray(slice_shape, "float3", tempdir)
 
 
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 31a8ba500e..258d2c8663 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -32,7 +32,8 @@ def nanmedian3D(cube, overwrite_input=True):
     memory-intensive. Instead, loop over the median calculation to avoid the
     memory usage of the internal upcasting and temporary array allocation.
     The additional runtime of this loop is indistinguishable from zero,
-    but this loop decreases overall memory usage of the step by as much as half.
+    but this loop cuts overall step memory usage roughly in half for at least one
+    test association.
     """
     with warnings.catch_warnings():
         warnings.filterwarnings(action="ignore",
@@ -192,7 +193,7 @@ class DiskAppendableArray:
     a small spatial segment of the full dataset.
     """
     
-    def __init__(self, slice_shape, dtype, tempdir):
+    def __init__(self, slice_shape, dtype, filename):
         """
         Parameters
         ----------
@@ -202,13 +203,14 @@ def __init__(self, slice_shape, dtype, tempdir):
         dtype : str
             The data type of the array. Must be a valid numpy array datatype.
 
-        tempdir : str
-            The directory in which to create the temporary files.
-            Default is the current working directory.
+        filename : str
+            The full file path in which to store the array
         """
         if len(slice_shape) != 2:
             raise ValueError(f"Invalid slice_shape {slice_shape}. Only 2-D arrays are supported.")
-        self._temp_file, self._filename = tempfile.mkstemp(dir=tempdir)
+        self._filename = Path(filename)
+        with open(filename, "wb") as f:   # noqa: F841
+            pass
         self._slice_shape = slice_shape
         self._dtype = np.dtype(dtype)
         self._append_count = 0
@@ -226,7 +228,8 @@ def append(self, data):
         if data.dtype != self._dtype:
             raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}")
         with open(self._filename, "ab") as f:
-            f.write(data.tobytes())
+            data.tofile(f, sep="")
+            #f.write(data.tobytes())
         self._append_count += 1
 
 
@@ -333,7 +336,7 @@ def _temparray_setup(self, dtype):
             if i == self.nsections - 1:
                 # last section has whatever shape is left over
                 shp = (self.frame_shape[0] - (self.nsections-1) * self.section_nrows, self.frame_shape[1])
-            arr = DiskAppendableArray(shp, dtype, self._temp_path)
+            arr = DiskAppendableArray(shp, dtype, self._temp_path / f"{i}.bin")
             temp_arrays.append(arr)
         return temp_arrays
 
@@ -360,7 +363,7 @@ def _validate_data(self, data):
 
     def cleanup(self):
         """Remove the temporary files and directory when finished"""
-        [arr.cleanup() for arr in self._temp_arrays]
+        #[arr.cleanup() for arr in self._temp_arrays]
         self._temp_dir.cleanup()
         return
 

From 5b2d71a86ccf8844cefae9fdbf45f1e688f3a6a4 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Mon, 23 Sep 2024 14:41:01 -0400
Subject: [PATCH 19/34] towncrier changelog

---
 CHANGES.rst                        | 14 --------------
 changes/8782.outlier_detection.rst |  1 +
 changes/8782.resample.rst          |  1 +
 3 files changed, 2 insertions(+), 14 deletions(-)
 create mode 100644 changes/8782.outlier_detection.rst
 create mode 100644 changes/8782.resample.rst

diff --git a/CHANGES.rst b/CHANGES.rst
index 3cd847985d..7500ad652f 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,17 +1,3 @@
-1.16.1 (unreleased)
-===================
-
-outlier_detection
------------------
-
-- Decrease the amount of file I/O required to compute the median when `in_memory`
-  is set to `False`. [#8782]
-
-resample
---------
-
-- Permit creating drizzled models one at a time in many-to-many mode. [#8782]
-
 1.16.0 (2024-09-20)
 ===================
 
diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst
new file mode 100644
index 0000000000..e63c0cf4f3
--- /dev/null
+++ b/changes/8782.outlier_detection.rst
@@ -0,0 +1 @@
+Decrease the amount of file I/O required to compute the median when in_memory is set to False.
diff --git a/changes/8782.resample.rst b/changes/8782.resample.rst
new file mode 100644
index 0000000000..05b3e87052
--- /dev/null
+++ b/changes/8782.resample.rst
@@ -0,0 +1 @@
+Permit creating drizzled models one at a time in many-to-many mode.

From 1e6f8dfecfd5f462302c55da223433ea0b24102d Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 24 Sep 2024 09:10:29 -0400
Subject: [PATCH 20/34] suggestions from Melanie

---
 jwst/outlier_detection/utils.py | 2 +-
 jwst/resample/resample.py       | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 258d2c8663..71a55c758c 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -376,7 +376,7 @@ def compute_median(self):
 
         output_rows = row_indices[-1][1]
         output_cols = self._temp_arrays[0].shape[2]
-        median_image = np.empty((output_rows, output_cols), self.dtype) * np.nan
+        median_image = np.full((output_rows, output_cols), np.nan, dtype=self.dtype)
 
         for i, disk_arr in enumerate(self._temp_arrays):
             row1, row2 = row_indices[i]
diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py
index 393d6f55d9..f961b3f1e0 100644
--- a/jwst/resample/resample.py
+++ b/jwst/resample/resample.py
@@ -261,7 +261,7 @@ def resample_group(self, input_models, indices):
 
         indices : list
         """
-        output_model = self.blank_output
+        output_model = self.blank_output.copy()
 
         copy_asn_info_from_library(input_models, output_model)
 
@@ -331,10 +331,7 @@ def resample_group(self, input_models, indices):
                 input_models.shelve(img, index, modify=False)
                 del img
 
-        out = output_model.copy()
-        output_model.data *= 0.
-        output_model.wht *= 0.
-        return out
+        return output_model
 
     def resample_many_to_many(self, input_models):
         """Resample many inputs to many outputs where outputs have a common frame.

From 4b0bac4ad02c417ae85da64ba76f1df0c41f9063 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 24 Sep 2024 12:15:56 -0400
Subject: [PATCH 21/34] fixed file names for intermediate files, added regtest
 to cover it

---
 jwst/outlier_detection/_fileio.py     | 16 ++++++++++------
 jwst/outlier_detection/utils.py       |  6 ++++++
 jwst/regtest/test_nirspec_fs_spec3.py | 12 ++++++++++--
 jwst/resample/resample.py             |  2 +-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 53dbb8bc6f..828d21bece 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -1,3 +1,4 @@
+import re
 import logging
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
@@ -12,13 +13,16 @@ def save_median(median_model, make_output_path, asn_id=None):
     median_model : ~jwst.datamodels.ImageModel
         The median ImageModel or CubeModel to save
     '''
-    default_suffix = "_outlier_i2d.fits"
-    if asn_id is None:
-        suffix_to_remove = default_suffix
-    else:
-        suffix_to_remove = f"_{asn_id}{default_suffix}"
+    # need unique slit ID for MultiSlitModel inputs to step
+    slit_id =  getattr(median_model, "name", "").lower()
+    if slit_id:
+        slit_id = "_"+slit_id
+    # regex to handle i2d and s2d as though they are the same string
+    default_suffix = r"_outlier_.2d\.fits"
+    suffix_to_remove = default_suffix if asn_id is None else fr"_{asn_id}{default_suffix}"
+    basepath = re.sub(suffix_to_remove, ".fits", median_model.meta.filename)
     median_model_output_path = make_output_path(
-        basepath=median_model.meta.filename.replace(suffix_to_remove, '.fits'),
+        basepath=basepath,
         suffix='median')
     median_model.save(median_model_output_path)
     log.info(f"Saved model in {median_model_output_path}")
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 71a55c758c..2342186aec 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -139,6 +139,9 @@ def drizzle_and_median(input_models,
             if save_intermediate_results:
                 # write the drizzled model to file
                 output_name = drizzled_model.meta.filename
+                # need unique intermediate filenames for slits in MultiSlitModels
+                if hasattr(drizzled_model, "name") and drizzled_model.name is not None:
+                    output_name = output_name.replace("_outlier_", f"_{drizzled_model.name.lower()}_outlier_")
                 if resamp.output_dir is not None:
                     output_name = os.path.join(resamp.output_dir, output_name)
                 drizzled_model.save(output_name)
@@ -423,6 +426,9 @@ def flag_resampled_model_crs(
         if make_output_path is None:
             raise ValueError("make_output_path must be provided if save_blot is True")
         model_path = make_output_path(input_model.meta.filename, suffix='blot')
+        # for MultiSlitModels, need to insert the slit ID into the filename
+        if hasattr(input_model, "name") and input_model.name is not None:
+            model_path = model_path.replace("_blot", f"_{input_model.name.lower()}_blot")
         blot_model = _make_blot_model(input_model, blot)
         blot_model.meta.filename = model_path
         blot_model.save(model_path)
diff --git a/jwst/regtest/test_nirspec_fs_spec3.py b/jwst/regtest/test_nirspec_fs_spec3.py
index 4f3bca7edd..4fd2d62cf5 100644
--- a/jwst/regtest/test_nirspec_fs_spec3.py
+++ b/jwst/regtest/test_nirspec_fs_spec3.py
@@ -1,6 +1,7 @@
 from astropy.io.fits.diff import FITSDiff
 import pytest
 import numpy as np
+import os
 from gwcs import wcstools
 
 from jwst.stpipe import Step
@@ -20,20 +21,27 @@ def run_pipeline(rtdata_module):
     # Run the calwebb_spec3 pipeline; save results from intermediate steps
     args = ["calwebb_spec3", rtdata.input,
             "--steps.outlier_detection.save_results=true",
+            "--steps.outlier_detection.save_intermediate_results=true",
             "--steps.resample_spec.save_results=true",
             "--steps.extract_1d.save_results=true"]
     Step.from_cmdline(args)
 
 
 @pytest.mark.bigdata
-@pytest.mark.parametrize("suffix", ["cal", "crf", "s2d", "x1d"])
+@pytest.mark.parametrize("suffix", ["cal", "crf", "s2d", "x1d", "median"])
 @pytest.mark.parametrize("source_id,slit_name", [("s000000001","s200a2"), ("s000000021","s200a1"),
     ("s000000023","s400a1"), ("s000000024","s1600a1"), ("s000000025","s200b1")])
 def test_nirspec_fs_spec3(run_pipeline, rtdata_module, fitsdiff_default_kwargs, suffix, source_id, slit_name):
     """Test spec3 pipeline on a set of NIRSpec FS exposures."""
     rtdata = rtdata_module
 
-    output = f"jw01309-o022_{source_id}_nirspec_f290lp-g395h-{slit_name}-allslits_{suffix}.fits"
+    if suffix == "median":
+        output = f"jw01309022001_04102_00002_nrs2_{slit_name}_{suffix}.fits"
+        # also ensure drizzled and blot models were created with the correct names
+        assert os.path.isfile(output.replace("median", "outlier_s2d"))
+        assert os.path.isfile(output.replace("median", "blot"))
+    else:
+        output = f"jw01309-o022_{source_id}_nirspec_f290lp-g395h-{slit_name}-allslits_{suffix}.fits"
     rtdata.output = output
     rtdata.get_truth(f"truth/test_nirspec_fs_spec3/{output}")
 
diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py
index f961b3f1e0..795ecef9a1 100644
--- a/jwst/resample/resample.py
+++ b/jwst/resample/resample.py
@@ -356,7 +356,7 @@ def resample_many_to_many(self, input_models):
                 log.info(f"Saved model in {output_name}")
                 output_models.append(output_name)
             else:
-                output_models.append(output_model.data)
+                output_models.append(output_model)
 
         if not self.in_memory:
             # build ModelLibrary as an association from the output files

From f2c7b5be757f5edf47cf1bb43ae4fd638e6950b9 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 24 Sep 2024 15:46:34 -0400
Subject: [PATCH 22/34] refactor drizzle_and_median in response to @braingram

---
 jwst/outlier_detection/_fileio.py             |  66 +++--
 jwst/outlier_detection/imaging.py             |  62 ++---
 jwst/outlier_detection/spec.py                |  66 ++---
 .../tests/test_outlier_detection.py           |  50 ++--
 jwst/outlier_detection/tests/test_utils.py    |   4 -
 jwst/outlier_detection/utils.py               | 235 ++++++++++--------
 6 files changed, 269 insertions(+), 214 deletions(-)

diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 828d21bece..6544fcdc70 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -1,10 +1,11 @@
 import re
 import logging
+from jwst.datamodels import ImageModel
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
 
 
-def save_median(median_model, make_output_path, asn_id=None):
+def save_median(median_data, median_wcs, example_model, make_output_path=None):
     '''
     Save median if requested by user
 
@@ -13,16 +14,53 @@ def save_median(median_model, make_output_path, asn_id=None):
     median_model : ~jwst.datamodels.ImageModel
         The median ImageModel or CubeModel to save
     '''
-    # need unique slit ID for MultiSlitModel inputs to step
-    slit_id =  getattr(median_model, "name", "").lower()
-    if slit_id:
-        slit_id = "_"+slit_id
-    # regex to handle i2d and s2d as though they are the same string
-    default_suffix = r"_outlier_.2d\.fits"
-    suffix_to_remove = default_suffix if asn_id is None else fr"_{asn_id}{default_suffix}"
-    basepath = re.sub(suffix_to_remove, ".fits", median_model.meta.filename)
-    median_model_output_path = make_output_path(
-        basepath=basepath,
-        suffix='median')
-    median_model.save(median_model_output_path)
-    log.info(f"Saved model in {median_model_output_path}")
+    median_model = ImageModel(median_data)
+    median_model.update(example_model)
+    median_model.meta.wcs = median_wcs
+    output_path = intermediate_output_path(example_model, "median", make_output_path)
+    _save_to_path(median_model, output_path, suffix="median")
+
+
+def save_drizzled(drizzled_model, make_output_path):
+    expected_tail = "outlier_?2d.fits"
+    suffix = drizzled_model.meta.filename[-len(expected_tail):-5]
+    output_path = intermediate_output_path(drizzled_model, suffix, make_output_path)
+    _save_to_path(drizzled_model, output_path, suffix=suffix)
+
+
+def save_blot(input_model, blot, make_output_path):
+
+    blot_model = _make_blot_model(input_model, blot)
+    output_path = intermediate_output_path(input_model, "blot", make_output_path, suffix_to_remove=r"_cal.*")
+    _save_to_path(blot_model, output_path, suffix="blot")
+
+
+def _make_blot_model(input_model, blot):
+    blot_model = type(input_model)()
+    blot_model.data = blot
+    blot_model.update(input_model)
+    return blot_model
+
+
+def intermediate_output_path(input_model, suffix, make_output_path, suffix_to_remove=r"_outlier.*"):
+    """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions"""
+    if hasattr(input_model, "name") and input_model.name is not None:
+        if "_"+input_model.name.lower()+"_" not in input_model.meta.filename:
+            replacement = f"_{input_model.name.lower()}"
+        else:
+            replacement = ""
+    else:
+        replacement = ""
+    basepath = re.sub(suffix_to_remove, replacement, input_model.meta.filename)
+
+    if make_output_path is None:
+        def make_output_path(basepath, suffix):
+            return basepath.replace(".fits", f"_{suffix}.fits")
+
+    return make_output_path(basepath, suffix=suffix)
+
+
+def _save_to_path(model, output_path, suffix=""):
+    model.meta.filename = output_path
+    model.save(output_path)
+    log.info(f"Saved {suffix} model in {output_path}")
\ No newline at end of file
diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 89cf232648..37f807b8bf 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -3,13 +3,15 @@
 """
 
 import logging
-import os
 
 from jwst.datamodels import ModelLibrary
 from jwst.resample import resample
 from jwst.stpipe.utilities import record_step_status
 
-from .utils import flag_model_crs, flag_resampled_model_crs, drizzle_and_median
+from .utils import (flag_model_crs,
+                    flag_resampled_model_crs,
+                    median_without_resampling,
+                    median_with_resampling)
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
@@ -53,38 +55,36 @@ def detect_outliers(
         log.warning("Outlier detection will be skipped")
         record_step_status(input_models, "outlier_detection", False)
         return input_models
-    
-    with input_models:
-        example_model = input_models.borrow(0)
-        output_path = make_output_path(basepath=example_model.meta.filename,
-                    suffix='')
-        input_models.shelve(example_model, modify=False)
-        del example_model
-    output_path = os.path.dirname(output_path)
         
     # Start by creating resampled/mosaic images for
     # each group of exposures
-    resamp = resample.ResampleData(
-        input_models,
-        output=output_path,
-        single=True,
-        blendheaders=False,
-        wht_type=weight_type,
-        pixfrac=pixfrac,
-        kernel=kernel,
-        fillval=fillval,
-        good_bits=good_bits,
-        in_memory=in_memory,
-        asn_id=asn_id,
-        allowed_memory=allowed_memory,
-    )
-
-    median_data, median_wcs = drizzle_and_median(input_models,
-                                                 resamp,
-                                                 maskpt,
-                                                 resample_data=resample_data,
-                                                 save_intermediate_results=save_intermediate_results,
-                                                 make_output_path=make_output_path,)
+    if resample_data:
+        resamp = resample.ResampleData(
+            input_models,
+            output=None,
+            single=True,
+            blendheaders=False,
+            wht_type=weight_type,
+            pixfrac=pixfrac,
+            kernel=kernel,
+            fillval=fillval,
+            good_bits=good_bits,
+            in_memory=in_memory,
+            asn_id=asn_id,
+            allowed_memory=allowed_memory,
+        )
+        median_data, median_wcs = median_with_resampling(input_models,
+                                                    resamp,
+                                                    maskpt,
+                                                    save_intermediate_results=save_intermediate_results,
+                                                    make_output_path=make_output_path,)
+    else:
+        median_data, median_wcs = median_without_resampling(input_models,
+                                                    maskpt,
+                                                    weight_type,
+                                                    good_bits,
+                                                    save_intermediate_results=save_intermediate_results,
+                                                    make_output_path=make_output_path,)
 
 
     # Perform outlier detection using statistical comparisons between
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index 05f7fd204f..23d06ac88c 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -1,13 +1,14 @@
 """
 Submodule for performing outlier detection on spectra.
 """
-import os
-
 from jwst.datamodels import ModelContainer, ModelLibrary
 from jwst.stpipe.utilities import record_step_status
 
 from ..resample import resample_spec
-from .utils import flag_crs_in_models, flag_crs_in_models_with_resampling, drizzle_and_median
+from .utils import (flag_crs_in_models,
+                    flag_crs_in_models_with_resampling,
+                    median_with_resampling,
+                    median_without_resampling)
 
 import logging
 log = logging.getLogger(__name__)
@@ -50,35 +51,42 @@ def detect_outliers(
         record_step_status(input_models, "outlier_detection", False)
         return input_models
 
-    # Start by creating resampled/mosaic images for
-    #  each group of exposures
-    output_path = make_output_path(
-        basepath=input_models[0].meta.filename, suffix='')
-    output_path = os.path.dirname(output_path)
-    resamp = resample_spec.ResampleSpecData(
-        input_models,
-        output=output_path,
-        single=True,
-        blendheaders=False,
-        wht_type=weight_type,
-        pixfrac=pixfrac,
-        kernel=kernel,
-        fillval=fillval,
-        good_bits=good_bits,
-        in_memory=in_memory,
-        asn_id=asn_id,
-    )
-
     # convert to library for resample
     # for compatibility with image3 pipeline
     library = ModelLibrary(input_models, on_disk=False)
-    
-    median_data, median_wcs = drizzle_and_median(library,
-                                                 resamp,
-                                                 maskpt,
-                                                 resample_data=resample_data,
-                                                 save_intermediate_results=save_intermediate_results,
-                                                 make_output_path=make_output_path)
+
+    if resample_data is True:
+        # Start by creating resampled/mosaic images for
+        #  each group of exposures
+        resamp = resample_spec.ResampleSpecData(
+            input_models,
+            output=None,
+            single=True,
+            blendheaders=False,
+            wht_type=weight_type,
+            pixfrac=pixfrac,
+            kernel=kernel,
+            fillval=fillval,
+            good_bits=good_bits,
+            in_memory=in_memory,
+            asn_id=asn_id,
+        )
+
+        median_data, median_wcs = median_with_resampling(
+            library,
+            resamp,
+            maskpt,
+            save_intermediate_results=save_intermediate_results,
+            make_output_path=make_output_path,)
+    else:
+        median_data, median_wcs = median_without_resampling(
+            library,
+            maskpt,
+            weight_type,
+            good_bits,
+            save_intermediate_results=save_intermediate_results,
+            make_output_path=make_output_path,
+        )
 
     # Perform outlier detection using statistical comparisons between
     # each original input image and its blotted version of the median image
diff --git a/jwst/outlier_detection/tests/test_outlier_detection.py b/jwst/outlier_detection/tests/test_outlier_detection.py
index 141b3bce1b..3155824fc8 100644
--- a/jwst/outlier_detection/tests/test_outlier_detection.py
+++ b/jwst/outlier_detection/tests/test_outlier_detection.py
@@ -19,7 +19,7 @@
     CORON_IMAGE_MODES,
 )
 from jwst.resample.tests.test_resample_step import miri_rate_model
-from jwst.outlier_detection.utils import drizzle_and_median
+from jwst.outlier_detection.utils import median_with_resampling, median_without_resampling
 from jwst.resample.resample import ResampleData
 
 OUTLIER_DO_NOT_USE = np.bitwise_or(
@@ -615,16 +615,18 @@ def test_same_median_on_disk(three_sci_as_asn, tmp_cwd):
     # 32-bit floats are 4 bytes each, min buffer size is one row of 20 pixels
     # arbitrarily use 5 times that
     buffer_size = 4 * 20 * 5 
-    median_on_disk, _ = drizzle_and_median(lib_on_disk,
-                                           make_resamp(lib_on_disk),
-                                           0.7,
-                                           buffer_size=buffer_size,
-                                           resample_data=False)
-    median_in_memory, _ = drizzle_and_median(lib_in_memory,
-                                             make_resamp(lib_in_memory),
-                                             0.7,
-                                             buffer_size=buffer_size,
-                                             resample_data=False)
+    median_on_disk, _ = median_without_resampling(
+        lib_on_disk,
+        0.7,
+        "ivm",
+        "~DO_NOT_USE",
+        buffer_size=buffer_size,)
+    median_in_memory, _ = median_without_resampling(
+        lib_in_memory,
+        0.7,
+        "ivm",
+        "~DO_NOT_USE",
+        buffer_size=buffer_size,)
 
     # Make sure the high-variance (low-weight) pixel is set to NaN
     assert np.isnan(median_in_memory[4,9])
@@ -637,30 +639,22 @@ def test_drizzle_and_median_with_resample(three_sci_as_asn, tmp_cwd):
     lib = ModelLibrary(three_sci_as_asn, on_disk=False)
 
     resamp = make_resamp(lib)
-    median, wcs = drizzle_and_median(lib,
-                                     resamp,
-                                     0.7,
-                                     resample_data=True)
+    median, wcs = median_with_resampling(
+        lib,
+        resamp,
+        0.7)
     
     assert isinstance(wcs, WCS)
     assert median.shape == (21,20)
-
-    with pytest.raises(ValueError):
-        # ensure failure if save_intermediate results but no output path
-        drizzle_and_median(lib,
-                           resamp,
-                           0.7,
-                           make_output_path=None,
-                           save_intermediate_results=True)
         
     resamp.single = False
     with pytest.raises(ValueError):
         # ensure failure if try to call when resamp.single is False
-        drizzle_and_median(lib,
-                           resamp,
-                           0.7,
-                           make_output_path=None,
-                           save_intermediate_results=True)
+        median_with_resampling(
+            lib,
+            resamp,
+            0.7,
+            save_intermediate_results=True)
 
 
 def make_resamp(input_models):
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index 875b5060d7..ff62440890 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -48,10 +48,6 @@ def test_disk_appendable_array(tmp_cwd):
     assert np.all(arr_in_memory[1] == candidate1)
     assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True)
 
-    # test cleanup 
-    arr.cleanup()
-    assert len(os.listdir(tempdir)) == 0
-
 
 @pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_disk_appendable_array_bad_inputs(tmp_cwd):
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 2342186aec..5eaddeba5b 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -2,7 +2,6 @@
 The ever-present utils sub-module. A home for all...
 """
 
-import os
 import copy
 import warnings
 import numpy as np
@@ -14,7 +13,7 @@
 from jwst.resample.resample_utils import build_driz_weight
 from stcal.outlier_detection.utils import compute_weight_threshold, gwcs_blot, flag_crs, flag_resampled_crs
 from stdatamodels.jwst import datamodels
-from ._fileio import save_median
+from . import _fileio
 
 import logging
 log = logging.getLogger(__name__)
@@ -42,7 +41,7 @@ def nanmedian3D(cube, overwrite_input=True):
         output_arr = np.empty(cube.shape[1:], dtype=np.float32)
         for i in range(output_arr.shape[0]):
             # this for loop looks silly, but see docstring above
-            output_arr[i,:] = np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input).astype(np.float32)
+            np.nanmedian(cube[:,i,:], axis=0, overwrite_input=overwrite_input, out=output_arr[i,:])
         return output_arr
 
 
@@ -57,13 +56,80 @@ def create_cube_median(cube_model, maskpt):
         overwrite_input=False)
 
 
-def drizzle_and_median(input_models,
-                       resamp,
-                       maskpt,
-                       resample_data=False,
-                       save_intermediate_results=False,
-                       make_output_path=None,
-                       buffer_size=None):
+def median_without_resampling(input_models,
+                              maskpt,
+                              weight_type,
+                              good_bits,
+                              save_intermediate_results=False,
+                              make_output_path=None,
+                              buffer_size=None):
+    """
+    Shared code between imaging and spec modes for resampling and median computation
+
+    Parameters
+    ----------
+    input_models : ModelLibrary
+        The input datamodels.
+
+    maskpt : float
+        The weight threshold for masking out low weight pixels.
+
+    weight_type : str
+        The type of weighting to use when combining images. Options are:
+        'ivm' (inverse variance) or 'exptime' (exposure time).
+
+    good_bits : int
+        The bit values that are considered good when determining the
+        data quality of the input.
+
+    save_intermediate_results : bool
+        if True, save the drizzled models and median model to fits.
+
+    make_output_path : function
+        The functools.partial instance to pass to save_median. Must be 
+        specified if save_intermediate_results is True. Default None.
+
+    buffer_size : int
+        The size of chunk in bytes that will be read into memory when computing the median.
+        This parameter has no effect if the input library has its on_disk attribute
+        set to False.
+    """
+    in_memory = not input_models._on_disk
+    ngroups = len(input_models)
+
+    with input_models:
+        for i in range(len(input_models)):
+
+            drizzled_model = input_models.borrow(i)
+            drizzled_model.wht = build_driz_weight(drizzled_model,
+                                                    weight_type=weight_type,
+                                                    good_bits=good_bits)
+            median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
+            input_models.shelve(drizzled_model, i, modify=True)
+
+            if save_intermediate_results:
+                # write the drizzled model to file
+                _fileio.save_drizzled(drizzled_model, make_output_path)
+
+            median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size)
+
+    # Perform median combination on set of drizzled mosaics
+    median_data = _evaluate_median_computer(median_computer, in_memory)
+
+    if save_intermediate_results:
+        # Save median model to fits
+        _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path)
+    del drizzled_model
+
+    return median_data, median_wcs
+
+
+def median_with_resampling(input_models,
+                           resamp,
+                           maskpt,
+                           save_intermediate_results=False,
+                           make_output_path=None,
+                           buffer_size=None):
     """
     Shared code between imaging and spec modes for resampling and median computation
 
@@ -78,9 +144,6 @@ def drizzle_and_median(input_models,
     maskpt : float
         The weight threshold for masking out low weight pixels.
 
-    resample_data : bool
-        Whether or not to do resampling.
-
     save_intermediate_results : bool
         if True, save the drizzled models and median model to fits.
 
@@ -93,91 +156,74 @@ def drizzle_and_median(input_models,
         This parameter has no effect if the input library has its on_disk attribute
         set to False.
     """
-
-    # validate inputs
-    if save_intermediate_results and (make_output_path is None):
-        raise ValueError("make_output_path is required if save_intermediate_results is True")
     if not resamp.single:
-        raise ValueError("drizzle_and_median should only be used for resample_many_to_many")
+        raise ValueError("median_with_resampling should only be used for resample_many_to_many")
+    
     in_memory = not input_models._on_disk
-
-    if resample_data:
-        indices_by_group = list(input_models.group_indices.values())
-    else:
-        # treat each input model as if it is the only member of its group
-        indices_by_group = [[i] for i in range(len(input_models))]
+    indices_by_group = list(input_models.group_indices.values())
+    ngroups = len(indices_by_group)
 
     with input_models:
         for i, indices in enumerate(indices_by_group):
 
-            if resample_data:
-                median_wcs = resamp.output_wcs
-                drizzled_model = resamp.resample_group(input_models, indices)
-            else:
-                # for non-dithered data, the resampled image is just the original image
-                drizzled_model = input_models.borrow(i)
-                drizzled_model.wht = build_driz_weight(
-                    drizzled_model,
-                    weight_type=resamp.weight_type,
-                    good_bits=resamp.good_bits)
-                input_models.shelve(drizzled_model, i, modify=True)
-                # copy for when saving median and input is a filename?
-                median_wcs = copy.deepcopy(drizzled_model.meta.wcs)
-
-            if i == 0:
-                ngroups = len(indices_by_group)
-                full_shape = (ngroups,) + drizzled_model.data.shape
-                if in_memory:
-                    # allocate memory for data arrays that go into median
-                    data_frames = np.empty(full_shape, dtype=np.float32)
-                else:
-                    # set up temporary storage for data arrays that go into median
-                    median_computer = OnDiskMedian(full_shape,
-                                                    dtype=drizzled_model.data.dtype,
-                                                    buffer_size=buffer_size)
+            median_wcs = resamp.output_wcs
+            drizzled_model = resamp.resample_group(input_models, indices)
 
             if save_intermediate_results:
                 # write the drizzled model to file
-                output_name = drizzled_model.meta.filename
-                # need unique intermediate filenames for slits in MultiSlitModels
-                if hasattr(drizzled_model, "name") and drizzled_model.name is not None:
-                    output_name = output_name.replace("_outlier_", f"_{drizzled_model.name.lower()}_outlier_")
-                if resamp.output_dir is not None:
-                    output_name = os.path.join(resamp.output_dir, output_name)
-                drizzled_model.save(output_name)
-                log.info(f"Saved model in {output_name}")
-
-            # handle the weights right away, so only data array needs to be saved
-            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
-            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-
-            if in_memory:
-                # populate pre-allocated memory with the drizzled data
-                data_frames[i] = drizzled_model.data
-            else:
-                # distribute the drizzled data into the temporary storage
-                median_computer.add_image(drizzled_model.data)
+                _fileio.save_drizzled(drizzled_model, make_output_path)
+
+            median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size)
 
     # Perform median combination on set of drizzled mosaics
-    if in_memory:
-        median_data = nanmedian3D(data_frames)
-        del data_frames
-    else:
-        median_data = median_computer.compute_median()
-        median_computer.cleanup()
+    median_data = _evaluate_median_computer(median_computer, in_memory)
 
     if save_intermediate_results:
         # Save median model to fits
-        median_model = datamodels.ImageModel(median_data)
-        median_model.update(drizzled_model)
-        median_model.meta.wcs = median_wcs
-        save_median(median_model, make_output_path, resamp.asn_id)
-        del median_model
+        _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path)
     del drizzled_model
 
     return median_data, median_wcs
 
 
+def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory, buffer_size):
+    
+    if idx == 0:
+        full_shape = (ngroups,) + drizzled_model.data.shape
+        global median_computer
+        if in_memory:
+            # allocate memory for data arrays that go into median
+            median_computer = np.empty(full_shape, dtype=np.float32)
+        else:
+            # set up temporary storage for data arrays that go into median
+            median_computer = OnDiskMedian(full_shape,
+                                            dtype=drizzled_model.data.dtype,
+                                            buffer_size=buffer_size)
+
+    # handle the weights right away, so only data array needs to be saved
+    weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+    drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+
+    if in_memory:
+        # populate pre-allocated memory with the drizzled data
+        median_computer[idx] = drizzled_model.data
+    else:
+        # distribute the drizzled data into the temporary storage
+        median_computer.add_image(drizzled_model.data)
+
+    return median_computer
+
+
+def _evaluate_median_computer(median_computer, in_memory):
+    if in_memory:
+        median_data = nanmedian3D(median_computer)
+        del median_computer
+    else:
+        median_data = median_computer.compute_median()
+        median_computer.cleanup()
+    return median_data
+
+
 class DiskAppendableArray:
     """
     Creates a temporary file to which to append data, in order to perform
@@ -232,28 +278,19 @@ def append(self, data):
             raise ValueError(f"Data dtype {data.dtype} does not match array dtype {self._dtype}")
         with open(self._filename, "ab") as f:
             data.tofile(f, sep="")
-            #f.write(data.tobytes())
         self._append_count += 1
 
 
     def read(self):
-        """Read the 3-D array into memory, then delete the tempfile"""
+        """Read the 3-D array into memory"""
         shp = (self._append_count,) + self._slice_shape
         with open(self._filename, "rb") as f:
             output = np.fromfile(f, dtype=self._dtype).reshape(shp)
         return output
-    
-
-    def cleanup(self):
-        Path.unlink(Path(self._filename))
-        return
 
 
 class OnDiskMedian:
 
-    # TODO: can/should this class inherit directly from tempfile.TemporaryDirectory?
-    # probably not, because it "has a" tempdir, it is not itself a tempdir
-
     def __init__(self, shape, dtype='float32', tempdir="", buffer_size=None):
         """
         Set up temporary files to perform operations on a stack of 2-D input arrays
@@ -366,7 +403,6 @@ def _validate_data(self, data):
 
     def cleanup(self):
         """Remove the temporary files and directory when finished"""
-        #[arr.cleanup() for arr in self._temp_arrays]
         self._temp_dir.cleanup()
         return
 
@@ -423,17 +459,7 @@ def flag_resampled_model_crs(
 
     blot = gwcs_blot(median_data, median_wcs, input_model.data.shape, input_model.meta.wcs, pix_ratio)
     if save_blot:
-        if make_output_path is None:
-            raise ValueError("make_output_path must be provided if save_blot is True")
-        model_path = make_output_path(input_model.meta.filename, suffix='blot')
-        # for MultiSlitModels, need to insert the slit ID into the filename
-        if hasattr(input_model, "name") and input_model.name is not None:
-            model_path = model_path.replace("_blot", f"_{input_model.name.lower()}_blot")
-        blot_model = _make_blot_model(input_model, blot)
-        blot_model.meta.filename = model_path
-        blot_model.save(model_path)
-        log.info(f"Saved model in {model_path}")
-        del blot_model
+        _fileio.save_blot(input_model, blot, make_output_path)
     # dq flags will be updated in-place
     _flag_resampled_model_crs(input_model, blot, snr1, snr2, scale1, scale2, backg)
 
@@ -504,10 +530,3 @@ def flag_model_crs(image, blot, snr):
     match_nans_and_flags(image)
 
     log.info(f"{np.count_nonzero(cr_mask)} pixels marked as outliers")
-
-
-def _make_blot_model(input_model, blot):
-    blot_model = type(input_model)()
-    blot_model.data = blot
-    blot_model.update(input_model)
-    return blot_model
\ No newline at end of file

From b51c27e88a21323613b5f636fcdd4b7dd6846a99 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 24 Sep 2024 17:37:03 -0400
Subject: [PATCH 23/34] simplified _save_intermediate_output

---
 changes/8782.outlier_detection.rst |  1 +
 jwst/outlier_detection/_fileio.py  | 36 ++++++++++++------------------
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst
index e63c0cf4f3..1b1e683d66 100644
--- a/changes/8782.outlier_detection.rst
+++ b/changes/8782.outlier_detection.rst
@@ -1 +1,2 @@
 Decrease the amount of file I/O required to compute the median when in_memory is set to False.
+Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed.
\ No newline at end of file
diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 6544fcdc70..165b43943b 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -17,22 +17,19 @@ def save_median(median_data, median_wcs, example_model, make_output_path=None):
     median_model = ImageModel(median_data)
     median_model.update(example_model)
     median_model.meta.wcs = median_wcs
-    output_path = intermediate_output_path(example_model, "median", make_output_path)
-    _save_to_path(median_model, output_path, suffix="median")
+    _save_intermediate_output(median_model, "median", make_output_path)
 
 
 def save_drizzled(drizzled_model, make_output_path):
     expected_tail = "outlier_?2d.fits"
     suffix = drizzled_model.meta.filename[-len(expected_tail):-5]
-    output_path = intermediate_output_path(drizzled_model, suffix, make_output_path)
-    _save_to_path(drizzled_model, output_path, suffix=suffix)
+    _save_intermediate_output(drizzled_model, suffix, make_output_path)
 
 
 def save_blot(input_model, blot, make_output_path):
 
     blot_model = _make_blot_model(input_model, blot)
-    output_path = intermediate_output_path(input_model, "blot", make_output_path, suffix_to_remove=r"_cal.*")
-    _save_to_path(blot_model, output_path, suffix="blot")
+    _save_intermediate_output(blot_model, "blot", make_output_path)
 
 
 def _make_blot_model(input_model, blot):
@@ -42,25 +39,20 @@ def _make_blot_model(input_model, blot):
     return blot_model
 
 
-def intermediate_output_path(input_model, suffix, make_output_path, suffix_to_remove=r"_outlier.*"):
+def _save_intermediate_output(model, suffix, make_output_path):
     """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions"""
-    if hasattr(input_model, "name") and input_model.name is not None:
-        if "_"+input_model.name.lower()+"_" not in input_model.meta.filename:
-            replacement = f"_{input_model.name.lower()}"
+    input_path = model.meta.filename.replace("_outlier_", "_")
+
+    # Add a slit name to the output path for MultiSlitModel input data if it isn't already there
+    if hasattr(model, "name") and model.name is not None:
+        if "_"+model.name.lower() not in model.meta.filename:
+            slit_name = f"{model.name.lower()}"
         else:
-            replacement = ""
+            slit_name = None
     else:
-        replacement = ""
-    basepath = re.sub(suffix_to_remove, replacement, input_model.meta.filename)
-
-    if make_output_path is None:
-        def make_output_path(basepath, suffix):
-            return basepath.replace(".fits", f"_{suffix}.fits")
-
-    return make_output_path(basepath, suffix=suffix)
-
+        slit_name = None
 
-def _save_to_path(model, output_path, suffix=""):
+    output_path = make_output_path(input_path, suffix=suffix, components=slit_name)
     model.meta.filename = output_path
     model.save(output_path)
-    log.info(f"Saved {suffix} model in {output_path}")
\ No newline at end of file
+    log.info(f"Saved {suffix} model in {output_path}")

From d1efbe9905f8614821fdd52362cae467d917c52f Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Tue, 24 Sep 2024 19:50:28 -0400
Subject: [PATCH 24/34] make file naming easier to understand and hopefully fix
 bug

---
 jwst/outlier_detection/_fileio.py             | 35 ++++++++++---------
 jwst/outlier_detection/coron.py               |  3 +-
 .../outlier_detection_step.py                 |  2 --
 jwst/outlier_detection/tso.py                 |  3 +-
 jwst/outlier_detection/utils.py               | 13 +++++--
 jwst/resample/resample.py                     | 11 ++----
 6 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 165b43943b..e146cc1932 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -1,11 +1,9 @@
-import re
 import logging
-from jwst.datamodels import ImageModel
 log = logging.getLogger(__name__)
 log.setLevel(logging.DEBUG)
 
 
-def save_median(median_data, median_wcs, example_model, make_output_path=None):
+def save_median(median_model, make_output_path):
     '''
     Save median if requested by user
 
@@ -14,9 +12,6 @@ def save_median(median_data, median_wcs, example_model, make_output_path=None):
     median_model : ~jwst.datamodels.ImageModel
         The median ImageModel or CubeModel to save
     '''
-    median_model = ImageModel(median_data)
-    median_model.update(example_model)
-    median_model.meta.wcs = median_wcs
     _save_intermediate_output(median_model, "median", make_output_path)
 
 
@@ -27,7 +22,6 @@ def save_drizzled(drizzled_model, make_output_path):
 
 
 def save_blot(input_model, blot, make_output_path):
-
     blot_model = _make_blot_model(input_model, blot)
     _save_intermediate_output(blot_model, "blot", make_output_path)
 
@@ -40,19 +34,26 @@ def _make_blot_model(input_model, blot):
 
 
 def _save_intermediate_output(model, suffix, make_output_path):
-    """Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions"""
+    """
+    Ensure all intermediate outputs from OutlierDetectionStep have consistent file naming conventions
+    
+    Notes
+    -----
+    self.make_output_path() is updated globally for the step in the main pipeline
+    to include the asn_id in the output path, so no need to handle it here.
+    """
+
+    # make_output_path cannot handle suffix with an underscore inside it,
+    # e.g. _outlier_s2d.fits, so do a manual string replacement
     input_path = model.meta.filename.replace("_outlier_", "_")
 
-    # Add a slit name to the output path for MultiSlitModel input data if it isn't already there
+    # Add a slit name to the output path for MultiSlitModel data if not present
+    components = None
     if hasattr(model, "name") and model.name is not None:
-        if "_"+model.name.lower() not in model.meta.filename:
-            slit_name = f"{model.name.lower()}"
-        else:
-            slit_name = None
-    else:
-        slit_name = None
-
-    output_path = make_output_path(input_path, suffix=suffix, components=slit_name)
+        if "_"+model.name.lower() not in input_path:
+            components = f"{model.name.lower()}"
+
+    output_path = make_output_path(input_path, suffix=suffix, components=components)
     model.meta.filename = output_path
     model.save(output_path)
     log.info(f"Saved {suffix} model in {output_path}")
diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py
index 500ad49602..86d5cef71d 100644
--- a/jwst/outlier_detection/coron.py
+++ b/jwst/outlier_detection/coron.py
@@ -27,7 +27,6 @@ def detect_outliers(
     good_bits,
     maskpt,
     snr,
-    asn_id,
     make_output_path,
 ):
     """
@@ -56,7 +55,7 @@ def detect_outliers(
         median_model.update(input_model)
         median_model.meta.wcs = input_model.meta.wcs
 
-        save_median(median_model, make_output_path, asn_id)
+        save_median(median_model, make_output_path)
         del median_model
 
     # Perform outlier detection using statistical comparisons between
diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py
index f18fb054e3..d11334fd0e 100644
--- a/jwst/outlier_detection/outlier_detection_step.py
+++ b/jwst/outlier_detection/outlier_detection_step.py
@@ -94,7 +94,6 @@ def process(self, input_data):
                 self.maskpt,
                 self.rolling_window_width,
                 snr1,
-                asn_id,
                 self.make_output_path,
             )
         elif mode == 'coron':
@@ -104,7 +103,6 @@ def process(self, input_data):
                 self.good_bits,
                 self.maskpt,
                 snr1,
-                asn_id,
                 self.make_output_path,
             )
         elif mode == 'imaging':
diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py
index 58aedd19b0..d0161dffe7 100644
--- a/jwst/outlier_detection/tso.py
+++ b/jwst/outlier_detection/tso.py
@@ -22,7 +22,6 @@ def detect_outliers(
     maskpt,
     rolling_window_width,
     snr,
-    asn_id,
     make_output_path,
 ):
     """
@@ -53,7 +52,7 @@ def detect_outliers(
         median_model = dm.CubeModel(data=medians)
         with dm.open(weighted_cube) as dm0:
             median_model.update(dm0)
-        save_median(median_model, make_output_path, asn_id)
+        save_median(median_model, make_output_path)
         del median_model
 
     # no need for blotting, resample is turned off for TSO
diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 5eaddeba5b..7ffc497ee9 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -3,6 +3,7 @@
 """
 
 import copy
+from functools import partial
 import warnings
 import numpy as np
 import tempfile
@@ -118,7 +119,10 @@ def median_without_resampling(input_models,
 
     if save_intermediate_results:
         # Save median model to fits
-        _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path)
+        median_model = datamodels.ImageModel(median_data)
+        median_model.update(drizzled_model)
+        median_model.meta.wcs = median_wcs
+        _fileio.save_median(median_model, make_output_path)
     del drizzled_model
 
     return median_data, median_wcs
@@ -180,7 +184,12 @@ def median_with_resampling(input_models,
 
     if save_intermediate_results:
         # Save median model to fits
-        _fileio.save_median(median_data, median_wcs, drizzled_model, make_output_path)
+        median_model = datamodels.ImageModel(median_data)
+        median_model.update(drizzled_model)
+        median_model.meta.wcs = median_wcs
+        # drizzled model already contains asn_id
+        make_output_path = partial(make_output_path, asn_id=None)
+        _fileio.save_median(median_model, make_output_path)
     del drizzled_model
 
     return median_data, median_wcs
diff --git a/jwst/resample/resample.py b/jwst/resample/resample.py
index 795ecef9a1..7d70ae840d 100644
--- a/jwst/resample/resample.py
+++ b/jwst/resample/resample.py
@@ -274,14 +274,9 @@ def resample_group(self, input_models, indices):
             output_type = example_image.meta.filename[indx:]
             output_root = '_'.join(example_image.meta.filename.replace(
                 output_type, '').split('_')[:-1])
-            if self.asn_id is not None:
-                output_model.meta.filename = (
-                    f'{output_root}_{self.asn_id}_'
-                    f'{self.intermediate_suffix}{output_type}')
-            else:
-                output_model.meta.filename = (
-                    f'{output_root}_'
-                    f'{self.intermediate_suffix}{output_type}')
+            output_model.meta.filename = (
+                f'{output_root}_'
+                f'{self.intermediate_suffix}{output_type}')
             input_models.shelve(example_image, indices[0], modify=False)
             del example_image
 

From b329a00b20fcdfb0933c5c996a7adde10b23a038 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 09:14:27 -0400
Subject: [PATCH 25/34] remove need to pass asn_id around the step

---
 jwst/outlier_detection/coron.py                  | 2 ++
 jwst/outlier_detection/imaging.py                | 2 --
 jwst/outlier_detection/outlier_detection_step.py | 8 +++-----
 jwst/outlier_detection/spec.py                   | 2 --
 jwst/outlier_detection/tso.py                    | 2 ++
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py
index 86d5cef71d..e1676de58f 100644
--- a/jwst/outlier_detection/coron.py
+++ b/jwst/outlier_detection/coron.py
@@ -3,6 +3,7 @@
 """
 
 
+from functools import partial
 import logging
 
 import numpy as np
@@ -55,6 +56,7 @@ def detect_outliers(
         median_model.update(input_model)
         median_model.meta.wcs = input_model.meta.wcs
 
+        make_output_path = partial(make_output_path, asn_id=None)
         save_median(median_model, make_output_path)
         del median_model
 
diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 37f807b8bf..1a8511b29c 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -37,7 +37,6 @@ def detect_outliers(
     fillval,
     allowed_memory,
     in_memory,
-    asn_id,
     make_output_path,
 ):
     """
@@ -70,7 +69,6 @@ def detect_outliers(
             fillval=fillval,
             good_bits=good_bits,
             in_memory=in_memory,
-            asn_id=asn_id,
             allowed_memory=allowed_memory,
         )
         median_data, median_wcs = median_with_resampling(input_models,
diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py
index d11334fd0e..1ffdbf6af4 100644
--- a/jwst/outlier_detection/outlier_detection_step.py
+++ b/jwst/outlier_detection/outlier_detection_step.py
@@ -80,8 +80,7 @@ def process(self, input_data):
         self.log.info(f"Outlier Detection mode: {mode}")
 
         # determine the asn_id (if not set by the pipeline)
-        asn_id = self._get_asn_id(input_data)
-        self.log.info(f"Outlier Detection asn_id: {asn_id}")
+        self._get_asn_id(input_data)
 
         snr1, snr2 = [float(v) for v in self.snr.split()]
         scale1, scale2 = [float(v) for v in self.scale.split()]
@@ -123,7 +122,6 @@ def process(self, input_data):
                 self.fillval,
                 self.allowed_memory,
                 self.in_memory,
-                asn_id,
                 self.make_output_path,
             )
         elif mode == 'spec':
@@ -143,7 +141,6 @@ def process(self, input_data):
                 self.kernel,
                 self.fillval,
                 self.in_memory,
-                asn_id,
                 self.make_output_path,
             )
         elif mode == 'ifu':
@@ -225,7 +222,8 @@ def _get_asn_id(self, input_models):
                 _make_output_path,
                 asn_id=asn_id
             )
-        return asn_id
+        self.log.info(f"Outlier Detection asn_id: {asn_id}")
+        return
     
     def _set_status(self, input_models, status):
         # this might be called with the input which might be a filename or path
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index 23d06ac88c..ad8e3db1c1 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -34,7 +34,6 @@ def detect_outliers(
     kernel,
     fillval,
     in_memory,
-    asn_id,
     make_output_path,
 ):
     """
@@ -69,7 +68,6 @@ def detect_outliers(
             fillval=fillval,
             good_bits=good_bits,
             in_memory=in_memory,
-            asn_id=asn_id,
         )
 
         median_data, median_wcs = median_with_resampling(
diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py
index d0161dffe7..95de86f82f 100644
--- a/jwst/outlier_detection/tso.py
+++ b/jwst/outlier_detection/tso.py
@@ -1,3 +1,4 @@
+from functools import partial
 import numpy as np
 from jwst.resample.resample_utils import build_mask
 
@@ -52,6 +53,7 @@ def detect_outliers(
         median_model = dm.CubeModel(data=medians)
         with dm.open(weighted_cube) as dm0:
             median_model.update(dm0)
+            make_output_path = partial(make_output_path, asn_id=None)
         save_median(median_model, make_output_path)
         del median_model
 

From 6b0ade72f3680dc39e305312f07ef2b123460882 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 09:57:13 -0400
Subject: [PATCH 26/34] fix indeterminate ordering of asn_id and slit_id

---
 jwst/outlier_detection/_fileio.py           |  5 +--
 jwst/outlier_detection/tests/test_fileio.py | 41 +++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 jwst/outlier_detection/tests/test_fileio.py

diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index e146cc1932..958e8a8473 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -48,12 +48,11 @@ def _save_intermediate_output(model, suffix, make_output_path):
     input_path = model.meta.filename.replace("_outlier_", "_")
 
     # Add a slit name to the output path for MultiSlitModel data if not present
-    components = None
     if hasattr(model, "name") and model.name is not None:
         if "_"+model.name.lower() not in input_path:
-            components = f"{model.name.lower()}"
+            suffix = f"{model.name.lower()}_{suffix}"
 
-    output_path = make_output_path(input_path, suffix=suffix, components=components)
+    output_path = make_output_path(input_path, suffix=suffix)
     model.meta.filename = output_path
     model.save(output_path)
     log.info(f"Saved {suffix} model in {output_path}")
diff --git a/jwst/outlier_detection/tests/test_fileio.py b/jwst/outlier_detection/tests/test_fileio.py
new file mode 100644
index 0000000000..8a114bb257
--- /dev/null
+++ b/jwst/outlier_detection/tests/test_fileio.py
@@ -0,0 +1,41 @@
+import pytest
+from jwst.outlier_detection._fileio import _save_intermediate_output
+from jwst.datamodels import ImageModel
+from jwst.step import OutlierDetectionStep
+import os
+import numpy as np
+from functools import partial
+
+
+SLIT_ID = "slit007"
+ASN_ID = "a008"
+INPUT_FILENAME = "foo_outlier_s2d.fits"
+SUFFIX = "test"
+
+@pytest.fixture(scope="module")
+def model():
+    model = ImageModel(data=np.zeros((10,10)))
+    model.meta.filename = INPUT_FILENAME
+    return model
+
+
+@pytest.fixture(scope="module")
+def step():
+    return OutlierDetectionStep()
+
+
+@pytest.mark.parametrize("asn_id", [None, ASN_ID])
+@pytest.mark.parametrize("slit_id", [None, SLIT_ID])
+def test_save(tmp_cwd, model, step, asn_id, slit_id):
+
+    this_model = model.copy()
+    if slit_id is not None:
+        this_model.name = slit_id
+    make_output_path = step.make_output_path
+    make_output_path = partial(make_output_path, asn_id=asn_id)
+    _save_intermediate_output(this_model, SUFFIX, make_output_path)
+
+    stem = model.meta.filename.split("_")[0]
+    inputs = [val for val in [stem, asn_id, slit_id, SUFFIX] if val is not None]
+    expected_filename = "_".join(inputs)+".fits"
+    assert os.path.isfile(expected_filename)
\ No newline at end of file

From 4db72e374feea7bbdaf5f1769a4fcd0fe831d40c Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 10:08:57 -0400
Subject: [PATCH 27/34] missed one suggested change, updated changelog

---
 changes/8782.outlier_detection.rst | 3 ++-
 jwst/outlier_detection/_fileio.py  | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst
index 1b1e683d66..55df086d02 100644
--- a/changes/8782.outlier_detection.rst
+++ b/changes/8782.outlier_detection.rst
@@ -1,2 +1,3 @@
 Decrease the amount of file I/O required to compute the median when in_memory is set to False.
-Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed.
\ No newline at end of file
+Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed.
+Add association IDs to intermediate filenames.
\ No newline at end of file
diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 958e8a8473..732f9b34fc 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -53,6 +53,5 @@ def _save_intermediate_output(model, suffix, make_output_path):
             suffix = f"{model.name.lower()}_{suffix}"
 
     output_path = make_output_path(input_path, suffix=suffix)
-    model.meta.filename = output_path
     model.save(output_path)
     log.info(f"Saved {suffix} model in {output_path}")

From 8e4ae1684f5a65aeaea2e403370795e459f729f3 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 10:31:46 -0400
Subject: [PATCH 28/34] fix median filenames for tso and coron modes

---
 jwst/outlier_detection/coron.py | 1 -
 jwst/outlier_detection/tso.py   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py
index e1676de58f..8d24d50df4 100644
--- a/jwst/outlier_detection/coron.py
+++ b/jwst/outlier_detection/coron.py
@@ -56,7 +56,6 @@ def detect_outliers(
         median_model.update(input_model)
         median_model.meta.wcs = input_model.meta.wcs
 
-        make_output_path = partial(make_output_path, asn_id=None)
         save_median(median_model, make_output_path)
         del median_model
 
diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py
index 95de86f82f..9eedb8ea4b 100644
--- a/jwst/outlier_detection/tso.py
+++ b/jwst/outlier_detection/tso.py
@@ -53,7 +53,6 @@ def detect_outliers(
         median_model = dm.CubeModel(data=medians)
         with dm.open(weighted_cube) as dm0:
             median_model.update(dm0)
-            make_output_path = partial(make_output_path, asn_id=None)
         save_median(median_model, make_output_path)
         del median_model
 

From 71cca05c64010ecf5d7ccb6aa60d596da5088ead Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 10:32:52 -0400
Subject: [PATCH 29/34] remove unused functools imports

---
 jwst/outlier_detection/coron.py | 2 --
 jwst/outlier_detection/tso.py   | 1 -
 2 files changed, 3 deletions(-)

diff --git a/jwst/outlier_detection/coron.py b/jwst/outlier_detection/coron.py
index 8d24d50df4..9a3bf9e984 100644
--- a/jwst/outlier_detection/coron.py
+++ b/jwst/outlier_detection/coron.py
@@ -2,8 +2,6 @@
 Submodule for performing outlier detection on coronagraphy data.
 """
 
-
-from functools import partial
 import logging
 
 import numpy as np
diff --git a/jwst/outlier_detection/tso.py b/jwst/outlier_detection/tso.py
index 9eedb8ea4b..d0161dffe7 100644
--- a/jwst/outlier_detection/tso.py
+++ b/jwst/outlier_detection/tso.py
@@ -1,4 +1,3 @@
-from functools import partial
 import numpy as np
 from jwst.resample.resample_utils import build_mask
 

From bb78cf7421922b5f3fac4a7dfef64319dd0fd0d4 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Wed, 25 Sep 2024 12:42:14 -0400
Subject: [PATCH 30/34] small changes from self review

---
 jwst/outlier_detection/imaging.py                | 3 ---
 jwst/outlier_detection/outlier_detection_step.py | 3 +++
 jwst/outlier_detection/spec.py                   | 1 -
 jwst/outlier_detection/tests/test_fileio.py      | 7 +++----
 jwst/outlier_detection/tests/test_utils.py       | 6 ++----
 5 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/jwst/outlier_detection/imaging.py b/jwst/outlier_detection/imaging.py
index 1a8511b29c..0dacd5ec43 100644
--- a/jwst/outlier_detection/imaging.py
+++ b/jwst/outlier_detection/imaging.py
@@ -55,12 +55,9 @@ def detect_outliers(
         record_step_status(input_models, "outlier_detection", False)
         return input_models
         
-    # Start by creating resampled/mosaic images for
-    # each group of exposures
     if resample_data:
         resamp = resample.ResampleData(
             input_models,
-            output=None,
             single=True,
             blendheaders=False,
             wht_type=weight_type,
diff --git a/jwst/outlier_detection/outlier_detection_step.py b/jwst/outlier_detection/outlier_detection_step.py
index 1ffdbf6af4..5ecc90a231 100644
--- a/jwst/outlier_detection/outlier_detection_step.py
+++ b/jwst/outlier_detection/outlier_detection_step.py
@@ -198,6 +198,9 @@ def _guess_mode(self, input_models):
         return None
 
     def _get_asn_id(self, input_models):
+        """Find association ID for any allowed input model type,
+        and update make_output_path such that the association ID
+        is included in intermediate and output file names."""
         # handle if input_models isn't open
         if isinstance(input_models, (str, dict)):
             input_models = datamodels.open(input_models, asn_n_members=1)
diff --git a/jwst/outlier_detection/spec.py b/jwst/outlier_detection/spec.py
index ad8e3db1c1..ad94551976 100644
--- a/jwst/outlier_detection/spec.py
+++ b/jwst/outlier_detection/spec.py
@@ -59,7 +59,6 @@ def detect_outliers(
         #  each group of exposures
         resamp = resample_spec.ResampleSpecData(
             input_models,
-            output=None,
             single=True,
             blendheaders=False,
             wht_type=weight_type,
diff --git a/jwst/outlier_detection/tests/test_fileio.py b/jwst/outlier_detection/tests/test_fileio.py
index 8a114bb257..6e01557c8e 100644
--- a/jwst/outlier_detection/tests/test_fileio.py
+++ b/jwst/outlier_detection/tests/test_fileio.py
@@ -20,18 +20,17 @@ def model():
 
 
 @pytest.fixture(scope="module")
-def step():
-    return OutlierDetectionStep()
+def make_output_path():
+    return OutlierDetectionStep().make_output_path
 
 
 @pytest.mark.parametrize("asn_id", [None, ASN_ID])
 @pytest.mark.parametrize("slit_id", [None, SLIT_ID])
-def test_save(tmp_cwd, model, step, asn_id, slit_id):
+def test_save(tmp_cwd, model, make_output_path, asn_id, slit_id):
 
     this_model = model.copy()
     if slit_id is not None:
         this_model.name = slit_id
-    make_output_path = step.make_output_path
     make_output_path = partial(make_output_path, asn_id=asn_id)
     _save_intermediate_output(this_model, SUFFIX, make_output_path)
 
diff --git a/jwst/outlier_detection/tests/test_utils.py b/jwst/outlier_detection/tests/test_utils.py
index ff62440890..d1fc0bd576 100644
--- a/jwst/outlier_detection/tests/test_utils.py
+++ b/jwst/outlier_detection/tests/test_utils.py
@@ -5,9 +5,6 @@
 import os
 
 
-# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error
-# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException
-@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_disk_appendable_array(tmp_cwd):
 
     slice_shape = (8,7)
@@ -49,7 +46,6 @@ def test_disk_appendable_array(tmp_cwd):
     assert np.allclose(arr_in_memory[2], candidate2, equal_nan=True)
 
 
-@pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_disk_appendable_array_bad_inputs(tmp_cwd):
 
     slice_shape = (8,7)
@@ -77,6 +73,8 @@ def test_disk_appendable_array_bad_inputs(tmp_cwd):
         DiskAppendableArray(slice_shape, "float3", tempdir)
 
 
+# filter warnings such that "ResourceWarning: Implictly cleaning up" is raised as error
+# unfortunately need to do this indirectly by finding "finalize" string in PytestUnraisableException
 @pytest.mark.filterwarnings("error:.*finalize.*:pytest.PytestUnraisableExceptionWarning")
 def test_on_disk_median(tmp_cwd):
 

From adb1ae1d73af05a04d1979f343a100d7cd2af67c Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 26 Sep 2024 10:19:17 -0400
Subject: [PATCH 31/34] fixed comment and updated changelog entry in response
 to @braingram final review

---
 changes/8782.outlier_detection.0.rst | 1 +
 changes/8782.outlier_detection.1.rst | 1 +
 changes/8782.outlier_detection.rst   | 3 ---
 jwst/outlier_detection/_fileio.py    | 4 ++--
 4 files changed, 4 insertions(+), 5 deletions(-)
 create mode 100644 changes/8782.outlier_detection.0.rst
 create mode 100644 changes/8782.outlier_detection.1.rst
 delete mode 100644 changes/8782.outlier_detection.rst

diff --git a/changes/8782.outlier_detection.0.rst b/changes/8782.outlier_detection.0.rst
new file mode 100644
index 0000000000..469e927e44
--- /dev/null
+++ b/changes/8782.outlier_detection.0.rst
@@ -0,0 +1 @@
+Decrease the amount of file I/O required to compute the median when in_memory is set to False.
\ No newline at end of file
diff --git a/changes/8782.outlier_detection.1.rst b/changes/8782.outlier_detection.1.rst
new file mode 100644
index 0000000000..a1f9bd8e55
--- /dev/null
+++ b/changes/8782.outlier_detection.1.rst
@@ -0,0 +1 @@
+Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed.
\ No newline at end of file
diff --git a/changes/8782.outlier_detection.rst b/changes/8782.outlier_detection.rst
deleted file mode 100644
index 55df086d02..0000000000
--- a/changes/8782.outlier_detection.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-Decrease the amount of file I/O required to compute the median when in_memory is set to False.
-Fix a bug that caused intermediate files to conflict for different slits when a MultiSlitModel was processed.
-Add association IDs to intermediate filenames.
\ No newline at end of file
diff --git a/jwst/outlier_detection/_fileio.py b/jwst/outlier_detection/_fileio.py
index 732f9b34fc..25dc3a26bd 100644
--- a/jwst/outlier_detection/_fileio.py
+++ b/jwst/outlier_detection/_fileio.py
@@ -43,8 +43,8 @@ def _save_intermediate_output(model, suffix, make_output_path):
     to include the asn_id in the output path, so no need to handle it here.
     """
 
-    # make_output_path cannot handle suffix with an underscore inside it,
-    # e.g. _outlier_s2d.fits, so do a manual string replacement
+    # outlier_?2d is not a known suffix, and make_output_path cannot handle an
+    # underscore in an unknown suffix, so do a manual string replacement
     input_path = model.meta.filename.replace("_outlier_", "_")
 
     # Add a slit name to the output path for MultiSlitModel data if not present

From 7c6492051f5601f1ae43391b0cc88bf7455167bc Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 26 Sep 2024 13:18:56 -0400
Subject: [PATCH 32/34] remove unnecessary use of global

---
 jwst/outlier_detection/utils.py | 38 ++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index 7ffc497ee9..a87a863bd3 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -112,7 +112,9 @@ def median_without_resampling(input_models,
                 # write the drizzled model to file
                 _fileio.save_drizzled(drizzled_model, make_output_path)
 
-            median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size)
+            if i == 0:
+                median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size)
+            _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory)
 
     # Perform median combination on set of drizzled mosaics
     median_data = _evaluate_median_computer(median_computer, in_memory)
@@ -177,7 +179,9 @@ def median_with_resampling(input_models,
                 # write the drizzled model to file
                 _fileio.save_drizzled(drizzled_model, make_output_path)
 
-            median_computer = _append_to_median_computer(i, drizzled_model, ngroups, maskpt, in_memory, buffer_size)
+            if i == 0:
+                median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size)
+            _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory)
 
     # Perform median combination on set of drizzled mosaics
     median_data = _evaluate_median_computer(median_computer, in_memory)
@@ -195,20 +199,22 @@ def median_with_resampling(input_models,
     return median_data, median_wcs
 
 
-def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory, buffer_size):
-    
-    if idx == 0:
-        full_shape = (ngroups,) + drizzled_model.data.shape
-        global median_computer
-        if in_memory:
-            # allocate memory for data arrays that go into median
-            median_computer = np.empty(full_shape, dtype=np.float32)
-        else:
-            # set up temporary storage for data arrays that go into median
-            median_computer = OnDiskMedian(full_shape,
-                                            dtype=drizzled_model.data.dtype,
-                                            buffer_size=buffer_size)
+def _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size):
+
+    full_shape = (ngroups,) + drizzled_model.data.shape
+    if in_memory:
+        # allocate memory for data arrays that go into median
+        median_computer = np.empty(full_shape, dtype=np.float32)
+    else:
+        # set up temporary storage for data arrays that go into median
+        median_computer = OnDiskMedian(full_shape,
+                                        dtype=drizzled_model.data.dtype,
+                                        buffer_size=buffer_size)
+    return median_computer
 
+
+def _append_to_median_computer(median_computer, idx, drizzled_model, maskpt, in_memory):
+    
     # handle the weights right away, so only data array needs to be saved
     weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
     drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
@@ -220,8 +226,6 @@ def _append_to_median_computer(idx, drizzled_model, ngroups, maskpt, in_memory,
         # distribute the drizzled data into the temporary storage
         median_computer.add_image(drizzled_model.data)
 
-    return median_computer
-
 
 def _evaluate_median_computer(median_computer, in_memory):
     if in_memory:

From 9c0f0afb810cd719b96fbc96955ebb11af720e40 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Thu, 26 Sep 2024 15:31:32 -0400
Subject: [PATCH 33/34] make median computer helper functions agnostic to input
 data

---
 jwst/outlier_detection/utils.py | 35 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/jwst/outlier_detection/utils.py b/jwst/outlier_detection/utils.py
index a87a863bd3..03446aac5e 100644
--- a/jwst/outlier_detection/utils.py
+++ b/jwst/outlier_detection/utils.py
@@ -113,8 +113,13 @@ def median_without_resampling(input_models,
                 _fileio.save_drizzled(drizzled_model, make_output_path)
 
             if i == 0:
-                median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size)
-            _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory)
+                input_shape = (ngroups,)+drizzled_model.data.shape
+                dtype = drizzled_model.data.dtype
+                median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype)
+
+            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+            _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory)
 
     # Perform median combination on set of drizzled mosaics
     median_data = _evaluate_median_computer(median_computer, in_memory)
@@ -180,8 +185,14 @@ def median_with_resampling(input_models,
                 _fileio.save_drizzled(drizzled_model, make_output_path)
 
             if i == 0:
-                median_computer = _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size)
-            _append_to_median_computer(median_computer, i, drizzled_model, maskpt, in_memory)
+                input_shape = (ngroups,)+drizzled_model.data.shape
+                dtype = drizzled_model.data.dtype
+                median_computer = _make_median_computer(input_shape, in_memory, buffer_size, dtype)
+
+            weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
+            drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
+            _append_to_median_computer(median_computer, i, drizzled_model.data, in_memory)
+
 
     # Perform median combination on set of drizzled mosaics
     median_data = _evaluate_median_computer(median_computer, in_memory)
@@ -199,32 +210,26 @@ def median_with_resampling(input_models,
     return median_data, median_wcs
 
 
-def _make_median_computer(drizzled_model, ngroups, in_memory, buffer_size):
+def _make_median_computer(full_shape, in_memory, buffer_size, dtype):
 
-    full_shape = (ngroups,) + drizzled_model.data.shape
     if in_memory:
         # allocate memory for data arrays that go into median
         median_computer = np.empty(full_shape, dtype=np.float32)
     else:
         # set up temporary storage for data arrays that go into median
         median_computer = OnDiskMedian(full_shape,
-                                        dtype=drizzled_model.data.dtype,
+                                        dtype=dtype,
                                         buffer_size=buffer_size)
     return median_computer
 
 
-def _append_to_median_computer(median_computer, idx, drizzled_model, maskpt, in_memory):
-    
-    # handle the weights right away, so only data array needs to be saved
-    weight_threshold = compute_weight_threshold(drizzled_model.wht, maskpt)
-    drizzled_model.data[drizzled_model.wht < weight_threshold] = np.nan
-
+def _append_to_median_computer(median_computer, idx, data, in_memory):
     if in_memory:
         # populate pre-allocated memory with the drizzled data
-        median_computer[idx] = drizzled_model.data
+        median_computer[idx] = data
     else:
         # distribute the drizzled data into the temporary storage
-        median_computer.add_image(drizzled_model.data)
+        median_computer.add_image(data)
 
 
 def _evaluate_median_computer(median_computer, in_memory):

From 2d5281c911ef61de02f0a3706672c5b90f35f6f2 Mon Sep 17 00:00:00 2001
From: Ned Molter <emolter@stsci.edu>
Date: Fri, 27 Sep 2024 09:30:20 -0400
Subject: [PATCH 34/34] updated memory model section of outlier detection
 imaging docs

---
 .../outlier_detection/outlier_detection_imaging.rst  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/jwst/outlier_detection/outlier_detection_imaging.rst b/docs/jwst/outlier_detection/outlier_detection_imaging.rst
index c2cd6318b9..a6a6ec78e7 100644
--- a/docs/jwst/outlier_detection/outlier_detection_imaging.rst
+++ b/docs/jwst/outlier_detection/outlier_detection_imaging.rst
@@ -58,7 +58,8 @@ Specifically, this routine performs the following operations:
      should be used when resampling to create the output mosaic.  Any pixel with a
      DQ value not included in this value (or list of values) will be ignored when
      resampling.
-   * Resampled images will be written out to disk with the suffix ``_<asn_id>_outlier_i2d.fits``
+   * When the ``save_intermediate_results`` parameter is set to True,
+     resampled images will be written out to disk with the suffix ``_<asn_id>_outlier_i2d.fits``
      if the input model container has an <asn_id>, otherwise the suffix will be ``_outlier_i2d.fits``
      by default.
    * **If resampling is turned off** through the use of the ``resample_data`` parameter,
@@ -162,9 +163,12 @@ during processing includes:
    :py:class:`~jwst.resample.ResampleStep` as well, to set whether or not to keep the 
    resampled images in memory or not.
 
-#. Computing the median image works section-by-section by only keeping 1Mb of each input
-   in memory at a time.  As a result, only the final output product array for the final
-   median image along with a stack of 1Mb image sections are kept in memory.
+#. Computing the median image works by writing the resampled data frames to appendable files
+   on disk that are split into sections spatially but contain the entire ngroups (i.e., time)
+   axis. The section size is set to use roughly the same amount of memory as a single resampled
+   model, and since the resampled models are discarded from memory after this write operation this
+   choice avoids increasing the memory usage beyond a single resampled model.
+   Those sections are then read in one at a time to compute the median image.
 
 These changes result in a minimum amount of memory usage during processing at the obvious
 expense of reading and writing the products from disk.