From 2b3a755b9d323237bf5a9481ce2a1358260dfb47 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 11 Aug 2023 09:34:10 -0700 Subject: [PATCH 01/81] feat: add inpaint-mode for lst binner --- hera_cal/datacontainer.py | 37 +++ hera_cal/io.py | 24 +- hera_cal/lstbin.py | 14 +- hera_cal/lstbin_simple.py | 386 ++++++++++++++++++++++----- hera_cal/tests/mock_uvdata.py | 13 + hera_cal/tests/test_lstbin_simple.py | 206 +++++++++++--- scripts/lstbin_simple.py | 12 +- 7 files changed, 565 insertions(+), 127 deletions(-) diff --git a/hera_cal/datacontainer.py b/hera_cal/datacontainer.py index cb00240ff..eb70a8147 100644 --- a/hera_cal/datacontainer.py +++ b/hera_cal/datacontainer.py @@ -515,6 +515,43 @@ def select_or_expand_times(self, new_times, in_place=True, skip_bda_check=False) if not in_place: return dc + def select_or_expand_freqs( + self, + freqs: np.ndarray | None = None, + channels: np.ndarray | slice | None = None, + in_place: bool=True + ): + """Update the object with a subset of frequencies (which may be repeated). + + Parameters + ---------- + freqs : np.ndarray, optional + Frequencies to select. If given, all frequencies must be in the datacontainer. + channels : np.ndarray, slice, optional + Channels to select. If given, all channels must be in the datacontainer. + Only one of freqs or channels can be given. + in_place : bool, optional + If True, modify the object in place. Otherwise, return a modified copy. + """ + obj = self if in_place else copy.deepcopy(self) + if freqs is None and channels is None: + return obj + elif freqs is not None and channels is not None: + raise ValueError('Cannot specify both freqs and channels.') + + if freqs is not None: + if not np.all([fq in dc.freqs for fq in freqs]): + raise ValueError('All freqs must be in self.freqs.') + channels = np.searchsorted(obj.freqs, freqs) + + axis = obj[next(iter(dc.keys()))].shape.index(len(obj.freqs)) + for bl in obj: + obj[bl] = obj[bl].take(channels, axis=axis) + + # update metadata + obj.freqs = obj.freqs[channels] + + return obj class RedDataContainer(DataContainer): '''Structure for containing redundant visibilities that can be accessed by any diff --git a/hera_cal/io.py b/hera_cal/io.py index 155996e12..850fbdc13 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -108,21 +108,31 @@ def build_calcontainers(self): total_qual: dict mapping polarization to (Nint, Nfreq) float total quality array ''' self._extract_metadata() - gains, flags, quals, total_qual = odict(), odict(), odict(), odict() + gains, flags = odict(), odict() + + if self.total_quality_array is not None: + total_qual = odict() + else: + total_qual = None + + if self.quality_array is not None: + quals = odict() + else: + quals = None # build dict of gains, flags, and quals for (ant, pol) in self.ants: i, ip = self._antnum_indices[ant], self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] gains[(ant, pol)] = np.array(self.gain_array[i, :, :, ip].T) flags[(ant, pol)] = np.array(self.flag_array[i, :, :, ip].T) - quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + if quals is not None: + quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + # build dict of total_qual if available - for pol in self.pols: - ip = self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] - if self.total_quality_array is not None: + if total_qual is not None: + for pol in self.pols: + ip = self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] total_qual[pol] = np.array(self.total_quality_array[:, :, ip].T) - else: - total_qual = None return gains, flags, quals, total_qual diff --git a/hera_cal/lstbin.py b/hera_cal/lstbin.py index 2bb036b10..df855fc7f 100644 --- a/hera_cal/lstbin.py +++ b/hera_cal/lstbin.py @@ -1045,7 +1045,7 @@ def make_lst_grid( return lst_grid -def sigma_clip(array: np.ndarray, sigma: float=4.0, min_N: int=4, axis: int = 0): +def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: int=4, axis: int = 0): """ One-iteration robust sigma clipping algorithm. @@ -1075,11 +1075,15 @@ def sigma_clip(array: np.ndarray, sigma: float=4.0, min_N: int=4, axis: int = 0) if array.shape[axis] < min_N: return np.zeros_like(array, dtype=bool) - # get robust location - location = np.expand_dims(np.nanmedian(array, axis=axis), axis=axis) + if isinstance(array, np.ma.MaskedArray): + location = np.expand_dims(np.median(array, axis=axis), axis=axis) + scale = np.expand_dims(np.median(np.abs(array - location), axis=axis) * 1.482579, axis=axis) + else: + # get robust location + location = np.expand_dims(np.nanmedian(array, axis=axis), axis=axis) - # get MAD * 1.482579 for consistency with Gaussian white noise - scale = np.expand_dims(np.nanmedian(np.abs(array - location), axis=axis) * 1.482579, axis=axis) + # get MAD * 1.482579 for consistency with Gaussian white noise + scale = np.expand_dims(np.nanmedian(np.abs(array - location), axis=axis) * 1.482579, axis=axis) # get clipped data clip_flags = np.abs(array - location) / scale > sigma diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index bb02c1218..21a7e3914 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -33,7 +33,8 @@ import h5py from functools import partial import yaml -from .types import Antpair, Pol, Baseline +from .types import Antpair +from .datacontainer import DataContainer try: profile @@ -52,6 +53,7 @@ def lst_align( freq_array: np.ndarray, flags: np.ndarray | None = None, nsamples: np.ndarray | None = None, + where_inpainted: np.ndarray | None = None, rephase: bool = True, antpos: dict[int, np.ndarray] | None = None, lat: float = -30.72152, @@ -180,6 +182,8 @@ def lst_align( nsamples = nsamples[lst_mask] data_lsts = data_lsts[lst_mask] grid_indices = grid_indices[lst_mask] + if where_inpainted is not None: + where_inpainted = where_inpainted[lst_mask] if freq_array is None or antpos is None: raise ValueError("freq_array and antpos is needed for rephase") @@ -206,7 +210,7 @@ def lst_align( # in scope, and is therefore removed when the function returns). # shortcut -- just return all the data, re-organized. - _data, _flags, _nsamples = [], [], [] + _data, _flags, _nsamples, _where_inpainted = [], [], [], [] empty_shape = (0, len(antpairs), len(freq_array), npols) for lstbin in range(len(lst_bin_centres)): mask = grid_indices == lstbin @@ -214,12 +218,16 @@ def lst_align( _data.append(data[mask]) _flags.append(flags[mask]) _nsamples.append(nsamples[mask]) + if where_inpainted is not None: + _where_inpainted.append(where_inpainted[mask]) else: _data.append(np.zeros(empty_shape, complex)) _flags.append(np.zeros(empty_shape, bool)) _nsamples.append(np.zeros(empty_shape, int)) + if where_inpainted is not None: + _where_inpainted.append(np.zeros(empty_shape, bool)) - return lst_bin_centres, _data, _flags, _nsamples + return lst_bin_centres, _data, _flags, _nsamples, _where_inpainted or None def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Get the LST bin indices for a set of LSTs. @@ -252,6 +260,8 @@ def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.nd def reduce_lst_bins( data: list[np.ndarray], flags: list[np.ndarray], nsamples: list[np.ndarray], + where_inpainted: list[np.ndarray] | None = None, + inpainted_mode: bool = False, mutable: bool = False, sigma_clip_thresh: float | None = None, sigma_clip_min_N: int = 4, @@ -277,6 +287,13 @@ def reduce_lst_bins( nsamples A list, the same length/shape as ``data``, containing the number of samples for each measurement. + where_inpainted + A list, the same length/shape as ``data``, containing a boolean mask indicating + which samples have been inpainted. + inpainted_mode + Whether to use the inpainted samples when calculating the statistics. If False, + the inpainted samples are ignored. If True, the inpainted samples are used, and + the statistics are calculated over the un-inpainted samples. mutable Whether the input data (and flags and nsamples) can be modified in place within the algorithm. Setting to true saves memory, and is safe for a one-shot script. @@ -312,8 +329,12 @@ def reduce_lst_bins( out_flags = np.zeros(out_data.shape, dtype=bool) out_std = np.ones(out_data.shape, dtype=complex) out_nsamples = np.zeros(out_data.shape, dtype=float) + days_binned = np.zeros(out_nsamples.shape, dtype=int) - for lstbin, (d,n,f) in enumerate(zip(data, nsamples, flags)): + if where_inpainted is None: + where_inpainted = [None] * nlst_bins + + for lstbin, (d,n,f, inpf) in enumerate(zip(data, nsamples, flags, where_inpainted)): logger.info(f"Computing LST bin {lstbin+1} / {nlst_bins}") # TODO: check that this doesn't make yet another copy... @@ -323,9 +344,13 @@ def reduce_lst_bins( out_data[:, lstbin], out_flags[:, lstbin], out_std[:, lstbin], - out_nsamples[:, lstbin] + out_nsamples[:, lstbin], + days_binned[:, lstbin] ) = lst_average( - d, n, f, mutable=mutable, + d, n, f, + inpainted=inpf, + inpainted_mode=inpainted_mode, + mutable=mutable, flag_thresh=flag_thresh, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=sigma_clip_min_N, @@ -338,7 +363,7 @@ def reduce_lst_bins( out_nsamples[:, lstbin] = 0.0 - return out_data, out_flags, out_std, out_nsamples + return out_data, out_flags, out_std, out_nsamples, days_binned def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): data = np.full(shape, d, dtype=complex) @@ -349,6 +374,8 @@ def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): @profile def lst_average( data: np.ndarray, nsamples: np.ndarray, flags: np.ndarray, + inpainted: np.ndarray | None = None, + inpainted_mode: bool = False, flag_thresh: float = 0.7, mutable: bool = False, sigma_clip_thresh: float | None= None, @@ -396,12 +423,23 @@ def lst_average( # all data is assumed to be in the same LST bin. assert data.shape == nsamples.shape == flags.shape - + if not mutable: flags = flags.copy() nsamples = nsamples.copy() data = data.copy() + if not inpainted_mode: + # Act like nothing is inpainted. + inpainted = np.zeros(flags.shape, dtype=bool) + elif inpainted is None: + # Flag if a whole blt is flagged: + allf = np.all(flags, axis=2)[:,:, None, :] + + # Assume everything else that's flagged is inpainted. + inpainted = flags.copy() * (~allf) + + # These potential extra flags are certainly not inpainted. flags[np.isnan(data) | np.isinf(data) | (nsamples == 0)] = True # Flag entire LST bins if there are too many flags over time @@ -409,11 +447,18 @@ def lst_average( nflags = np.sum(flags) logger.info(f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%") flags |= flag_frac > flag_thresh - data[flags] *= np.nan # do this so that we can do nansum later. multiply to get both real/imag as nan logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}") + non_inpainted = flags & ~inpainted + if np.any(non_inpainted): + # Use a masked array instead of a nan-filled array. This is very slightly + # faster than nanmean, and allows us more flexibility. + data = np.ma.masked_array(data, mask=non_inpainted) + # Now do sigma-clipping. if sigma_clip_thresh is not None: + if inpainted_mode: + warnings.warn("Sigma-clipping in in-painted mode is a bad idea.") nflags = np.sum(flags) clip_flags = sigma_clip(data.real, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) @@ -424,36 +469,59 @@ def lst_average( clip_flags[:, sc_min_N] = False flags |= clip_flags - data[flags] *= np.nan + + if isinstance(data, np.ma.MaskedArray): + data.mask |= clip_flags + else: + data = np.ma.masked_array(data, mask=clip_flags) + logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping") - nsamples[flags] = 0 - norm = np.sum(nsamples, axis=0) # missing a "clip" between 1e-99 and inf here... - ndays_binned = np.sum(nsamples > 0, axis=0) + # Update the "non_inpainted" mask + non_inpainted = data.mask + + nsamples[non_inpainted] = 0 + + # Here we do a check to make sure Nsamples is uniform across frequency. + ndiff = np.diff(nsamples, axis=2) + if np.any(ndiff != 0): + warnings.warn("Nsamples is not uniform across frequency. This will result in spectral structure.") + + norm = np.sum(nsamples, axis=0) + ndays_binned = np.sum((~flags).astype(int), axis=0) logger.info("Calculating mean") - meandata = np.nansum(data * nsamples, axis=0) + meandata = np.sum(data * nsamples, axis=0) - f_min = np.all(flags, axis=0) + lstbin_flagged = np.all(flags, axis=0) if flag_below_min_N: - f_min[ndays_binned < sigma_clip_min_N] = True + lstbin_flagged[ndays_binned < sigma_clip_min_N] = True - meandata[~f_min] /= norm[~f_min] - meandata[f_min] *= np.nan + normalizable = norm > 0 + meandata[normalizable] /= norm[normalizable] + meandata[~normalizable] *= np.nan # get other stats logger.info("Calculating std") with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Degrees of freedom <= 0 for slice.") std = np.square(data.real - meandata.real) + 1j*np.square(data.imag - meandata.imag) - std = np.nansum(std * nsamples, axis=0) - std[~f_min] /= norm[~f_min] + std = np.sum(std * nsamples, axis=0) + std[normalizable] /= norm[normalizable] std = np.sqrt(std.real) + 1j*np.sqrt(std.imag) - std[f_min] = np.inf - norm[f_min] = 0 + std[~normalizable] = np.inf - return meandata, f_min, std, norm + + # While the previous norm is correct for normalizing the mean, we now + # calculate nsamples as the unflagged samples in each LST bin. + nsamples = np.sum(nsamples * (~flags).astype(int), axis=0) + + # Get the standard array out of the masked array. + if isinstance(meandata, np.ma.MaskedArray): + meandata = meandata.data + + return meandata, lstbin_flagged, std, nsamples, ndays_binned def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: """ @@ -483,6 +551,7 @@ def lst_bin_files_for_baselines( reds: RedundantGroups | None = None, freq_min: float | None = None, freq_max: float | None = None, + where_inpainted_files: list[list[str | Path | None]] | None = None, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list[np.ndarray]]: """Produce a set of LST-binned (but not averaged) data for a set of baselines. @@ -547,6 +616,10 @@ def lst_bin_files_for_baselines( freq_min, freq_max Minimum and maximum frequencies to include in the data. If not provided, all frequencies will be included. + where_inpainted_files + A list of lists of strings, one for each file, where each file is a UVFlag file + specifying which data are in-painted. If not provided, no inpainting will be + assumed. Returns ------- @@ -560,6 +633,8 @@ def lst_bin_files_for_baselines( Same as ``data``, but boolean flags. nsamples Same as ``data``, but sample counts. + where_inpainted + Same as ``data``, but boolean flags indicating where inpainting has been done. times_in_bins The JDs that are in each LST bin -- a list of arrays. """ @@ -621,6 +696,12 @@ def lst_bin_files_for_baselines( f=True ) + if where_inpainted_files is None or all(w is None for w in where_inpainted_files): + where_inpainted_files = [None] * len(metas) + where_inpainted = None + else: + where_inpainted = np.zeros_like(flags) + if cal_files is None: cal_files = [None] * len(metas) @@ -631,7 +712,7 @@ def lst_bin_files_for_baselines( # This loop actually reads the associated data in this LST bin. ntimes_so_far = 0 - for meta, calfl, tind, tarr in zip(metas, cal_files, time_idx, time_arrays): + for meta, calfl, tind, tarr, inpfile in zip(metas, cal_files, time_idx, time_arrays, where_inpainted_files): logger.info(f"Reading {meta.path}") slc = slice(ntimes_so_far,ntimes_so_far+len(tarr)) ntimes_so_far += len(tarr) @@ -657,6 +738,20 @@ def lst_bin_files_for_baselines( _data, _flags, _nsamples = io.HERAData(meta.path).read( bls=bls_to_load, times=tarr, freq_chans=freq_chans, polarizations=pols, ) + if inpfile is not None: + # This returns a DataContainer (unless something went wrong) since it should + # always be a 'baseline' type of UVFlag.s + inpainted = io.load_flags(inpfile) + if not isinstance(inpainted, DataContainer): + raise ValueError(f"Expected {inpfile} to be a DataContainer") + + # We need to down-selecton times/freqs (bls and pols will be sub-selected + # based on those in the data through the next loop) + inpainted.select_or_expand_times(times=tarr, skip_bda_check=True) + inpainted.select_or_expand_freqs(channels=freq_chans) + else: + inpainted = None + if redundantly_averaged: keyed = reds.keyed_on_bls(_data.antpairs()) @@ -686,6 +781,8 @@ def lst_bin_files_for_baselines( data[slc, i, :, j] = _data[blpol] flags[slc, i, :, j] = _flags[blpol] nsamples[slc, i, :, j] = _nsamples[blpol] + if inpainted is not None: + where_inpainted[slc, i, :, j] = inpainted[blpol] else: # This baseline+pol doesn't exist in this file. That's # OK, we don't assume all baselines are in every file. @@ -697,12 +794,12 @@ def lst_bin_files_for_baselines( # LST bin edges are the actual edges of the bins, so should have length # +1 of the LST centres. We use +dlst instead of +dlst/2 on the top edge # so that np.arange definitely gets the last edge. - # lst_edges = np.arange(outfile_lsts[0] - dlst/2, outfile_lsts[-1] + dlst, dlst) - bin_lst, data, flags, nsamples = lst_align( + bin_lst, data, flags, nsamples, where_inpainted = lst_align( data=data, flags=None if ignore_flags else flags, nsamples=nsamples, data_lsts=lsts, + where_inpainted=where_inpainted, antpairs=antpairs, lst_bin_edges=lst_bin_edges, freq_array = freqs, @@ -717,7 +814,7 @@ def lst_bin_files_for_baselines( mask = bins == i times_in_bins.append(times[mask]) - return bin_lst, data, flags, nsamples, times_in_bins + return bin_lst, data, flags, nsamples, where_inpainted, times_in_bins def apply_calfile_rules( data_files: list[list[str]], @@ -772,7 +869,8 @@ def filter_required_files_by_times( lst_range: tuple[float, float], data_metas: list[list[FastUVH5Meta]], cal_files: list[list[str]] | None = None, -) -> tuple[list, list, list, list, list]: + where_inpainted_files: list[list[str]] | None = None, +) -> tuple[list, list, list, list, list, list]: lstmin, lstmax = lst_range lstmin %= (2 * np.pi) @@ -783,19 +881,23 @@ def filter_required_files_by_times( if not cal_files: cal_files = [[None for _ in dm] for dm in data_metas] + if not where_inpainted_files: + where_inpainted_files = [[None for _ in dm] for dm in data_metas] + tinds = [] all_lsts = [] file_list = [] time_arrays = [] cals = [] + where_inpainted = [] # This loop just gets the number of times that we'll be reading. # Even though the input files should already have been configured to be those # that fall within the output LST range, we still need to read them in to # check exactly which time integrations we require. - for night, callist in zip(data_metas, cal_files): - for meta, cal in zip(night, callist): + for night, callist, inplist in zip(data_metas, cal_files, where_inpainted_files): + for meta, cal, inp in zip(night, callist, inplist): tarr = meta.times lsts = meta.lsts % (2*np.pi) lsts[lsts < lstmin] += 2 * np.pi @@ -808,8 +910,40 @@ def filter_required_files_by_times( all_lsts.append(lsts[tind]) file_list.append(meta) cals.append(cal) - return tinds, time_arrays, all_lsts, file_list, cals + where_inpainted.append(inp) + + return tinds, time_arrays, all_lsts, file_list, cals, where_inpainted + +def _get_where_inpainted_files( + data_files: list[list[str | Path]], + where_inpainted_file_rules: list[tuple[str, str]] | None +) -> list[list[str | Path]] | None: + if where_inpainted_file_rules is None: + return None + + where_inpainted_files = [] + for dflist in enumerate(data_files): + this = [] + where_inpainted_files.append(this) + for df in dflist: + wif = df + for rule in where_inpainted_file_rules: + wif = wif.replace(rule[0], rule[1]) + if os.path.exists(wif): + this.append(wif) + else: + raise IOError(f"Where inpainted file {wif} does not exist") + return where_inpainted_files + +def _configure_inpainted_mode(output_flagged, output_inpainted, where_inpainted_files): + # Sort out defaults for inpaint/flagging mode + if output_inpainted is None: + output_inpainted = bool(where_inpainted_files) + + if not output_inpainted and not output_flagged: + raise ValueError("Both output_inpainted and output_flagged are False. One must be True.") + return output_flagged, output_inpainted def lst_bin_files_single_outfile( config_opts: dict[str, Any], @@ -840,6 +974,9 @@ def lst_bin_files_single_outfile( flag_thresh: float = 0.7, freq_min: float | None = None, freq_max: float | None = None, + output_inpainted: bool | None = None, + output_flagged: bool = True, + where_inpainted_file_rules: list[tuple[str, str]] | None = None, ) -> dict[str, Path]: """ Bin data files into LST bins, and write all bins to disk in a single file. @@ -847,6 +984,21 @@ def lst_bin_files_single_outfile( Note that this is generally not meant to be called directly, but rather through the :func:`lst_bin_files` function. + The mode(s) in which the function does the averaging can be specified via the + `output_inpainted`, `output_flagged` and `where_inpainted_file_rules` options. + Algorithmically, there are two modes of averaging: either flagged data is *ignored* + in the average (and in Nsamples) or the flagged data is included in the average but + ignored in Nsamples. These are called "flagged" and "inpainted" modes respectively. + The latter only makes sense if the data in the flagged regions has been inpainted, + rather than left as raw data. For delay-filtered data, either mode is equivalent, + since the flagged data itself is set to zero. By default, this function *only* + uses the "flagged" mode, *unless* the `where_inpainted_file_rules` option is set, + which indicates that the files are definitely in-painted, and in this case it will + use *both* modes by default. + + .. note:: Both "flagged" and "inpainted" mode as implemented in this function are + *not* spectrally smooth if the input Nsamples are not spectrally uniform. + Parameters ---------- config_opts @@ -947,12 +1099,78 @@ def lst_bin_files_single_outfile( freq_max The maximum frequency to include in the output files. If not provided, this is set to the maximum frequency in the first data file on the first night. + output_inpainted + If True, output data LST-binned in in-painted mode. This mode does *not* flag + data for the averaging, assuming that data that has flags has been in-painted + to improve spectral smoothness. It does take the flags into account for the + LST-binned Nsamples, however. + output_flagged + If True, output data LST-binned in flagged mode. This mode *does* apply flags + to the data before averaging. It will yield the same Nsamples as the in-painted + mode, but simply ignores flagged data for the average, which can yield less + spectrally-smooth LST-binned results. + where_inpainted_file_rules + Rules to transform the input data file names into the corresponding "where + inpainted" files (which should be in UVFlag format). If provided, this indicates + that the data itself is in-painted, and the `output_inpainted` mode will be + switched on by default. These files should specify which data is in-painted + in the associated data file (which may be different than the in-situ flags + of the data object). If not provided, but `output_inpainted` is set to True, + all data-flags will be considered in-painted except for baseline-times that are + fully flagged, which will be completely ignored. Returns ------- out_files A dict of output files, keyed by the type of file (e.g. 'LST', 'STD', 'GOLDEN', 'REDUCEDCHAN'). + + Notes + ----- + It is worth describing in a bit more detail what is actually _in_ the output files. + The "LST" files contain the LST-binned data, with the data averaged over each LST + bin. There are two possible modes for this: inpainted and flagged. In inpainted mode, + the data in flagged bl-channel-pols is used for the averaging, as it is considered + to be in-painted. This gives the most spectrally-smooth results. In order to ignore + a particular bl-channel-pol while still using inpaint mode, supply a "where inpainted" + file, which should be a UVFlag object that specifies which bl-channel-pols are + inpainted in the associated data file. Anything that's flagged but not inpainted is + ignored for the averaging. In this inpainted mode, the Nsamples are the number of + un-flagged integrations (whether they were in-painted or not). The LST-binned flags + are only set to True if ALL of the nights for a given bl-channel-pol are flagged + (again, whether they were in-painted or not). In flagged mode, both the Nsamples + and Flags are the same as inpainted mode. The averaged data, however, ignores any + flagged data. This can lead to less spectrally-smooth results. The "STD" files + contain LST-binned "data" that is the standard deviation of the data in each LST-bin. + This differs between inpainted and flagged modes in the same way as the "LST" files: + in inpainted mode, the flagged and inpainted data is used for calculating the sample + variance, while in flagged mode, only the unflagged data is used. The final flags + in the "STD" files is equivalent to that in the "LST" files. The Nsamples in the + "STD" files is actually the number of unflagged nights in the LST bin (so, not the + sum of Nsamples), where "unflagged" really does mean unflagged -- whether inpainted + or not. + + One note here about what is considered "flagged" vs. "flagged and inpainted" vs + "flagged and not inpainted". In flagged mode, there are input flags that exist in the + input files. These are potentially *augmented* by sigma clipping within the LST + binner, and also by flagging whole LST bins if they have too few unflagged integrations. + In inpainted mode, input flags are considered as normal flags. However, only + "non-inpainted" flags are *ignored* for the averaging. By default, all flagged data + is considered to to be in-painted UNLESS it is a blt-pol that is fully flagged (i.e. + all channels are flagged for an integration for a single bl and pol). However, you + can tell the routine that other data is NOT in-painted by supplying a "where inpainted" + file. Now, integrations in LST-bins that end up having "too few" unflagged + integrations will be flagged inside the binner, however in inpainted mode, if these + are considered "inpainted", they will still be used in averaging (this means they + will have "valid" data for the average, but their average will be flagged). + On the other hand, flags that are applied by sigma-clipping will be considered + NOT inpainted, i.e. those data will be ignored in the averaged, just like flagging + mode. In this case, either choice is bad: to include them in the average is bad + because even though they may have been actually in-painted, whatever value they have + is clearly triggering the sigma-clipper and is therefore an outlier. On the other + hand, to ignore them is bad because the point of in-painting mode is to get + smoother spectra, and this negates that. So, it's best just to not do sigma-clipping + in inpainted mode. """ write_kwargs = write_kwargs or {} @@ -964,6 +1182,14 @@ def lst_bin_files_single_outfile( data_files, calfile_rules, ignore_missing=ignore_missing_calfiles ) + where_inpainted_files = _get_where_inpainted_files( + data_files, where_inpainted_file_rules + ) + output_inpainted, output_flagged = _configure_inpainted_mode( + output_inpainted, output_flagged, where_inpainted_files + ) + + # Prune empty nights (some nights start with files, but have files removed because # they have no associated calibration) data_files = [df for df in data_files if df] @@ -987,9 +1213,7 @@ def lst_bin_files_single_outfile( if outdir is None: outdir = os.path.dirname(os.path.commonprefix(abscal.flatten(data_files))) - x_orientation = metadata['x_orientation'] start_jd = metadata['start_jd'] - integration_time = metadata['integration_time'] # get metadata logger.info("Getting metadata from first file...") @@ -1012,7 +1236,7 @@ def lst_bin_files_single_outfile( for fl_list in data_metas: meta = fl_list[len(fl_list)//2] antpairs = meta.get_transactional("antpairs") - ubls = set(reds.get_ubl_key(ap) for ap in antpairs) + ubls = {reds.get_ubl_key(ap) for ap in antpairs} if len(ubls) != len(antpairs): # At least two of the antpairs are in the same redundant group. redundantly_averaged = False @@ -1057,10 +1281,11 @@ def lst_bin_files_single_outfile( [x - dlst/2 for x in lst_bins] + [lst_bins[-1] + dlst/2] ) - tinds, time_arrays, all_lsts, file_list, cals = filter_required_files_by_times( + tinds, time_arrays, all_lsts, file_list, cals, where_inpainted_files = filter_required_files_by_times( (lst_bin_edges[0], lst_bin_edges[-1]), data_metas, input_cals, + where_inpainted_files ) # If we have no times at all for this file, just return if len(all_lsts) == 0: @@ -1068,7 +1293,6 @@ def lst_bin_files_single_outfile( all_lsts = np.concatenate(all_lsts) - # The "golden" data is the full data over all days for a small subset of LST # bins. This works best if the LST bins are small (similar to the size of the # raw integrations). Usually, the length of "bins" will be zero. @@ -1098,20 +1322,27 @@ def lst_bin_files_single_outfile( **write_kwargs, ) out_files = {} - for kind in ['LST', 'STD']: - # Create the files we'll write to - out_files[kind] = create_outfile( - kind = kind, - lst=lst_bin_edges[0], - lsts=lst_bins, - ) + for inpaint_mode in [True, False]: + if inpaint_mode and not output_inpainted: + continue + if not inpaint_mode and not output_flagged: + continue + + for kind in ['LST', 'STD']: + # Create the files we'll write to + out_files[(kind, inpaint_mode)] = create_outfile( + kind = kind, + lst=lst_bin_edges[0], + lsts=lst_bins, + inpaint_mode = inpaint_mode, + ) nbls_so_far = 0 for bi, bl_chunk in enumerate(bl_chunks): logger.info(f"Baseline Chunk {bi+1} / {len(bl_chunks)}") # data/flags/nsamples are *lists*, with nlst_bins entries, each being an # array, with shape (times, bls, freqs, npols) - bin_lst, data, flags, nsamples, binned_times = lst_bin_files_for_baselines( + bin_lst, data, flags, nsamples, where_inpainted, binned_times = lst_bin_files_for_baselines( data_files = file_list, lst_bin_edges=lst_bin_edges, antpairs=bl_chunk, @@ -1128,12 +1359,16 @@ def lst_bin_files_single_outfile( reds=reds, freq_min=freq_min, freq_max=freq_max, + where_inpainted_files=where_inpainted_files, ) slc = slice(nbls_so_far, nbls_so_far + len(bl_chunk)) if bi == 0: # On the first baseline chunk, create the output file + # TODO: we're not writing out the where_inpainted data for the GOLDEN + # or REDUCEDCHAN files yet -- it looks like we'd have to write out + # completely new UVFlag files for this. out_files['GOLDEN'] = [] for nbin in golden_bins: out_files["GOLDEN"].append( @@ -1171,28 +1406,36 @@ def lst_bin_files_single_outfile( nsamples=nsamples[0][:, :, save_channels].transpose((1, 0, 2, 3)), ) - data, flags, std, nsamples = reduce_lst_bins( - data, flags, nsamples, - flag_thresh=flag_thresh, - sigma_clip_thresh = sigma_clip_thresh, - sigma_clip_min_N = sigma_clip_min_N, - flag_below_min_N=flag_below_min_N, - ) + for inpainted in [True, False]: + if inpainted and not output_inpainted: + continue + if not inpainted and not output_flagged: + continue - write_baseline_slc_to_file( - fl=out_files['LST'], - slc=slc, - data=data, - flags=flags, - nsamples=nsamples, - ) - write_baseline_slc_to_file( - fl=out_files['STD'], - slc=slc, - data=std, - flags=flags, - nsamples=nsamples, - ) + _data, _flags, std, _nsamples, days_binned = reduce_lst_bins( + data, flags, nsamples, + where_inpainted=where_inpainted, + inpainted_mode=inpainted, + flag_thresh=flag_thresh, + sigma_clip_thresh = sigma_clip_thresh, + sigma_clip_min_N = sigma_clip_min_N, + flag_below_min_N=flag_below_min_N, + ) + + write_baseline_slc_to_file( + fl=out_files[('LST', inpainted)], + slc=slc, + data=_data, + flags=_flags, + nsamples=_nsamples, + ) + write_baseline_slc_to_file( + fl=out_files[('STD', inpainted)], + slc=slc, + data=std, + flags=_flags, + nsamples=days_binned, + ) nbls_so_far += len(bl_chunk) @@ -1427,22 +1670,26 @@ def create_lstbin_output_file( times: np.ndarray | None = None, lsts: np.ndarray | None = None, history: str = "", - fname_format: str="zen.{kind}.{lst:7.5f}.uvh5", + fname_format: str="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", overwrite: bool = False, antpairs: list[tuple[int, int]] | None = None, freq_min: float | None = None, freq_max: float | None = None, channels: np.ndarray | list[int] | None = None, vis_units: str = "Jy", + inpaint_mode: bool | None = None, ) -> Path: outdir = Path(outdir) - # pols = set(pols) + # update history file_list_str = "-".join(ff.path.name for ff in file_list) file_history = f"{history} Input files: {file_list_str}" _history = file_history + utils.history_string() - fname = outdir / fname_format.format(kind=kind, lst=lst, pol=''.join(pols)) + fname = outdir / fname_format.format( + kind=kind, lst=lst, pol=''.join(pols), + inpaint_mode='.inpaint' if inpaint_mode else ('.flagged' if inpaint_mode is False else "") + ) logger.info(f"Initializing {fname}") @@ -1787,4 +2034,7 @@ def lst_bin_arg_parser(): a.add_argument("--only-last-file-per-night", action='store_true', default=False, help="if true, only use the first and last file every night to obtain antpairs") a.add_argument("--freq-min", type=float, default=None, help="minimum frequency to include in lstbinning") a.add_argument("--freq-max", type=float, default=None, help="maximum frequency to include in lstbinning") + a.add_argument("--no-flagged-mode", action='store_true', help="turn off output of flagged mode LST-binning") + a.add_argument("--do-inpaint-mode", action='store_true', default=None, help="turn on inpainting mode LST-binning") + a.add_argument("--where-inpainted-file-rules", nargs='*', type=str, help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter") return a diff --git a/hera_cal/tests/mock_uvdata.py b/hera_cal/tests/mock_uvdata.py index b96f80504..29822aa85 100644 --- a/hera_cal/tests/mock_uvdata.py +++ b/hera_cal/tests/mock_uvdata.py @@ -130,6 +130,7 @@ def identifiable_data_from_uvd( def create_uvd_identifiable( with_noise: bool = False, autos_noise: bool = False, + flag_frac: float = 0.0, **kwargs) -> UVData: """Make a UVData object with identifiable data. @@ -144,6 +145,8 @@ def create_uvd_identifiable( if with_noise: add_noise_to_uvd(uvd, autos = autos_noise) + if flag_frac > 0: + add_flags_to_uvd(uvd, flag_frac=flag_frac) return uvd def add_noise_to_uvd(uvd, autos: bool = False): @@ -167,6 +170,16 @@ def add_noise_to_uvd(uvd, autos: bool = False): ) hd.update(data=data) +def add_flags_to_uvd(uvd, flag_frac: float = 0.1): + hd = io.to_HERAData(uvd) + data, flags, nsamples = hd.read() + + for bl in data.bls(): + if bl[0] == bl[1] and bl[2][0] == bl[2][1]: + continue + flags[bl] = np.random.uniform(size=flags[bl].shape) < flag_frac + hd.update(flags=flags) + def write_files_in_hera_format( uvds: list[UVData] | list[list[UVData]], tmpdir: Path, diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index bfd60ef5b..e1ea39b4f 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -106,7 +106,7 @@ def lst_align_with(**kw): def test_increasing_lsts_one_per_bin(self, benchmark): kwargs = self.get_lst_align_data(ntimes=6) - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) + bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) # We should not be changing the data at all. d = np.squeeze(np.asarray(d)) @@ -120,7 +120,7 @@ def test_increasing_lsts_one_per_bin(self, benchmark): def test_multi_days_one_per_bin(self, benchmark): kwargs = self.get_lst_align_data(ndays=2, ntimes=7) - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) + bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) # We should not be changing the data at all. d = np.squeeze(np.asarray(d)) @@ -140,7 +140,7 @@ def test_multi_days_with_flagging(self, benchmark): flags[7:] = True kwargs['data'][7:] = 1000.0 - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs) + bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) @@ -159,7 +159,7 @@ def test_multi_days_with_nsamples_zero(self, benchmark): nsamples[7:] = 0.0 kwargs['data'][7:] = 1000.0 - bins, d, f, n = lstbin_simple.lst_align(rephase=False, nsamples=nsamples, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align(rephase=False, nsamples=nsamples, **kwargs) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) @@ -176,8 +176,8 @@ def test_rephase(self, benchmark): """Test that rephasing where each bin is already at center does nothing.""" kwargs = self.get_lst_align_data(ntimes=7) - bins0, d0, f0, n0 = benchmark(lstbin_simple.lst_align, rephase=True, **kwargs) - bins, d, f, n = lstbin_simple.lst_align(rephase=False, **kwargs) + bins0, d0, f0, n0, inp = benchmark(lstbin_simple.lst_align, rephase=True, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align(rephase=False, **kwargs) np.testing.assert_allclose(d, d0, rtol=1e-6) np.testing.assert_allclose(f, f0, rtol=1e-6) np.testing.assert_allclose(n, n0, rtol=1e-6) @@ -191,12 +191,12 @@ def test_lstbinedges_modulus(self, benchmark): lst_bin_edges = edges.copy() lst_bin_edges -= 4*np.pi - bins, d0, f0, n0 = benchmark(lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs) + bins, d0, f0, n0, inp = benchmark(lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs) lst_bin_edges = edges.copy() lst_bin_edges += 4*np.pi - bins, d, f, n = lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges, **kwargs) np.testing.assert_allclose(d, d0) np.testing.assert_allclose(f, f0) @@ -227,7 +227,7 @@ def get_input_data( def test_one_point_per_bin(self, benchmark): d, f, n = self.get_input_data(ntimes=(1,)) - dd, ff, std, nn = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + dd, ff, std, nn, db = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) assert dd.shape == ff.shape == std.shape == nn.shape @@ -243,7 +243,7 @@ def test_one_point_per_bin(self, benchmark): def test_zerosize_bin(self): d, f, n = self.get_input_data(ntimes=(0, 1)) print(d[0].shape, len(d)) - dd, ff, std, nn = lstbin_simple.reduce_lst_bins(d, f, n) + dd, ff, std, nn, db = lstbin_simple.reduce_lst_bins(d, f, n) assert dd.shape[1] == 2 # 2 LST bins assert np.all(np.isnan(dd[:, 0])) @@ -253,7 +253,7 @@ def test_zerosize_bin(self): @pytest.mark.parametrize("ntimes", [(4, ), (5, 4)]) def test_multi_points_per_bin(self, ntimes, benchmark): d, f, n = self.get_input_data(ntimes=ntimes) - dd, ff, std, nn = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + dd, ff, std, nn, db = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) assert dd.shape == ff.shape == std.shape == nn.shape @@ -271,7 +271,7 @@ def test_multi_points_per_bin_flagged(self): d, f, n = self.get_input_data(ntimes=(4,)) f[0][2:] = True d[0][2:] = 1000.0 - dd, ff, std, nn = lstbin_simple.reduce_lst_bins(d,f,n) + dd, ff, std, nn, db = lstbin_simple.reduce_lst_bins(d,f,n) assert dd.shape == ff.shape == std.shape == nn.shape @@ -329,14 +329,14 @@ def test_sigma_clip_without_outliers(self, benchmark): nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n =lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, daysbinned =lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, sigma_clip_thresh=None, ) - data, flg, std, norm = benchmark( + data, flg, std, norm, daysbinned = benchmark( lstbin_simple.lst_average, data=data, nsamples=nsamples, @@ -355,7 +355,7 @@ def test_average_repeated(self): nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, ) @@ -366,7 +366,7 @@ def test_average_repeated(self): # Now flag the last "night" flags[-1] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, ) @@ -376,14 +376,14 @@ def test_average_repeated(self): assert np.allclose(norm_n, 2.0) def test_std_simple(self): - shape = (5000, 2) # 1000 nights, doesn't matter what the other axis is. + shape = (5000, 1, 2, 2) # 1000 nights, doesn't matter what the other axis is. std = 2.0 data = np.random.normal(scale=std, size=shape) + np.random.normal(scale=std, size=shape)*1j nsamples = np.ones_like(data, dtype=float) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, ) @@ -398,7 +398,7 @@ def test_std_simple(self): @pytest.mark.parametrize("nsamples", ("ones", "random")) @pytest.mark.parametrize("flags", ("zeros", "random")) def test_std(self, nsamples, flags): - shape = (5000, 2) # 1000 nights, doesn't matter what the other axis is. + shape = (5000, 1, 2, 2) # 1000 nights, doesn't matter what the other axis is. std = 2.0 if nsamples == "ones": @@ -417,7 +417,7 @@ def test_std(self, nsamples, flags): flags = np.zeros(data.shape, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, ) @@ -434,26 +434,111 @@ def test_std(self, nsamples, flags): assert not np.any(flg_n) + def test_inpaint_mode(self): + shape = (7,8,9, 1) # nights, bls, freqs, pols + _data = np.random.random(shape) + np.random.random(shape)*1j + nsamples = np.ones(_data.shape, dtype=float) + flags = np.zeros(_data.shape, dtype=bool) + + # First test -- no flags should mean inpainted_mode does nothing. + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + assert np.allclose(df, di) + assert np.allclose(ff, fi) + assert np.allclose(stdf, stdi) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + + # Now test with a whole LST bin flagged for a single bl-chan-pol (but inpainted) + flags[:, 0, 0, 0] = True + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + # The data and std in the fully-flagged bin should be different, but Nsamples + # and Flags should be the same. + assert not np.allclose(df[0,0,0], di[0,0,0]) + assert np.allclose(df[1:], di[1:]) + assert np.allclose(ff, fi) + assert not np.allclose(stdf[0,0,0], stdi[0,0,0]) + assert np.allclose(stdf[1:], stdi[1:]) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + # Now test with a whole spectrum flagged for one night + flags[:] = False + flags[0, 0, :, 0] = True + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + # This should give exactly the same results either way, because the full-flagged + # blt is considered to NOT be inpainted by default. + assert np.allclose(df, di) + assert np.allclose(ff, fi) + assert np.allclose(stdf, stdi) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + # However, if we had explicitly told the routine that the blt was inpainted, + # we'd get a different result... + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False, + inpainted=np.ones_like(flags) # Everything in-painted. + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True, + inpainted=np.ones_like(flags) # Everything in-painted. + ) + + # The LST-binned data will be different for blt=0, pol=0: + assert not np.allclose(df[0,:,0], di[0,:,0]) + assert np.allclose(df[1:], di[1:]) + assert np.allclose(ff, fi) + assert not np.allclose(stdf[0,:,0], stdi[0,:,0]) + assert np.allclose(stdf[1:], stdi[1:]) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + + + def test_flag_below_min_N(self): - shape = (7,8,9) + shape = (7,8,9, 2) _data = np.random.random(shape) + np.random.random(shape)*1j nsamples = np.ones(_data.shape, dtype=float) flags = np.zeros(_data.shape, dtype=bool) # No samples have more than min_N, so they should all be flagged. - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=8, flag_below_min_N=True ) assert np.all(flg_n) - assert np.all(norm_n==0) - assert np.all(np.isinf(std_n)) - assert np.all(np.isnan(data_n)) + assert np.all(norm_n==7) # Even though they're flagged, we track the nsamples + assert not np.all(np.isinf(std_n)) + assert not np.all(np.isnan(data_n)) # this time, there's enough samples, but too many are flagged... flags[:5] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, flag_below_min_N=True ) @@ -461,13 +546,12 @@ def test_flag_below_min_N(self): assert np.all(flg_n) assert np.all(norm_n==0) assert np.all(np.isinf(std_n)) - assert np.all(np.isnan(data_n)) - + # this time, only one column is flagged too much... # this time, there's enough samples, but too many are flagged... flags[:] = False flags[:5, 0] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, flag_below_min_N=True ) @@ -475,7 +559,6 @@ def test_flag_below_min_N(self): assert np.all(flg_n[0]) assert np.all(norm_n[0]==0) assert np.all(np.isinf(std_n[0])) - assert np.all(np.isnan(data_n[0])) assert not np.any(flg_n[1:]) assert np.all(norm_n[1:]>0) @@ -541,14 +624,14 @@ def uvc_file(uvc, uvd_file: Path) -> Path: class Test_LSTBinFilesForBaselines: def test_defaults(self, uvd, uvd_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], antpairs=uvd.get_antpairs(), rephase=False ) - lstbins, d, f, n, times = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d, f, n, inpflg, times = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], antpairs=uvd.get_antpairs(), @@ -566,7 +649,7 @@ def test_defaults(self, uvd, uvd_file): np.testing.assert_allclose(times0, times) def test_empty(self, uvd, uvd_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=[(127, 128)], @@ -577,7 +660,7 @@ def test_empty(self, uvd, uvd_file): def test_extra(self, uvd, uvd_file): # Providing baselines that don't exist in the file is fine, they're just ignored. - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=uvd.get_antpairs() + [(127, 128)], @@ -588,7 +671,7 @@ def test_extra(self, uvd, uvd_file): def test_freqrange(self, uvd, uvd_file, uvc_file): """Test that providing freq_range works.""" - bins, data, flags, nsamples, times = lstbin_simple.lst_bin_files_for_baselines( + bins, data, flags, nsamples, inpflg, times = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], cal_files = [uvc_file], @@ -631,7 +714,7 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): ) def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_redavg_file], lst_bin_edges=[uvd_redavg.lst_array.min()-0.1, uvd_redavg.lst_array.max()+0.1], redundantly_averaged=True, @@ -842,12 +925,12 @@ def test_baseline_chunking(self, tmp_path_factory): ) for flset, flsetc in zip(out_files, out_files_chunked): - assert flset['LST'] != flsetc['LST'] + assert flset[('LST', False)] != flsetc[('LST', False)] uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[('LST', False)]) uvdlstc = UVData() - uvdlstc.read(flsetc['LST']) + uvdlstc.read(flsetc[('LST', False)]) assert uvdlst == uvdlstc @@ -922,12 +1005,12 @@ def test_compare_nontrivial_cal( ) for flset, flsetc in zip(out_files, out_files_recal): - assert flset['LST'] != flsetc['LST'] + assert flset[('LST', False)] != flsetc[('LST', False)] uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[('LST', False)]) uvdlstc = UVData() - uvdlstc.read(flsetc['LST']) + uvdlstc.read(flsetc[('LST', False)]) print(np.unique(uvdlstc.lst_array)) expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1010,7 +1093,7 @@ def test_nontrivial_cal( assert len(out_files) == 2 for flset in out_files: uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[('LST', False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1068,7 +1151,7 @@ def test_redundantly_averaged( for flset in out_files: uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[('LST', False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1123,3 +1206,38 @@ def test_output_file_select(self, tmp_path_factory): with pytest.raises(ValueError, match='output_file_select must be less than the number of output files'): lstbf(output_file_select=100) + + def test_inpaint_mode_no_flags(self, tmp_path_factory): + """Test that using inpaint mode does nothing when there's no flags.""" + tmp = tmp_path_factory.mktemp("inpaint_no_flags") + uvds = mockuvd.make_dataset( + ndays=3, nfiles=2, ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs + pols = ('xx', 'yx'), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format(uvds, tmp) + + cfl = tmp / "lstbin_config_file.yaml" + config_info = lstbin_simple.make_lst_bin_config_file( + cfl, data_files, ntimes_per_file=2, clobber=True, + ) + out_files = lstbin_simple.lst_bin_files( + config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + Nbls_to_load=11, rephase=False, + sigma_clip_thresh=0.0, + sigma_clip_min_N=2, + output_flagged=True, + output_inpainted=True + ) + + assert len(out_files) == 2 + + for flset in out_files: + flagged = UVData.from_file(flset[('LST', False)]) + inpainted = UVData.from_file(flset[('LST', True)]) + + assert flagged == inpainted diff --git a/scripts/lstbin_simple.py b/scripts/lstbin_simple.py index d809635c5..3cb2b526a 100644 --- a/scripts/lstbin_simple.py +++ b/scripts/lstbin_simple.py @@ -7,11 +7,8 @@ from hera_cal import lstbin_simple as lstbin import sys -import glob import json from hera_cal._cli_tools import setup_logger, parse_args, filter_kwargs, run_with_profiling -import logging -import importlib from pathlib import Path a = lstbin.lst_bin_arg_parser() @@ -52,15 +49,24 @@ else: calfile_rules = None +inprules = kwargs.pop("calfile_rules") +if inprules is not None: + inpaint_rules = [(inprules[i], inprules[i+1]) for i in range(len(inprules)//2)] +else: + inpaint_rules = None + kwargs['save_channels'] = tuple(int(ch) for ch in kwargs['save_channels'].split(',')) kwargs['golden_lsts'] = tuple(float(lst) for lst in kwargs['golden_lsts'].split(',')) +kwargs['output_flagged'] = not kwargs.pop('no_flagged_mode') +kwargs['output_inpainted'] = kwargs.pop("do_inpaint_mode") run_with_profiling( lstbin.lst_bin_files, args, config_file=kwargs.pop('configfile'), calfile_rules=calfile_rules, + where_inpainted_file_rules = inpaint_rules, write_kwargs=write_kwargs, **kwargs ) From b24b706463df0efec01010ea888d5a146f218907 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Thu, 17 Aug 2023 09:00:37 -0700 Subject: [PATCH 02/81] fix: adress Josh's comments --- hera_cal/datacontainer.py | 20 +++++++++++++++++--- hera_cal/lstbin.py | 4 ++-- hera_cal/lstbin_simple.py | 30 ++++++++++++++++++++++++------ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/hera_cal/datacontainer.py b/hera_cal/datacontainer.py index eb70a8147..9902c7293 100644 --- a/hera_cal/datacontainer.py +++ b/hera_cal/datacontainer.py @@ -6,6 +6,7 @@ import numpy as np from collections import OrderedDict as odict import copy +import warnings from typing import Sequence from .utils import conj_pol, comply_pol, make_bl, comply_bl, reverse_bl @@ -515,7 +516,7 @@ def select_or_expand_times(self, new_times, in_place=True, skip_bda_check=False) if not in_place: return dc - def select_or_expand_freqs( + def select_freqs( self, freqs: np.ndarray | None = None, channels: np.ndarray | slice | None = None, @@ -523,6 +524,9 @@ def select_or_expand_freqs( ): """Update the object with a subset of frequencies (which may be repeated). + While typically this will be used to down-select frequencies, one can + 'expand' the frequencies by duplicating channels. + Parameters ---------- freqs : np.ndarray, optional @@ -532,6 +536,12 @@ def select_or_expand_freqs( Only one of freqs or channels can be given. in_place : bool, optional If True, modify the object in place. Otherwise, return a modified copy. + Even if `in_place` is True, the object is still returned for convenience. + + Returns + ------- + DataContainer + The modified object. If `in_place` is True, this is the same object. """ obj = self if in_place else copy.deepcopy(self) if freqs is None and channels is None: @@ -540,11 +550,15 @@ def select_or_expand_freqs( raise ValueError('Cannot specify both freqs and channels.') if freqs is not None: - if not np.all([fq in dc.freqs for fq in freqs]): + if not np.all([fq in obj.freqs for fq in freqs]): raise ValueError('All freqs must be in self.freqs.') channels = np.searchsorted(obj.freqs, freqs) - axis = obj[next(iter(dc.keys()))].shape.index(len(obj.freqs)) + if obj.freqs is None: + warnings.warn("It is impossible to automatically detect which axis is frequency. Trying last axis.") + axis = -1 + else: + axis = obj[next(iter(obj.keys()))].shape.index(len(obj.freqs)) for bl in obj: obj[bl] = obj[bl].take(channels, axis=axis) diff --git a/hera_cal/lstbin.py b/hera_cal/lstbin.py index df855fc7f..b4e99e69c 100644 --- a/hera_cal/lstbin.py +++ b/hera_cal/lstbin.py @@ -1076,8 +1076,8 @@ def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: i return np.zeros_like(array, dtype=bool) if isinstance(array, np.ma.MaskedArray): - location = np.expand_dims(np.median(array, axis=axis), axis=axis) - scale = np.expand_dims(np.median(np.abs(array - location), axis=axis) * 1.482579, axis=axis) + location = np.expand_dims(np.ma.median(array, axis=axis), axis=axis) + scale = np.expand_dims(np.ma.median(np.abs(array - location), axis=axis) * 1.482579, axis=axis) else: # get robust location location = np.expand_dims(np.nanmedian(array, axis=axis), axis=axis) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 21a7e3914..9e1782db2 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -458,7 +458,11 @@ def lst_average( # Now do sigma-clipping. if sigma_clip_thresh is not None: if inpainted_mode: - warnings.warn("Sigma-clipping in in-painted mode is a bad idea.") + warnings.warn( + "Sigma-clipping in in-painted mode is a bad idea, because it creates " + "non-uniform flags over frequency, which can cause artificial spectral " + "structure. In-painted mode specifically attempts to avoid this." + ) nflags = np.sum(flags) clip_flags = sigma_clip(data.real, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) @@ -480,17 +484,19 @@ def lst_average( # Update the "non_inpainted" mask non_inpainted = data.mask - nsamples[non_inpainted] = 0 - # Here we do a check to make sure Nsamples is uniform across frequency. + # Do this before setting non_inpainted to zero nsamples. ndiff = np.diff(nsamples, axis=2) if np.any(ndiff != 0): warnings.warn("Nsamples is not uniform across frequency. This will result in spectral structure.") + nsamples[non_inpainted] = 0 + norm = np.sum(nsamples, axis=0) ndays_binned = np.sum((~flags).astype(int), axis=0) logger.info("Calculating mean") + # np.sum works the same for both masked and non-masked arrays. meandata = np.sum(data * nsamples, axis=0) lstbin_flagged = np.all(flags, axis=0) @@ -499,6 +505,7 @@ def lst_average( normalizable = norm > 0 meandata[normalizable] /= norm[normalizable] + # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan # get other stats @@ -748,7 +755,7 @@ def lst_bin_files_for_baselines( # We need to down-selecton times/freqs (bls and pols will be sub-selected # based on those in the data through the next loop) inpainted.select_or_expand_times(times=tarr, skip_bda_check=True) - inpainted.select_or_expand_freqs(channels=freq_chans) + inpainted.select_freqs(channels=freq_chans) else: inpainted = None @@ -977,6 +984,7 @@ def lst_bin_files_single_outfile( output_inpainted: bool | None = None, output_flagged: bool = True, where_inpainted_file_rules: list[tuple[str, str]] | None = None, + sigma_clip_in_inpainted_mode: bool = False, ) -> dict[str, Path]: """ Bin data files into LST bins, and write all bins to disk in a single file. @@ -1118,7 +1126,13 @@ def lst_bin_files_single_outfile( of the data object). If not provided, but `output_inpainted` is set to True, all data-flags will be considered in-painted except for baseline-times that are fully flagged, which will be completely ignored. - + sigma_clip_in_inpainted_mode + If True, sigma clip the data in inpainted mode (if sigma-clipping is turned on). + This is generally not a good idea, since the point of inpainting is to get + smoother spectra, and sigma-clipping creates non-uniform Nsamples, which can + lead to less smooth spectra. This option is only here to enable sigma-clipping + to be turned on for flagged mode, and off for inpainted mode. + Returns ------- out_files @@ -1417,7 +1431,10 @@ def lst_bin_files_single_outfile( where_inpainted=where_inpainted, inpainted_mode=inpainted, flag_thresh=flag_thresh, - sigma_clip_thresh = sigma_clip_thresh, + sigma_clip_thresh = ( + None if inpainted and not sigma_clip_in_inpainted_mode + else sigma_clip_thresh + ), sigma_clip_min_N = sigma_clip_min_N, flag_below_min_N=flag_below_min_N, ) @@ -2037,4 +2054,5 @@ def lst_bin_arg_parser(): a.add_argument("--no-flagged-mode", action='store_true', help="turn off output of flagged mode LST-binning") a.add_argument("--do-inpaint-mode", action='store_true', default=None, help="turn on inpainting mode LST-binning") a.add_argument("--where-inpainted-file-rules", nargs='*', type=str, help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter") + a.add_argument('--sigma-clip-in-inpainted-mode', action='store_true', default=False, help='allow sigma-clipping in inpainted mode') return a From fe935d2d5ae6da642c015c3cd4e651712617d867 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Thu, 17 Aug 2023 13:18:53 -0700 Subject: [PATCH 03/81] feat: ability to write out median/mad files --- hera_cal/lstbin_simple.py | 191 ++++++++++++++++++--------- hera_cal/tests/test_lstbin_simple.py | 97 ++++++++------ 2 files changed, 187 insertions(+), 101 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 9e1782db2..97ad93f68 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -267,6 +267,7 @@ def reduce_lst_bins( sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, flag_thresh: float = 0.7, + get_mad: bool = False, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ From a list of LST-binned data, produce reduced statistics. @@ -331,6 +332,10 @@ def reduce_lst_bins( out_nsamples = np.zeros(out_data.shape, dtype=float) days_binned = np.zeros(out_nsamples.shape, dtype=int) + if get_mad: + mad = np.zeros(out_data.shape, dtype=complex) + med = np.zeros(out_data.shape, dtype=complex) + if where_inpainted is None: where_inpainted = [None] * nlst_bins @@ -339,7 +344,12 @@ def reduce_lst_bins( # TODO: check that this doesn't make yet another copy... # This is just the data in this particular lst-bin. + if d.size: + d, f = get_masked_data(d, n, f, inpainted=inpf, inpainted_mode=inpainted_mode) + f = threshold_flags(f, inplace=True, flag_thresh=flag_thresh) + d.mask |= f + ( out_data[:, lstbin], out_flags[:, lstbin], @@ -348,22 +358,36 @@ def reduce_lst_bins( days_binned[:, lstbin] ) = lst_average( d, n, f, - inpainted=inpf, inpainted_mode=inpainted_mode, - mutable=mutable, - flag_thresh=flag_thresh, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=sigma_clip_min_N, flag_below_min_N=flag_below_min_N, ) + + if get_mad: + med[:, lstbin], mad[:, lstbin] = get_lst_mad(d) else: out_data[:, lstbin] *= np.nan out_flags[:, lstbin] = True out_std[:, lstbin] *= np.inf out_nsamples[:, lstbin] = 0.0 + if get_mad: + mad[:, lstbin] *= np.inf + med[:, lstbin] *= np.nan - return out_data, out_flags, out_std, out_nsamples, days_binned + out = { + "data": out_data, + "flags": out_flags, + "std": out_std, + "nsamples": out_nsamples, + "days_binned": days_binned, + } + if get_mad: + out["mad"] = mad + out["median"] = med + + return out def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): data = np.full(shape, d, dtype=complex) @@ -371,13 +395,65 @@ def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): nsamples = np.full(shape, n, dtype=float) return data, flags, nsamples -@profile -def lst_average( +def get_masked_data( data: np.ndarray, nsamples: np.ndarray, flags: np.ndarray, inpainted: np.ndarray | None = None, inpainted_mode: bool = False, +) -> np.ma.MaskedArray: + if not inpainted_mode: + # Act like nothing is inpainted. + inpainted = np.zeros(flags.shape, dtype=bool) + elif inpainted is None: + # Flag if a whole blt is flagged: + allf = np.all(flags, axis=2)[:,:, None, :] + + # Assume everything else that's flagged is inpainted. + inpainted = flags.copy() * (~allf) + + flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) + data = np.ma.masked_array( + data, + mask=(flags & ~inpainted) + ) + return data, flags + + +def get_lst_mad( + data: np.ndarray | np.ma.MaskedArray, +): + """Compute the median absolute deviation of a set of data over its zeroth axis. + + Flagged data will be ignored in flagged mode, but included in inpainted mode. + Nsamples is not taken into account at all, unless Nsamples=0. + """ + fnc = np.ma.median if isinstance(data, np.ma.MaskedArray) else np.median + med = fnc(data, axis=0) + madrl = fnc(np.abs(data.real - med.real), axis=0)*1.482579 + madim = fnc(np.abs(data.imag - med.imag), axis=0)*1.482579 + return med, madrl + madim*1j + + +def threshold_flags( + flags: np.ndarray, + inplace: bool = False, flag_thresh: float = 0.7, - mutable: bool = False, +): + if not inplace: + flags = flags.copy() + + # Flag entire LST bins if there are too many flags over time + flag_frac = np.sum(flags, axis=0) / flags.shape[0] + nflags = np.sum(flags) + logger.info(f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%") + flags |= flag_frac > flag_thresh + logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}") + + return flags + +@profile +def lst_average( + data: np.ndarray | np.ma.MaskedArray, nsamples: np.ndarray, flags: np.ndarray, + inpainted_mode: bool = False, sigma_clip_thresh: float | None= None, sigma_clip_min_N: int=4, flag_below_min_N: bool = False, @@ -422,38 +498,12 @@ def lst_average( # data has shape (nnights, nbl, npols, nfreqs) # all data is assumed to be in the same LST bin. - assert data.shape == nsamples.shape == flags.shape - - if not mutable: - flags = flags.copy() - nsamples = nsamples.copy() - data = data.copy() - - if not inpainted_mode: - # Act like nothing is inpainted. - inpainted = np.zeros(flags.shape, dtype=bool) - elif inpainted is None: - # Flag if a whole blt is flagged: - allf = np.all(flags, axis=2)[:,:, None, :] - - # Assume everything else that's flagged is inpainted. - inpainted = flags.copy() * (~allf) - - # These potential extra flags are certainly not inpainted. - flags[np.isnan(data) | np.isinf(data) | (nsamples == 0)] = True - - # Flag entire LST bins if there are too many flags over time - flag_frac = np.sum(flags, axis=0) / flags.shape[0] - nflags = np.sum(flags) - logger.info(f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%") - flags |= flag_frac > flag_thresh - logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}") - - non_inpainted = flags & ~inpainted - if np.any(non_inpainted): - # Use a masked array instead of a nan-filled array. This is very slightly - # faster than nanmean, and allows us more flexibility. - data = np.ma.masked_array(data, mask=non_inpainted) + if not isinstance(data, np.ma.MaskedArray): + # Generally, we want a MaskedArray for 'data', where the mask is *either* + # the flags or the 'non-inpainted flags', as obtained by `threshold_flags`. + # However, if this hasn't been called, and we just have an array, apply flags + # appropriately here. + data, flags = get_masked_data(data, nsamples, flags, inpainted_mode=inpainted_mode) # Now do sigma-clipping. if sigma_clip_thresh is not None: @@ -474,23 +524,17 @@ def lst_average( flags |= clip_flags - if isinstance(data, np.ma.MaskedArray): - data.mask |= clip_flags - else: - data = np.ma.masked_array(data, mask=clip_flags) + data.mask |= clip_flags logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping") - # Update the "non_inpainted" mask - non_inpainted = data.mask - # Here we do a check to make sure Nsamples is uniform across frequency. # Do this before setting non_inpainted to zero nsamples. ndiff = np.diff(nsamples, axis=2) if np.any(ndiff != 0): warnings.warn("Nsamples is not uniform across frequency. This will result in spectral structure.") - nsamples[non_inpainted] = 0 + nsamples = np.ma.masked_array(nsamples, mask=data.mask) norm = np.sum(nsamples, axis=0) ndays_binned = np.sum((~flags).astype(int), axis=0) @@ -522,13 +566,10 @@ def lst_average( # While the previous norm is correct for normalizing the mean, we now # calculate nsamples as the unflagged samples in each LST bin. - nsamples = np.sum(nsamples * (~flags).astype(int), axis=0) - - # Get the standard array out of the masked array. - if isinstance(meandata, np.ma.MaskedArray): - meandata = meandata.data - - return meandata, lstbin_flagged, std, nsamples, ndays_binned + nsamples.mask = flags + nsamples = np.sum(nsamples, axis=0) + + return meandata.data, lstbin_flagged, std.data, nsamples.data, ndays_binned def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: """ @@ -985,6 +1026,7 @@ def lst_bin_files_single_outfile( output_flagged: bool = True, where_inpainted_file_rules: list[tuple[str, str]] | None = None, sigma_clip_in_inpainted_mode: bool = False, + write_med_mad: bool = False, ) -> dict[str, Path]: """ Bin data files into LST bins, and write all bins to disk in a single file. @@ -1132,7 +1174,9 @@ def lst_bin_files_single_outfile( smoother spectra, and sigma-clipping creates non-uniform Nsamples, which can lead to less smooth spectra. This option is only here to enable sigma-clipping to be turned on for flagged mode, and off for inpainted mode. - + write_med_mad + If True, write out the median and MAD of the data in each LST bin. + Returns ------- out_files @@ -1342,7 +1386,10 @@ def lst_bin_files_single_outfile( if not inpaint_mode and not output_flagged: continue - for kind in ['LST', 'STD']: + kinds = ['LST', 'STD'] + if write_med_mad: + kinds += ['MED', 'MAD'] + for kind in kinds: # Create the files we'll write to out_files[(kind, inpaint_mode)] = create_outfile( kind = kind, @@ -1426,7 +1473,7 @@ def lst_bin_files_single_outfile( if not inpainted and not output_flagged: continue - _data, _flags, std, _nsamples, days_binned = reduce_lst_bins( + rdc = reduce_lst_bins( data, flags, nsamples, where_inpainted=where_inpainted, inpainted_mode=inpainted, @@ -1437,23 +1484,40 @@ def lst_bin_files_single_outfile( ), sigma_clip_min_N = sigma_clip_min_N, flag_below_min_N=flag_below_min_N, + get_mad=write_med_mad, ) write_baseline_slc_to_file( fl=out_files[('LST', inpainted)], slc=slc, - data=_data, - flags=_flags, - nsamples=_nsamples, + data=rdc['data'], + flags=rdc['flags'], + nsamples=rdc['nsamples'], ) write_baseline_slc_to_file( fl=out_files[('STD', inpainted)], slc=slc, - data=std, - flags=_flags, - nsamples=days_binned, + data=rdc['std'], + flags=rdc['flags'], + nsamples=rdc['days_binned'], ) + if write_med_mad: + write_baseline_slc_to_file( + fl=out_files[('MED', inpainted)], + slc=slc, + data=rdc['median'], + flags=rdc['flags'], + nsamples=rdc['nsamples'], + ) + write_baseline_slc_to_file( + fl=out_files[('STD', inpainted)], + slc=slc, + data=rdc['mad'], + flags=rdc['flags'], + nsamples=rdc['days_binned'], + ) + nbls_so_far += len(bl_chunk) return out_files @@ -1958,6 +2022,7 @@ def make_lst_bin_config_file( blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, jd_regex=jd_regex, + ntimes_per_file=ntimes_per_file, ) dlst = lst_grid[0][1] - lst_grid[0][0] diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index e1ea39b4f..218e412af 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -227,14 +227,14 @@ def get_input_data( def test_one_point_per_bin(self, benchmark): d, f, n = self.get_input_data(ntimes=(1,)) - dd, ff, std, nn, db = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + rdc= benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape # reduce_data swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + dd = rdc['data'].swapaxes(0, 1) + ff = rdc['flags'].swapaxes(0, 1) + nn = rdc['nsamples'].swapaxes(0, 1) np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) @@ -243,24 +243,24 @@ def test_one_point_per_bin(self, benchmark): def test_zerosize_bin(self): d, f, n = self.get_input_data(ntimes=(0, 1)) print(d[0].shape, len(d)) - dd, ff, std, nn, db = lstbin_simple.reduce_lst_bins(d, f, n) + rdc = lstbin_simple.reduce_lst_bins(d, f, n) - assert dd.shape[1] == 2 # 2 LST bins - assert np.all(np.isnan(dd[:, 0])) - assert np.all(ff[:, 0]) - assert np.all(nn[:, 0] == 0.0) + assert rdc['data'].shape[1] == 2 # 2 LST bins + assert np.all(np.isnan(rdc['data'][:, 0])) + assert np.all(rdc['flags'][:, 0]) + assert np.all(rdc['nsamples'][:, 0] == 0.0) @pytest.mark.parametrize("ntimes", [(4, ), (5, 4)]) def test_multi_points_per_bin(self, ntimes, benchmark): d, f, n = self.get_input_data(ntimes=ntimes) - dd, ff, std, nn, db = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + rdc = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape - # reduce_lst_bins swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + # reduce_data swaps the order of bls/times + dd = rdc['data'].swapaxes(0, 1) + ff = rdc['flags'].swapaxes(0, 1) + nn = rdc['nsamples'].swapaxes(0, 1) assert not np.any(ff) for lst in range(len(ntimes)): @@ -271,19 +271,26 @@ def test_multi_points_per_bin_flagged(self): d, f, n = self.get_input_data(ntimes=(4,)) f[0][2:] = True d[0][2:] = 1000.0 - dd, ff, std, nn, db = lstbin_simple.reduce_lst_bins(d,f,n) + rdc = lstbin_simple.reduce_lst_bins(d,f,n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape # reduce_data swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + dd = rdc['data'].swapaxes(0, 1) + ff = rdc['flags'].swapaxes(0, 1) + nn = rdc['nsamples'].swapaxes(0, 1) + np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) np.testing.assert_allclose(nn, 2.0) + def test_get_med_mad(self): + d, f, n = self.get_input_data(ntimes=(4,)) + rdc = lstbin_simple.reduce_lst_bins(d, f, n, get_mad=True) + + assert np.all(rdc['median'] == rdc['data']) + def test_apply_calfile_rules(tmpdir_factory): direc = tmpdir_factory.mktemp("test_apply_calfile_rules") @@ -435,7 +442,7 @@ def test_std(self, nsamples, flags): assert not np.any(flg_n) def test_inpaint_mode(self): - shape = (7,8,9, 1) # nights, bls, freqs, pols + shape = (3,2,4, 1) # nights, bls, freqs, pols _data = np.random.random(shape) + np.random.random(shape)*1j nsamples = np.ones(_data.shape, dtype=float) flags = np.zeros(_data.shape, dtype=bool) @@ -451,7 +458,7 @@ def test_inpaint_mode(self): assert np.allclose(df, di) assert np.allclose(ff, fi) - assert np.allclose(stdf, stdi) + np.testing.assert_allclose(stdf, stdi) assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) @@ -491,20 +498,20 @@ def test_inpaint_mode(self): # blt is considered to NOT be inpainted by default. assert np.allclose(df, di) assert np.allclose(ff, fi) - assert np.allclose(stdf, stdi) + np.testing.assert_allclose(stdf, stdi) assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) # However, if we had explicitly told the routine that the blt was inpainted, # we'd get a different result... + _d, _f = lstbin_simple.get_masked_data(_data, nsamples, flags, inpainted=np.ones_like(flags),inpainted_mode=False) df, ff, stdf, nf, dbf = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False, - inpainted=np.ones_like(flags) # Everything in-painted. + data=_d, nsamples=nsamples, flags=_f, inpainted_mode=False, ) + _d, _f = lstbin_simple.get_masked_data(_data, nsamples, flags, inpainted=np.ones_like(flags),inpainted_mode=True) di, fi, stdi, ni, dbi = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True, - inpainted=np.ones_like(flags) # Everything in-painted. + data=_d, nsamples=nsamples, flags=_f, inpainted_mode=True, ) # The LST-binned data will be different for blt=0, pol=0: @@ -544,8 +551,10 @@ def test_flag_below_min_N(self): ) assert np.all(flg_n) - assert np.all(norm_n==0) - assert np.all(np.isinf(std_n)) + # just because we're flagging it, doesn't mean we need to set nsamples=0 + # or the std to inf. We have info there, we're just choosing not to use it. + assert np.all(norm_n==2) + assert not np.any(np.isinf(std_n)) # this time, only one column is flagged too much... # this time, there's enough samples, but too many are flagged... @@ -557,11 +566,12 @@ def test_flag_below_min_N(self): ) assert np.all(flg_n[0]) - assert np.all(norm_n[0]==0) - assert np.all(np.isinf(std_n[0])) + assert np.all(norm_n[0]==2) + assert not np.any(np.isinf(std_n[0])) + print(np.sum(flg_n[1:]), flg_n[1:].size) assert not np.any(flg_n[1:]) - assert np.all(norm_n[1:]>0) + assert np.all(norm_n[1:]==7) assert not np.any(np.isinf(std_n[1:])) assert not np.any(np.isnan(data_n[1:])) @@ -918,10 +928,12 @@ def test_baseline_chunking(self, tmp_path_factory): out_files = lstbin_simple.lst_bin_files( config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", + write_med_mad=True ) out_files_chunked = lstbin_simple.lst_bin_files( config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.chunked.uvh5", Nbls_to_load=10, + write_med_mad=True ) for flset, flsetc in zip(out_files, out_files_chunked): @@ -934,6 +946,15 @@ def test_baseline_chunking(self, tmp_path_factory): assert uvdlst == uvdlstc + assert flset[('MED', False)] != flsetc[('MED', False)] + uvdlst = UVData() + uvdlst.read(flset[('MED', False)]) + + uvdlstc = UVData() + uvdlstc.read(flsetc[('MED', False)]) + + assert uvdlst == uvdlstc + def test_compare_nontrivial_cal( self, tmp_path_factory ): @@ -1211,9 +1232,9 @@ def test_inpaint_mode_no_flags(self, tmp_path_factory): """Test that using inpaint mode does nothing when there's no flags.""" tmp = tmp_path_factory.mktemp("inpaint_no_flags") uvds = mockuvd.make_dataset( - ndays=3, nfiles=2, ntimes=2, + ndays=3, nfiles=1, ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs + antpairs = [(i,j) for i in range(3) for j in range(i, 3)], # 55 antpairs pols = ('xx', 'yx'), freqs=np.linspace(140e6, 180e6, 3), redundantly_averaged=True, @@ -1227,14 +1248,14 @@ def test_inpaint_mode_no_flags(self, tmp_path_factory): ) out_files = lstbin_simple.lst_bin_files( config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", - Nbls_to_load=11, rephase=False, - sigma_clip_thresh=0.0, + rephase=False, + sigma_clip_thresh=None, sigma_clip_min_N=2, output_flagged=True, output_inpainted=True ) - assert len(out_files) == 2 + assert len(out_files) == 1 for flset in out_files: flagged = UVData.from_file(flset[('LST', False)]) From d1e268d45dbb9f1f06c7068ce8ffa6be64087099 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 18 Aug 2023 10:21:06 -0700 Subject: [PATCH 04/81] fix: add --write-med-mad option to lstbin-simple script --- hera_cal/lstbin_simple.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 97ad93f68..b526696ad 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -268,7 +268,7 @@ def reduce_lst_bins( flag_below_min_N: bool = False, flag_thresh: float = 0.7, get_mad: bool = False, -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> dict[str, np.ndarray]: """ From a list of LST-binned data, produce reduced statistics. @@ -310,12 +310,17 @@ def reduce_lst_bins( The fraction of integrations for a particular (antpair, pol, channel) combination within an LST-bin that can be flagged before that combination is flagged in the LST-average. + get_mad + Whether to compute the median and median absolute deviation of the data in each + LST bin, in addition to the mean and standard deviation. Returns ------- - out_data, out_flags, out_std, out_nsamples - The reduced data, flags, standard deviation (across days) and nsamples. - Shape ``(nbl, nlst_bins, nfreq, npol)``. + dict + The reduced data in a dictionary. Keys are 'data' (the lst-binned mean), + 'nsamples', 'flags', 'days_binned' (the number of days that went into each bin), + 'std' (standard deviation) and *otionally* 'median' and 'mad' (if `get_mad` is + True). All values are arrays of the same shape: ``(nbl, nlst_bins, nfreq, npol)``. """ nlst_bins = len(data) (_, nbl, nfreq, npol) = data[0].shape @@ -365,7 +370,7 @@ def reduce_lst_bins( ) if get_mad: - med[:, lstbin], mad[:, lstbin] = get_lst_mad(d) + med[:, lstbin], mad[:, lstbin] = get_lst_median_and_mad(d) else: out_data[:, lstbin] *= np.nan out_flags[:, lstbin] = True @@ -418,7 +423,7 @@ def get_masked_data( return data, flags -def get_lst_mad( +def get_lst_median_and_mad( data: np.ndarray | np.ma.MaskedArray, ): """Compute the median absolute deviation of a set of data over its zeroth axis. @@ -2120,4 +2125,5 @@ def lst_bin_arg_parser(): a.add_argument("--do-inpaint-mode", action='store_true', default=None, help="turn on inpainting mode LST-binning") a.add_argument("--where-inpainted-file-rules", nargs='*', type=str, help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter") a.add_argument('--sigma-clip-in-inpainted-mode', action='store_true', default=False, help='allow sigma-clipping in inpainted mode') + a.add_argument('--write-med-mad', action='store_true', default=False, help='option to write out MED/MAD files in addition to LST/STD files') return a From e48e03518db97d18a81ec06fb3c4afdf45f8752c Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 21 Aug 2023 18:00:13 -0700 Subject: [PATCH 05/81] move aipy to extra --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9aaf78f23..0dca839a7 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,6 @@ def package_files(package_dir, subdirectory): 'scikit-learn', 'hera-filters', "line_profiler", - 'aipy', "rich", "jax", "jaxlib", @@ -68,7 +67,8 @@ def package_files(package_dir, subdirectory): ], 'extras_require': { "all": [ - 'optax' + 'optax', + 'aipy' ], 'dev': [ "pytest", From 32fff7cff88823eeb452a834cfd34580733f7f12 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 21 Aug 2023 18:31:56 -0700 Subject: [PATCH 06/81] Revert "move aipy to extra" This reverts commit e48e03518db97d18a81ec06fb3c4afdf45f8752c. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0dca839a7..9aaf78f23 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ def package_files(package_dir, subdirectory): 'scikit-learn', 'hera-filters', "line_profiler", + 'aipy', "rich", "jax", "jaxlib", @@ -67,8 +68,7 @@ def package_files(package_dir, subdirectory): ], 'extras_require': { "all": [ - 'optax', - 'aipy' + 'optax' ], 'dev': [ "pytest", From 90b63a7b001c339a9378348aed9b428fe18c1b4a Mon Sep 17 00:00:00 2001 From: Bryna Hazelton Date: Tue, 22 Aug 2023 14:00:54 -0700 Subject: [PATCH 07/81] Fix some errors related to recent pyuvdata changes --- hera_cal/io.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 155996e12..31c427641 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -115,7 +115,10 @@ def build_calcontainers(self): i, ip = self._antnum_indices[ant], self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] gains[(ant, pol)] = np.array(self.gain_array[i, :, :, ip].T) flags[(ant, pol)] = np.array(self.flag_array[i, :, :, ip].T) - quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + if self.quality_array is not None: + quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + else: + quals = None # build dict of total_qual if available for pol in self.pols: ip = self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] @@ -802,7 +805,7 @@ def read(self, bls=None, polarizations=None, times=None, time_range=None, lsts=N run_check_acceptability=run_check_acceptability, **kwargs) self.use_future_array_shapes() if self.filetype == 'uvfits': - self.unphase_to_drift() + self.unproject_phase() else: if not read_data: raise NotImplementedError('reading only metadata is not implemented for ' + self.filetype) @@ -2263,7 +2266,7 @@ def _write_HERAData_to_filetype(hd, outfilename, filetype_out='miriad', clobber= if filetype_out == 'miriad': hd.write_miriad(outfilename, clobber=clobber) elif filetype_out == 'uvfits': - hd.write_uvfits(outfilename, force_phase=True, spoof_nonessential=True) + hd.write_uvfits(outfilename, force_phase=True) elif filetype_out == 'uvh5': hd.write_uvh5(outfilename, clobber=clobber) else: From efeb2f2ee10a3129e60887794daa5b99c1a6ab67 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 22 Aug 2023 17:10:07 -0700 Subject: [PATCH 08/81] fix array shape issue --- hera_cal/reflections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hera_cal/reflections.py b/hera_cal/reflections.py index a34ee5d5d..225c5de45 100644 --- a/hera_cal/reflections.py +++ b/hera_cal/reflections.py @@ -1320,7 +1320,7 @@ def L(x, amp, dly, phs, clean_data, clean_model, clean_wgts, dly_range, freqs, f else: x0.append(phs0[i]) _phs = None - x0 = np.array(x0) + x0 = np.squeeze(x0) # optimize res = minimize(L, x0, args=(_amp, _dly, _phs, clean_data[i], clean_model[i], clean_wgts[i], dly_range, freqs, fft_kwargs), method=method, tol=tol, options=dict(maxiter=maxiter)) From 467e76518dae48dd8059fad519fde9bc59766c1c Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Wed, 23 Aug 2023 10:24:45 -0700 Subject: [PATCH 09/81] fix issue where the number of iterations was unstable --- hera_cal/tests/test_redcal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/tests/test_redcal.py b/hera_cal/tests/test_redcal.py index 31e53a325..7cc293b5d 100644 --- a/hera_cal/tests/test_redcal.py +++ b/hera_cal/tests/test_redcal.py @@ -1890,9 +1890,11 @@ def test_redcal_run(self): sys.stdout = open(os.devnull, 'w') redcal_meta, hc_first, hc_omni, hd_vissol = om.redcal_run(input_data, verbose=True, ant_z_thresh=1.7, add_to_history='testing', a_priori_ex_ants_yaml=os.path.join(DATA_PATH, 'test_input', 'a_priori_flags_sample.yaml'), + flag_nchan_low=10, flag_nchan_high=10, check_every=100, check_after=100, iter0_prefix='.iter0', metrics_files=ant_metrics_file, clobber=True) hd = io.HERAData(input_data) - redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, ex_ants=[11, 50]) + redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, flag_nchan_low=10, flag_nchan_high=10, + check_every=100, check_after=100, ex_ants=[11, 50]) sys.stdout = sys.__stdout__ for prefix, hc_here, bad_ants in [('', hc_first, [11, 50, 12, 24]), ('.iter0', hc_first0, [11, 50])]: From f5f463ca62b5541194be1bcf90ca2ad11ec8fa0f Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Wed, 23 Aug 2023 11:19:32 -0700 Subject: [PATCH 10/81] guarantee exact same number of iterations in two versions of test --- hera_cal/tests/test_redcal.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hera_cal/tests/test_redcal.py b/hera_cal/tests/test_redcal.py index 7cc293b5d..72f79711d 100644 --- a/hera_cal/tests/test_redcal.py +++ b/hera_cal/tests/test_redcal.py @@ -1890,11 +1890,10 @@ def test_redcal_run(self): sys.stdout = open(os.devnull, 'w') redcal_meta, hc_first, hc_omni, hd_vissol = om.redcal_run(input_data, verbose=True, ant_z_thresh=1.7, add_to_history='testing', a_priori_ex_ants_yaml=os.path.join(DATA_PATH, 'test_input', 'a_priori_flags_sample.yaml'), - flag_nchan_low=10, flag_nchan_high=10, check_every=100, check_after=100, + flag_nchan_low=10, flag_nchan_high=10, check_after=500, iter0_prefix='.iter0', metrics_files=ant_metrics_file, clobber=True) hd = io.HERAData(input_data) - redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, flag_nchan_low=10, flag_nchan_high=10, - check_every=100, check_after=100, ex_ants=[11, 50]) + redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, flag_nchan_low=10, flag_nchan_high=10, check_after=500, ex_ants=[11, 50]) sys.stdout = sys.__stdout__ for prefix, hc_here, bad_ants in [('', hc_first, [11, 50, 12, 24]), ('.iter0', hc_first0, [11, 50])]: From 83607f9d8784f13a8af76664b7cd2f4c69cf93f9 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 23 Aug 2023 17:00:50 -0700 Subject: [PATCH 11/81] feat: add lst_branch_cut auto-detection for lst-binning --- hera_cal/lstbin_simple.py | 28 +++++++++++++++++++------- hera_cal/tests/test_utils.py | 38 +++++++++++++++++++++++++++++++++++- hera_cal/utils.py | 16 ++++++++++++++- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index b526696ad..3d69f1bc4 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1382,6 +1382,7 @@ def lst_bin_files_single_outfile( start_jd=start_jd, freq_min=freq_min, freq_max=freq_max, + lst_branch_cut=metadata['lst_branch_cut'], **write_kwargs, ) out_files = {} @@ -1764,6 +1765,7 @@ def create_lstbin_output_file( channels: np.ndarray | list[int] | None = None, vis_units: str = "Jy", inpaint_mode: bool | None = None, + lst_branch_cut: float = 0.0, ) -> Path: outdir = Path(outdir) @@ -1772,6 +1774,9 @@ def create_lstbin_output_file( file_history = f"{history} Input files: {file_list_str}" _history = file_history + utils.history_string() + if lst < lst_branch_cut: + lst += 2*np.pi + fname = outdir / fname_format.format( kind=kind, lst=lst, pol=''.join(pols), inpaint_mode='.inpaint' if inpaint_mode else ('.flagged' if inpaint_mode is False else "") @@ -1801,6 +1806,7 @@ def create_lstbin_output_file( start_jd=start_jd, time_axis_faster_than_bls = True, vis_units=vis_units, + lst_branch_cut=lst_branch_cut, ) uvd_template.select(frequencies=freqs, polarizations=pols, inplace=True) # Need to set the polarization array manually because even though the select @@ -1866,12 +1872,7 @@ def config_lst_bin_files( Returns ------- lst_grid : float ndarray holding LST bin centers. - dlst : float, LST bin width of output lst_grid - file_lsts : list, contains the lst grid of each output file. Empty files are dropped. - begin_lst : float, starting lst for LST binner. If lst_start is not None, this equals lst_start. - lst_arrays : list, list of lst arrays for each file. These will have 2 pis added or subtracted - to match the range of lst_grid given lst_start - time_arrays : list, list of time arrays for each file + matched_files : list of lists of files, one list for each output file. """ logger.info("Configuring lst_grid") @@ -1959,6 +1960,7 @@ def make_lst_bin_config_file( blts_are_rectangular: bool | None = None, time_axis_faster_than_bls: bool | None = None, jd_regex: str = r"zen\.(\d+\.\d+)\.", + lst_branch_cut: float | None = None, ) -> dict[str, Any]: """Construct and write a YAML configuration file for lst-binning. @@ -2008,7 +2010,13 @@ def make_lst_bin_config_file( jd_regex : str, optional Regex to use to extract the JD from the file name. Set to None or empty to force the LST-matching to use the LSTs in the metadata within the file. - + lst_branch_cut + The LST at which to branch cut the LST grid for file writing. The JDs in the + output LST-binned files will be *lowest* at the lst_branch_cut, and all file + names will have LSTs that are higher than lst_branch_cut. If None, this will + be determined automatically by finding the largest gap in LSTs and starting + AFTER it. + Returns ------- config : dict @@ -2030,6 +2038,11 @@ def make_lst_bin_config_file( ntimes_per_file=ntimes_per_file, ) + # Get the best lst_branch_cut by finding the largest gap in LSTs and starting + # AFTER it + if lst_branch_cut is None: + lst_branch_cut = utils.get_best_lst_branch_cut(np.concatenate(lst_grid)) + dlst = lst_grid[0][1] - lst_grid[0][0] # Make it a real list of floats to make the YAML easier to read lst_grid = [[float(l) for l in lsts] for lsts in lst_grid] @@ -2069,6 +2082,7 @@ def get_meta(): 'time_axis_faster_than_bls': meta.time_axis_faster_than_bls, 'start_jd': int(meta.times[0]), 'integration_time': float(tint), + 'lst_branch_cut': lst_branch_cut } } diff --git a/hera_cal/tests/test_utils.py b/hera_cal/tests/test_utils.py index 89e8b21a3..33e16d259 100644 --- a/hera_cal/tests/test_utils.py +++ b/hera_cal/tests/test_utils.py @@ -1069,4 +1069,40 @@ def test_one_file_per_lstbin(self, tmpdir): assert 0 < len(match) <= 2 # Ensure all files are matched - assert len(files_matched) == len(fls) \ No newline at end of file + assert len(files_matched) == len(fls) + + +class Test_LSTBranchCut: + def test_simple_ascending(self): + lsts = np.linspace(0, 1.0, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 + + def test_simple_descending(self): + lsts = np.linspace(1.0, 0, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 + + def test_simple_ascending_with_offset(self): + lsts = np.linspace(0.1, 1.1, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 0.1) + + def test_wrap_around_2pi(self): + lsts = np.linspace(3*np.pi/2, 5*np.pi/2, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 3*np.pi/2) + + def test_wrap_around_2pi_starting_low(self): + lsts0 = np.linspace(0, np.pi/2) + lsts1 = np.linspace(3*np.pi/2, 2*np.pi) + lsts = np.concatenate([lsts0, lsts1]) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 3*np.pi/2) + + def test_with_crazy_periods(self): + lsts = np.linspace(0, 1.0, 100) + n = np.random.random_integers(10, size=100) + lsts += n*2*np.pi + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 \ No newline at end of file diff --git a/hera_cal/utils.py b/hera_cal/utils.py index 59ba43c34..1d1cf4001 100644 --- a/hera_cal/utils.py +++ b/hera_cal/utils.py @@ -563,7 +563,7 @@ def LST2JD(LST, start_jd, allow_other_jd=False, lst_branch_cut=0.0, latitude=-30 LSTs to correspond to previous or subsequent days if necessary lst_branch_cut : type=float, LST that must fall during start_jd even when allow_other_jd - is True. Used as the starting point starting point where LSTs below this + is True. Used as the starting point where LSTs below this value map to later JDs than this LST [radians] latitude : type=float, degrees North of observer, default=HERA latitude @@ -1778,6 +1778,20 @@ def chunk_baselines_by_redundant_groups(reds, max_chunk_size): baseline_chunks.append(grp) return baseline_chunks +def get_best_lst_branch_cut(lsts): + # Sort them + lsts = np.sort(lsts) % (2*np.pi) + + # wrap lstg around in case the biggest gap is right at the start + lsts = np.concatenate((lsts, [lsts[0]])) + + diff_lst = np.diff(lsts) + while np.any(diff_lst < 0): + lsts[np.where(diff_lst < 0)[0] + 1] += 2*np.pi + diff_lst = np.diff(lsts) + idxmax = np.argmax(diff_lst) + 1 + return lsts[idxmax] % (2*np.pi) + def select_spw_ranges(inputfilename, outputfilename, spw_ranges=None, clobber=False): """Utility function for selecting spw_ranges and writing out From 371a76108f2743dd2de569a650086c9c3f22a387 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 23 Aug 2023 17:34:38 -0700 Subject: [PATCH 12/81] fix: make sure lst_branch_cut is float --- hera_cal/lstbin_simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 3d69f1bc4..906254f9f 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -2041,7 +2041,7 @@ def make_lst_bin_config_file( # Get the best lst_branch_cut by finding the largest gap in LSTs and starting # AFTER it if lst_branch_cut is None: - lst_branch_cut = utils.get_best_lst_branch_cut(np.concatenate(lst_grid)) + lst_branch_cut = float(utils.get_best_lst_branch_cut(np.concatenate(lst_grid))) dlst = lst_grid[0][1] - lst_grid[0][0] # Make it a real list of floats to make the YAML easier to read From dab4f8239447fcd763342f44b47d73bde87e3d19 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Wed, 23 Aug 2023 20:04:14 -0700 Subject: [PATCH 13/81] fix unit test and some pep8 --- hera_cal/tests/test_utils.py | 75 +++++++++++++++++++----------------- hera_cal/utils.py | 48 ++++++++++++----------- 2 files changed, 65 insertions(+), 58 deletions(-) diff --git a/hera_cal/tests/test_utils.py b/hera_cal/tests/test_utils.py index 33e16d259..67ad1e441 100644 --- a/hera_cal/tests/test_utils.py +++ b/hera_cal/tests/test_utils.py @@ -26,6 +26,7 @@ from . import mock_uvdata as mockuvd from pathlib import Path + class Test_Pol_Ops(object): def test_comply_pol(self): assert utils.comply_pol('XX') == 'xx' @@ -425,12 +426,12 @@ def test_lst_rephase_vectorized(): for i, antpair in enumerate(antpairs): for j, pol in enumerate(pols): data_drift[:, i, :, j] = data[antpair + (pol,)].copy() - + # basic test: single dlst for all integrations - _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) - + _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) + # check error at transit - phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime+1, ibl, ifreq, ipol]) + phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime + 1, ibl, ifreq, ipol]) assert np.isclose(phs_err, 0, atol=1e-7) # check error across file phs_err = np.angle(_data[:-1, ibl, ifreq, ipol] / data_drift[1:, ibl, ifreq, ipol]) @@ -440,7 +441,7 @@ def test_lst_rephase_vectorized(): dlst = np.array([np.median(np.diff(lsts))] * data.shape[0]) _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) # check error at transit - phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime+1, ibl, ifreq, ipol]) + phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime + 1, ibl, ifreq, ipol]) assert np.isclose(phs_err, 0, atol=1e-7) # check err across file phs_err = np.angle(_data[:-1, ibl, ifreq, ipol] / data_drift[1:, ibl, ifreq, ipol]) @@ -456,6 +457,7 @@ def test_lst_rephase_vectorized(): phs_err = np.angle(_data[:, ibl, ifreq, ipol] / data_drift[50, ibl, ifreq, ipol]) assert np.abs(phs_err).max() < 1e-4 + def test_lst_rephase(): # load point source sim w/ array at latitude = 0 fname = os.path.join(DATA_PATH, "PAPER_point_source_sim.uv") @@ -926,14 +928,15 @@ def test_select_spw_ranges_run_script_code(tmpdir): def tmpdir(tmp_path_factory): return tmp_path_factory.mktemp('test_match_files_to_lst_bins') + class TestMatchFilesToLSTBins: def make_empty_day(self, nfiles: int, **kwargs): - # We make the internal data really small here, since we care only about the + # We make the internal data really small here, since we care only about the # time metadata return mockuvd.make_day( nfiles=nfiles, ants=[0, 1, 2], - antpairs=[(0,0), (0,1)], + antpairs=[(0, 0), (0, 1)], freqs=np.array([155e6]), creator=mockuvd.create_mock_hera_obs, pols=('xx',), @@ -945,7 +948,7 @@ def make_empty_day(self, nfiles: int, **kwargs): @pytest.mark.parametrize('time_axis_faster_than_bls', [True, False, None]) @pytest.mark.parametrize('jd_regex', (r"zen\.(\d+\.\d+)\.", None)) def test_simple_zero2pi( - self, + self, files_sorted: bool, blts_are_rectangular: bool, time_axis_faster_than_bls: bool, @@ -956,14 +959,14 @@ def test_simple_zero2pi( which should match all files. This makes it easy to test different input parameter combinations. """ - + uvds = self.make_empty_day(nfiles=10, time_axis_faster_than_bls=bool(time_axis_faster_than_bls)) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) # Ensure that using lst-bin edges of (0, 2pi) works fine out_fls = utils.match_files_to_lst_bins( - lst_edges=(0, 2*np.pi), - file_list = fls, + lst_edges=(0, 2 * np.pi), + file_list=fls, files_sorted=files_sorted, blts_are_rectangular=blts_are_rectangular, jd_regex=jd_regex, @@ -979,13 +982,13 @@ def test_regex_vs_readtimes(self, tmpdir: Path): """ uvds = self.make_empty_day(nfiles=10) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) - + # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(uvds[0].lst_array.min(), (uvds[5].lst_array.min() +uvds[5].lst_array.max())/2 ) + lst_edges = (uvds[0].lst_array.min(), (uvds[5].lst_array.min() + uvds[5].lst_array.max()) / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 @@ -993,7 +996,7 @@ def test_regex_vs_readtimes(self, tmpdir: Path): out_fls_meta = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, jd_regex=None, ) @@ -1011,16 +1014,16 @@ def test_lst_bins_crossing_2pi(self, tmpdir: Path): total_dlst = dlst * nfiles * 2 uvds = self.make_empty_day( - nfiles=nfiles, lst_start=2*np.pi - total_dlst/2, integration_time=tint, + nfiles=nfiles, lst_start=2 * np.pi - total_dlst / 2, integration_time=tint, ) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(3*np.pi/2, np.pi/2) + lst_edges = (3 * np.pi / 2, np.pi / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 @@ -1028,38 +1031,38 @@ def test_lst_bins_crossing_2pi(self, tmpdir: Path): # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(3*np.pi/2, 2*np.pi) + lst_edges = (3 * np.pi / 2, 2 * np.pi) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 - assert len(out_fls[0]) == nfiles//2 + assert len(out_fls[0]) == nfiles // 2 # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(0, np.pi/2) + lst_edges = (0, np.pi / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 - assert len(out_fls[0]) == nfiles//2 - + assert len(out_fls[0]) == nfiles // 2 + def test_one_file_per_lstbin(self, tmpdir): uvds = self.make_empty_day(nfiles=10, jd_start=0.2) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) - - lst_edges=[ + + lst_edges = [ uvd.lst_array.min() for uvd in uvds ] lst_edges.append(uvds[-1].lst_array.max()) - + out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == len(lst_edges) - 1 @@ -1089,20 +1092,20 @@ def test_simple_ascending_with_offset(self): assert np.isclose(best, 0.1) def test_wrap_around_2pi(self): - lsts = np.linspace(3*np.pi/2, 5*np.pi/2, 100) + lsts = np.linspace(3 * np.pi / 2, 5 * np.pi / 2, 100) best = utils.get_best_lst_branch_cut(lsts) - assert np.isclose(best, 3*np.pi/2) + assert np.isclose(best, 3 * np.pi / 2) def test_wrap_around_2pi_starting_low(self): - lsts0 = np.linspace(0, np.pi/2) - lsts1 = np.linspace(3*np.pi/2, 2*np.pi) + lsts0 = np.linspace(0, np.pi / 2) + lsts1 = np.linspace(3 * np.pi / 2, 2 * np.pi) lsts = np.concatenate([lsts0, lsts1]) best = utils.get_best_lst_branch_cut(lsts) - assert np.isclose(best, 3*np.pi/2) + assert np.isclose(best, 3 * np.pi / 2) def test_with_crazy_periods(self): lsts = np.linspace(0, 1.0, 100) n = np.random.random_integers(10, size=100) - lsts += n*2*np.pi + lsts += n * 2 * np.pi best = utils.get_best_lst_branch_cut(lsts) - assert best == 0 \ No newline at end of file + assert best == 0 diff --git a/hera_cal/utils.py b/hera_cal/utils.py index 1d1cf4001..21873c038 100644 --- a/hera_cal/utils.py +++ b/hera_cal/utils.py @@ -790,8 +790,8 @@ def lst_rephase( bls: dict[tp.Baseline, np.ndarray] | np.ndarray, freqs: np.ndarray, dlst: float | list | np.ndarray, - lat: float =-30.721526120689507, - inplace: bool=True, + lat=-30.721526120689507, + inplace=True, array=False ): """ @@ -820,7 +820,7 @@ def lst_rephase( if True edit arrays in data in memory, else make a copy and return array Deprecated parameter that specifies that the input data is an array - with shape (ntimes, nfreqs, [npols]). Don't write code with this + with shape (ntimes, nfreqs, [npols]). Don't write code with this parameter -- instead just set the baseline axis of data to be length-1. Notes: @@ -901,7 +901,7 @@ def lst_rephase( if array: warnings.warn( "the 'array' parameter to lst_rephase is deprecated. Please provide data with second-axis of Nbls", - category=DeprecationWarning, + category=DeprecationWarning, ) data = data[:, None] @@ -933,7 +933,7 @@ def lst_rephase( if not inplace: if orig_type in (dict, OrderedDict, DataContainer): - return orig_type({k: data[:,i] for i,k in enumerate(keys)}) + return orig_type({k: data[:, i] for i, k in enumerate(keys)}) else: if array: return data[:, 0] @@ -1465,6 +1465,7 @@ def red_average(data, reds=None, bl_tol=1.0, inplace=False, else: return data + def match_files_to_lst_bins( lst_edges: tuple[float], file_list: Sequence[str | Path], @@ -1554,12 +1555,12 @@ def match_files_to_lst_bins( if len(lst_edges) < 2: raise ValueError("lst_edges must have at least 2 elements.") lst0 = lst_edges[0] - lst_edges=np.array([(lst - lst0)%(2*np.pi) + lst0 for lst in lst_edges]) + lst_edges = np.array([(lst - lst0) % (2 * np.pi) + lst0 for lst in lst_edges]) # We can have the case that an edges is at lst0 + 2pi exactly, which would # get wrapped around to lst0, but it should stay at lst0+2pi. - lst_edges[1:][lst_edges[1:] == lst_edges[0]] += 2*np.pi + lst_edges[1:][lst_edges[1:] == lst_edges[0]] += 2 * np.pi - if np.any(np.diff(lst_edges) < 0 ): + if np.any(np.diff(lst_edges) < 0): raise ValueError("lst_edges must not extend beyond 2pi total radians from start to finish.") lstmin, lstmax = lst_edges[0], lst_edges[-1] @@ -1597,10 +1598,12 @@ def match_files_to_lst_bins( if jd_regex is not None: jd_regex = re.compile(jd_regex) + @cache def get_first_time(path: Path) -> float: return float(jd_regex.findall(path.name)[0]) else: + @cache def get_first_time(path: Path) -> float: meta = _metas[path] @@ -1609,7 +1612,7 @@ def get_first_time(path: Path) -> float: def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: jdstart, jdend = jd_range - jdstart_for_file = jdstart - tint*(ntimes_per_file-1) - atol + jdstart_for_file = jdstart - tint * (ntimes_per_file - 1) - atol # The lst-edges might wrap around within a day. If so, when comparing times # to the bins, we need to use an OR instead of an AND. @@ -1630,22 +1633,21 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: break diff = jdstart_for_file - thistime - in_range = cmp(jdstart_for_file <= thistime, thistime <= jdend+atol) + in_range = cmp(jdstart_for_file <= thistime, thistime <= jdend + atol) if np.abs(diff) < best_diff and in_range: # This file has a time that is in our range, and is closest so far # to the start of the range. best_diff = np.abs(diff) best_index = index - - move_nfiles = round(diff / (tint*ntimes_per_file)) + move_nfiles = round(diff / (tint * ntimes_per_file)) if move_nfiles == 0: if in_range: break else: # Move by one file in the right direction, just to be sure. - move_nfiles = int(1*np.sign(diff)) + move_nfiles = int(1 * np.sign(diff)) if diff > 0: minidx = max(minidx, index) @@ -1654,7 +1656,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: index += move_nfiles if not (minidx < index < maxidx): - index = min(maxidx-1, max(minidx+1, (minidx + maxidx) // 2)) + index = min(maxidx - 1, max(minidx + 1, (minidx + maxidx) // 2)) if index < 0: index = 0 elif index >= len(meta_list): @@ -1674,10 +1676,10 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: # that out later. first_file = get_first_file_in_range(metadata_list, (jdstart, jdend)) except ValueError: - return [[] for _ in range(len(lst_edges)-1)] + return [[] for _ in range(len(lst_edges) - 1)] # Now, we have to actually go through the bins in lst_edges and assign files. - lstbin_files = [[] for _ in range(len(lst_edges)-1)] + lstbin_files = [[] for _ in range(len(lst_edges) - 1)] _meta_list = metadata_list[first_file:] + metadata_list[:first_file] # roll the files for i, fl in enumerate(_meta_list): @@ -1686,7 +1688,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: for lstbin, (jdstart, jdend) in enumerate(zip(jd_edges[:-1], jd_edges[1:])): # Go through each lst bin cmp = operator.and_ if (jdstart < jdend) else operator.or_ - if cmp(jdstart - (ntimes_per_file-1)*tint - atol <= thistime, thistime < jdend + atol): + if cmp(jdstart - (ntimes_per_file - 1) * tint - atol <= thistime, thistime < jdend + atol): lstbin_files[lstbin].append(fl) fl_in_range = True @@ -1695,6 +1697,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: return lstbin_files + def eq2top_m(ha, dec): """Return the 3x3 matrix converting equatorial coordinates to topocentric at the given hour angle (ha) and declination (dec). @@ -1778,19 +1781,20 @@ def chunk_baselines_by_redundant_groups(reds, max_chunk_size): baseline_chunks.append(grp) return baseline_chunks -def get_best_lst_branch_cut(lsts): + +def get_best_lst_branch_cut(lsts_in): # Sort them - lsts = np.sort(lsts) % (2*np.pi) - + lsts = np.sort(lsts_in % (2 * np.pi)) + # wrap lstg around in case the biggest gap is right at the start lsts = np.concatenate((lsts, [lsts[0]])) diff_lst = np.diff(lsts) while np.any(diff_lst < 0): - lsts[np.where(diff_lst < 0)[0] + 1] += 2*np.pi + lsts[np.where(diff_lst < 0)[0] + 1] += 2 * np.pi diff_lst = np.diff(lsts) idxmax = np.argmax(diff_lst) + 1 - return lsts[idxmax] % (2*np.pi) + return lsts[idxmax] % (2 * np.pi) def select_spw_ranges(inputfilename, outputfilename, spw_ranges=None, clobber=False): From 9285f86c86301a25f963a836920dfc7b416aa2ef Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 28 Aug 2023 15:55:07 -0700 Subject: [PATCH 14/81] fix bug in script also pep8 --- scripts/lstbin_simple.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/lstbin_simple.py b/scripts/lstbin_simple.py index 3cb2b526a..b10cf0817 100644 --- a/scripts/lstbin_simple.py +++ b/scripts/lstbin_simple.py @@ -45,28 +45,27 @@ # Turn a list into a list of 2-tuples. crules = kwargs.pop("calfile_rules") if crules is not None: - calfile_rules = [(crules[i], crules[i+1]) for i in range(len(crules)//2)] + calfile_rules = [(crules[i], crules[i + 1]) for i in range(len(crules) // 2)] else: calfile_rules = None -inprules = kwargs.pop("calfile_rules") +inprules = kwargs.pop("where-inpainted-file-rules") if inprules is not None: - inpaint_rules = [(inprules[i], inprules[i+1]) for i in range(len(inprules)//2)] + inpaint_rules = [(inprules[i], inprules[i + 1]) for i in range(len(inprules) // 2)] else: inpaint_rules = None - kwargs['save_channels'] = tuple(int(ch) for ch in kwargs['save_channels'].split(',')) kwargs['golden_lsts'] = tuple(float(lst) for lst in kwargs['golden_lsts'].split(',')) kwargs['output_flagged'] = not kwargs.pop('no_flagged_mode') kwargs['output_inpainted'] = kwargs.pop("do_inpaint_mode") run_with_profiling( - lstbin.lst_bin_files, + lstbin.lst_bin_files, args, config_file=kwargs.pop('configfile'), - calfile_rules=calfile_rules, - where_inpainted_file_rules = inpaint_rules, - write_kwargs=write_kwargs, + calfile_rules=calfile_rules, + where_inpainted_file_rules=inpaint_rules, + write_kwargs=write_kwargs, **kwargs ) From 67e43839f6055a6d76b9974f187c8c3f68aaa1fd Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 28 Aug 2023 16:03:20 -0700 Subject: [PATCH 15/81] try underscores --- scripts/lstbin_simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lstbin_simple.py b/scripts/lstbin_simple.py index b10cf0817..83c754bd2 100644 --- a/scripts/lstbin_simple.py +++ b/scripts/lstbin_simple.py @@ -49,7 +49,7 @@ else: calfile_rules = None -inprules = kwargs.pop("where-inpainted-file-rules") +inprules = kwargs.pop("where_inpainted_file_rules") if inprules is not None: inpaint_rules = [(inprules[i], inprules[i + 1]) for i in range(len(inprules) // 2)] else: From 7a07eaf8b25c50432c05c01f3dce75403cc3eb2f Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 28 Aug 2023 16:14:56 -0700 Subject: [PATCH 16/81] bug fix... enumerate wasnt being used --- hera_cal/lstbin_simple.py | 82 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 906254f9f..f43de06c3 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -311,13 +311,13 @@ def reduce_lst_bins( within an LST-bin that can be flagged before that combination is flagged in the LST-average. get_mad - Whether to compute the median and median absolute deviation of the data in each + Whether to compute the median and median absolute deviation of the data in each LST bin, in addition to the mean and standard deviation. Returns ------- dict - The reduced data in a dictionary. Keys are 'data' (the lst-binned mean), + The reduced data in a dictionary. Keys are 'data' (the lst-binned mean), 'nsamples', 'flags', 'days_binned' (the number of days that went into each bin), 'std' (standard deviation) and *otionally* 'median' and 'mad' (if `get_mad` is True). All values are arrays of the same shape: ``(nbl, nlst_bins, nfreq, npol)``. @@ -349,7 +349,7 @@ def reduce_lst_bins( # TODO: check that this doesn't make yet another copy... # This is just the data in this particular lst-bin. - + if d.size: d, f = get_masked_data(d, n, f, inpainted=inpf, inpainted_mode=inpainted_mode) f = threshold_flags(f, inplace=True, flag_thresh=flag_thresh) @@ -362,7 +362,7 @@ def reduce_lst_bins( out_nsamples[:, lstbin], days_binned[:, lstbin] ) = lst_average( - d, n, f, + d, n, f, inpainted_mode=inpainted_mode, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=sigma_clip_min_N, @@ -411,23 +411,23 @@ def get_masked_data( elif inpainted is None: # Flag if a whole blt is flagged: allf = np.all(flags, axis=2)[:,:, None, :] - + # Assume everything else that's flagged is inpainted. inpainted = flags.copy() * (~allf) flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) data = np.ma.masked_array( - data, + data, mask=(flags & ~inpainted) ) return data, flags def get_lst_median_and_mad( - data: np.ndarray | np.ma.MaskedArray, + data: np.ndarray | np.ma.MaskedArray, ): """Compute the median absolute deviation of a set of data over its zeroth axis. - + Flagged data will be ignored in flagged mode, but included in inpainted mode. Nsamples is not taken into account at all, unless Nsamples=0. """ @@ -568,12 +568,12 @@ def lst_average( std[~normalizable] = np.inf - - # While the previous norm is correct for normalizing the mean, we now + + # While the previous norm is correct for normalizing the mean, we now # calculate nsamples as the unflagged samples in each LST bin. nsamples.mask = flags nsamples = np.sum(nsamples, axis=0) - + return meandata.data, lstbin_flagged, std.data, nsamples.data, ndays_binned def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: @@ -671,7 +671,7 @@ def lst_bin_files_for_baselines( all frequencies will be included. where_inpainted_files A list of lists of strings, one for each file, where each file is a UVFlag file - specifying which data are in-painted. If not provided, no inpainting will be + specifying which data are in-painted. If not provided, no inpainting will be assumed. Returns @@ -797,7 +797,7 @@ def lst_bin_files_for_baselines( inpainted = io.load_flags(inpfile) if not isinstance(inpainted, DataContainer): raise ValueError(f"Expected {inpfile} to be a DataContainer") - + # We need to down-selecton times/freqs (bls and pols will be sub-selected # based on those in the data through the next loop) inpainted.select_or_expand_times(times=tarr, skip_bda_check=True) @@ -968,14 +968,14 @@ def filter_required_files_by_times( return tinds, time_arrays, all_lsts, file_list, cals, where_inpainted def _get_where_inpainted_files( - data_files: list[list[str | Path]], + data_files: list[list[str | Path]], where_inpainted_file_rules: list[tuple[str, str]] | None ) -> list[list[str | Path]] | None: if where_inpainted_file_rules is None: return None - + where_inpainted_files = [] - for dflist in enumerate(data_files): + for dflist in data_files: this = [] where_inpainted_files.append(this) for df in dflist: @@ -992,7 +992,7 @@ def _configure_inpainted_mode(output_flagged, output_inpainted, where_inpainted_ # Sort out defaults for inpaint/flagging mode if output_inpainted is None: output_inpainted = bool(where_inpainted_files) - + if not output_inpainted and not output_flagged: raise ValueError("Both output_inpainted and output_flagged are False. One must be True.") @@ -1052,7 +1052,7 @@ def lst_bin_files_single_outfile( use *both* modes by default. .. note:: Both "flagged" and "inpainted" mode as implemented in this function are - *not* spectrally smooth if the input Nsamples are not spectrally uniform. + *not* spectrally smooth if the input Nsamples are not spectrally uniform. Parameters ---------- @@ -1157,24 +1157,24 @@ def lst_bin_files_single_outfile( output_inpainted If True, output data LST-binned in in-painted mode. This mode does *not* flag data for the averaging, assuming that data that has flags has been in-painted - to improve spectral smoothness. It does take the flags into account for the + to improve spectral smoothness. It does take the flags into account for the LST-binned Nsamples, however. output_flagged If True, output data LST-binned in flagged mode. This mode *does* apply flags to the data before averaging. It will yield the same Nsamples as the in-painted mode, but simply ignores flagged data for the average, which can yield less - spectrally-smooth LST-binned results. + spectrally-smooth LST-binned results. where_inpainted_file_rules - Rules to transform the input data file names into the corresponding "where + Rules to transform the input data file names into the corresponding "where inpainted" files (which should be in UVFlag format). If provided, this indicates - that the data itself is in-painted, and the `output_inpainted` mode will be + that the data itself is in-painted, and the `output_inpainted` mode will be switched on by default. These files should specify which data is in-painted in the associated data file (which may be different than the in-situ flags of the data object). If not provided, but `output_inpainted` is set to True, all data-flags will be considered in-painted except for baseline-times that are fully flagged, which will be completely ignored. sigma_clip_in_inpainted_mode - If True, sigma clip the data in inpainted mode (if sigma-clipping is turned on). + If True, sigma clip the data in inpainted mode (if sigma-clipping is turned on). This is generally not a good idea, since the point of inpainting is to get smoother spectra, and sigma-clipping creates non-uniform Nsamples, which can lead to less smooth spectra. This option is only here to enable sigma-clipping @@ -1196,42 +1196,42 @@ def lst_bin_files_single_outfile( the data in flagged bl-channel-pols is used for the averaging, as it is considered to be in-painted. This gives the most spectrally-smooth results. In order to ignore a particular bl-channel-pol while still using inpaint mode, supply a "where inpainted" - file, which should be a UVFlag object that specifies which bl-channel-pols are - inpainted in the associated data file. Anything that's flagged but not inpainted is + file, which should be a UVFlag object that specifies which bl-channel-pols are + inpainted in the associated data file. Anything that's flagged but not inpainted is ignored for the averaging. In this inpainted mode, the Nsamples are the number of un-flagged integrations (whether they were in-painted or not). The LST-binned flags are only set to True if ALL of the nights for a given bl-channel-pol are flagged (again, whether they were in-painted or not). In flagged mode, both the Nsamples and Flags are the same as inpainted mode. The averaged data, however, ignores any - flagged data. This can lead to less spectrally-smooth results. The "STD" files + flagged data. This can lead to less spectrally-smooth results. The "STD" files contain LST-binned "data" that is the standard deviation of the data in each LST-bin. This differs between inpainted and flagged modes in the same way as the "LST" files: in inpainted mode, the flagged and inpainted data is used for calculating the sample variance, while in flagged mode, only the unflagged data is used. The final flags - in the "STD" files is equivalent to that in the "LST" files. The Nsamples in the + in the "STD" files is equivalent to that in the "LST" files. The Nsamples in the "STD" files is actually the number of unflagged nights in the LST bin (so, not the sum of Nsamples), where "unflagged" really does mean unflagged -- whether inpainted - or not. + or not. One note here about what is considered "flagged" vs. "flagged and inpainted" vs "flagged and not inpainted". In flagged mode, there are input flags that exist in the input files. These are potentially *augmented* by sigma clipping within the LST binner, and also by flagging whole LST bins if they have too few unflagged integrations. - In inpainted mode, input flags are considered as normal flags. However, only + In inpainted mode, input flags are considered as normal flags. However, only "non-inpainted" flags are *ignored* for the averaging. By default, all flagged data is considered to to be in-painted UNLESS it is a blt-pol that is fully flagged (i.e. all channels are flagged for an integration for a single bl and pol). However, you can tell the routine that other data is NOT in-painted by supplying a "where inpainted" - file. Now, integrations in LST-bins that end up having "too few" unflagged + file. Now, integrations in LST-bins that end up having "too few" unflagged integrations will be flagged inside the binner, however in inpainted mode, if these are considered "inpainted", they will still be used in averaging (this means they - will have "valid" data for the average, but their average will be flagged). - On the other hand, flags that are applied by sigma-clipping will be considered - NOT inpainted, i.e. those data will be ignored in the averaged, just like flagging + will have "valid" data for the average, but their average will be flagged). + On the other hand, flags that are applied by sigma-clipping will be considered + NOT inpainted, i.e. those data will be ignored in the averaged, just like flagging mode. In this case, either choice is bad: to include them in the average is bad because even though they may have been actually in-painted, whatever value they have is clearly triggering the sigma-clipper and is therefore an outlier. On the other - hand, to ignore them is bad because the point of in-painting mode is to get + hand, to ignore them is bad because the point of in-painting mode is to get smoother spectra, and this negates that. So, it's best just to not do sigma-clipping in inpainted mode. """ @@ -1252,7 +1252,7 @@ def lst_bin_files_single_outfile( output_inpainted, output_flagged, where_inpainted_files ) - + # Prune empty nights (some nights start with files, but have files removed because # they have no associated calibration) data_files = [df for df in data_files if df] @@ -1768,7 +1768,7 @@ def create_lstbin_output_file( lst_branch_cut: float = 0.0, ) -> Path: outdir = Path(outdir) - + # update history file_list_str = "-".join(ff.path.name for ff in file_list) file_history = f"{history} Input files: {file_list_str}" @@ -1778,7 +1778,7 @@ def create_lstbin_output_file( lst += 2*np.pi fname = outdir / fname_format.format( - kind=kind, lst=lst, pol=''.join(pols), + kind=kind, lst=lst, pol=''.join(pols), inpaint_mode='.inpaint' if inpaint_mode else ('.flagged' if inpaint_mode is False else "") ) @@ -2011,12 +2011,12 @@ def make_lst_bin_config_file( Regex to use to extract the JD from the file name. Set to None or empty to force the LST-matching to use the LSTs in the metadata within the file. lst_branch_cut - The LST at which to branch cut the LST grid for file writing. The JDs in the + The LST at which to branch cut the LST grid for file writing. The JDs in the output LST-binned files will be *lowest* at the lst_branch_cut, and all file names will have LSTs that are higher than lst_branch_cut. If None, this will be determined automatically by finding the largest gap in LSTs and starting AFTER it. - + Returns ------- config : dict @@ -2038,11 +2038,11 @@ def make_lst_bin_config_file( ntimes_per_file=ntimes_per_file, ) - # Get the best lst_branch_cut by finding the largest gap in LSTs and starting + # Get the best lst_branch_cut by finding the largest gap in LSTs and starting # AFTER it if lst_branch_cut is None: lst_branch_cut = float(utils.get_best_lst_branch_cut(np.concatenate(lst_grid))) - + dlst = lst_grid[0][1] - lst_grid[0][0] # Make it a real list of floats to make the YAML easier to read lst_grid = [[float(l) for l in lsts] for lsts in lst_grid] From 2757b8f3b6ac6efe18474d1fc780a7da4aa040dd Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Mon, 28 Aug 2023 16:46:08 -0700 Subject: [PATCH 17/81] remove . from inpaint_mode --- hera_cal/lstbin_simple.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index f43de06c3..b7f886cdc 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1757,7 +1757,7 @@ def create_lstbin_output_file( times: np.ndarray | None = None, lsts: np.ndarray | None = None, history: str = "", - fname_format: str="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + fname_format: str="zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", overwrite: bool = False, antpairs: list[tuple[int, int]] | None = None, freq_min: float | None = None, @@ -1779,7 +1779,7 @@ def create_lstbin_output_file( fname = outdir / fname_format.format( kind=kind, lst=lst, pol=''.join(pols), - inpaint_mode='.inpaint' if inpaint_mode else ('.flagged' if inpaint_mode is False else "") + inpaint_mode='inpaint' if inpaint_mode else ('flagged' if inpaint_mode is False else "") ) logger.info(f"Initializing {fname}") From 3cbe6164c994dccbd46981aff05f3bf60373269e Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Mon, 4 Sep 2023 03:13:07 -0700 Subject: [PATCH 18/81] test: select_freqs on datacontainer --- hera_cal/datacontainer.py | 74 ++++++++++++++-------------- hera_cal/tests/test_datacontainer.py | 66 +++++++++++++++++++++---- 2 files changed, 93 insertions(+), 47 deletions(-) diff --git a/hera_cal/datacontainer.py b/hera_cal/datacontainer.py index 9902c7293..32a35fedb 100644 --- a/hera_cal/datacontainer.py +++ b/hera_cal/datacontainer.py @@ -12,6 +12,7 @@ from .utils import conj_pol, comply_pol, make_bl, comply_bl, reverse_bl from .red_groups import RedundantGroups, Baseline, AntPair + class DataContainer: """Dictionary-like object that abstracts away the pol/ant pair ordering of data dictionaries and the the polarization case (i.e. 'nn' vs. 'NN'). Keys are in @@ -70,8 +71,7 @@ def __init__(self, data): setattr(self, attr, getattr(data, attr)) else: setattr(self, attr, None) - - + @property def dtype(self): """The dtype of the underlying data.""" @@ -83,7 +83,6 @@ def dtype(self): else: return None - def antpairs(self, pol=None): '''Return a set of antenna pairs (with a specific pol or more generally).''' if pol is None: @@ -187,7 +186,7 @@ def __delitem__(self, key): @property def shape(self) -> tuple[int]: return self[next(iter(self.keys()))].shape - + def concatenate(self, D, axis=0): '''Concatenates D, a DataContainer or a list of DCs, with self along an axis''' # check type of D @@ -517,14 +516,14 @@ def select_or_expand_times(self, new_times, in_place=True, skip_bda_check=False) return dc def select_freqs( - self, - freqs: np.ndarray | None = None, - channels: np.ndarray | slice | None = None, - in_place: bool=True + self, + freqs: np.ndarray | None = None, + channels: np.ndarray | slice | None = None, + in_place: bool = True ): """Update the object with a subset of frequencies (which may be repeated). - - While typically this will be used to down-select frequencies, one can + + While typically this will be used to down-select frequencies, one can 'expand' the frequencies by duplicating channels. Parameters @@ -532,7 +531,7 @@ def select_freqs( freqs : np.ndarray, optional Frequencies to select. If given, all frequencies must be in the datacontainer. channels : np.ndarray, slice, optional - Channels to select. If given, all channels must be in the datacontainer. + Channels to select. If given, all channels must be in the datacontainer. Only one of freqs or channels can be given. in_place : bool, optional If True, modify the object in place. Otherwise, return a modified copy. @@ -548,12 +547,15 @@ def select_freqs( return obj elif freqs is not None and channels is not None: raise ValueError('Cannot specify both freqs and channels.') - + if freqs is not None: + if obj.freqs is None: + raise ValueError('Cannot select frequencies if self.freqs is None.') + if not np.all([fq in obj.freqs for fq in freqs]): raise ValueError('All freqs must be in self.freqs.') channels = np.searchsorted(obj.freqs, freqs) - + if obj.freqs is None: warnings.warn("It is impossible to automatically detect which axis is frequency. Trying last axis.") axis = -1 @@ -563,21 +565,23 @@ def select_freqs( obj[bl] = obj[bl].take(channels, axis=axis) # update metadata - obj.freqs = obj.freqs[channels] + if obj.freqs is not None: + obj.freqs = obj.freqs[channels] return obj + class RedDataContainer(DataContainer): '''Structure for containing redundant visibilities that can be accessed by any one of the redundant baseline keys (or their conjugate).''' def __init__( - self, - data: DataContainer | dict[Baseline, np.ndarray], - reds: RedundantGroups | Sequence[Sequence[Baseline | AntPair]] | None=None, - antpos: dict[int, np.ndarray] | None=None, - bl_error_tol: float=1.0 - ): + self, + data: DataContainer | dict[Baseline, np.ndarray], + reds: RedundantGroups | Sequence[Sequence[Baseline | AntPair]] | None = None, + antpos: dict[int, np.ndarray] | None = None, + bl_error_tol: float = 1.0 + ): '''Creates a RedDataContainer. Parameters @@ -585,7 +589,7 @@ def __init__( data : DataContainer or dictionary of visibilities, just as one would pass into DataContainer(). Will error if multiple baselines are part of the same redundant group. reds : :class:`RedundantGroups` object, or list of lists of redundant baseline tuples, e.g. (ind1, ind2, pol). - These are the redundant groups of baselines. If not provided, will try to + These are the redundant groups of baselines. If not provided, will try to infer them from antpos. antpos: dictionary of antenna positions in the form {ant_index: np.array([x, y, z])}. Will error if one tries to provide both reds and antpos. If neither is provided, @@ -597,10 +601,10 @@ def __init__( Attributes ---------- - reds - A :class:`RedundantGroups` object that contains the redundant groups for - the entire array, and methods to manipulate them. - + reds + A :class:`RedundantGroups` object that contains the redundant groups for + the entire array, and methods to manipulate them. + ''' if reds is not None and antpos is not None: raise ValueError('Can only provide reds or antpos, not both.') @@ -620,25 +624,24 @@ def __init__( ) else: raise ValueError('Must provide reds, antpos, or have antpos available at data.antpos') - + if not isinstance(reds, RedundantGroups): reds = RedundantGroups(red_list=reds, antpos=self.antpos) self.build_red_keys(reds) - def build_red_keys(self, reds: RedundantGroups | list[list[Baseline]]): '''Build the dictionaries that map baselines to redundant keys. Arguments: reds: list of lists of redundant baseline tuples, e.g. (ind1, ind2, pol). ''' - + if isinstance(reds, RedundantGroups): self.reds = reds else: self.reds = RedundantGroups(red_list=reds, antpos=getattr(self, 'antpos', None)) - + self._reds_keyed_on_data = self.reds.keyed_on_bls(bls=self.bls()) # delete unused data to avoid leaking memory @@ -652,24 +655,23 @@ def build_red_keys(self, reds: RedundantGroups | list[list[Baseline]]): raise ValueError( 'RedDataContainer can only be constructed with (at most) one baseline per group, ' f'but {bl} is redundant with {redkeys[ubl]}.' - ) + ) else: redkeys[ubl] = bl - def get_ubl_key(self, bl): '''Returns the blkey used to internally denote the data stored. - + If this bl is in a redundant group present in the data, this will return the blkey that exists in the data. Otherwise, it will return the array-wide blkey representing this group. ''' return self._reds_keyed_on_data.get_ubl_key(bl) - + def get_red(self, key): '''Returns the list of baselines in the array redundant with this key. - - Note: this is not just baselines existing in the data itself, but in the + + Note: this is not just baselines existing in the data itself, but in the entire array. ''' return self.reds[key] @@ -694,8 +696,6 @@ def __setitem__(self, key, value): super().__setitem__(ubl_key, value) - def __contains__(self, key): '''Returns true if the baseline redundant with the key is in the data.''' return (key in self.reds) and (super().__contains__(self.get_ubl_key(key))) - \ No newline at end of file diff --git a/hera_cal/tests/test_datacontainer.py b/hera_cal/tests/test_datacontainer.py index 1ec1be146..683927005 100644 --- a/hera_cal/tests/test_datacontainer.py +++ b/hera_cal/tests/test_datacontainer.py @@ -180,10 +180,10 @@ def test_del(self): del dc['bad_key'] with pytest.raises(ValueError, match='Tuple keys to delete must be in the format'): - del dc[(1,2,3,4)] - + del dc[(1, 2, 3, 4)] + with pytest.raises(ValueError, match='Tuple keys to delete must be in the format'): - del dc[[1,2,'xx']] + del dc[[1, 2, 'xx']] def test_getitem(self): dc = datacontainer.DataContainer(self.blpol) @@ -314,7 +314,7 @@ def test_dtype(self): assert dc.dtype is None dc = datacontainer.DataContainer({(1, 2): {'xx': 2}}) assert dc.dtype is None - + dc = datacontainer.DataContainer(self.blpolarr) assert dc.dtype == complex dc = datacontainer.DataContainer(self.bools) @@ -326,18 +326,19 @@ class SmallDataContainer: def __init__(self, d: dict): self._data = deepcopy(d) self.ants = set(sum(self._data.keys(), ())) - + def __getitem__(self, key): return self._data[key] - + def keys(self): return self._data.keys() - + blpol = SmallDataContainer(self.blpol) - + dc = datacontainer.DataContainer(blpol) assert dc.ants == blpol.ants + @pytest.mark.filterwarnings("ignore:The default for the `center` keyword has changed") class TestDataContainerWithRealData: @@ -467,6 +468,52 @@ def test_select_or_expand_times(self): assert np.all(dc.lsts == (np.arange(10) * 2 * np.pi / 10)[new_times]) assert np.all(dc.lsts_by_bl[0, 1] == (np.arange(10) * 2 * np.pi / 10)[new_times]) + def test_select_freqs(self): + fq = np.linspace(0, 1, 10) + dc1 = datacontainer.DataContainer({(0, 1, 'ee'): np.arange(10)}) + dc1.freqs = fq + + # Both of the following should pick all the freqs + dc2 = dc1.select_freqs(fq, in_place=False) + dc1.select_freqs(fq, in_place=True) + dc1.select_freqs() + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(10)) + assert np.all(dc.freqs == fq) + + # Now actually sub-select + fq2 = fq[:9] + dc2 = dc1.select_freqs(fq2, in_place=False) + dc1.select_freqs(fq2, in_place=True) + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(9)) + assert np.all(dc.freqs == fq2) + + # Sub-select by channel + chans = np.arange(8) + dc2 = dc1.select_freqs(channels=chans, in_place=False) + dc1.select_freqs(channels=chans, in_place=True) + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(8)) + assert np.all(dc.freqs == fq[chans]) + + # Error if both freqs and chans given + with pytest.raises(ValueError, match='Cannot specify both freqs and channels'): + dc1.select_freqs(fq, channels=chans) + + # Ensure warning is raised without freqs + dc1.freqs = None + with pytest.warns(UserWarning, match='It is impossible'): + dc1.select_freqs(channels=chans) + + assert len(dc1[(0, 1, 'ee')]) == 8 + + with pytest.raises(ValueError, match='Cannot select frequencies'): + dc1.select_freqs(fq2) + def test_RedDataContainer(): # build hexagonal array and data which tells us which baseline it comes from @@ -486,7 +533,7 @@ def test_RedDataContainer(): # build an incomplete datacontainer, then finish it rdata6 = datacontainer.RedDataContainer(deepcopy(data), reds[:-1]) rdata6[reds[-1][0]] = deepcopy(data[reds[-1][0]]) - #rdata6.build_red_keys(reds) + # rdata6.build_red_keys(reds) # make sure that the data for a redundant group are being accessed from the same place in memory for i, rdata in enumerate([rdata1, rdata2, rdata3, rdata4, rdata5, rdata6]): @@ -555,4 +602,3 @@ def test_RedDataContainerKeyManipulation(): rdc[1, 3, 'ee'] = 21j with pytest.raises(ValueError): rdc.build_red_keys([[(0, 2, 'ee'), (1, 3, 'ee')]]) - From 9fea23c35f083d49b7b61608fe22d6ecd4c15597 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Mon, 4 Sep 2023 04:05:32 -0700 Subject: [PATCH 19/81] test: using where_inpainted_file_rules for lstbin --- hera_cal/io.py | 19 +- hera_cal/lstbin_simple.py | 761 ++++++++++++++++--------- hera_cal/tests/mock_uvdata.py | 156 +++-- hera_cal/tests/test_lstbin_simple.py | 815 ++++++++++++++++++--------- 4 files changed, 1150 insertions(+), 601 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 3641b83bc..90230a9e6 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -460,7 +460,7 @@ def get_blt_slices(uvo, tried_to_reorder=False): if getattr(uvo, 'blts_are_rectangular', False): if uvo.time_axis_faster_than_bls: for i in range(uvo.Nbls): - start = i*uvo.Ntimes + start = i * uvo.Ntimes antp = (uvo.ant_1_array[start], uvo.ant_2_array[start]) blt_slices[antp] = slice(start, start + uvo.Ntimes, 1) assert uvo.Nbls == len(blt_slices) @@ -479,8 +479,10 @@ def get_blt_slices(uvo, tried_to_reorder=False): uvo.reorder_blts(order='time') return get_blt_slices(uvo, tried_to_reorder=True) else: - raise NotImplementedError('UVData objects with non-regular spacing of ' - 'baselines in its baseline-times are not supported.') + raise NotImplementedError( + 'UVData objects with non-regular spacing of ' + 'baselines in its baseline-times are not supported.' + ) else: blt_slices[(ant1, ant2)] = slice(indices[0], indices[-1] + 1, indices[1] - indices[0]) return blt_slices @@ -797,7 +799,7 @@ def read(self, bls=None, polarizations=None, times=None, time_range=None, lsts=N locs = locals() partials = ['bls', 'polarizations', 'times', 'time_range', 'lsts', 'lst_range', 'frequencies', 'freq_chans'] self.last_read_kwargs = {p: locs[p] for p in partials} - + # if filepaths is None, this was converted to HERAData # from a different pre-loaded object with no history of filepath if self.filepaths is not None: @@ -1618,6 +1620,8 @@ def load_flags(flagfile, filetype='h5', return_meta=False): # data container only supports standard polarizations strings if np.issubdtype(uvf.polarization_array.dtype, np.signedinteger): flags = DataContainer(flags) + flags.times = times + flags.freqs = freqs elif uvf.type == 'antenna': # one time x freq waterfall per antenna for i, ant in enumerate(uvf.ant_array): @@ -2234,9 +2238,6 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags return uvd - - - def update_uvdata(uvd, data=None, flags=None, nsamples=None, add_to_history='', **kwargs): '''Updates a UVData/HERAData object with data or parameters. Cannot modify the shape of data arrays. More than one spectral window is not supported. Assumes every baseline @@ -2719,6 +2720,7 @@ def throw_away_flagged_ants_parser(): ap.add_argument("--clobber", default=False, action="store_true", help='overwrites existing file at outfile') return ap + def uvdata_from_fastuvh5( meta: FastUVH5Meta, antpairs: list[tuple[int, int]] | None = None, @@ -2726,7 +2728,8 @@ def uvdata_from_fastuvh5( lsts: np.ndarray | None = None, start_jd: float | None = None, lst_branch_cut: float = 0.0, - **kwargs) -> UVData: + **kwargs +) -> UVData: """Convert a FastUVH5Meta object to a UVData object. This is a convenience function to convert a FastUVH5Meta object to a UVData diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index b7f886cdc..d7e00fcab 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -39,11 +39,14 @@ try: profile except NameError: + def profile(fnc): return fnc + logger = logging.getLogger(__name__) + @profile def lst_align( data: np.ndarray, @@ -139,9 +142,7 @@ def lst_align( flags = np.zeros(data.shape, dtype=bool) if flags.shape != data.shape: - raise ValueError( - f"flags should have shape {data.shape} but got {flags.shape}" - ) + raise ValueError(f"flags should have shape {data.shape} but got {flags.shape}") if nsamples is None: nsamples = np.ones(data.shape, dtype=float) @@ -158,13 +159,11 @@ def lst_align( adjust_lst_bin_edges(lst_bin_edges) if not np.all(np.diff(lst_bin_edges) > 0): - raise ValueError( - "lst_bin_edges must be monotonically increasing." - ) + raise ValueError("lst_bin_edges must be monotonically increasing.") # Now ensure that all the observed LSTs are wrapped so they start above the first bin edges grid_indices, data_lsts, lst_mask = get_lst_bins(data_lsts, lst_bin_edges) - lst_bin_centres = (lst_bin_edges[1:] + lst_bin_edges[:-1])/2 + lst_bin_centres = (lst_bin_edges[1:] + lst_bin_edges[:-1]) / 2 logger.info(f"Data Shape: {data.shape}") @@ -229,7 +228,10 @@ def lst_align( return lst_bin_centres, _data, _flags, _nsamples, _where_inpainted or None -def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + +def get_lst_bins( + lsts: np.ndarray, edges: np.ndarray +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Get the LST bin indices for a set of LSTs. Parameters @@ -252,14 +254,17 @@ def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.nd lsts = np.array(lsts).copy() # Now ensure that all the observed LSTs are wrapped so they start above the first bin edges - lsts %= 2*np.pi - lsts[lsts < edges[0]] += 2* np.pi + lsts %= 2 * np.pi + lsts[lsts < edges[0]] += 2 * np.pi bins = np.digitize(lsts, edges, right=True) - 1 - mask = (bins >= 0) & (bins < (len(edges)-1)) + mask = (bins >= 0) & (bins < (len(edges) - 1)) return bins, lsts, mask + def reduce_lst_bins( - data: list[np.ndarray], flags: list[np.ndarray], nsamples: list[np.ndarray], + data: list[np.ndarray], + flags: list[np.ndarray], + nsamples: list[np.ndarray], where_inpainted: list[np.ndarray] | None = None, inpainted_mode: bool = False, mutable: bool = False, @@ -344,14 +349,18 @@ def reduce_lst_bins( if where_inpainted is None: where_inpainted = [None] * nlst_bins - for lstbin, (d,n,f, inpf) in enumerate(zip(data, nsamples, flags, where_inpainted)): + for lstbin, (d, n, f, inpf) in enumerate( + zip(data, nsamples, flags, where_inpainted) + ): logger.info(f"Computing LST bin {lstbin+1} / {nlst_bins}") # TODO: check that this doesn't make yet another copy... # This is just the data in this particular lst-bin. if d.size: - d, f = get_masked_data(d, n, f, inpainted=inpf, inpainted_mode=inpainted_mode) + d, f = get_masked_data( + d, n, f, inpainted=inpf, inpainted_mode=inpainted_mode + ) f = threshold_flags(f, inplace=True, flag_thresh=flag_thresh) d.mask |= f @@ -360,9 +369,11 @@ def reduce_lst_bins( out_flags[:, lstbin], out_std[:, lstbin], out_nsamples[:, lstbin], - days_binned[:, lstbin] + days_binned[:, lstbin], ) = lst_average( - d, n, f, + d, + n, + f, inpainted_mode=inpainted_mode, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=sigma_clip_min_N, @@ -394,14 +405,18 @@ def reduce_lst_bins( return out + def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): data = np.full(shape, d, dtype=complex) flags = np.full(shape, f, dtype=bool) nsamples = np.full(shape, n, dtype=float) return data, flags, nsamples + def get_masked_data( - data: np.ndarray, nsamples: np.ndarray, flags: np.ndarray, + data: np.ndarray, + nsamples: np.ndarray, + flags: np.ndarray, inpainted: np.ndarray | None = None, inpainted_mode: bool = False, ) -> np.ma.MaskedArray: @@ -410,16 +425,13 @@ def get_masked_data( inpainted = np.zeros(flags.shape, dtype=bool) elif inpainted is None: # Flag if a whole blt is flagged: - allf = np.all(flags, axis=2)[:,:, None, :] + allf = np.all(flags, axis=2)[:, :, None, :] # Assume everything else that's flagged is inpainted. inpainted = flags.copy() * (~allf) flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) - data = np.ma.masked_array( - data, - mask=(flags & ~inpainted) - ) + data = np.ma.masked_array(data, mask=(flags & ~inpainted)) return data, flags @@ -433,9 +445,9 @@ def get_lst_median_and_mad( """ fnc = np.ma.median if isinstance(data, np.ma.MaskedArray) else np.median med = fnc(data, axis=0) - madrl = fnc(np.abs(data.real - med.real), axis=0)*1.482579 - madim = fnc(np.abs(data.imag - med.imag), axis=0)*1.482579 - return med, madrl + madim*1j + madrl = fnc(np.abs(data.real - med.real), axis=0) * 1.482579 + madim = fnc(np.abs(data.imag - med.imag), axis=0) * 1.482579 + return med, madrl + madim * 1j def threshold_flags( @@ -449,18 +461,25 @@ def threshold_flags( # Flag entire LST bins if there are too many flags over time flag_frac = np.sum(flags, axis=0) / flags.shape[0] nflags = np.sum(flags) - logger.info(f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%") + logger.info( + f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%" + ) flags |= flag_frac > flag_thresh - logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}") + logger.info( + f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}" + ) return flags + @profile def lst_average( - data: np.ndarray | np.ma.MaskedArray, nsamples: np.ndarray, flags: np.ndarray, + data: np.ndarray | np.ma.MaskedArray, + nsamples: np.ndarray, + flags: np.ndarray, inpainted_mode: bool = False, - sigma_clip_thresh: float | None= None, - sigma_clip_min_N: int=4, + sigma_clip_thresh: float | None = None, + sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ @@ -508,7 +527,9 @@ def lst_average( # the flags or the 'non-inpainted flags', as obtained by `threshold_flags`. # However, if this hasn't been called, and we just have an array, apply flags # appropriately here. - data, flags = get_masked_data(data, nsamples, flags, inpainted_mode=inpainted_mode) + data, flags = get_masked_data( + data, nsamples, flags, inpainted_mode=inpainted_mode + ) # Now do sigma-clipping. if sigma_clip_thresh is not None: @@ -520,8 +541,12 @@ def lst_average( ) nflags = np.sum(flags) - clip_flags = sigma_clip(data.real, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) - clip_flags |= sigma_clip(data.imag, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) + clip_flags = sigma_clip( + data.real, sigma=sigma_clip_thresh, min_N=sigma_clip_min_N + ) + clip_flags |= sigma_clip( + data.imag, sigma=sigma_clip_thresh, min_N=sigma_clip_min_N + ) # Need to restore min_N condition properly here because it's not done properly in sigma_clip sc_min_N = np.sum(~flags, axis=0) < sigma_clip_min_N @@ -531,13 +556,17 @@ def lst_average( data.mask |= clip_flags - logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping") + logger.info( + f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping" + ) # Here we do a check to make sure Nsamples is uniform across frequency. # Do this before setting non_inpainted to zero nsamples. ndiff = np.diff(nsamples, axis=2) if np.any(ndiff != 0): - warnings.warn("Nsamples is not uniform across frequency. This will result in spectral structure.") + warnings.warn( + "Nsamples is not uniform across frequency. This will result in spectral structure." + ) nsamples = np.ma.masked_array(nsamples, mask=data.mask) @@ -561,21 +590,23 @@ def lst_average( logger.info("Calculating std") with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Degrees of freedom <= 0 for slice.") - std = np.square(data.real - meandata.real) + 1j*np.square(data.imag - meandata.imag) + std = np.square(data.real - meandata.real) + 1j * np.square( + data.imag - meandata.imag + ) std = np.sum(std * nsamples, axis=0) std[normalizable] /= norm[normalizable] - std = np.sqrt(std.real) + 1j*np.sqrt(std.imag) + std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) std[~normalizable] = np.inf - # While the previous norm is correct for normalizing the mean, we now # calculate nsamples as the unflagged samples in each LST bin. nsamples.mask = flags - nsamples = np.sum(nsamples, axis=0) + nsamples = np.sum(nsamples, axis=0) return meandata.data, lstbin_flagged, std.data, nsamples.data, ndays_binned + def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: """ Adjust the LST bin edges so that they start in the range [0, 2pi) and increase. @@ -583,9 +614,10 @@ def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: Performs the adjustment in-place. """ while lst_bin_edges[0] < 0: - lst_bin_edges += 2*np.pi - while lst_bin_edges[0] >= 2*np.pi: - lst_bin_edges -= 2*np.pi + lst_bin_edges += 2 * np.pi + while lst_bin_edges[0] >= 2 * np.pi: + lst_bin_edges -= 2 * np.pi + def lst_bin_files_for_baselines( data_files: list[Path | FastUVH5Meta], @@ -691,7 +723,12 @@ def lst_bin_files_for_baselines( times_in_bins The JDs that are in each LST bin -- a list of arrays. """ - metas = [fl if isinstance(fl, FastUVH5Meta) else FastUVH5Meta(fl, blts_are_rectangular=True) for fl in data_files] + metas = [ + fl + if isinstance(fl, FastUVH5Meta) + else FastUVH5Meta(fl, blts_are_rectangular=True) + for fl in data_files + ] lst_bin_edges = np.array(lst_bin_edges) @@ -727,26 +764,28 @@ def lst_bin_files_for_baselines( if time_idx is None: adjust_lst_bin_edges(lst_bin_edges) - lst_bin_edges %= 2*np.pi + lst_bin_edges %= 2 * np.pi op = np.logical_and if lst_bin_edges[0] < lst_bin_edges[-1] else np.logical_or time_idx = [] for meta in metas: - _lsts = meta.get_transactional('lsts') + _lsts = meta.get_transactional("lsts") time_idx.append(op(_lsts >= lst_bin_edges[0], _lsts < lst_bin_edges[-1])) if time_arrays is None: - time_arrays = [meta.get_transactional('times')[idx] for meta, idx in zip(metas, time_idx)] + time_arrays = [ + meta.get_transactional("times")[idx] for meta, idx in zip(metas, time_idx) + ] if lsts is None: lsts = np.concatenate( - [meta.get_transactional('lsts')[idx] for meta, idx in zip(metas, time_idx)] + [meta.get_transactional("lsts")[idx] for meta, idx in zip(metas, time_idx)] ) # Now we can set up our master arrays of data. data, flags, nsamples = _allocate_dfn( (len(lsts), len(antpairs), len(freqs), len(pols)), - d=np.nan + np.nan*1j, - f=True + d=np.nan + np.nan * 1j, + f=True, ) if where_inpainted_files is None or all(w is None for w in where_inpainted_files): @@ -765,18 +804,29 @@ def lst_bin_files_for_baselines( # This loop actually reads the associated data in this LST bin. ntimes_so_far = 0 - for meta, calfl, tind, tarr, inpfile in zip(metas, cal_files, time_idx, time_arrays, where_inpainted_files): + for meta, calfl, tind, tarr, inpfile in zip( + metas, cal_files, time_idx, time_arrays, where_inpainted_files + ): logger.info(f"Reading {meta.path}") - slc = slice(ntimes_so_far,ntimes_so_far+len(tarr)) + slc = slice(ntimes_so_far, ntimes_so_far + len(tarr)) ntimes_so_far += len(tarr) - #hd = io.HERAData(str(fl.path), filetype='uvh5') - data_antpairs = meta.get_transactional('antpairs') + # hd = io.HERAData(str(fl.path), filetype='uvh5') + data_antpairs = meta.get_transactional("antpairs") if redundantly_averaged: - bls_to_load = [bl for bl in data_antpairs if reds.get_ubl_key(bl) in antpairs or reds.get_ubl_key(bl[::-1]) in antpairs] + bls_to_load = [ + bl + for bl in data_antpairs + if reds.get_ubl_key(bl) in antpairs + or reds.get_ubl_key(bl[::-1]) in antpairs + ] else: - bls_to_load = [bl for bl in antpairs if bl in data_antpairs or bl[::-1] in data_antpairs] + bls_to_load = [ + bl + for bl in antpairs + if bl in data_antpairs or bl[::-1] in data_antpairs + ] if not bls_to_load or not np.any(tind): # If none of the requested baselines are in this file, then just @@ -789,7 +839,10 @@ def lst_bin_files_for_baselines( # TODO: use Fast readers here instead. _data, _flags, _nsamples = io.HERAData(meta.path).read( - bls=bls_to_load, times=tarr, freq_chans=freq_chans, polarizations=pols, + bls=bls_to_load, + times=tarr, + freq_chans=freq_chans, + polarizations=pols, ) if inpfile is not None: # This returns a DataContainer (unless something went wrong) since it should @@ -800,7 +853,7 @@ def lst_bin_files_for_baselines( # We need to down-selecton times/freqs (bls and pols will be sub-selected # based on those in the data through the next loop) - inpainted.select_or_expand_times(times=tarr, skip_bda_check=True) + inpainted.select_or_expand_times(new_times=tarr, skip_bda_check=True) inpainted.select_freqs(channels=freq_chans) else: inpainted = None @@ -820,8 +873,11 @@ def lst_bin_files_for_baselines( gains, cal_flags, _, _ = uvc.build_calcontainers() apply_cal.calibrate_in_place( - _data, gains, data_flags=_flags, cal_flags=cal_flags, - gain_convention=uvc.gain_convention + _data, + gains, + data_flags=_flags, + cal_flags=cal_flags, + gain_convention=uvc.gain_convention, ) for i, bl in enumerate(antpairs): @@ -855,8 +911,8 @@ def lst_bin_files_for_baselines( where_inpainted=where_inpainted, antpairs=antpairs, lst_bin_edges=lst_bin_edges, - freq_array = freqs, - rephase = rephase, + freq_array=freqs, + rephase=rephase, antpos=antpos, ) @@ -869,10 +925,11 @@ def lst_bin_files_for_baselines( return bin_lst, data, flags, nsamples, where_inpainted, times_in_bins + def apply_calfile_rules( data_files: list[list[str]], calfile_rules: list[tuple[str, str]], - ignore_missing: bool + ignore_missing: bool, ) -> tuple[list[list[str]], list[list[str]]]: """ Apply a set of rules to convert data file names to calibration file names. @@ -918,16 +975,16 @@ def apply_calfile_rules( data_files[night] = [df for df in dflist if df not in missing] return data_files, input_cals + def filter_required_files_by_times( lst_range: tuple[float, float], data_metas: list[list[FastUVH5Meta]], cal_files: list[list[str]] | None = None, where_inpainted_files: list[list[str]] | None = None, ) -> tuple[list, list, list, list, list, list]: - lstmin, lstmax = lst_range - lstmin %= (2 * np.pi) - lstmax %= (2 * np.pi) + lstmin %= 2 * np.pi + lstmax %= 2 * np.pi if lstmin > lstmax: lstmax += 2 * np.pi @@ -952,7 +1009,7 @@ def filter_required_files_by_times( for night, callist, inplist in zip(data_metas, cal_files, where_inpainted_files): for meta, cal, inp in zip(night, callist, inplist): tarr = meta.times - lsts = meta.lsts % (2*np.pi) + lsts = meta.lsts % (2 * np.pi) lsts[lsts < lstmin] += 2 * np.pi tind = (lsts > lstmin) & (lsts < lstmax) @@ -967,9 +1024,10 @@ def filter_required_files_by_times( return tinds, time_arrays, all_lsts, file_list, cals, where_inpainted + def _get_where_inpainted_files( data_files: list[list[str | Path]], - where_inpainted_file_rules: list[tuple[str, str]] | None + where_inpainted_file_rules: list[tuple[str, str]] | None, ) -> list[list[str | Path]] | None: if where_inpainted_file_rules is None: return None @@ -979,8 +1037,9 @@ def _get_where_inpainted_files( this = [] where_inpainted_files.append(this) for df in dflist: - wif = df + wif = str(df) for rule in where_inpainted_file_rules: + print(rule[0], "HA", rule[1]) wif = wif.replace(rule[0], rule[1]) if os.path.exists(wif): this.append(wif) @@ -988,47 +1047,51 @@ def _get_where_inpainted_files( raise IOError(f"Where inpainted file {wif} does not exist") return where_inpainted_files + def _configure_inpainted_mode(output_flagged, output_inpainted, where_inpainted_files): # Sort out defaults for inpaint/flagging mode if output_inpainted is None: output_inpainted = bool(where_inpainted_files) if not output_inpainted and not output_flagged: - raise ValueError("Both output_inpainted and output_flagged are False. One must be True.") + raise ValueError( + "Both output_inpainted and output_flagged are False. One must be True." + ) return output_flagged, output_inpainted + def lst_bin_files_single_outfile( config_opts: dict[str, Any], metadata: dict[str, Any], lst_bins: np.ndarray, data_files: list[list[str]], - calfile_rules: list[tuple[str, str]] | None= None, + calfile_rules: list[tuple[str, str]] | None = None, ignore_missing_calfiles: bool = False, outdir: str | Path | None = None, reds: RedundantGroups | None = None, redundantly_averaged: bool | None = None, only_last_file_per_night: bool = False, - history: str = '', - fname_format: str="zen.{kind}.{lst:7.5f}.uvh5", - overwrite: bool=False, - rephase: bool=False, - Nbls_to_load: int | None=None, - ignore_flags: bool=False, - include_autos: bool=True, + history: str = "", + fname_format: str = "zen.{kind}.{lst:7.5f}.uvh5", + overwrite: bool = False, + rephase: bool = False, + Nbls_to_load: int | None = None, + ignore_flags: bool = False, + include_autos: bool = True, ex_ant_yaml_files=None, - ignore_ants: tuple[int]=(), + ignore_ants: tuple[int] = (), write_kwargs: dict | None = None, save_channels: list[int] = (), golden_lsts: tuple[float] = (), - sigma_clip_thresh: float | None= None, + sigma_clip_thresh: float | None = None, sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, flag_thresh: float = 0.7, freq_min: float | None = None, freq_max: float | None = None, output_inpainted: bool | None = None, - output_flagged: bool = True, + output_flagged: bool = True, where_inpainted_file_rules: list[tuple[str, str]] | None = None, sigma_clip_in_inpainted_mode: bool = False, write_med_mad: bool = False, @@ -1252,7 +1315,6 @@ def lst_bin_files_single_outfile( output_inpainted, output_flagged, where_inpainted_files ) - # Prune empty nights (some nights start with files, but have files removed because # they have no associated calibration) data_files = [df for df in data_files if df] @@ -1262,12 +1324,14 @@ def lst_bin_files_single_outfile( for dflist in data_files: logger.info(f"{dflist[0].split('/')[-1]}: {len(dflist)}") - data_metas = [[ - FastUVH5Meta( - df, - blts_are_rectangular=metadata['blts_are_rectangular'], - time_axis_faster_than_bls=metadata['time_axis_faster_than_bls'] - ) for df in dflist + data_metas = [ + [ + FastUVH5Meta( + df, + blts_are_rectangular=metadata["blts_are_rectangular"], + time_axis_faster_than_bls=metadata["time_axis_faster_than_bls"], + ) + for df in dflist ] for dflist in data_files ] @@ -1276,7 +1340,7 @@ def lst_bin_files_single_outfile( if outdir is None: outdir = os.path.dirname(os.path.commonprefix(abscal.flatten(data_files))) - start_jd = metadata['start_jd'] + start_jd = metadata["start_jd"] # get metadata logger.info("Getting metadata from first file...") @@ -1288,16 +1352,13 @@ def lst_bin_files_single_outfile( # all the antenna positions are included in every file. antpos = dict(zip(meta.antenna_numbers, meta.antpos_enu)) if reds is None: - reds = RedundantGroups.from_antpos( - antpos=antpos, - include_autos=include_autos - ) + reds = RedundantGroups.from_antpos(antpos=antpos, include_autos=include_autos) if redundantly_averaged is None: # Try to work out if the files are redundantly averaged. # just look at the middle file from each night. for fl_list in data_metas: - meta = fl_list[len(fl_list)//2] + meta = fl_list[len(fl_list) // 2] antpairs = meta.get_transactional("antpairs") ubls = {reds.get_ubl_key(ap) for ap in antpairs} if len(ubls) != len(antpairs): @@ -1319,11 +1380,11 @@ def lst_bin_files_single_outfile( reds=reds, only_last_file_per_night=only_last_file_per_night, ) - nants0 = meta.header['antenna_numbers'].size + nants0 = meta.header["antenna_numbers"].size # Do a quick check to make sure all nights at least have the same number of Nants for dflist in data_metas: - _nants = dflist[0].header['antenna_numbers'].size + _nants = dflist[0].header["antenna_numbers"].size dflist[0].close() if _nants != nants0: raise ValueError( @@ -1336,19 +1397,29 @@ def lst_bin_files_single_outfile( if Nbls_to_load is None: Nbls_to_load = len(all_baselines) + 1 n_bl_chunks = len(all_baselines) // Nbls_to_load + 1 - bl_chunks = [all_baselines[i * Nbls_to_load:(i + 1) * Nbls_to_load] for i in range(n_bl_chunks)] + bl_chunks = [ + all_baselines[i * Nbls_to_load: (i + 1) * Nbls_to_load] + for i in range(n_bl_chunks) + ] bl_chunks = [blg for blg in bl_chunks if len(blg) > 0] - dlst = config_opts['dlst'] + dlst = config_opts["dlst"] lst_bin_edges = np.array( - [x - dlst/2 for x in lst_bins] + [lst_bins[-1] + dlst/2] + [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] ) - tinds, time_arrays, all_lsts, file_list, cals, where_inpainted_files = filter_required_files_by_times( + ( + tinds, + time_arrays, + all_lsts, + file_list, + cals, + where_inpainted_files, + ) = filter_required_files_by_times( (lst_bin_edges[0], lst_bin_edges[-1]), data_metas, input_cals, - where_inpainted_files + where_inpainted_files, ) # If we have no times at all for this file, just return if len(all_lsts) == 0: @@ -1372,7 +1443,7 @@ def lst_bin_files_single_outfile( # make it a bit easier to create the outfiles create_outfile = partial( create_lstbin_output_file, - outdir = outdir, + outdir=outdir, pols=all_pols, file_list=file_list, history=history, @@ -1382,7 +1453,7 @@ def lst_bin_files_single_outfile( start_jd=start_jd, freq_min=freq_min, freq_max=freq_max, - lst_branch_cut=metadata['lst_branch_cut'], + lst_branch_cut=metadata["lst_branch_cut"], **write_kwargs, ) out_files = {} @@ -1392,16 +1463,16 @@ def lst_bin_files_single_outfile( if not inpaint_mode and not output_flagged: continue - kinds = ['LST', 'STD'] + kinds = ["LST", "STD"] if write_med_mad: - kinds += ['MED', 'MAD'] + kinds += ["MED", "MAD"] for kind in kinds: # Create the files we'll write to out_files[(kind, inpaint_mode)] = create_outfile( - kind = kind, + kind=kind, lst=lst_bin_edges[0], lsts=lst_bins, - inpaint_mode = inpaint_mode, + inpaint_mode=inpaint_mode, ) nbls_so_far = 0 @@ -1409,8 +1480,15 @@ def lst_bin_files_single_outfile( logger.info(f"Baseline Chunk {bi+1} / {len(bl_chunks)}") # data/flags/nsamples are *lists*, with nlst_bins entries, each being an # array, with shape (times, bls, freqs, npols) - bin_lst, data, flags, nsamples, where_inpainted, binned_times = lst_bin_files_for_baselines( - data_files = file_list, + ( + bin_lst, + data, + flags, + nsamples, + where_inpainted, + binned_times, + ) = lst_bin_files_for_baselines( + data_files=file_list, lst_bin_edges=lst_bin_edges, antpairs=bl_chunk, freqs=freq_array, @@ -1436,28 +1514,28 @@ def lst_bin_files_single_outfile( # TODO: we're not writing out the where_inpainted data for the GOLDEN # or REDUCEDCHAN files yet -- it looks like we'd have to write out # completely new UVFlag files for this. - out_files['GOLDEN'] = [] + out_files["GOLDEN"] = [] for nbin in golden_bins: out_files["GOLDEN"].append( create_outfile( - kind = 'GOLDEN', + kind="GOLDEN", lst=lst_bin_edges[nbin], times=binned_times[nbin], ) ) if save_channels and len(binned_times[0]) > 0: - out_files['REDUCEDCHAN'] = create_outfile( - kind = 'REDUCEDCHAN', + out_files["REDUCEDCHAN"] = create_outfile( + kind="REDUCEDCHAN", lst=lst_bin_edges[0], times=binned_times[0], channels=list(save_channels), ) - if len(golden_bins)>0: - for fl, nbin in zip(out_files['GOLDEN'], golden_bins): + if len(golden_bins) > 0: + for fl, nbin in zip(out_files["GOLDEN"], golden_bins): write_baseline_slc_to_file( - fl = fl, + fl=fl, slc=slc, data=data[nbin].transpose((1, 0, 2, 3)), flags=flags[nbin].transpose((1, 0, 2, 3)), @@ -1466,7 +1544,7 @@ def lst_bin_files_single_outfile( if "REDUCEDCHAN" in out_files: write_baseline_slc_to_file( - fl = out_files['REDUCEDCHAN'], + fl=out_files["REDUCEDCHAN"], slc=slc, data=data[0][:, :, save_channels].transpose((1, 0, 2, 3)), flags=flags[0][:, :, save_channels].transpose((1, 0, 2, 3)), @@ -1480,48 +1558,51 @@ def lst_bin_files_single_outfile( continue rdc = reduce_lst_bins( - data, flags, nsamples, + data, + flags, + nsamples, where_inpainted=where_inpainted, inpainted_mode=inpainted, flag_thresh=flag_thresh, - sigma_clip_thresh = ( - None if inpainted and not sigma_clip_in_inpainted_mode + sigma_clip_thresh=( + None + if inpainted and not sigma_clip_in_inpainted_mode else sigma_clip_thresh ), - sigma_clip_min_N = sigma_clip_min_N, + sigma_clip_min_N=sigma_clip_min_N, flag_below_min_N=flag_below_min_N, get_mad=write_med_mad, ) write_baseline_slc_to_file( - fl=out_files[('LST', inpainted)], + fl=out_files[("LST", inpainted)], slc=slc, - data=rdc['data'], - flags=rdc['flags'], - nsamples=rdc['nsamples'], + data=rdc["data"], + flags=rdc["flags"], + nsamples=rdc["nsamples"], ) write_baseline_slc_to_file( - fl=out_files[('STD', inpainted)], + fl=out_files[("STD", inpainted)], slc=slc, - data=rdc['std'], - flags=rdc['flags'], - nsamples=rdc['days_binned'], + data=rdc["std"], + flags=rdc["flags"], + nsamples=rdc["days_binned"], ) if write_med_mad: write_baseline_slc_to_file( - fl=out_files[('MED', inpainted)], + fl=out_files[("MED", inpainted)], slc=slc, - data=rdc['median'], - flags=rdc['flags'], - nsamples=rdc['nsamples'], + data=rdc["median"], + flags=rdc["flags"], + nsamples=rdc["nsamples"], ) write_baseline_slc_to_file( - fl=out_files[('STD', inpainted)], + fl=out_files[("STD", inpainted)], slc=slc, - data=rdc['mad'], - flags=rdc['flags'], - nsamples=rdc['days_binned'], + data=rdc["mad"], + flags=rdc["flags"], + nsamples=rdc["days_binned"], ) nbls_so_far += len(bl_chunk) @@ -1532,9 +1613,9 @@ def lst_bin_files_single_outfile( @profile def lst_bin_files( config_file: str | Path, - output_file_select: int | Sequence[int] | None=None, - include_autos: bool=True, - **kwargs + output_file_select: int | Sequence[int] | None = None, + include_autos: bool = True, + **kwargs, ) -> list[dict[str, Path]]: """ LST bin a series of UVH5 files. @@ -1573,10 +1654,10 @@ def lst_bin_files( with open(config_file, "r") as fl: configuration = yaml.safe_load(fl) - config_opts = configuration['config_params'] - lst_grid = configuration['lst_grid'] - matched_files = configuration['matched_files'] - metadata = configuration['metadata'] + config_opts = configuration["config_params"] + lst_grid = configuration["lst_grid"] + matched_files = configuration["matched_files"] + metadata = configuration["metadata"] if output_file_select is None: output_file_select = list(range(len(matched_files))) @@ -1591,33 +1672,31 @@ def lst_bin_files( meta = FastUVH5Meta( matched_files[0][0][0], - blts_are_rectangular=metadata['blts_are_rectangular'], - time_axis_faster_than_bls=metadata['time_axis_faster_than_bls'], + blts_are_rectangular=metadata["blts_are_rectangular"], + time_axis_faster_than_bls=metadata["time_axis_faster_than_bls"], ) antpos = dict(zip(meta.antenna_numbers, meta.antpos_enu)) - reds = RedundantGroups.from_antpos( - antpos=antpos, - include_autos=include_autos - ) + reds = RedundantGroups.from_antpos(antpos=antpos, include_autos=include_autos) output_files = [] for outfile_index in output_file_select: data_files = matched_files[outfile_index] output_files.append( lst_bin_files_single_outfile( - config_opts = config_opts, - metadata = metadata, - lst_bins = lst_grid[outfile_index], - data_files = data_files, - reds = reds, - include_autos = include_autos, - **kwargs + config_opts=config_opts, + metadata=metadata, + lst_bins=lst_grid[outfile_index], + data_files=data_files, + reds=reds, + include_autos=include_autos, + **kwargs, ) ) return output_files + @profile def get_all_unflagged_baselines( data_files: list[list[str | Path | FastUVH5Meta]], @@ -1654,33 +1733,37 @@ def get_all_unflagged_baselines( data_files = [ [ - fl if isinstance(fl, FastUVH5Meta) else - FastUVH5Meta( - fl, blts_are_rectangular=blts_are_rectangular, - time_axis_faster_than_bls=time_axis_faster_than_bls - ) for fl in fl_list - ] for fl_list in data_files + fl + if isinstance(fl, FastUVH5Meta) + else FastUVH5Meta( + fl, + blts_are_rectangular=blts_are_rectangular, + time_axis_faster_than_bls=time_axis_faster_than_bls, + ) + for fl in fl_list + ] + for fl_list in data_files ] - all_baselines = set() all_pols = set() meta0 = data_files[0][0] - x_orientation = meta0.get_transactional('x_orientation') + x_orientation = meta0.get_transactional("x_orientation") # reds will contain all of the redundant groups for the whole array, because # all the antenna positions are included in every file. if reds is None: reds = RedundantGroups.from_antpos( - antpos=dict(zip(meta0.antenna_numbers, meta0.antpos_enu)), include_autos=True + antpos=dict(zip(meta0.antenna_numbers, meta0.antpos_enu)), + include_autos=True, ) if redundantly_averaged is None: # Try to work out if the files are redundantly averaged. # just look at the middle file from each night. for fl_list in data_files: - meta = fl_list[len(fl_list)//2] + meta = fl_list[len(fl_list) // 2] antpairs = meta.get_transactional("antpairs") ubls = set(reds.get_ubl_key(ap) for ap in antpairs) if len(ubls) != len(antpairs): @@ -1734,19 +1817,19 @@ def get_all_unflagged_baselines( a1, a2 = reds.get_ubl_key((a1, a2)) if ( - (a1, a2) not in all_baselines and # Do this first because after the - (a2, a1) not in all_baselines and # first file it often triggers. - a1 not in ignore_ants and - a2 not in ignore_ants and - (include_autos or a1 != a2) and - a1 not in a_priori_antenna_flags and - a2 not in a_priori_antenna_flags + (a1, a2) not in all_baselines + and (a2, a1) not in all_baselines # Do this first because after the + and a1 not in ignore_ants # first file it often triggers. + and a2 not in ignore_ants + and (include_autos or a1 != a2) + and a1 not in a_priori_antenna_flags + and a2 not in a_priori_antenna_flags ): all_baselines.add((a1, a2)) - return sorted(all_baselines), sorted(all_pols) + def create_lstbin_output_file( outdir: Path, kind: str, @@ -1756,8 +1839,8 @@ def create_lstbin_output_file( start_jd: float, times: np.ndarray | None = None, lsts: np.ndarray | None = None, - history: str = "", - fname_format: str="zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", + history: str = "", + fname_format: str = "zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", overwrite: bool = False, antpairs: list[tuple[int, int]] | None = None, freq_min: float | None = None, @@ -1775,11 +1858,15 @@ def create_lstbin_output_file( _history = file_history + utils.history_string() if lst < lst_branch_cut: - lst += 2*np.pi + lst += 2 * np.pi fname = outdir / fname_format.format( - kind=kind, lst=lst, pol=''.join(pols), - inpaint_mode='inpaint' if inpaint_mode else ('flagged' if inpaint_mode is False else "") + kind=kind, + lst=lst, + pol="".join(pols), + inpaint_mode="inpaint" + if inpaint_mode + else ("flagged" if inpaint_mode is False else ""), ) logger.info(f"Initializing {fname}") @@ -1804,43 +1891,45 @@ def create_lstbin_output_file( times=times, history=_history, start_jd=start_jd, - time_axis_faster_than_bls = True, + time_axis_faster_than_bls=True, vis_units=vis_units, lst_branch_cut=lst_branch_cut, ) uvd_template.select(frequencies=freqs, polarizations=pols, inplace=True) # Need to set the polarization array manually because even though the select # operation does the down-select, it doesn't re-order the pols. - uvd_template.polarization_array = np.array(uvutils.polstr2num(pols, x_orientation=uvd_template.x_orientation)) + uvd_template.polarization_array = np.array( + uvutils.polstr2num(pols, x_orientation=uvd_template.x_orientation) + ) uvd_template.initialize_uvh5_file(str(fname.absolute()), clobber=overwrite) return fname + def write_baseline_slc_to_file( - fl: Path, - slc: slice, - data: np.ndarray, - flags: np.ndarray, - nsamples: np.ndarray + fl: Path, slc: slice, data: np.ndarray, flags: np.ndarray, nsamples: np.ndarray ): """Write a baseline slice to a file.""" - with h5py.File(fl, 'r+') as f: - ntimes = int(f['Header']['Ntimes'][()]) - timefirst = bool(f['Header']['time_axis_faster_than_bls'][()]) + with h5py.File(fl, "r+") as f: + ntimes = int(f["Header"]["Ntimes"][()]) + timefirst = bool(f["Header"]["time_axis_faster_than_bls"][()]) if not timefirst and ntimes > 1: raise NotImplementedError("Can only do time-first files for now.") - slc = slice(slc.start*ntimes, slc.stop*ntimes, 1) - f['Data']['visdata'][slc] = data.reshape((-1, data.shape[2], data.shape[3])) - f['Data']['flags'][slc] = flags.reshape((-1, data.shape[2], data.shape[3])) - f['Data']['nsamples'][slc] = nsamples.reshape((-1, data.shape[2], data.shape[3])) + slc = slice(slc.start * ntimes, slc.stop * ntimes, 1) + f["Data"]["visdata"][slc] = data.reshape((-1, data.shape[2], data.shape[3])) + f["Data"]["flags"][slc] = flags.reshape((-1, data.shape[2], data.shape[3])) + f["Data"]["nsamples"][slc] = nsamples.reshape( + (-1, data.shape[2], data.shape[3]) + ) + def config_lst_bin_files( data_files: list[list[str | FastUVH5Meta]], - dlst: float | None=None, - atol: float=1e-10, - lst_start: float | None=None, - lst_width: float = 2*np.pi, + dlst: float | None = None, + atol: float = 1e-10, + lst_start: float | None = None, + lst_width: float = 2 * np.pi, blts_are_rectangular: bool | None = None, time_axis_faster_than_bls: bool | None = None, ntimes_per_file: int | None = None, @@ -1876,18 +1965,19 @@ def config_lst_bin_files( """ logger.info("Configuring lst_grid") - data_files=[sorted(df) for df in data_files] + data_files = [sorted(df) for df in data_files] df0 = data_files[0][0] # get dlst from first data file if None if dlst is None: - dlst = io.get_file_times(df0, filetype='uvh5')[0] + dlst = io.get_file_times(df0, filetype="uvh5")[0] # Get rectangularity of blts from first file if None meta = FastUVH5Meta( - df0, blts_are_rectangular=blts_are_rectangular, - time_axis_faster_than_bls=time_axis_faster_than_bls + df0, + blts_are_rectangular=blts_are_rectangular, + time_axis_faster_than_bls=time_axis_faster_than_bls, ) if blts_are_rectangular is None: @@ -1904,7 +1994,8 @@ def config_lst_bin_files( df[0], blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, - ) for df in data_files + ) + for df in data_files ] # get begin_lst from lst_start or from the first JD in the data_files @@ -1916,7 +2007,7 @@ def config_lst_bin_files( lst_grid = make_lst_grid(dlst, begin_lst=begin_lst, lst_width=lst_width) dlst = lst_grid[1] - lst_grid[0] - lst_edges = np.concatenate([lst_grid - dlst/2, [lst_grid[-1] + dlst/2]]) + lst_edges = np.concatenate([lst_grid - dlst / 2, [lst_grid[-1] + dlst / 2]]) # Now, what we need here is actually the lst_edges of the *files*, not the actual # bins. @@ -1930,33 +2021,38 @@ def config_lst_bin_files( for fllist in data_files: matched = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fllist, + file_list=fllist, files_sorted=True, jd_regex=jd_regex, blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, - atol=atol + atol=atol, ) for i, m in enumerate(matched): matched_files[i].append(m) nfiles = int(np.ceil(len(lst_grid) / ntimes_per_file)) - lst_grid = [lst_grid[ntimes_per_file*i:ntimes_per_file*(i+1)] for i in range(nfiles)] + lst_grid = [ + lst_grid[ntimes_per_file * i: ntimes_per_file * (i + 1)] for i in range(nfiles) + ] # Only keep output files that have data associated - lst_grid = [lg for lg, mf in zip(lst_grid, matched_files) if any(len(mff)>0 for mff in mf)] - matched_files = [mf for mf in matched_files if any(len(mff)>0 for mff in mf)] + lst_grid = [ + lg for lg, mf in zip(lst_grid, matched_files) if any(len(mff) > 0 for mff in mf) + ] + matched_files = [mf for mf in matched_files if any(len(mff) > 0 for mff in mf)] return lst_grid, matched_files + def make_lst_bin_config_file( config_file: str | Path, data_files: list[list[str | FastUVH5Meta]], clobber: bool = False, - dlst: float | None=None, - atol: float=1e-10, - lst_start: float | None=None, - lst_width: float = 2*np.pi, - ntimes_per_file: int=60, + dlst: float | None = None, + atol: float = 1e-10, + lst_start: float | None = None, + lst_width: float = 2 * np.pi, + ntimes_per_file: int = 60, blts_are_rectangular: bool | None = None, time_axis_faster_than_bls: bool | None = None, jd_regex: str = r"zen\.(\d+\.\d+)\.", @@ -2045,52 +2141,58 @@ def make_lst_bin_config_file( dlst = lst_grid[0][1] - lst_grid[0][0] # Make it a real list of floats to make the YAML easier to read - lst_grid = [[float(l) for l in lsts] for lsts in lst_grid] + lst_grid = [[float(lst) for lst in lsts] for lsts in lst_grid] # now matched files is a list of output files, each containing a list of nights, # each containing a list of files logger.info("Getting metadata from first file...") + def get_meta(): for outfile in matched_files: for night in outfile: for i, fl in enumerate(night): return fl + meta = get_meta() tint = np.median(meta.integration_time) if not np.all(np.abs(np.diff(np.diff(meta.times))) < 1e-6): raise ValueError( - 'All integrations must be of equal length (BDA not supported), got diffs: ' - f'{np.diff(meta.times)}' + "All integrations must be of equal length (BDA not supported), got diffs: " + f"{np.diff(meta.times)}" ) - matched_files = [[[str(m.path) for m in night] for night in outfiles] for outfiles in matched_files] + matched_files = [ + [[str(m.path) for m in night] for night in outfiles] + for outfiles in matched_files + ] output = { - 'config_params': { - 'dlst': float(dlst), - 'atol': atol, - 'lst_start': lst_start, - 'lst_width': lst_width, - 'jd_regex': jd_regex, + "config_params": { + "dlst": float(dlst), + "atol": atol, + "lst_start": lst_start, + "lst_width": lst_width, + "jd_regex": jd_regex, + }, + "lst_grid": lst_grid, + "matched_files": matched_files, + "metadata": { + "x_orientation": meta.x_orientation, + "blts_are_rectangular": meta.blts_are_rectangular, + "time_axis_faster_than_bls": meta.time_axis_faster_than_bls, + "start_jd": int(meta.times[0]), + "integration_time": float(tint), + "lst_branch_cut": lst_branch_cut, }, - 'lst_grid': lst_grid, - 'matched_files': matched_files, - 'metadata': { - 'x_orientation': meta.x_orientation, - 'blts_are_rectangular': meta.blts_are_rectangular, - 'time_axis_faster_than_bls': meta.time_axis_faster_than_bls, - 'start_jd': int(meta.times[0]), - 'integration_time': float(tint), - 'lst_branch_cut': lst_branch_cut - } } - with open(config_file, 'w') as fl: + with open(config_file, "w") as fl: yaml.safe_dump(output, fl) return output + def lst_bin_arg_parser(): """ arg parser for lst_bin_files() function. data_files argument must be quotation-bounded @@ -2107,37 +2209,166 @@ def lst_bin_arg_parser(): "Consult lstbin.lst_bin_files() for further details on functionality." ) ) - a.add_argument('configfile', type=str, help="config file produced by lstbin.make_lst_bin_config_file") a.add_argument( - "--calfile-rules", nargs='*', type=str, - help="rules to convert datafile names to calfile names. A series of two strings where the first will be replaced by the latter" + "configfile", + type=str, + help="config file produced by lstbin.make_lst_bin_config_file", + ) + a.add_argument( + "--calfile-rules", + nargs="*", + type=str, + help="rules to convert datafile names to calfile names. A series of two strings where the first will be replaced by the latter", + ) + a.add_argument( + "--fname-format", + type=str, + default="zen.{kind}.{lst:7.5f}.uvh5", + help="filename format for output files. See docstring for details.", + ) + a.add_argument( + "--outdir", default=None, type=str, help="directory for writing output" + ) + a.add_argument( + "--overwrite", default=False, action="store_true", help="overwrite output files" + ) + a.add_argument( + "--rephase", + default=False, + action="store_true", + help="rephase data to center of LST bin before binning", + ) + a.add_argument( + "--history", default=" ", type=str, help="history to insert into output files" + ) + a.add_argument( + "--output_file_select", + default=None, + nargs="*", + type=int, + help="list of output file integers to run on. Default is all output files.", + ) + a.add_argument( + "--vis_units", default="Jy", type=str, help="visibility units of output files." + ) + a.add_argument( + "--ignore_flags", + default=False, + action="store_true", + help="Ignore flags in data files, such that all input data is included in binning.", + ) + a.add_argument( + "--Nbls_to_load", + default=None, + type=int, + help="Number of baselines to load and bin simultaneously. Default is all.", + ) + a.add_argument( + "--ex_ant_yaml_files", + default=None, + type=str, + nargs="+", + help="list of paths to yamls with lists of antennas from each night to exclude lstbinned data files.", + ) + a.add_argument( + "--ignore-ants", default=(), type=int, nargs="+", help="ants to ignore" + ) + a.add_argument( + "--ignore-missing-calfiles", + default=False, + action="store_true", + help="if true, any datafile with missing calfile will just be removed from lstbinning.", + ) + a.add_argument( + "--write_kwargs", + default="{}", + type=str, + help="json dictionary of arguments to the uvh5 writer", + ) + a.add_argument( + "--golden-lsts", + type=str, + help="LSTS (rad) to save longitudinal data for, separated by commas", + ) + a.add_argument( + "--save-channels", + type=str, + help="integer channels separated by commas to save longitudinal data for", + ) + a.add_argument( + "--sigma-clip-thresh", + type=float, + help="sigma clip threshold for flagging data in an LST bin over time. Zero means no clipping.", + default=None, + ) + a.add_argument( + "--sigma-clip-min-N", + type=int, + help="number of unflagged data points over time to require before considering sigma clipping", + default=4, + ) + a.add_argument( + "--flag-below-min-N", + action="store_true", + help="if true, flag all data in an LST bin if there are fewer than --sigma-clip-min-N unflagged data points over time", + ) + a.add_argument( + "--flag-thresh", + type=float, + help="fraction of integrations in an LST bin for a particular (antpair, pol, channel) that must be flagged for the entire bin to be flagged", + default=0.7, + ) + a.add_argument( + "--redundantly-averaged", + action="store_true", + default=None, + help="if true, assume input files are redundantly averaged", + ) + a.add_argument( + "--only-last-file-per-night", + action="store_true", + default=False, + help="if true, only use the first and last file every night to obtain antpairs", + ) + a.add_argument( + "--freq-min", + type=float, + default=None, + help="minimum frequency to include in lstbinning", + ) + a.add_argument( + "--freq-max", + type=float, + default=None, + help="maximum frequency to include in lstbinning", + ) + a.add_argument( + "--no-flagged-mode", + action="store_true", + help="turn off output of flagged mode LST-binning", + ) + a.add_argument( + "--do-inpaint-mode", + action="store_true", + default=None, + help="turn on inpainting mode LST-binning", + ) + a.add_argument( + "--where-inpainted-file-rules", + nargs="*", + type=str, + help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter", + ) + a.add_argument( + "--sigma-clip-in-inpainted-mode", + action="store_true", + default=False, + help="allow sigma-clipping in inpainted mode", + ) + a.add_argument( + "--write-med-mad", + action="store_true", + default=False, + help="option to write out MED/MAD files in addition to LST/STD files", ) - a.add_argument("--fname-format", type=str, default="zen.{kind}.{lst:7.5f}.uvh5", help="filename format for output files. See docstring for details.") - a.add_argument("--outdir", default=None, type=str, help="directory for writing output") - a.add_argument("--overwrite", default=False, action='store_true', help="overwrite output files") - a.add_argument("--rephase", default=False, action='store_true', help="rephase data to center of LST bin before binning") - a.add_argument("--history", default=' ', type=str, help="history to insert into output files") - a.add_argument("--output_file_select", default=None, nargs='*', type=int, help="list of output file integers to run on. Default is all output files.") - a.add_argument("--vis_units", default='Jy', type=str, help="visibility units of output files.") - a.add_argument("--ignore_flags", default=False, action='store_true', help="Ignore flags in data files, such that all input data is included in binning.") - a.add_argument("--Nbls_to_load", default=None, type=int, help="Number of baselines to load and bin simultaneously. Default is all.") - a.add_argument("--ex_ant_yaml_files", default=None, type=str, nargs='+', help="list of paths to yamls with lists of antennas from each night to exclude lstbinned data files.") - a.add_argument("--ignore-ants", default=(), type=int, nargs='+', help='ants to ignore') - a.add_argument("--ignore-missing-calfiles", default=False,action='store_true', help='if true, any datafile with missing calfile will just be removed from lstbinning.') - a.add_argument("--write_kwargs", default='{}', type=str, help="json dictionary of arguments to the uvh5 writer") - a.add_argument("--golden-lsts", type=str, help="LSTS (rad) to save longitudinal data for, separated by commas") - a.add_argument("--save-channels", type=str, help="integer channels separated by commas to save longitudinal data for") - a.add_argument("--sigma-clip-thresh", type=float, help="sigma clip threshold for flagging data in an LST bin over time. Zero means no clipping.", default=None) - a.add_argument("--sigma-clip-min-N", type=int, help="number of unflagged data points over time to require before considering sigma clipping", default=4) - a.add_argument("--flag-below-min-N", action='store_true', help="if true, flag all data in an LST bin if there are fewer than --sigma-clip-min-N unflagged data points over time") - a.add_argument("--flag-thresh", type=float, help="fraction of integrations in an LST bin for a particular (antpair, pol, channel) that must be flagged for the entire bin to be flagged", default=0.7) - a.add_argument("--redundantly-averaged", action='store_true', default=None, help="if true, assume input files are redundantly averaged") - a.add_argument("--only-last-file-per-night", action='store_true', default=False, help="if true, only use the first and last file every night to obtain antpairs") - a.add_argument("--freq-min", type=float, default=None, help="minimum frequency to include in lstbinning") - a.add_argument("--freq-max", type=float, default=None, help="maximum frequency to include in lstbinning") - a.add_argument("--no-flagged-mode", action='store_true', help="turn off output of flagged mode LST-binning") - a.add_argument("--do-inpaint-mode", action='store_true', default=None, help="turn on inpainting mode LST-binning") - a.add_argument("--where-inpainted-file-rules", nargs='*', type=str, help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter") - a.add_argument('--sigma-clip-in-inpainted-mode', action='store_true', default=False, help='allow sigma-clipping in inpainted mode') - a.add_argument('--write-med-mad', action='store_true', default=False, help='option to write out MED/MAD files in addition to LST/STD files') return a diff --git a/hera_cal/tests/mock_uvdata.py b/hera_cal/tests/mock_uvdata.py index 29822aa85..9003d3b12 100644 --- a/hera_cal/tests/mock_uvdata.py +++ b/hera_cal/tests/mock_uvdata.py @@ -1,7 +1,7 @@ """Functions for mocking UVdata objects for testing purposes.""" from __future__ import annotations -from pyuvdata import UVData, UVCal +from pyuvdata import UVData, UVCal, UVFlag from pyuvdata.uvdata import FastUVH5Meta from pyuvdata.telescopes import get_telescope @@ -25,16 +25,19 @@ HERA_ANTPOS = yaml.safe_load(fl) start: 46920776.3671875 -end: 234298706.0546875 +end: 234298706.0546875 delta: 122070.3125 + def create_mock_hera_obs( jdint: int = 2459855, integration_time=9.663677215576172, lst_start=0.1, jd_start: float | None = None, - ntimes: int=2, - freqs: np.ndarray = np.arange(46920776.3671875, 234298706.0546875 + 10.0, 122070.3125), + ntimes: int = 2, + freqs: np.ndarray = np.arange( + 46920776.3671875, 234298706.0546875 + 10.0, 122070.3125 + ), pols: list[str] = ["xx", "yy", "xy", "yx"], ants: list[int] | None = None, antpairs: list[tuple[int, int]] | None = None, @@ -49,17 +52,17 @@ def create_mock_hera_obs( if jd_start is None: # We ensure that the LSTs align exactly with the LST-grid that would be LST-binned. lst_start = make_lst_grid(dlst, begin_lst=lst_start)[0] - lsts = np.arange(lst_start, ntimes*dlst + lst_start, dlst)[:ntimes] + lsts = np.arange(lst_start, ntimes * dlst + lst_start, dlst)[:ntimes] times = utils.LST2JD(lsts, start_jd=jdint, allow_other_jd=False) else: if jd_start < 2000000: jd_start += jdint - times = np.arange(jd_start, ntimes*tint + jd_start, tint)[:ntimes] + times = np.arange(jd_start, ntimes * tint + jd_start, tint)[:ntimes] if ants is None: ants = list(HERA_ANTPOS.keys()) - antpos = {k: v for k,v in HERA_ANTPOS.items() if k in ants} + antpos = {k: v for k, v in HERA_ANTPOS.items() if k in ants} reds = RedundantGroups.from_antpos(antpos) @@ -75,51 +78,54 @@ def create_mock_hera_obs( # build a UVData object uvd = UVData.new( - freq_array= freqs, + freq_array=freqs, polarization_array=pols, - antenna_positions= antpos, + antenna_positions=antpos, antpairs=np.array(antpairs), - telescope_location= HERA_LOC, - telescope_name= "HERA", - times= times, + telescope_location=HERA_LOC, + telescope_name="HERA", + times=times, x_orientation=x_orientation, empty=empty, time_axis_faster_than_bls=time_axis_faster_than_bls, - do_blt_outer=True + do_blt_outer=True, ) uvd.polarization_array = np.array(uvd.polarization_array) return uvd def create_uvd_ones(**kwargs) -> UVData: - uvd = create_mock_hera_obs( - empty=True, - **kwargs - ) + uvd = create_mock_hera_obs(empty=True, **kwargs) uvd.data_array += 1.0 return uvd + def identifiable_data_from_uvd( ones: UVData, reshape: bool = True, ) -> np.ndarray: lsts = np.unique(ones.lst_array) - normfreqs = 2*np.pi * (ones.freq_array - 100e6) / 100e6 + normfreqs = 2 * np.pi * (ones.freq_array - 100e6) / 100e6 data = np.zeros_like(ones.data_array) orig_shape = data.shape # in-place reshape of data to make it easier for us. - data.shape = (len(ones.get_antpairs()), ones.Ntimes, ones.Nfreqs, len(ones.polarization_array)) + data.shape = ( + len(ones.get_antpairs()), + ones.Ntimes, + ones.Nfreqs, + len(ones.polarization_array), + ) for i, ap in enumerate(ones.get_antpairs()): for j, p in enumerate(ones.polarization_array): if ap[0] == ap[1] and p in (-5, -6): - d = np.outer(-p*lsts*1000, (ones.freq_array / 75e6)**-2) + d = np.outer(-p * lsts * 1000, (ones.freq_array / 75e6) ** -2) else: d = np.outer( - p*lsts, np.cos(normfreqs*ap[0]) + np.sin(normfreqs*ap[1])*1j + p * lsts, np.cos(normfreqs * ap[0]) + np.sin(normfreqs * ap[1]) * 1j ) data[i, :, :, j] = d @@ -127,11 +133,13 @@ def identifiable_data_from_uvd( data.shape = orig_shape return data + def create_uvd_identifiable( with_noise: bool = False, autos_noise: bool = False, flag_frac: float = 0.0, - **kwargs) -> UVData: + **kwargs, +) -> UVData: """Make a UVData object with identifiable data. Each baseline, pol, LST and freq channel should be identifiable in the data with the @@ -143,18 +151,18 @@ def create_uvd_identifiable( uvd.data_array = identifiable_data_from_uvd(uvd) if with_noise: - add_noise_to_uvd(uvd, autos = autos_noise) + add_noise_to_uvd(uvd, autos=autos_noise) if flag_frac > 0: add_flags_to_uvd(uvd, flag_frac=flag_frac) return uvd + def add_noise_to_uvd(uvd, autos: bool = False): hd = io.to_HERAData(uvd) - data, flags, nsamples = hd.read() - dt = (data.times[1] - data.times[0])* units.si.day.in_units(units.si.s) + dt = (data.times[1] - data.times[0]) * units.si.day.in_units(units.si.s) df = data.freqs[1] - data.freqs[0] for bl in data.bls(): @@ -164,12 +172,12 @@ def add_noise_to_uvd(uvd, autos: bool = False): variance = noise.predict_noise_variance_from_autos( bl, data, dt=dt, df=df, nsamples=nsamples ) - data[bl] += ( - np.random.normal(scale=np.sqrt(variance/2)) - + 1j * np.random.normal(scale=np.sqrt(variance/2)) - ) + data[bl] += np.random.normal( + scale=np.sqrt(variance / 2) + ) + 1j * np.random.normal(scale=np.sqrt(variance / 2)) hd.update(data=data) + def add_flags_to_uvd(uvd, flag_frac: float = 0.1): hd = io.to_HERAData(uvd) data, flags, nsamples = hd.read() @@ -180,13 +188,14 @@ def add_flags_to_uvd(uvd, flag_frac: float = 0.1): flags[bl] = np.random.uniform(size=flags[bl].shape) < flag_frac hd.update(flags=flags) + def write_files_in_hera_format( uvds: list[UVData] | list[list[UVData]], tmpdir: Path, fmt: str = "zen.{jd:.5f}.sum.uvh5", in_jdint_dir: bool = True, + add_where_inpainted_files: bool = False, ) -> list[list[str]]: - if not tmpdir.exists(): tmpdir.mkdir() @@ -199,7 +208,7 @@ def write_files_in_hera_format( for uvdlist in uvds: if in_jdint_dir: jdint = int(uvdlist[0].time_array.min()) - daydir = (tmpdir / str(jdint)) + daydir = tmpdir / str(jdint) if not daydir.exists(): daydir.mkdir() else: @@ -212,6 +221,14 @@ def write_files_in_hera_format( obj.initialize_uvh5_file(fl, clobber=True) else: obj.write_uvh5(fl, clobber=True) + + if add_where_inpainted_files: + flg = UVFlag() + flg.from_uvdata(obj, copy_flags=True, waterfall=False) + flg.flag_array[:] = True # Everything inpainted. + flgfile = fl.with_suffix(".where_inpainted.h5") + flg.write(flgfile, clobber=True) + _fls.append(str(fl)) fls.append(_fls) @@ -220,7 +237,10 @@ def write_files_in_hera_format( else: return fls -def make_day(nfiles: int, creator: callable = create_mock_hera_obs, **kwargs) -> list[UVData]: + +def make_day( + nfiles: int, creator: callable = create_mock_hera_obs, **kwargs +) -> list[UVData]: """Make a day of UVData objects.""" uvds = [] @@ -228,17 +248,21 @@ def make_day(nfiles: int, creator: callable = create_mock_hera_obs, **kwargs) -> for i in range(nfiles): uvds.append(creator(lst_start=lst_start, **kwargs)) - if i ==0: + if i == 0: lsts = np.unique(uvds[0].lst_array) dlst = lsts[1] - lsts[0] lst_start = uvds[-1].lst_array[-1] + dlst - kwargs['jd_start'] = None # only use for first file, after, use lst_start + kwargs["jd_start"] = None # only use for first file, after, use lst_start return uvds + def make_dataset( - ndays: int, nfiles: int, start_jdint: int = 2459855, creator: callable = create_mock_hera_obs, + ndays: int, + nfiles: int, + start_jdint: int = 2459855, + creator: callable = create_mock_hera_obs, random_ants_to_drop: int = 0, - **kwargs + **kwargs, ) -> list[list[UVData]]: """Make a dataset of UVData objects.""" @@ -246,64 +270,86 @@ def make_dataset( default = creator(jdint=start_jdint, **kwargs) antpairs = default.get_antpairs() data_ants = list(set([ap[0] for ap in antpairs] + [ap[1] for ap in antpairs])) - if 'antpairs' in kwargs: - del kwargs['antpairs'] + if "antpairs" in kwargs: + del kwargs["antpairs"] uvds = [] for i in range(ndays): if random_ants_to_drop > 0: drop_ants = np.random.choice(data_ants, random_ants_to_drop, replace=False) - _antpairs = [ap for ap in antpairs if ap[0] not in drop_ants and ap[1] not in drop_ants] - kwargs['antpairs'] = _antpairs + _antpairs = [ + ap + for ap in antpairs + if ap[0] not in drop_ants and ap[1] not in drop_ants + ] + kwargs["antpairs"] = _antpairs uvds.append(make_day(nfiles, jdint=start_jdint + i, creator=creator, **kwargs)) return uvds -def make_uvc_ones(uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0): + +def make_uvc_ones( + uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0 +): uvc = UVCal.initialize_from_uvdata( uvd, - cal_style = "redundant", - gain_convention = "multiply", - jones_array = "linear", - cal_type = "gain", - metadata_only=False + cal_style="redundant", + gain_convention="multiply", + jones_array="linear", + cal_type="gain", + metadata_only=False, ) if flag_full_ant > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_full_ant, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_full_ant, replace=False + ) uvc.flag_array[badants] = True if flag_ant_time > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_ant_time, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_ant_time, replace=False + ) badtime = np.random.randint(0, uvc.Ntimes) uvc.flag_array[badants, :, badtime] = True if flag_ant_freq > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_ant_freq, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_ant_freq, replace=False + ) badfreq = np.random.randint(0, uvc.Nfreqs) uvc.flag_array[badants, badfreq, :] = True return uvc + def identifiable_gains_from_uvc(uvc: UVCal): gains = np.zeros_like(uvc.gain_array) for i, ant in enumerate(uvc.ant_array): for j in range(uvc.Njones): gains[i, :, :, j] = np.outer( - np.exp(2j*np.pi*uvc.freq_array*(j+1)), - np.ones_like(uvc.time_array)*(ant+1) + np.exp(2j * np.pi * uvc.freq_array * (j + 1)), + np.ones_like(uvc.time_array) * (ant + 1), ) return gains -def make_uvc_identifiable(uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0): - uvc = make_uvc_ones(uvd, flag_full_ant=flag_full_ant, flag_ant_time=flag_ant_time, flag_ant_freq=flag_ant_freq) + +def make_uvc_identifiable( + uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0 +): + uvc = make_uvc_ones( + uvd, + flag_full_ant=flag_full_ant, + flag_ant_time=flag_ant_time, + flag_ant_freq=flag_ant_freq, + ) uvc.gain_array = identifiable_gains_from_uvc(uvc) return uvc + def write_cals_in_hera_format( uvds: list[UVCal] | list[list[UVCal]], tmpdir: Path, fmt: str = "zen.{jd:.5f}.sum.calfits", in_jdint_dir: bool = True, ) -> list[list[str]]: - if not tmpdir.exists(): tmpdir.mkdir() @@ -316,7 +362,7 @@ def write_cals_in_hera_format( for uvdlist in uvds: if in_jdint_dir: jdint = int(uvdlist[0].time_array.min()) - daydir = (tmpdir / str(jdint)) + daydir = tmpdir / str(jdint) if not daydir.exists(): daydir.mkdir() else: diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index 218e412af..1830dbefd 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -20,12 +20,15 @@ try: benchmark except NameError: - @pytest.fixture(scope='module') + + @pytest.fixture(scope="module") def benchmark(): def fnc(wrapped, *args, **kwargs): return wrapped(*args, **kwargs) + return fnc + class Test_LSTAlign: @classmethod def get_lst_align_data( @@ -33,48 +36,56 @@ def get_lst_align_data( ntimes: int = 10, nfreqs: int = 5, npols: int = 4, - nants: int = 4, + nants: int = 4, ndays: int = 1, creator: callable = mockuvd.create_uvd_identifiable, ): - uvds = mockuvd.make_dataset( ndays=ndays, nfiles=1, creator=creator, - pols=('xx', 'yy', 'xy', 'yx')[:npols], - freqs = np.linspace(100e6, 200e6, nfreqs), - ants = np.arange(nants), + pols=("xx", "yy", "xy", "yx")[:npols], + freqs=np.linspace(100e6, 200e6, nfreqs), + ants=np.arange(nants), ntimes=ntimes, time_axis_faster_than_bls=False, ) uvds = [uvd[0] for uvd in uvds] # flatten to single list - data = np.concatenate([uvd.data_array.reshape((ntimes, -1, nfreqs, npols)) for uvd in uvds], axis=0) + data = np.concatenate( + [uvd.data_array.reshape((ntimes, -1, nfreqs, npols)) for uvd in uvds], + axis=0, + ) freq_array = uvds[0].freq_array - data_lsts = np.concatenate([np.unique(uvds[0].lst_array),]*len(uvds)) + data_lsts = np.concatenate( + [ + np.unique(uvds[0].lst_array), + ] + * len(uvds) + ) antpairs = uvds[0].get_antpairs() antpos = uvds[0].antenna_positions # Ensure that each LST is in its own bin lsts = np.sort(np.unique(data_lsts)) - lst_bin_edges = np.concatenate(( - [lsts[0] - (lsts[1] - lsts[0])/2], - (lsts[1:] + lsts[:-1])/2, - [lsts[-1] + (lsts[-1] - lsts[-2])/2] - )) + lst_bin_edges = np.concatenate( + ( + [lsts[0] - (lsts[1] - lsts[0]) / 2], + (lsts[1:] + lsts[:-1]) / 2, + [lsts[-1] + (lsts[-1] - lsts[-2]) / 2], + ) + ) return dict( - data = data, - data_lsts = data_lsts, + data=data, + data_lsts=data_lsts, antpairs=antpairs, lst_bin_edges=lst_bin_edges, freq_array=freq_array, antpos=antpos, ) - def test_bad_inputs(self): # Test that we get the right errors for bad inputs kwargs = self.get_lst_align_data() @@ -82,12 +93,12 @@ def test_bad_inputs(self): def lst_align_with(**kw): return lstbin_simple.lst_align(**{**kwargs, **kw}) - data = np.ones(kwargs['data'].shape[:-1] + (5,)) + data = np.ones(kwargs["data"].shape[:-1] + (5,)) with pytest.raises(ValueError, match="data has more than 4 pols"): lst_align_with(data=data) with pytest.raises(ValueError, match="data should have shape"): - lst_align_with(freq_array=kwargs['freq_array'][:-1]) + lst_align_with(freq_array=kwargs["freq_array"][:-1]) with pytest.raises(ValueError, match="flags should have shape"): lst_align_with(flags=np.ones(13, dtype=bool)) @@ -97,11 +108,15 @@ def lst_align_with(**kw): lst_align_with(nsamples=np.ones(13)) # Use only one bin edge - with pytest.raises(ValueError, match="lst_bin_edges must have at least 2 elements"): + with pytest.raises( + ValueError, match="lst_bin_edges must have at least 2 elements" + ): lst_align_with(lst_bin_edges=np.ones(1)) # Try rephasing without freq_array or antpos - with pytest.raises(ValueError, match="freq_array and antpos is needed for rephase"): + with pytest.raises( + ValueError, match="freq_array and antpos is needed for rephase" + ): lst_align_with(rephase=True, antpos=None) def test_increasing_lsts_one_per_bin(self, benchmark): @@ -113,7 +128,7 @@ def test_increasing_lsts_one_per_bin(self, benchmark): f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d, kwargs['data']) + np.testing.assert_allclose(d, kwargs["data"]) assert not np.any(f) assert np.all(n == 1.0) assert len(bins) == 6 @@ -127,7 +142,7 @@ def test_multi_days_one_per_bin(self, benchmark): f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f) assert np.all(n == 1.0) assert len(bins) == 7 @@ -136,17 +151,19 @@ def test_multi_days_with_flagging(self, benchmark): kwargs = self.get_lst_align_data(ndays=2, ntimes=7) # Flag everything after the first day, and make the data there crazy. - flags = np.zeros_like(kwargs['data'], dtype=bool) + flags = np.zeros_like(kwargs["data"], dtype=bool) flags[7:] = True - kwargs['data'][7:] = 1000.0 + kwargs["data"][7:] = 1000.0 - bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs) + bins, d, f, n, inp = benchmark( + lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs + ) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f[:, 0]) assert np.all(f[:, 1]) assert len(bins) == 7 @@ -155,28 +172,31 @@ def test_multi_days_with_nsamples_zero(self, benchmark): kwargs = benchmark(self.get_lst_align_data, ndays=2, ntimes=7) # Flag everything after the first day, and make the data there crazy. - nsamples = np.ones_like(kwargs['data'], dtype=float) + nsamples = np.ones_like(kwargs["data"], dtype=float) nsamples[7:] = 0.0 - kwargs['data'][7:] = 1000.0 + kwargs["data"][7:] = 1000.0 - bins, d, f, n, inp = lstbin_simple.lst_align(rephase=False, nsamples=nsamples, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align( + rephase=False, nsamples=nsamples, **kwargs + ) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f) assert np.all(n[:, 0] == 1.0) assert np.all(n[:, 1] == 0.0) assert len(bins) == 7 - def test_rephase(self, benchmark): """Test that rephasing where each bin is already at center does nothing.""" kwargs = self.get_lst_align_data(ntimes=7) - bins0, d0, f0, n0, inp = benchmark(lstbin_simple.lst_align, rephase=True, **kwargs) + bins0, d0, f0, n0, inp = benchmark( + lstbin_simple.lst_align, rephase=True, **kwargs + ) bins, d, f, n, inp = lstbin_simple.lst_align(rephase=False, **kwargs) np.testing.assert_allclose(d, d0, rtol=1e-6) np.testing.assert_allclose(f, f0, rtol=1e-6) @@ -184,42 +204,57 @@ def test_rephase(self, benchmark): assert len(bins) == len(bins0) def test_lstbinedges_modulus(self, benchmark): - kwargs = self.get_lst_align_data(ntimes=7) edges = kwargs.pop("lst_bin_edges") lst_bin_edges = edges.copy() - lst_bin_edges -= 4*np.pi + lst_bin_edges -= 4 * np.pi - bins, d0, f0, n0, inp = benchmark(lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs) + bins, d0, f0, n0, inp = benchmark( + lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs + ) lst_bin_edges = edges.copy() - lst_bin_edges += 4*np.pi + lst_bin_edges += 4 * np.pi - bins, d, f, n, inp = lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align( + lst_bin_edges=lst_bin_edges, **kwargs + ) np.testing.assert_allclose(d, d0) np.testing.assert_allclose(f, f0) np.testing.assert_allclose(n, n0) - with pytest.raises(ValueError, match="lst_bin_edges must be monotonically increasing."): + with pytest.raises( + ValueError, match="lst_bin_edges must be monotonically increasing." + ): lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges[::-1], **kwargs) + def test_argparser_returns(): args = lstbin_simple.lst_bin_arg_parser() assert args is not None + class Test_ReduceLSTBins: @classmethod def get_input_data( - cls, - nfreqs: int=3, npols: int = 1, nbls: int = 6, ntimes: tuple[int] = (4, ) + cls, nfreqs: int = 3, npols: int = 1, nbls: int = 6, ntimes: tuple[int] = (4,) ): data = np.random.random((nbls, nfreqs, npols)) # Make len(ntimes) LST bins, each with ntimes[i] time-entries, all the same # data. - data = [np.array([data, ]*nt).reshape((nt,) + data.shape)*(i+1) for i, nt in enumerate(ntimes)] + data = [ + np.array( + [ + data, + ] + * nt + ).reshape((nt,) + data.shape) + * (i + 1) + for i, nt in enumerate(ntimes) + ] flags = [np.zeros(d.shape, dtype=bool) for d in data] nsamples = [np.ones(d.shape, dtype=float) for d in data] @@ -227,14 +262,19 @@ def get_input_data( def test_one_point_per_bin(self, benchmark): d, f, n = self.get_input_data(ntimes=(1,)) - rdc= benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + rdc = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) # reduce_data swaps the order of bls/times - dd = rdc['data'].swapaxes(0, 1) - ff = rdc['flags'].swapaxes(0, 1) - nn = rdc['nsamples'].swapaxes(0, 1) + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) @@ -245,22 +285,27 @@ def test_zerosize_bin(self): print(d[0].shape, len(d)) rdc = lstbin_simple.reduce_lst_bins(d, f, n) - assert rdc['data'].shape[1] == 2 # 2 LST bins - assert np.all(np.isnan(rdc['data'][:, 0])) - assert np.all(rdc['flags'][:, 0]) - assert np.all(rdc['nsamples'][:, 0] == 0.0) + assert rdc["data"].shape[1] == 2 # 2 LST bins + assert np.all(np.isnan(rdc["data"][:, 0])) + assert np.all(rdc["flags"][:, 0]) + assert np.all(rdc["nsamples"][:, 0] == 0.0) - @pytest.mark.parametrize("ntimes", [(4, ), (5, 4)]) + @pytest.mark.parametrize("ntimes", [(4,), (5, 4)]) def test_multi_points_per_bin(self, ntimes, benchmark): d, f, n = self.get_input_data(ntimes=ntimes) rdc = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) # reduce_data swaps the order of bls/times - dd = rdc['data'].swapaxes(0, 1) - ff = rdc['flags'].swapaxes(0, 1) - nn = rdc['nsamples'].swapaxes(0, 1) + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) assert not np.any(ff) for lst in range(len(ntimes)): @@ -271,15 +316,19 @@ def test_multi_points_per_bin_flagged(self): d, f, n = self.get_input_data(ntimes=(4,)) f[0][2:] = True d[0][2:] = 1000.0 - rdc = lstbin_simple.reduce_lst_bins(d,f,n) + rdc = lstbin_simple.reduce_lst_bins(d, f, n) - assert rdc['data'].shape == rdc['flags'].shape == rdc['std'].shape == rdc['nsamples'].shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) # reduce_data swaps the order of bls/times - dd = rdc['data'].swapaxes(0, 1) - ff = rdc['flags'].swapaxes(0, 1) - nn = rdc['nsamples'].swapaxes(0, 1) - + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) @@ -289,7 +338,8 @@ def test_get_med_mad(self): d, f, n = self.get_input_data(ntimes=(4,)) rdc = lstbin_simple.reduce_lst_bins(d, f, n, get_mad=True) - assert np.all(rdc['median'] == rdc['data']) + assert np.all(rdc["median"] == rdc["data"]) + def test_apply_calfile_rules(tmpdir_factory): direc = tmpdir_factory.mktemp("test_apply_calfile_rules") @@ -304,8 +354,8 @@ def test_apply_calfile_rules(tmpdir_factory): data_files, calfiles = lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=False + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=False, ) assert len(data_files[0]) == 3 assert len(calfiles[0]) == 3 @@ -314,29 +364,28 @@ def test_apply_calfile_rules(tmpdir_factory): with pytest.raises(IOError, match="does not exist"): lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=False + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=False, ) data_files, calfiles = lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=True + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=True, ) assert len(data_files[0]) == 2 assert len(calfiles[0]) == 2 class Test_LSTAverage: - def test_sigma_clip_without_outliers(self, benchmark): - shape = (7,8,9) + shape = (7, 8, 9) np.random.seed(42) - data = np.random.normal(size=shape) + np.random.normal(size=shape)*1j + data = np.random.normal(size=shape) + np.random.normal(size=shape) * 1j nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n, daysbinned =lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, daysbinned = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, @@ -352,18 +401,20 @@ def test_sigma_clip_without_outliers(self, benchmark): ) assert data.shape == flg.shape == std.shape == norm.shape == nsamples.shape[1:] - np.testing.assert_allclose(data,data_n) + np.testing.assert_allclose(data, data_n) def test_average_repeated(self): - shape = (7,8,9) - _data = np.random.random(shape) + np.random.random(shape)*1j + shape = (7, 8, 9) + _data = np.random.random(shape) + np.random.random(shape) * 1j data = np.array([_data, _data, _data]) nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data=data, + nsamples=nsamples, + flags=flags, ) assert np.allclose(data_n, _data) @@ -374,7 +425,9 @@ def test_average_repeated(self): # Now flag the last "night" flags[-1] = True data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data=data, + nsamples=nsamples, + flags=flags, ) assert np.allclose(data_n, _data) @@ -386,19 +439,24 @@ def test_std_simple(self): shape = (5000, 1, 2, 2) # 1000 nights, doesn't matter what the other axis is. std = 2.0 - data = np.random.normal(scale=std, size=shape) + np.random.normal(scale=std, size=shape)*1j + data = ( + np.random.normal(scale=std, size=shape) + + np.random.normal(scale=std, size=shape) * 1j + ) nsamples = np.ones_like(data, dtype=float) flags = np.zeros_like(data, dtype=bool) data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data=data, + nsamples=nsamples, + flags=flags, ) # Check the averaged data is within 6 sigma of the population mean - np.testing.assert_allclose(data_n, 0.0, atol=std*6/np.sqrt(shape[0])) + np.testing.assert_allclose(data_n, 0.0, atol=std * 6 / np.sqrt(shape[0])) # Check the standard deviation is within 20% of the true value - np.testing.assert_allclose(std_n, std + std*1j, rtol=0.2) + np.testing.assert_allclose(std_n, std + std * 1j, rtol=0.2) assert not np.any(flg_n) @@ -420,16 +478,18 @@ def test_std(self, nsamples, flags): else: flags = np.random.random(shape) > 0.1 - data = np.random.normal(scale=std) + np.random.normal(scale=std)*1j + data = np.random.normal(scale=std) + np.random.normal(scale=std) * 1j flags = np.zeros(data.shape, dtype=bool) data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data=data, + nsamples=nsamples, + flags=flags, ) # Check the averaged data is within 6 sigma of the population mean - assert np.allclose(data_n, 0.0, atol=std*6/np.sqrt(shape[0])) + assert np.allclose(data_n, 0.0, atol=std * 6 / np.sqrt(shape[0])) # In reality the std is infinity where flags is True std[flags] = np.inf @@ -437,13 +497,13 @@ def test_std(self, nsamples, flags): sample_var_expectation = sve = w * (shape[0] - 1) # Check the standard deviation is within 20% of the true value - np.testing.assert_allclose(std_n, np.sqrt(sve) + np.sqrt(sve)*1j, rtol=0.2) + np.testing.assert_allclose(std_n, np.sqrt(sve) + np.sqrt(sve) * 1j, rtol=0.2) assert not np.any(flg_n) def test_inpaint_mode(self): - shape = (3,2,4, 1) # nights, bls, freqs, pols - _data = np.random.random(shape) + np.random.random(shape)*1j + shape = (3, 2, 4, 1) # nights, bls, freqs, pols + _data = np.random.random(shape) + np.random.random(shape) * 1j nsamples = np.ones(_data.shape, dtype=float) flags = np.zeros(_data.shape, dtype=bool) @@ -462,7 +522,6 @@ def test_inpaint_mode(self): assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) - # Now test with a whole LST bin flagged for a single bl-chan-pol (but inpainted) flags[:, 0, 0, 0] = True df, ff, stdf, nf, dbf = lstbin_simple.lst_average( @@ -475,10 +534,10 @@ def test_inpaint_mode(self): # The data and std in the fully-flagged bin should be different, but Nsamples # and Flags should be the same. - assert not np.allclose(df[0,0,0], di[0,0,0]) + assert not np.allclose(df[0, 0, 0], di[0, 0, 0]) assert np.allclose(df[1:], di[1:]) assert np.allclose(ff, fi) - assert not np.allclose(stdf[0,0,0], stdi[0,0,0]) + assert not np.allclose(stdf[0, 0, 0], stdi[0, 0, 0]) assert np.allclose(stdf[1:], stdi[1:]) assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) @@ -501,125 +560,146 @@ def test_inpaint_mode(self): np.testing.assert_allclose(stdf, stdi) assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) - + # However, if we had explicitly told the routine that the blt was inpainted, # we'd get a different result... - _d, _f = lstbin_simple.get_masked_data(_data, nsamples, flags, inpainted=np.ones_like(flags),inpainted_mode=False) + _d, _f = lstbin_simple.get_masked_data( + _data, nsamples, flags, inpainted=np.ones_like(flags), inpainted_mode=False + ) df, ff, stdf, nf, dbf = lstbin_simple.lst_average( - data=_d, nsamples=nsamples, flags=_f, inpainted_mode=False, + data=_d, + nsamples=nsamples, + flags=_f, + inpainted_mode=False, ) - _d, _f = lstbin_simple.get_masked_data(_data, nsamples, flags, inpainted=np.ones_like(flags),inpainted_mode=True) + _d, _f = lstbin_simple.get_masked_data( + _data, nsamples, flags, inpainted=np.ones_like(flags), inpainted_mode=True + ) di, fi, stdi, ni, dbi = lstbin_simple.lst_average( - data=_d, nsamples=nsamples, flags=_f, inpainted_mode=True, + data=_d, + nsamples=nsamples, + flags=_f, + inpainted_mode=True, ) # The LST-binned data will be different for blt=0, pol=0: - assert not np.allclose(df[0,:,0], di[0,:,0]) + assert not np.allclose(df[0, :, 0], di[0, :, 0]) assert np.allclose(df[1:], di[1:]) assert np.allclose(ff, fi) - assert not np.allclose(stdf[0,:,0], stdi[0,:,0]) + assert not np.allclose(stdf[0, :, 0], stdi[0, :, 0]) assert np.allclose(stdf[1:], stdi[1:]) assert np.allclose(nf, ni) assert np.allclose(dbf, dbi) - - - def test_flag_below_min_N(self): - shape = (7,8,9, 2) - _data = np.random.random(shape) + np.random.random(shape)*1j + shape = (7, 8, 9, 2) + _data = np.random.random(shape) + np.random.random(shape) * 1j nsamples = np.ones(_data.shape, dtype=float) flags = np.zeros(_data.shape, dtype=bool) # No samples have more than min_N, so they should all be flagged. data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=8, - flag_below_min_N=True + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=8, + flag_below_min_N=True, ) assert np.all(flg_n) - assert np.all(norm_n==7) # Even though they're flagged, we track the nsamples + assert np.all(norm_n == 7) # Even though they're flagged, we track the nsamples assert not np.all(np.isinf(std_n)) assert not np.all(np.isnan(data_n)) # this time, there's enough samples, but too many are flagged... flags[:5] = True data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, - flag_below_min_N=True + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=5, + flag_below_min_N=True, ) assert np.all(flg_n) # just because we're flagging it, doesn't mean we need to set nsamples=0 # or the std to inf. We have info there, we're just choosing not to use it. - assert np.all(norm_n==2) + assert np.all(norm_n == 2) assert not np.any(np.isinf(std_n)) - + # this time, only one column is flagged too much... # this time, there's enough samples, but too many are flagged... flags[:] = False flags[:5, 0] = True data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, - flag_below_min_N=True + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=5, + flag_below_min_N=True, ) assert np.all(flg_n[0]) - assert np.all(norm_n[0]==2) + assert np.all(norm_n[0] == 2) assert not np.any(np.isinf(std_n[0])) print(np.sum(flg_n[1:]), flg_n[1:].size) assert not np.any(flg_n[1:]) - assert np.all(norm_n[1:]==7) + assert np.all(norm_n[1:] == 7) assert not np.any(np.isinf(std_n[1:])) assert not np.any(np.isnan(data_n[1:])) - + def create_small_array_uvd(identifiable: bool = False, **kwargs): kwargs.update( freqs=np.linspace(150e6, 160e6, 100), - ants=[0,1,2,127,128], - antpairs=[(0,0), (0,1), (0,2), (1, 1), (1,2), (2, 2)], - pols=('xx', 'yy') + ants=[0, 1, 2, 127, 128], + antpairs=[(0, 0), (0, 1), (0, 2), (1, 1), (1, 2), (2, 2)], + pols=("xx", "yy"), ) if identifiable: return mockuvd.create_uvd_identifiable(**kwargs) else: return mockuvd.create_uvd_ones(**kwargs) + @pytest.fixture(scope="function") def uvd(): return create_small_array_uvd() + @pytest.fixture(scope="function") def uvd_redavg(): return create_small_array_uvd(redundantly_averaged=True) + @pytest.fixture(scope="function") def uvc(uvd): return UVCal.initialize_from_uvdata( uvd, - cal_style = "redundant", - gain_convention = "multiply", - jones_array = "linear", - cal_type = "gain", + cal_style="redundant", + gain_convention="multiply", + jones_array="linear", + cal_type="gain", metadata_only=False, ) + @pytest.fixture(scope="function") def uvd_file(uvd, tmpdir_factory) -> Path: # Write to file, so we can run lst_bin_files tmp = Path(tmpdir_factory.mktemp("test_partial_times")) - mock = tmp / 'mock.uvh5' + mock = tmp / "mock.uvh5" uvd.write_uvh5(str(mock), clobber=True) return mock + @pytest.fixture(scope="function") def uvd_redavg_file(uvd_redavg, tmpdir_factory) -> Path: # Write to file, so we can run lst_bin_files tmp = Path(tmpdir_factory.mktemp("test_partial_times")) - mock = tmp / 'mock.uvh5' + mock = tmp / "mock.uvh5" uvd_redavg.write_uvh5(str(mock), clobber=True) return mock @@ -628,29 +708,30 @@ def uvd_redavg_file(uvd_redavg, tmpdir_factory) -> Path: def uvc_file(uvc, uvd_file: Path) -> Path: # Write to file, so we can run lst_bin_files tmp = uvd_file.parent - fl = f'{tmp}/mock.calfits' + fl = f"{tmp}/mock.calfits" uvc.write_calfits(str(fl), clobber=True) return fl + class Test_LSTBinFilesForBaselines: def test_defaults(self, uvd, uvd_file): lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], - lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], + lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max() + 0.01], antpairs=uvd.get_antpairs(), - rephase=False + rephase=False, ) lstbins, d, f, n, inpflg, times = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], - lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], + lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max() + 0.01], antpairs=uvd.get_antpairs(), freqs=uvd.freq_array, pols=uvd.polarization_array, - time_idx = [np.ones(uvd.Ntimes, dtype=bool)], + time_idx=[np.ones(uvd.Ntimes, dtype=bool)], time_arrays=[np.unique(uvd.time_array)], - lsts = np.unique(uvd.lst_array), - rephase=False + lsts=np.unique(uvd.lst_array), + rephase=False, ) np.testing.assert_allclose(d0, d) @@ -663,7 +744,7 @@ def test_empty(self, uvd, uvd_file): data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=[(127, 128)], - rephase=True + rephase=True, ) assert np.all(f0) @@ -674,17 +755,26 @@ def test_extra(self, uvd, uvd_file): data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=uvd.get_antpairs() + [(127, 128)], - rephase=True + rephase=True, ) - assert np.all(f0[0][:, -1]) # last baseline is the extra one that's all flagged. + assert np.all( + f0[0][:, -1] + ) # last baseline is the extra one that's all flagged. def test_freqrange(self, uvd, uvd_file, uvc_file): """Test that providing freq_range works.""" - bins, data, flags, nsamples, inpflg, times = lstbin_simple.lst_bin_files_for_baselines( + ( + bins, + data, + flags, + nsamples, + inpflg, + times, + ) = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], - cal_files = [uvc_file], + cal_files=[uvc_file], freq_min=153e6, freq_max=158e6, antpairs=uvd.get_antpairs(), @@ -693,7 +783,7 @@ def test_freqrange(self, uvd, uvd_file, uvc_file): assert data[0].shape[-2] < uvd.Nfreqs def test_bad_pols(self, uvd, uvd_file): - with pytest.raises(KeyError, match='7'): + with pytest.raises(KeyError, match="7"): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -702,7 +792,9 @@ def test_bad_pols(self, uvd, uvd_file): ) def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): - with pytest.raises(ValueError, match='reds must be provided if redundantly_averaged is True'): + with pytest.raises( + ValueError, match="reds must be provided if redundantly_averaged is True" + ): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -710,7 +802,9 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): antpairs=uvd.get_antpairs(), ) - with pytest.raises(ValueError, match="Cannot apply calibration if redundantly_averaged is True"): + with pytest.raises( + ValueError, match="Cannot apply calibration if redundantly_averaged is True" + ): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -718,7 +812,9 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): redundantly_averaged=True, reds=RedundantGroups.from_antpos( dict(zip(uvd.antenna_numbers, uvd.antenna_positions)), - pols=uvutils.polnum2str(uvd.polarization_array, x_orientation=uvd.x_orientation), + pols=uvutils.polnum2str( + uvd.polarization_array, x_orientation=uvd.x_orientation + ), ), antpairs=uvd.get_antpairs(), ) @@ -726,7 +822,10 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_redavg_file], - lst_bin_edges=[uvd_redavg.lst_array.min()-0.1, uvd_redavg.lst_array.max()+0.1], + lst_bin_edges=[ + uvd_redavg.lst_array.min() - 0.1, + uvd_redavg.lst_array.max() + 0.1, + ], redundantly_averaged=True, rephase=False, antpairs=uvd_redavg.get_antpairs(), @@ -736,7 +835,13 @@ def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): ) assert len(d0) == 1 - assert d0[0].shape == (uvd_redavg.Ntimes, uvd_redavg.Nbls, uvd_redavg.Nfreqs, uvd_redavg.Npols) + assert d0[0].shape == ( + uvd_redavg.Ntimes, + uvd_redavg.Nbls, + uvd_redavg.Nfreqs, + uvd_redavg.Npols, + ) + def test_make_lst_grid(): lst_grid = lstbin_simple.make_lst_grid(0.01, begin_lst=None) @@ -749,10 +854,13 @@ def test_make_lst_grid(): assert len(lst_grid) == 628 assert np.isclose(lst_grid[0], 3.1365901175171982) + class Test_GetAllUnflaggedBaselines: @pytest.mark.parametrize("redundantly_averaged", [True, False]) @pytest.mark.parametrize("only_last_file_per_night", [True, False]) - def test_get_all_baselines(self, tmp_path_factory, redundantly_averaged, only_last_file_per_night): + def test_get_all_baselines( + self, tmp_path_factory, redundantly_averaged, only_last_file_per_night + ): tmp = tmp_path_factory.mktemp("get_all_baselines") uvds = mockuvd.make_dataset( ndays=3, @@ -784,13 +892,19 @@ def test_bad_inputs(self, tmp_path_factory): ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) - with pytest.raises(ValueError, match='Cannot ignore antennas if the files are redundantly averaged'): + with pytest.raises( + ValueError, + match="Cannot ignore antennas if the files are redundantly averaged", + ): lstbin_simple.get_all_unflagged_baselines(data_files, ignore_ants=[0, 1]) - with pytest.raises(ValueError, match='Cannot exclude antennas if the files are redundantly averaged'): + with pytest.raises( + ValueError, + match="Cannot exclude antennas if the files are redundantly averaged", + ): lstbin_simple.get_all_unflagged_baselines( data_files, - ex_ant_yaml_files=['non-existent-file.yaml'], + ex_ant_yaml_files=["non-existent-file.yaml"], ) uvds_different_xorient = mockuvd.make_dataset( @@ -799,47 +913,58 @@ def test_bad_inputs(self, tmp_path_factory): ntimes=2, ants=np.arange(10), creator=create_small_array_uvd, - x_orientation='east', + x_orientation="east", redundantly_averaged=True, ) - data_files = mockuvd.write_files_in_hera_format(uvds + uvds_different_xorient, tmp) + data_files = mockuvd.write_files_in_hera_format( + uvds + uvds_different_xorient, tmp + ) - with pytest.raises(ValueError, match='Not all files have the same xorientation!'): + with pytest.raises( + ValueError, match="Not all files have the same xorientation!" + ): lstbin_simple.get_all_unflagged_baselines(data_files) + class Test_LSTBinFiles: def test_golden_data(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("lstbin_golden_data") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, identifiable=True, - creator=create_small_array_uvd, time_axis_faster_than_bls=True + ndays=3, + nfiles=4, + ntimes=2, + identifiable=True, + creator=create_small_array_uvd, + time_axis_faster_than_bls=True, ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) print(len(uvds)) cfl = tmp / "lstbin_config_file.yaml" print(cfl) lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, rephase=False, - golden_lsts=uvds[0][1].lst_array.min() + 0.0001 + config_file=cfl, + rephase=False, + golden_lsts=uvds[0][1].lst_array.min() + 0.0001, ) assert len(out_files) == 4 - assert out_files[1]['GOLDEN'] + assert out_files[1]["GOLDEN"] assert not out_files[0]["GOLDEN"] assert not out_files[2]["GOLDEN"] assert not out_files[3]["GOLDEN"] uvd = UVData() - uvd.read(out_files[1]['GOLDEN']) - + uvd.read(out_files[1]["GOLDEN"]) # Read the Golden File - golden_hd = io.HERAData(out_files[1]['GOLDEN']) + golden_hd = io.HERAData(out_files[1]["GOLDEN"]) gd, gf, gn = golden_hd.read() assert gd.shape[0] == 3 # ndays @@ -850,8 +975,8 @@ def test_golden_data(self, tmp_path_factory): assert len(gd.keys()) == 12 # Check that autos are all the same - assert np.all(gd[(0,0,'ee')] == gd[(1, 1,'ee')]) - assert np.all(gd[(0,0,'ee')] == gd[(2, 2,'ee')]) + assert np.all(gd[(0, 0, "ee")] == gd[(1, 1, "ee")]) + assert np.all(gd[(0, 0, "ee")] == gd[(2, 2, "ee")]) # Since each day is at exactly the same LST by construction, the golden data # over time should be the same. @@ -861,19 +986,26 @@ def test_golden_data(self, tmp_path_factory): for day in data: np.testing.assert_allclose(data[0], day, atol=1e-6) - assert not np.allclose(gd[(0, 1, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 1, 'ee')][0]) - + assert not np.allclose(gd[(0, 1, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 1, "ee")][0]) def test_save_chans(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("lstbin_golden_data") - uvds = mockuvd.make_dataset(ndays=3, nfiles=4, ntimes=2, identifiable=True, creator=create_small_array_uvd) + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=4, + ntimes=2, + identifiable=True, + creator=create_small_array_uvd, + ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( @@ -883,10 +1015,10 @@ def test_save_chans(self, tmp_path_factory): assert len(out_files) == 4 # Ensure there's a REDUCEDCHAN file for each output LST for fl in out_files: - assert fl['REDUCEDCHAN'] + assert fl["REDUCEDCHAN"] # Read the Golden File - hd = io.HERAData(fl['REDUCEDCHAN']) + hd = io.HERAData(fl["REDUCEDCHAN"]) gd, gf, gn = hd.read() assert gd.shape[0] == 3 # ndays @@ -897,8 +1029,8 @@ def test_save_chans(self, tmp_path_factory): assert len(gd.keys()) == 12 # Check that autos are all the same - assert np.all(gd[(0,0,'ee')] == gd[(1, 1,'ee')]) - assert np.all(gd[(0,0,'ee')] == gd[(2, 2,'ee')]) + assert np.all(gd[(0, 0, "ee")] == gd[(1, 1, "ee")]) + assert np.all(gd[(0, 0, "ee")] == gd[(2, 2, "ee")]) # Since each day is at exactly the same LST by construction, the golden data # over time should be the same. @@ -906,92 +1038,104 @@ def test_save_chans(self, tmp_path_factory): for day in data: np.testing.assert_allclose(data[0], day, rtol=1e-6) - assert not np.allclose(gd[(0, 1, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 1, 'ee')][0]) + assert not np.allclose(gd[(0, 1, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 1, "ee")][0]) def test_baseline_chunking(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("baseline_chunking") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(10) for j in range(i, 10)], # 55 antpairs - pols = ['xx', 'yy'], + antpairs=[(i, j) for i in range(10) for j in range(i, 10)], # 55 antpairs + pols=["xx", "yy"], freqs=np.linspace(140e6, 180e6, 12), ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - write_med_mad=True + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + write_med_mad=True, ) out_files_chunked = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.chunked.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.chunked.uvh5", Nbls_to_load=10, - write_med_mad=True + write_med_mad=True, ) for flset, flsetc in zip(out_files, out_files_chunked): - assert flset[('LST', False)] != flsetc[('LST', False)] + assert flset[("LST", False)] != flsetc[("LST", False)] uvdlst = UVData() - uvdlst.read(flset[('LST', False)]) + uvdlst.read(flset[("LST", False)]) uvdlstc = UVData() - uvdlstc.read(flsetc[('LST', False)]) + uvdlstc.read(flsetc[("LST", False)]) assert uvdlst == uvdlstc - assert flset[('MED', False)] != flsetc[('MED', False)] + assert flset[("MED", False)] != flsetc[("MED", False)] uvdlst = UVData() - uvdlst.read(flset[('MED', False)]) + uvdlst.read(flset[("MED", False)]) uvdlstc = UVData() - uvdlstc.read(flsetc[('MED', False)]) + uvdlstc.read(flsetc[("MED", False)]) assert uvdlst == uvdlstc - - def test_compare_nontrivial_cal( - self, tmp_path_factory - ): + + def test_compare_nontrivial_cal(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs - pols = ('xx', 'yy'), + antpairs=[(i, j) for i in range(7) for j in range(i, 7)], # 55 antpairs + pols=("xx", "yy"), freqs=np.linspace(140e6, 180e6, 3), ) - uvcs = [ - [mockuvd.make_uvc_identifiable(d) for d in uvd ] for uvd in uvds - ] + uvcs = [[mockuvd.make_uvc_identifiable(d) for d in uvd] for uvd in uvds] for night in uvds: print([np.unique(night[0].lst_array)]) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cal_files = mockuvd.write_cals_in_hera_format(uvcs, tmp) - decal_files = [[df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files] + decal_files = [ + [df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(data_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - df, uf, cf, - gain_convention='divide', # go the wrong way + df, + uf, + cf, + gain_convention="divide", # go the wrong way clobber=True, ) # First, let's go the other way to check if we get the same thing back directly - recaled_files = [[df.replace(".uvh5", ".recal.uvh5") for df in dfl] for dfl in data_files] + recaled_files = [ + [df.replace(".uvh5", ".recal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(recaled_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - uf, df, cf, - gain_convention='multiply', # go the wrong way + uf, + df, + cf, + gain_convention="multiply", # go the wrong way clobber=True, ) @@ -1006,32 +1150,39 @@ def test_compare_nontrivial_cal( cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, decal_files, ntimes_per_file=2, + cfl, + decal_files, + ntimes_per_file=2, ) out_files_recal = lstbin_simple.lst_bin_files( - config_file=cfl, calfile_rules=[(".decal.uvh5", ".calfits")], + config_file=cfl, + calfile_rules=[(".decal.uvh5", ".calfits")], fname_format="zen.{kind}.{lst:7.5f}.recal.uvh5", Nbls_to_load=10, - rephase=False + rephase=False, ) lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", Nbls_to_load=11, - rephase=False + rephase=False, ) for flset, flsetc in zip(out_files, out_files_recal): - assert flset[('LST', False)] != flsetc[('LST', False)] + assert flset[("LST", False)] != flsetc[("LST", False)] uvdlst = UVData() - uvdlst.read(flset[('LST', False)]) + uvdlst.read(flset[("LST", False)]) uvdlstc = UVData() - uvdlstc.read(flsetc[('LST', False)]) + uvdlstc.read(flsetc[("LST", False)]) print(np.unique(uvdlstc.lst_array)) expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1044,9 +1195,15 @@ def test_compare_nontrivial_cal( # when we put in flags, we end up setting the data to nan (and # never using it...) np.testing.assert_allclose( - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, uvdlstc.get_data(ap+(pol,))), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 + np.where( + uvdlst.get_flags(ap + (pol,)), + 1.0, + uvdlstc.get_data(ap + (pol,)), + ), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4, ) # Don't worry about history here, because we know they use different inputs @@ -1056,65 +1213,82 @@ def test_compare_nontrivial_cal( @pytest.mark.parametrize( "random_ants_to_drop, rephase, sigma_clip_thresh, flag_strategy, pols, freq_range", [ - (0, True, 0.0, (0,0,0), ('xx', 'yy'), None), - (0, True, 0.0, (0,0,0), ('xx', 'yy', 'xy', 'yx'), None), - (0, True, 0.0, (0,0,0), ('xx', 'yy'), (150e6, 180e6)), - (0, True, 0.0, (2,1,3), ('xx', 'yy'), None), - (0, True, 3.0, (0,0,0), ('xx', 'yy'), None), - (0, False, 0.0, (0,0,0), ('xx', 'yy'), None), - (3, True, 0.0, (0,0,0), ('xx', 'yy'), None), - ] + (0, True, 0.0, (0, 0, 0), ("xx", "yy"), None), + (0, True, 0.0, (0, 0, 0), ("xx", "yy", "xy", "yx"), None), + (0, True, 0.0, (0, 0, 0), ("xx", "yy"), (150e6, 180e6)), + (0, True, 0.0, (2, 1, 3), ("xx", "yy"), None), + (0, True, 3.0, (0, 0, 0), ("xx", "yy"), None), + (0, False, 0.0, (0, 0, 0), ("xx", "yy"), None), + (3, True, 0.0, (0, 0, 0), ("xx", "yy"), None), + ], ) def test_nontrivial_cal( - self, tmp_path_factory, random_ants_to_drop: int, rephase: bool, - sigma_clip_thresh: float, flag_strategy: tuple[int, int, int], - pols: tuple[str], freq_range: tuple[float, float] | None, - benchmark + self, + tmp_path_factory, + random_ants_to_drop: int, + rephase: bool, + sigma_clip_thresh: float, + flag_strategy: tuple[int, int, int], + pols: tuple[str], + freq_range: tuple[float, float] | None, + benchmark, ): - tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=2, ntimes=2, ants=np.arange(7), + ndays=3, + nfiles=2, + ntimes=2, + ants=np.arange(7), creator=mockuvd.create_uvd_identifiable, - pols = pols, + pols=pols, freqs=np.linspace(140e6, 180e6, 3), random_ants_to_drop=random_ants_to_drop, ) uvcs = [ - [mockuvd.make_uvc_identifiable(d, *flag_strategy) for d in uvd ] for uvd in uvds + [mockuvd.make_uvc_identifiable(d, *flag_strategy) for d in uvd] + for uvd in uvds ] data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cal_files = mockuvd.write_cals_in_hera_format(uvcs, tmp) - decal_files = [[df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files] + decal_files = [ + [df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(data_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - df, uf, cf, - gain_convention='divide', # go the wrong way + df, + uf, + cf, + gain_convention="divide", # go the wrong way clobber=True, ) cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = benchmark( lstbin_simple.lst_bin_files, - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=rephase, + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + Nbls_to_load=11, + rephase=rephase, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=2, freq_min=freq_range[0] if freq_range is not None else None, freq_max=freq_range[1] if freq_range is not None else None, - overwrite=True + overwrite=True, ) assert len(out_files) == 2 for flset in out_files: uvdlst = UVData() - uvdlst.read(flset[('LST', False)]) + uvdlst.read(flset[("LST", False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1133,23 +1307,29 @@ def test_nontrivial_cal( print(uvdlst.get_data(ap)) print(expected[i]) np.testing.assert_allclose( - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, uvdlst.get_data(ap+(pol,))), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 if (not rephase or (ap[0] == ap[1] and pol[0]==pol[1])) else 1e-3 + np.where( + uvdlst.get_flags(ap + (pol,)), + 1.0, + uvdlst.get_data(ap + (pol,)), + ), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4 + if (not rephase or (ap[0] == ap[1] and pol[0] == pol[1])) + else 1e-3, ) - @pytest.mark.parametrize("tell_it", (True, False)) - def test_redundantly_averaged( - self, tmp_path_factory, tell_it - ): - + def test_redundantly_averaged(self, tmp_path_factory, tell_it): tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=2, ntimes=2, + ndays=3, + nfiles=2, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs - pols = ('xx', 'yx'), + antpairs=[(i, j) for i in range(7) for j in range(i, 7)], # 55 antpairs + pols=("xx", "yx"), freqs=np.linspace(140e6, 180e6, 3), redundantly_averaged=True, ) @@ -1158,11 +1338,16 @@ def test_redundantly_averaged( cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=False, + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + Nbls_to_load=11, + rephase=False, sigma_clip_thresh=0.0, sigma_clip_min_N=2, redundantly_averaged=True if tell_it else None, @@ -1172,7 +1357,7 @@ def test_redundantly_averaged( for flset in out_files: uvdlst = UVData() - uvdlst.read(flset[('LST', False)]) + uvdlst.read(flset[("LST", False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1190,18 +1375,22 @@ def test_redundantly_averaged( # when we put in flags, we end up setting the data to 1.0 (and # never using it...) np.testing.assert_allclose( - uvdlst.get_data(ap+(pol,)), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 + uvdlst.get_data(ap + (pol,)), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4, ) def test_output_file_select(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("output_file_select") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(4) for j in range(i, 4)], # 55 antpairs - pols = ('xx', 'yx'), + antpairs=[(i, j) for i in range(4) for j in range(i, 4)], # 55 antpairs + pols=("xx", "yx"), freqs=np.linspace(140e6, 180e6, 3), ) @@ -1209,33 +1398,41 @@ def test_output_file_select(self, tmp_path_factory): cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) lstbf = partial( - lstbin_simple.lst_bin_files, config_file=cfl, + lstbin_simple.lst_bin_files, + config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=False, + Nbls_to_load=11, + rephase=False, ) out_files = lstbf(output_file_select=0) assert len(out_files) == 1 - out_files = lstbf(output_file_select=(1, 2)) assert len(out_files) == 2 - with pytest.raises(ValueError, match='output_file_select must be less than the number of output files'): + with pytest.raises( + ValueError, + match="output_file_select must be less than the number of output files", + ): lstbf(output_file_select=100) - def test_inpaint_mode_no_flags(self, tmp_path_factory): """Test that using inpaint mode does nothing when there's no flags.""" tmp = tmp_path_factory.mktemp("inpaint_no_flags") uvds = mockuvd.make_dataset( - ndays=3, nfiles=1, ntimes=2, + ndays=3, + nfiles=1, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(3) for j in range(i, 3)], # 55 antpairs - pols = ('xx', 'yx'), + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), freqs=np.linspace(140e6, 180e6, 3), redundantly_averaged=True, ) @@ -1244,21 +1441,93 @@ def test_inpaint_mode_no_flags(self, tmp_path_factory): cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", rephase=False, sigma_clip_thresh=None, sigma_clip_min_N=2, output_flagged=True, - output_inpainted=True + output_inpainted=True, ) assert len(out_files) == 1 for flset in out_files: - flagged = UVData.from_file(flset[('LST', False)]) - inpainted = UVData.from_file(flset[('LST', True)]) + flagged = UVData.from_file(flset[("LST", False)]) + inpainted = UVData.from_file(flset[("LST", True)]) assert flagged == inpainted + + def test_inpaint_mode_no_flags_where_inpainted(self, tmp_path_factory): + """Test that .""" + tmp = tmp_path_factory.mktemp("inpaint_no_flags") + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=1, + ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format( + uvds, tmp, add_where_inpainted_files=True + ) + + cfl = tmp / "lstbin_config_file.yaml" + config_info = lstbin_simple.make_lst_bin_config_file( + cfl, + data_files, + ntimes_per_file=2, + clobber=True, + ) + out_files = lstbin_simple.lst_bin_files( + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + rephase=False, + sigma_clip_thresh=None, + sigma_clip_min_N=2, + output_flagged=True, + output_inpainted=True, + where_inpainted_file_rules=[(".uvh5", ".where_inpainted.h5")], + ) + + assert len(out_files) == 1 + + for flset in out_files: + flagged = UVData.from_file(flset[("LST", False)]) + inpainted = UVData.from_file(flset[("LST", True)]) + + assert flagged == inpainted + + +def test_get_where_inpainted(tmp_path_factory): + tmp: Path = tmp_path_factory.mktemp("get_where_inpainted") + + fls = [] + for outer in ["abc", "def"]: + these = [] + for filename in outer: + (tmp / f"{filename}.uvh5").touch() + (tmp / f"{filename}.where_inpainted.h5").touch() + these.append(tmp / f"{filename}.uvh5") + fls.append(these) + + out = lstbin_simple._get_where_inpainted_files( + fls, [(".uvh5", ".where_inpainted.h5")] + ) + + assert len(out) == 2 + assert len(out[0]) == 3 + assert len(out[1]) == 3 + + with pytest.raises(IOError, match="Where inpainted file"): + lstbin_simple._get_where_inpainted_files(fls, [(".uvh5", ".non_existent.h5")]) From 0af35b66194d1e9cb72c5ad0ae9cdcff648e9f8b Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 03:27:47 -0700 Subject: [PATCH 20/81] test: add tests to cover corner cases --- hera_cal/io.py | 2 +- hera_cal/lstbin_simple.py | 10 +- hera_cal/tests/test_lstbin_simple.py | 139 ++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 10 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 90230a9e6..0c7ad01b1 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -1600,7 +1600,7 @@ def load_flags(flagfile, filetype='h5', return_meta=False): elif filetype == 'h5': from pyuvdata import UVFlag uvf = UVFlag(flagfile) - assert uvf.mode == 'flag', 'The input h5-based UVFlag object must be in flag mode.' + assert uvf.mode == 'flag', f'The input h5-based UVFlag object must be in flag mode, got {uvf.mode}' assert (np.issubsctype(uvf.polarization_array.dtype, np.signedinteger) or np.issubsctype(uvf.polarization_array.dtype, np.str_)), \ "The input h5-based UVFlag object's polarization_array must be integers or byte strings." diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index d7e00fcab..a808b7881 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -343,8 +343,8 @@ def reduce_lst_bins( days_binned = np.zeros(out_nsamples.shape, dtype=int) if get_mad: - mad = np.zeros(out_data.shape, dtype=complex) - med = np.zeros(out_data.shape, dtype=complex) + mad = np.ones(out_data.shape, dtype=complex) + med = np.ones(out_data.shape, dtype=complex) if where_inpainted is None: where_inpainted = [None] * nlst_bins @@ -1039,12 +1039,12 @@ def _get_where_inpainted_files( for df in dflist: wif = str(df) for rule in where_inpainted_file_rules: - print(rule[0], "HA", rule[1]) wif = wif.replace(rule[0], rule[1]) if os.path.exists(wif): this.append(wif) else: raise IOError(f"Where inpainted file {wif} does not exist") + return where_inpainted_files @@ -1311,8 +1311,8 @@ def lst_bin_files_single_outfile( where_inpainted_files = _get_where_inpainted_files( data_files, where_inpainted_file_rules ) - output_inpainted, output_flagged = _configure_inpainted_mode( - output_inpainted, output_flagged, where_inpainted_files + output_flagged, output_inpainted = _configure_inpainted_mode( + output_flagged, output_inpainted, where_inpainted_files ) # Prune empty nights (some nights start with files, but have files removed because diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index 1830dbefd..d17b1c924 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -3,14 +3,13 @@ from . import mock_uvdata as mockuvd import pytest from pathlib import Path -from pyuvdata import UVCal -from itertools import combinations_with_replacement +from pyuvdata import UVCal, UVFlag import numpy as np from hera_cal import lstbin_simple as lstbin_simple import pytest import numpy as np from pyuvdata import UVCal, UVData -from .. import io, utils, lstbin_simple, noise +from .. import io, utils, lstbin_simple from hera_cal import apply_cal from pyuvdata import utils as uvutils from hera_cal.red_groups import RedundantGroups @@ -283,12 +282,14 @@ def test_one_point_per_bin(self, benchmark): def test_zerosize_bin(self): d, f, n = self.get_input_data(ntimes=(0, 1)) print(d[0].shape, len(d)) - rdc = lstbin_simple.reduce_lst_bins(d, f, n) + rdc = lstbin_simple.reduce_lst_bins(d, f, n, get_mad=True) assert rdc["data"].shape[1] == 2 # 2 LST bins assert np.all(np.isnan(rdc["data"][:, 0])) assert np.all(rdc["flags"][:, 0]) assert np.all(rdc["nsamples"][:, 0] == 0.0) + assert np.all(np.isinf(rdc["mad"][:, 0])) + assert np.all(np.isnan(rdc["median"][:, 0])) @pytest.mark.parametrize("ntimes", [(4,), (5, 4)]) def test_multi_points_per_bin(self, ntimes, benchmark): @@ -650,6 +651,22 @@ def test_flag_below_min_N(self): assert not np.any(np.isinf(std_n[1:])) assert not np.any(np.isnan(data_n[1:])) + def test_sigma_clip_and_inpainted_warning(self): + """Test that a warning is raised if doing inpainted_mode as well as sigma-clip.""" + shape = (7, 8, 9, 2) + _data = np.random.random(shape) + np.random.random(shape) * 1j + nsamples = np.ones(_data.shape, dtype=float) + flags = np.zeros(_data.shape, dtype=bool) + + with pytest.warns(UserWarning, match="Sigma-clipping in in-painted mode"): + lstbin_simple.lst_average( + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_thresh=2.0, + inpainted_mode=True, + ) + def create_small_array_uvd(identifiable: bool = False, **kwargs): kwargs.update( @@ -1508,6 +1525,51 @@ def test_inpaint_mode_no_flags_where_inpainted(self, tmp_path_factory): assert flagged == inpainted + def test_where_inpainted_not_baseline_type(self, tmp_path_factory): + """Test that proper error is raised when using incorrect inpainted files.""" + tmp = tmp_path_factory.mktemp("inpaint_not_baseline_type") + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=1, + ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format( + uvds, tmp, add_where_inpainted_files=True + ) + + # Now create dodgy where_inpainted files + inp = lstbin_simple._get_where_inpainted_files( + data_files, [(".uvh5", ".where_inpainted.h5")] + ) + for fllist in inp: + for fl in fllist: + uvf = UVFlag() + uvf.read(fl) + uvf.to_waterfall() + uvf.to_flag() + uvf.write(fl.replace(".h5", ".waterfall.h5"), clobber=True) + + cfl = tmp / "lstbin_config_file.yaml" + lstbin_simple.make_lst_bin_config_file( + cfl, + data_files, + ntimes_per_file=2, + clobber=True, + ) + with pytest.raises(ValueError, match="to be a DataContainer"): + lstbin_simple.lst_bin_files( + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + output_flagged=False, + where_inpainted_file_rules=[(".uvh5", ".where_inpainted.waterfall.h5")], + ) + def test_get_where_inpainted(tmp_path_factory): tmp: Path = tmp_path_factory.mktemp("get_where_inpainted") @@ -1531,3 +1593,72 @@ def test_get_where_inpainted(tmp_path_factory): with pytest.raises(IOError, match="Where inpainted file"): lstbin_simple._get_where_inpainted_files(fls, [(".uvh5", ".non_existent.h5")]) + + +def test_configure_inpainted_mode(): + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=True, where_inpainted_files=[] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=True, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=False, where_inpainted_files=[] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=False, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=None, where_inpainted_files=[] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=None, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=False, output_inpainted=True, where_inpainted_files=[] + ) + assert not flg + assert inp + + with pytest.raises(ValueError, match="Both output_inpainted and output_flagged"): + lstbin_simple._configure_inpainted_mode( + output_flagged=False, output_inpainted=False, where_inpainted_files=[] + ) + + +class TestThresholdFlags: + def test_inplace(self): + flags = np.zeros((10, 12), dtype=bool) # shape = (ntime, ...) + flags[:8, 0] = True + + new = lstbin_simple.threshold_flags(flags, flag_thresh=0.7, inplace=True) + + assert np.all(flags[:, 0]) + assert np.all(new == flags) + + def test_not_inplace(self): + flags = np.zeros((10, 12), dtype=bool) # shape = (ntime, ...) + flags[:8, 0] = True + + new = lstbin_simple.threshold_flags(flags, flag_thresh=0.7, inplace=False) + + assert not np.all(flags[:, 0]) + assert np.all(new[:, 0]) From 6886f3b24807c74253dedd9be3875cc74bc58e75 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 05:21:14 -0700 Subject: [PATCH 21/81] ci: make codecov upload more robust --- .github/workflows/ci.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c24fd7d1..b2bcf0e9b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: PYTHON: ${{ matrix.python-version }} OS: ${{ matrix.os }} runs-on: ${{ matrix.os }} - + strategy: matrix: os: [ubuntu-latest, macos-latest] @@ -40,7 +40,12 @@ jobs: run: | pytest -n auto --pyargs hera_cal --cov=hera_cal --cov-config=./.coveragerc --cov-report xml:./coverage.xml --durations=15 - - name: Upload Coverage (Ubuntu) + - name: Upload coverage report if: matrix.os == 'ubuntu-latest' && success() - run: | - bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }} + uses: codecov/codecov-action@v3.1.3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: unittests + name: codecov-umbrella + fail_ci_if_error: true From d113c49a2aad5033eebf045bb5b3a5f547235c3d Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 06:24:33 -0700 Subject: [PATCH 22/81] test: add new test of select_freqs --- hera_cal/lstbin_simple.py | 2 -- hera_cal/tests/test_datacontainer.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index a808b7881..507ad6490 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -181,8 +181,6 @@ def lst_align( nsamples = nsamples[lst_mask] data_lsts = data_lsts[lst_mask] grid_indices = grid_indices[lst_mask] - if where_inpainted is not None: - where_inpainted = where_inpainted[lst_mask] if freq_array is None or antpos is None: raise ValueError("freq_array and antpos is needed for rephase") diff --git a/hera_cal/tests/test_datacontainer.py b/hera_cal/tests/test_datacontainer.py index 683927005..29caf4e76 100644 --- a/hera_cal/tests/test_datacontainer.py +++ b/hera_cal/tests/test_datacontainer.py @@ -504,6 +504,9 @@ def test_select_freqs(self): with pytest.raises(ValueError, match='Cannot specify both freqs and channels'): dc1.select_freqs(fq, channels=chans) + with pytest.raises(ValueError, match="All freqs must be in self.freqs"): + dc1.select_freqs(np.array([100.0])) + # Ensure warning is raised without freqs dc1.freqs = None with pytest.warns(UserWarning, match='It is impossible'): From 89abe8fdc3ade2bc8058e3600c2ff3023b80576c Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 06:28:13 -0700 Subject: [PATCH 23/81] chore: remove unnecessary comment --- hera_cal/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/hera_cal/utils.py b/hera_cal/utils.py index 21873c038..fd532a010 100644 --- a/hera_cal/utils.py +++ b/hera_cal/utils.py @@ -1671,9 +1671,6 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: return best_index try: - # We subtract one, because the file before the first file in the range - # may just get into the range by a tiny bit, and we allow the user to test - # that out later. first_file = get_first_file_in_range(metadata_list, (jdstart, jdend)) except ValueError: return [[] for _ in range(len(lst_edges) - 1)] From a3eaac75615d49bad7af76d0f8a8b63d256bddef Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 11:09:51 -0700 Subject: [PATCH 24/81] fix: let where_inpainted be a RedDataContainer --- hera_cal/lstbin_simple.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 507ad6490..373539e3f 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -844,7 +844,7 @@ def lst_bin_files_for_baselines( ) if inpfile is not None: # This returns a DataContainer (unless something went wrong) since it should - # always be a 'baseline' type of UVFlag.s + # always be a 'baseline' type of UVFlag. inpainted = io.load_flags(inpfile) if not isinstance(inpainted, DataContainer): raise ValueError(f"Expected {inpfile} to be a DataContainer") @@ -881,6 +881,17 @@ def lst_bin_files_for_baselines( for i, bl in enumerate(antpairs): if redundantly_averaged: bl = keyed.get_ubl_key(bl) + + # Get the representative baseline key from this bl group that exists + # in the where_inpainted data. + if inpainted is not None: + for inpbl in reds[bl]: + if inpbl in inpainted: + break + else: + raise ValueError( + f"Could not find any baseline from group {bl} in inpainted file" + ) for j, pol in enumerate(pols): blpol = bl + (pol,) From 62a03e45bfbe947c6c3be21cb89c58f9c271d294 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 5 Sep 2023 11:59:56 -0700 Subject: [PATCH 25/81] fix: use blpol when checking existence --- hera_cal/lstbin_simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 373539e3f..71e43c876 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -886,7 +886,7 @@ def lst_bin_files_for_baselines( # in the where_inpainted data. if inpainted is not None: for inpbl in reds[bl]: - if inpbl in inpainted: + if inpbl + (pols[0],) in inpainted: break else: raise ValueError( From 34f70172fd092de684d65d579e3baf25de058bc7 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 6 Sep 2023 01:01:55 -0700 Subject: [PATCH 26/81] fix: actually use the where_inpainted blpol --- hera_cal/lstbin_simple.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 71e43c876..18a3102de 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -894,13 +894,14 @@ def lst_bin_files_for_baselines( ) for j, pol in enumerate(pols): blpol = bl + (pol,) + inp_blpol = inpbl + (pol,) if blpol in _data: # DataContainer takes care of conjugates. data[slc, i, :, j] = _data[blpol] flags[slc, i, :, j] = _flags[blpol] nsamples[slc, i, :, j] = _nsamples[blpol] if inpainted is not None: - where_inpainted[slc, i, :, j] = inpainted[blpol] + where_inpainted[slc, i, :, j] = inpainted[inp_blpol] else: # This baseline+pol doesn't exist in this file. That's # OK, we don't assume all baselines are in every file. From 3ba785249811fcfa0aa83b43fb9854b18c831f6d Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 6 Sep 2023 01:51:24 -0700 Subject: [PATCH 27/81] fix: only check for a bl in inpainted if it exists in data --- hera_cal/lstbin_simple.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 18a3102de..c43d66db6 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -882,26 +882,29 @@ def lst_bin_files_for_baselines( if redundantly_averaged: bl = keyed.get_ubl_key(bl) - # Get the representative baseline key from this bl group that exists - # in the where_inpainted data. - if inpainted is not None: - for inpbl in reds[bl]: - if inpbl + (pols[0],) in inpainted: - break - else: - raise ValueError( - f"Could not find any baseline from group {bl} in inpainted file" - ) for j, pol in enumerate(pols): blpol = bl + (pol,) - inp_blpol = inpbl + (pol,) if blpol in _data: # DataContainer takes care of conjugates. data[slc, i, :, j] = _data[blpol] flags[slc, i, :, j] = _flags[blpol] nsamples[slc, i, :, j] = _nsamples[blpol] + if inpainted is not None: - where_inpainted[slc, i, :, j] = inpainted[inp_blpol] + # Get the representative baseline key from this bl group that + # exists in the where_inpainted data. + if redundantly_averaged: + for inpbl in reds[bl]: + if inpbl + (pol,) in inpainted: + blpol = inpbl + (pol,) + break + else: + raise ValueError( + f"Could not find any baseline from group {bl} in " + "inpainted file" + ) + + where_inpainted[slc, i, :, j] = inpainted[blpol] else: # This baseline+pol doesn't exist in this file. That's # OK, we don't assume all baselines are in every file. From 1a83fac0799212321761d2caa0de20a1835730ca Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 8 Sep 2023 03:49:54 -0700 Subject: [PATCH 28/81] temporary prints for debug --- hera_cal/lstbin_simple.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index c43d66db6..2003cde76 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1453,6 +1453,7 @@ def lst_bin_files_single_outfile( f"lst_bin_edges={lst_bin_edges}" ) + logger.warning("BEFORE PARTIAL: ", outdir) # make it a bit easier to create the outfiles create_outfile = partial( create_lstbin_output_file, @@ -1522,6 +1523,7 @@ def lst_bin_files_single_outfile( slc = slice(nbls_so_far, nbls_so_far + len(bl_chunk)) + logger.warning("HEY HEY JUST BEFORE WRITIG: ", outdir) if bi == 0: # On the first baseline chunk, create the output file # TODO: we're not writing out the where_inpainted data for the GOLDEN @@ -1872,7 +1874,7 @@ def create_lstbin_output_file( if lst < lst_branch_cut: lst += 2 * np.pi - + logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: ", outdir) fname = outdir / fname_format.format( kind=kind, lst=lst, From 12c612ac6f89e967df504d0c952e19e0eeba588e Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 8 Sep 2023 05:11:35 -0700 Subject: [PATCH 29/81] temporary prints for debug fixed --- hera_cal/lstbin_simple.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 2003cde76..78eb4d18e 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1453,7 +1453,7 @@ def lst_bin_files_single_outfile( f"lst_bin_edges={lst_bin_edges}" ) - logger.warning("BEFORE PARTIAL: ", outdir) + logger.warning(f"BEFORE PARTIAL: {outdir}") # make it a bit easier to create the outfiles create_outfile = partial( create_lstbin_output_file, @@ -1523,7 +1523,7 @@ def lst_bin_files_single_outfile( slc = slice(nbls_so_far, nbls_so_far + len(bl_chunk)) - logger.warning("HEY HEY JUST BEFORE WRITIG: ", outdir) + logger.warning(f"HEY HEY JUST BEFORE WRITING: {outdir}") if bi == 0: # On the first baseline chunk, create the output file # TODO: we're not writing out the where_inpainted data for the GOLDEN @@ -1874,7 +1874,7 @@ def create_lstbin_output_file( if lst < lst_branch_cut: lst += 2 * np.pi - logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: ", outdir) + logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: {outdir}") fname = outdir / fname_format.format( kind=kind, lst=lst, From 30a22f1290e76ef0b4e6b7f7a54fde704559fac9 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 8 Sep 2023 05:30:27 -0700 Subject: [PATCH 30/81] temporary prints for debug fixed and added --- hera_cal/lstbin_simple.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 78eb4d18e..af83335f3 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1874,7 +1874,17 @@ def create_lstbin_output_file( if lst < lst_branch_cut: lst += 2 * np.pi - logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: {outdir}") + + herp = fname_format.format( + kind=kind, + lst=lst, + pol="".join(pols), + inpaint_mode="inpaint" + if inpaint_mode + else ("flagged" if inpaint_mode is False else ""), + ) + logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: {outdir} {fname_format}\n{herp}") + fname = outdir / fname_format.format( kind=kind, lst=lst, From ffd51c9b0dcc1cc843161f827c3b221ee4bf18d3 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 8 Sep 2023 05:49:30 -0700 Subject: [PATCH 31/81] remove / prefix from filename if it exists --- hera_cal/lstbin_simple.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index af83335f3..01e4fe567 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1453,7 +1453,6 @@ def lst_bin_files_single_outfile( f"lst_bin_edges={lst_bin_edges}" ) - logger.warning(f"BEFORE PARTIAL: {outdir}") # make it a bit easier to create the outfiles create_outfile = partial( create_lstbin_output_file, @@ -1523,7 +1522,6 @@ def lst_bin_files_single_outfile( slc = slice(nbls_so_far, nbls_so_far + len(bl_chunk)) - logger.warning(f"HEY HEY JUST BEFORE WRITING: {outdir}") if bi == 0: # On the first baseline chunk, create the output file # TODO: we're not writing out the where_inpainted data for the GOLDEN @@ -1875,17 +1873,7 @@ def create_lstbin_output_file( if lst < lst_branch_cut: lst += 2 * np.pi - herp = fname_format.format( - kind=kind, - lst=lst, - pol="".join(pols), - inpaint_mode="inpaint" - if inpaint_mode - else ("flagged" if inpaint_mode is False else ""), - ) - logger.warning(f"IN CREATE LSTBIN OUTPUT FILE for {kind}: {outdir} {fname_format}\n{herp}") - - fname = outdir / fname_format.format( + fname = fname_format.format( kind=kind, lst=lst, pol="".join(pols), @@ -1893,6 +1881,9 @@ def create_lstbin_output_file( if inpaint_mode else ("flagged" if inpaint_mode is False else ""), ) + # There's a weird gotcha with pathlib where if you do path / "/file.name" + # You get just "/file.name" which is in root. + fname = outdir / fname.removeprefix('/') logger.info(f"Initializing {fname}") From e634e17576ab9f5017e5cb8381e93a4d7b72c438 Mon Sep 17 00:00:00 2001 From: Bryna Hazelton Date: Thu, 7 Sep 2023 16:03:24 -0700 Subject: [PATCH 32/81] fix some things that will error in pyuvdata 3.0 --- hera_cal/io.py | 49 ++++++++++++++++++++++++++------------- hera_cal/smooth_cal.py | 5 +++- hera_cal/tests/test_io.py | 10 +++----- hera_cal/utils.py | 14 ++++++----- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 0c7ad01b1..d08bc674d 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -1030,9 +1030,14 @@ def partial_write(self, output_path, data=None, flags=None, nsamples=None, # else: # make a copy of this object and then update the relevant arrays using DataContainers # this = copy.deepcopy(self) - hd_writer.write_uvh5_part(output_path, d, f, n, - run_check_acceptability=(output_path in self._writers), - **self.last_read_kwargs) + hd_writer.write_uvh5_part( + output_path, + data_array=d, + flag_array=f, + nsample_array=n, + run_check_acceptability=(output_path in self._writers), + **self.last_read_kwargs + ) def iterate_over_bls(self, Nbls=1, bls=None, chunk_by_redundant_group=False, reds=None, bl_error_tol=1.0, include_autos=True, frequencies=None): @@ -1435,8 +1440,12 @@ def _adapt_metadata(self, info_dict, skip_lsts=False): info_dict['antpairs'] = sorted(info_dict['bls']) info_dict['bls'] = sorted(set([ap + (pol, ) for ap in info_dict['antpairs'] for pol in info_dict['pols']])) XYZ = XYZ_from_LatLonAlt(info_dict['latitude'] * np.pi / 180, info_dict['longitude'] * np.pi / 180, info_dict['altitude']) - enu_antpos = ENU_from_ECEF(np.array([antpos for ant, antpos in info_dict['antpos'].items()]) + XYZ, - info_dict['latitude'] * np.pi / 180, info_dict['longitude'] * np.pi / 180, info_dict['altitude']) + enu_antpos = ENU_from_ECEF( + np.array([antpos for ant, antpos in info_dict['antpos'].items()]) + XYZ, + latitude=info_dict['latitude'] * np.pi / 180, + longitude=info_dict['longitude'] * np.pi / 180, + altitude=info_dict['altitude'] + ) info_dict['antpos'] = {ant: enu for enu, ant in zip(enu_antpos, info_dict['antpos'])} info_dict['data_antpos'] = {ant: info_dict['antpos'][ant] for ant in info_dict['data_ants']} info_dict['times'] = np.unique(info_dict['times']) @@ -1734,9 +1743,11 @@ def get_file_times(filepaths, filetype='uvh5'): _f[u'Header'][u'altitude'][()])) # figure out which baseline has the most times in order to handle BDA appropriately - baseline_array = uvutils.antnums_to_baseline(np.array(_f[u'Header'][u'ant_1_array']), - np.array(_f[u'Header'][u'ant_2_array']), - np.array(_f[u'Header'][u'Nants_telescope'])) + baseline_array = uvutils.antnums_to_baseline( + np.array(_f[u'Header'][u'ant_1_array']), + np.array(_f[u'Header'][u'ant_2_array']), + Nants_telescope=np.array(_f[u'Header'][u'Nants_telescope']) + ) most_common_bl_num = scipy.stats.mode(baseline_array, keepdims=True)[0][0] time_array = time_array[baseline_array == most_common_bl_num] lst_array = lst_array[baseline_array == most_common_bl_num] @@ -2114,18 +2125,25 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags antenna_names = [f"HH{a}" for a in antenna_numbers] # get antenna positions in ITRF frame - tel_lat_lon_alt = uvutils.LatLonAlt_from_XYZ(telescope_location) + lat, lon, alt = uvutils.LatLonAlt_from_XYZ(telescope_location) antenna_positions = np.array([antpos[k] for k in antenna_numbers]) - antenna_positions = uvutils.ECEF_from_ENU(antenna_positions, *tel_lat_lon_alt) - telescope_location + antenna_positions = uvutils.ECEF_from_ENU( + antenna_positions, latitude=lat, longitude=lon, altitude=alt + ) - telescope_location # get times if time_array is None: if start_jd is None: raise AttributeError("if time_array is not fed, start_jd must be fed") - time_array = LST2JD(lst_array, start_jd, allow_other_jd=True, lst_branch_cut=lst_branch_cut, - latitude=(tel_lat_lon_alt[0] * 180 / np.pi), - longitude=(tel_lat_lon_alt[1] * 180 / np.pi), - altitude=tel_lat_lon_alt[2]) + time_array = LST2JD( + lst_array, + start_jd, + allow_other_jd=True, + lst_branch_cut=lst_branch_cut, + latitude=(lat * 180 / np.pi), + longitude=(lon * 180 / np.pi), + altitude=alt + ) Ntimes = len(time_array) # get freqs @@ -2450,7 +2468,6 @@ def write_cal(fname, gains, freqs, times, lsts=None, flags=None, quality=None, t # get time info time_array = np.array(times, float) Ntimes = len(time_array) - time_range = np.array([time_array.min(), time_array.max()], float) if len(time_array) > 1: integration_time = np.median(np.diff(time_array)) * 24. * 3600. else: @@ -2519,7 +2536,7 @@ def write_cal(fname, gains, freqs, times, lsts=None, flags=None, quality=None, t "ant_array", "antenna_numbers", "antenna_names", "cal_style", "history", "channel_width", "flag_array", "gain_array", "quality_array", "jones_array", "time_array", "lst_array", "spw_array", "freq_array", "history", "integration_time", - "time_range", "x_orientation", "telescope_name", "gain_convention", "total_quality_array"] + "x_orientation", "telescope_name", "gain_convention", "total_quality_array"] # create local parameter dict local_params = locals() diff --git a/hera_cal/smooth_cal.py b/hera_cal/smooth_cal.py index 7f04e5a26..2cbbb0bb8 100644 --- a/hera_cal/smooth_cal.py +++ b/hera_cal/smooth_cal.py @@ -648,7 +648,10 @@ def build_time_blacklist(time_grid, time_blacklists=[], lst_blacklists=[], lat_l raise NotImplementedError(f'No known position for telescope {telescope_name}. lat_lon_alt_degrees must be specified.') # calculate LST grid in hours from time grid and lat_lon_alt - lst_grid = pyuvdata.utils.get_lst_for_time(time_grid, *lat_lon_alt_degrees) * 12 / np.pi + lat, lon, alt = lat_lon_alt_degrees + lst_grid = pyuvdata.utils.get_lst_for_time( + time_grid, latitude=lat, longitude=lon, altitude=alt + ) * 12 / np.pi # add blacklisted times from lst_blacklists for bounds in lst_blacklists: diff --git a/hera_cal/tests/test_io.py b/hera_cal/tests/test_io.py index c447860b9..e28fba76a 100644 --- a/hera_cal/tests/test_io.py +++ b/hera_cal/tests/test_io.py @@ -779,6 +779,7 @@ def test_to_dataarray(self): nsample_array = hd.set_data_array_with_datacontainer(n, hd.nsample_array.copy()) assert nsample_array.dtype == n.dtype + class Test_ReadHeraHdf5(object): def setup_method(self): self.uvh5_1 = os.path.join(DATA_PATH, "zen.2458116.61019.xx.HH.XRS_downselected.uvh5") @@ -943,20 +944,16 @@ def test_comp_to_HERAData_dc(self, infile, bls, pols): hd2 = io.HERAData(indata) if bls == "first10": bls = hd2.get_antpairs()[:10] - print(bls) elif bls == "conjugated": bls = hd2.get_antpairs()[:15] bls = [(b, a) for a, b in bls] - d, f, n = hd.read(check=True,bls=bls, pols=pols) + d, f, n = hd.read(check=True, bls=bls, pols=pols) d2, f2, n2 = hd2.read(bls=bls, polarizations=pols) - print(d.antpairs()) - print(d.times_by_bl.keys()) # compare all data and metadata for dc1, dc2 in zip([d, f, n], [d2, f2, n2]): self.compare_datacontainers(dc1, dc2, allow_close=infile != 'uvh5_h4c') - @pytest.mark.parametrize( 'infile', (['uvh5_1'], 'uvh5_h4c') ) @@ -1758,6 +1755,7 @@ def test_throw_away_flagged_ants(tmpdir): else: assert ant in set(hdo.ant_1_array).union(set(hdo.ant_2_array)) + class Test_UVDataFromFastUVH5: """Test the uvdata_from_fastuvh5 function.""" @@ -1830,5 +1828,3 @@ def test_pass_metadata(self): uvd = io.uvdata_from_fastuvh5(self.meta_default, history='I made this!') assert uvd.history == 'I made this!' - - \ No newline at end of file diff --git a/hera_cal/utils.py b/hera_cal/utils.py index fd532a010..8ab68539c 100644 --- a/hera_cal/utils.py +++ b/hera_cal/utils.py @@ -119,7 +119,7 @@ def comply_pol(pol): compliant with pyuvdata and hera_cal.''' try: return _comply_vispol(pol) - except(KeyError): # happens if we have an antpol, not vispol + except KeyError: # happens if we have an antpol, not vispol return _comply_antpol(pol) @@ -399,7 +399,7 @@ def update(self): def get_params(self, ant_prms={'*': '*'}): try: prms = aipy.pol.AntennaArray.get_params(self, ant_prms) - except(IndexError): + except IndexError: return {} return prms @@ -411,17 +411,17 @@ def set_params(self, prms): try: top_pos[0] = prms[str(i)]['top_x'] ant_changed = True - except(KeyError): + except KeyError: pass try: top_pos[1] = prms[str(i)]['top_y'] ant_changed = True - except(KeyError): + except KeyError: pass try: top_pos[2] = prms[str(i)]['top_z'] ant_changed = True - except(KeyError): + except KeyError: pass if ant_changed: # rotate from zenith to equatorial, convert from meters to ns @@ -539,7 +539,9 @@ def JD2LST(JD, latitude=-30.721526120689507, longitude=21.428303826863015, altit JD = [JD] # use pyuvdata - LST = uvutils.get_lst_for_time(np.array(JD), latitude, longitude, altitude) + LST = uvutils.get_lst_for_time( + np.array(JD), latitude=latitude, longitude=longitude, altitude=altitude + ) if _array: return LST From 3eea16fba086b4916921d7cd75246fade507228f Mon Sep 17 00:00:00 2001 From: Bryna Hazelton Date: Fri, 8 Sep 2023 10:40:35 -0700 Subject: [PATCH 33/81] ensure current compatibility --- hera_cal/io.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index d08bc674d..caa90bade 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -9,6 +9,7 @@ import os import copy import warnings +from packaging import version from functools import reduce from collections.abc import Iterable from pyuvdata import UVCal, UVData @@ -31,6 +32,7 @@ from contextlib import contextmanager from functools import lru_cache from pyuvdata.uvdata import FastUVH5Meta +import pyuvdata try: import aipy @@ -1030,14 +1032,19 @@ def partial_write(self, output_path, data=None, flags=None, nsamples=None, # else: # make a copy of this object and then update the relevant arrays using DataContainers # this = copy.deepcopy(self) - hd_writer.write_uvh5_part( - output_path, - data_array=d, - flag_array=f, - nsample_array=n, - run_check_acceptability=(output_path in self._writers), - **self.last_read_kwargs - ) + if version.parse(pyuvdata.__version__) < version.parse("3.0"): + hd_writer.write_uvh5_part(output_path, d, f, n, + run_check_acceptability=(output_path in self._writers), + **self.last_read_kwargs) + else: + hd_writer.write_uvh5_part( + output_path, + data_array=d, + flag_array=f, + nsample_array=n, + run_check_acceptability=(output_path in self._writers), + **self.last_read_kwargs + ) def iterate_over_bls(self, Nbls=1, bls=None, chunk_by_redundant_group=False, reds=None, bl_error_tol=1.0, include_autos=True, frequencies=None): From 16430d2702519ce0eafc56cabb2b2d061c7cb91f Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Sun, 10 Sep 2023 05:36:55 -0700 Subject: [PATCH 34/81] tmp: print debug info --- hera_cal/lstbin_simple.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 01e4fe567..29e2774c7 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -851,6 +851,7 @@ def lst_bin_files_for_baselines( # We need to down-selecton times/freqs (bls and pols will be sub-selected # based on those in the data through the next loop) + print("HERE I AM: ", meta.path, inpfile, tarr, inpainted.times) inpainted.select_or_expand_times(new_times=tarr, skip_bda_check=True) inpainted.select_freqs(channels=freq_chans) else: From 11323002a25a27bba7a599a0d5f319eafc22339b Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Sun, 10 Sep 2023 05:52:11 -0700 Subject: [PATCH 35/81] tmp: print debug info --- hera_cal/lstbin_simple.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 29e2774c7..df1875369 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1422,6 +1422,11 @@ def lst_bin_files_single_outfile( [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] ) + print('JUST BEFORE: ') + for dlist, inplist in zip(data_files, where_inpainted_files): + for df, inp in zip(dlist, inplist): + print(df.name, inp.name) + ( tinds, time_arrays, @@ -1435,6 +1440,12 @@ def lst_bin_files_single_outfile( input_cals, where_inpainted_files, ) + + print('JUST AFTER: ') + for dlist, inplist in zip(data_files, where_inpainted_files): + for df, inp in zip(dlist, inplist): + print(df.name, inp.name) + # If we have no times at all for this file, just return if len(all_lsts) == 0: return {} From add972a7da1bf686b7dcd3ddaca28b736d52dee9 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Sun, 10 Sep 2023 05:57:39 -0700 Subject: [PATCH 36/81] tmp: print debug info --- hera_cal/lstbin_simple.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index df1875369..4b4d1fb63 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1325,6 +1325,12 @@ def lst_bin_files_single_outfile( where_inpainted_files = _get_where_inpainted_files( data_files, where_inpainted_file_rules ) + + print('FRESH: ') + for dlist, inplist in zip(data_files, where_inpainted_files): + for df, inp in zip(dlist, inplist): + print(df, inp) + output_flagged, output_inpainted = _configure_inpainted_mode( output_flagged, output_inpainted, where_inpainted_files ) @@ -1425,7 +1431,7 @@ def lst_bin_files_single_outfile( print('JUST BEFORE: ') for dlist, inplist in zip(data_files, where_inpainted_files): for df, inp in zip(dlist, inplist): - print(df.name, inp.name) + print(df, inp) ( tinds, @@ -1444,7 +1450,7 @@ def lst_bin_files_single_outfile( print('JUST AFTER: ') for dlist, inplist in zip(data_files, where_inpainted_files): for df, inp in zip(dlist, inplist): - print(df.name, inp.name) + print(df, inp) # If we have no times at all for this file, just return if len(all_lsts) == 0: From ca4e3ee8cd942ff286807458cfdc9fea7f3a5f31 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Sun, 10 Sep 2023 06:12:49 -0700 Subject: [PATCH 37/81] tmp: print debug info --- hera_cal/lstbin_simple.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 4b4d1fb63..c826a417d 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1339,6 +1339,7 @@ def lst_bin_files_single_outfile( # they have no associated calibration) data_files = [df for df in data_files if df] input_cals = [cf for cf in input_cals if cf] + where_inpainted_files = [wif for wif in where_inpainted_files if wif] logger.info("Got the following numbers of data files per night:") for dflist in data_files: @@ -1412,6 +1413,11 @@ def lst_bin_files_single_outfile( f"{dflist[0].path} and {nants0} for {meta.path} for {meta.path}" ) + print('IN BETWEEN: ') + for dlist, inplist in zip(data_metas, where_inpainted_files): + for df, inp in zip(dlist, inplist): + print(df.path, inp) + # Split up the baselines into chunks that will be LST-binned together. # This is just to save on RAM. if Nbls_to_load is None: @@ -1448,9 +1454,8 @@ def lst_bin_files_single_outfile( ) print('JUST AFTER: ') - for dlist, inplist in zip(data_files, where_inpainted_files): - for df, inp in zip(dlist, inplist): - print(df, inp) + for fl, inp in zip(file_list, where_inpainted_files): + print(fl, inp) # If we have no times at all for this file, just return if len(all_lsts) == 0: From 811c800e7b2caeec4a8333324abfa621e44ce4ad Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Sun, 10 Sep 2023 06:26:50 -0700 Subject: [PATCH 38/81] fix: remove empty where_inpainted lists as well --- hera_cal/lstbin_simple.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index c826a417d..bd151e91f 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -851,7 +851,6 @@ def lst_bin_files_for_baselines( # We need to down-selecton times/freqs (bls and pols will be sub-selected # based on those in the data through the next loop) - print("HERE I AM: ", meta.path, inpfile, tarr, inpainted.times) inpainted.select_or_expand_times(new_times=tarr, skip_bda_check=True) inpainted.select_freqs(channels=freq_chans) else: @@ -1326,11 +1325,6 @@ def lst_bin_files_single_outfile( data_files, where_inpainted_file_rules ) - print('FRESH: ') - for dlist, inplist in zip(data_files, where_inpainted_files): - for df, inp in zip(dlist, inplist): - print(df, inp) - output_flagged, output_inpainted = _configure_inpainted_mode( output_flagged, output_inpainted, where_inpainted_files ) @@ -1413,11 +1407,6 @@ def lst_bin_files_single_outfile( f"{dflist[0].path} and {nants0} for {meta.path} for {meta.path}" ) - print('IN BETWEEN: ') - for dlist, inplist in zip(data_metas, where_inpainted_files): - for df, inp in zip(dlist, inplist): - print(df.path, inp) - # Split up the baselines into chunks that will be LST-binned together. # This is just to save on RAM. if Nbls_to_load is None: @@ -1434,11 +1423,6 @@ def lst_bin_files_single_outfile( [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] ) - print('JUST BEFORE: ') - for dlist, inplist in zip(data_files, where_inpainted_files): - for df, inp in zip(dlist, inplist): - print(df, inp) - ( tinds, time_arrays, @@ -1453,10 +1437,6 @@ def lst_bin_files_single_outfile( where_inpainted_files, ) - print('JUST AFTER: ') - for fl, inp in zip(file_list, where_inpainted_files): - print(fl, inp) - # If we have no times at all for this file, just return if len(all_lsts) == 0: return {} From 3e293c23baed1e79416e4647c870e47a0ec65c06 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Fri, 15 Sep 2023 01:43:02 -0700 Subject: [PATCH 39/81] fix: allow where_inpainted_files to be none --- hera_cal/lstbin_simple.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index bd151e91f..2d28af9d8 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1333,7 +1333,8 @@ def lst_bin_files_single_outfile( # they have no associated calibration) data_files = [df for df in data_files if df] input_cals = [cf for cf in input_cals if cf] - where_inpainted_files = [wif for wif in where_inpainted_files if wif] + if where_inpainted_files is not None: + where_inpainted_files = [wif for wif in where_inpainted_files if wif] logger.info("Got the following numbers of data files per night:") for dflist in data_files: From 3bd8b890784c055b79fd42d06577a932ac40b3b5 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 19 Sep 2023 15:13:35 +0200 Subject: [PATCH 40/81] test: remove unnecessary script test --- hera_cal/tests/test_utils.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/hera_cal/tests/test_utils.py b/hera_cal/tests/test_utils.py index 67ad1e441..dd392f2fb 100644 --- a/hera_cal/tests/test_utils.py +++ b/hera_cal/tests/test_utils.py @@ -900,30 +900,6 @@ def test_select_spw_ranges_argparser(): assert args.spw_ranges == [(0, 20), (30, 100), (120, 150)] -def test_select_spw_ranges_run_script_code(tmpdir): - # test script code from scripts/test_select_spw_ranges.py - tmp_path = str(tmpdir) - new_cal = os.path.join(DATA_PATH, "test_input/zen.2458101.46106.xx.HH.uv.abs.calfits_54x_only") - uvh5 = os.path.join(DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5") - hd = io.HERAData(uvh5) - hd.read() - nf = hd.Nfreqs - output = os.path.join(tmp_path, 'test_calibrated_output.uvh5') - # construct bash script command - select_cmd = f'python ./scripts/select_spw_ranges.py {uvh5} {output} --clobber --spw_ranges 0~256,332~364,792~1000' - # and excecute inside of python - os.system(select_cmd) - # test that output has correct frequencies. - hdo = io.HERAData(output) - hdo.read() - assert np.allclose(hdo.freq_array, np.hstack([hd.freq_array[:256], hd.freq_array[332:364], hd.freq_array[792:1000]])) - freq_inds = np.hstack([np.arange(0, 256).astype(int), np.arange(332, 364).astype(int), np.arange(792, 1000).astype(int)]) - # and check that data, flags, nsamples make sense. - assert np.allclose(hdo.data_array, hd.data_array[:, freq_inds, :]) - assert np.allclose(hdo.flag_array, hd.flag_array[:, freq_inds, :]) - assert np.allclose(hdo.nsample_array, hd.nsample_array[:, freq_inds, :]) - - @pytest.fixture(scope='function') def tmpdir(tmp_path_factory): return tmp_path_factory.mktemp('test_match_files_to_lst_bins') From f6641363fbb62f8669d7305b88332bf651395a10 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 09:49:28 +0200 Subject: [PATCH 41/81] temporary log.info for debugging --- hera_cal/lstbin_simple.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 507ad6490..060c29a81 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -584,6 +584,14 @@ def lst_average( # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan + logger.info(f"Number of flags in 1000th channel for bl 40: {np.sum(flags[:, 20, 1000, 0])}") + logger.info(f"Data: {data[:, 20, 1000, 0]}") + logger.info(f"Flags: {flags[:, 20, 1000, 0]}") + logger.info(f"Nsamples: {nsamples[:, 20, 1000, 0]}") + logger.info(f"Mean: {meandata[20, 1000, 0]}") + logger.info(f"Data Mask: {data.mask[:, 20, 1000, 0]}") + logger.info(f"NORM: {norm[20, 1000, 0]}") + # get other stats logger.info("Calculating std") with warnings.catch_warnings(): @@ -595,6 +603,8 @@ def lst_average( std[normalizable] /= norm[normalizable] std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) + logger.info(f"STD: {std[20, 1000, 0]}") + std[~normalizable] = np.inf # While the previous norm is correct for normalizing the mean, we now @@ -1401,6 +1411,7 @@ def lst_bin_files_single_outfile( ] bl_chunks = [blg for blg in bl_chunks if len(blg) > 0] + logger.info(f"HERE IS BASELINE 20: {all_baselines[20]}") dlst = config_opts["dlst"] lst_bin_edges = np.array( [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] From 182eb04ee71cf3abbeb791df57953af5b18f4576 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 10:36:11 +0200 Subject: [PATCH 42/81] maint: ensure we don't use setuptools_scm=8.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 869b95217..e7d87af06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=30.3.0", "wheel", "setuptools_scm[toml]>=6.2"] +requires = ["setuptools>=30.3.0", "wheel", "setuptools_scm[toml]>=6.2,!=8.0"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] From 96f41e5edb7aa20c76c35630253034760f91851f Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 13:33:02 +0200 Subject: [PATCH 43/81] fix: write MAD to MAD file, not STD --- hera_cal/lstbin_simple.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 729bb1f80..a516d525d 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1609,6 +1609,8 @@ def lst_bin_files_single_outfile( flags=rdc["flags"], nsamples=rdc["nsamples"], ) + + logger.info(f"JUST TO MAKE SURE: {rdc['std'][20,1000,0]}") write_baseline_slc_to_file( fl=out_files[("STD", inpainted)], slc=slc, @@ -1626,7 +1628,7 @@ def lst_bin_files_single_outfile( nsamples=rdc["nsamples"], ) write_baseline_slc_to_file( - fl=out_files[("STD", inpainted)], + fl=out_files[("MAD", inpainted)], slc=slc, data=rdc["mad"], flags=rdc["flags"], From 8351721554fb7b364eac4c6c7c513d8d7c1eb0d0 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 13:47:07 +0200 Subject: [PATCH 44/81] remove unnecessary logging --- hera_cal/lstbin_simple.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index a516d525d..1b9536178 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -584,13 +584,13 @@ def lst_average( # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan - logger.info(f"Number of flags in 1000th channel for bl 40: {np.sum(flags[:, 20, 1000, 0])}") - logger.info(f"Data: {data[:, 20, 1000, 0]}") - logger.info(f"Flags: {flags[:, 20, 1000, 0]}") - logger.info(f"Nsamples: {nsamples[:, 20, 1000, 0]}") - logger.info(f"Mean: {meandata[20, 1000, 0]}") - logger.info(f"Data Mask: {data.mask[:, 20, 1000, 0]}") - logger.info(f"NORM: {norm[20, 1000, 0]}") + # logger.info(f"Number of flags in 1000th channel for bl 40: {np.sum(flags[:, 20, 1000, 0])}") + # logger.info(f"Data: {data[:, 20, 1000, 0]}") + # logger.info(f"Flags: {flags[:, 20, 1000, 0]}") + # logger.info(f"Nsamples: {nsamples[:, 20, 1000, 0]}") + # logger.info(f"Mean: {meandata[20, 1000, 0]}") + # logger.info(f"Data Mask: {data.mask[:, 20, 1000, 0]}") + # logger.info(f"NORM: {norm[20, 1000, 0]}") # get other stats logger.info("Calculating std") @@ -603,7 +603,7 @@ def lst_average( std[normalizable] /= norm[normalizable] std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) - logger.info(f"STD: {std[20, 1000, 0]}") + # logger.info(f"STD: {std[20, 1000, 0]}") std[~normalizable] = np.inf @@ -1610,7 +1610,6 @@ def lst_bin_files_single_outfile( nsamples=rdc["nsamples"], ) - logger.info(f"JUST TO MAKE SURE: {rdc['std'][20,1000,0]}") write_baseline_slc_to_file( fl=out_files[("STD", inpainted)], slc=slc, From 372da724ae264f34b4e987e6c652c127a149851e Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 15:05:03 +0200 Subject: [PATCH 45/81] retract superfluous commit --- hera_cal/lstbin_simple.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 060c29a81..507ad6490 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -584,14 +584,6 @@ def lst_average( # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan - logger.info(f"Number of flags in 1000th channel for bl 40: {np.sum(flags[:, 20, 1000, 0])}") - logger.info(f"Data: {data[:, 20, 1000, 0]}") - logger.info(f"Flags: {flags[:, 20, 1000, 0]}") - logger.info(f"Nsamples: {nsamples[:, 20, 1000, 0]}") - logger.info(f"Mean: {meandata[20, 1000, 0]}") - logger.info(f"Data Mask: {data.mask[:, 20, 1000, 0]}") - logger.info(f"NORM: {norm[20, 1000, 0]}") - # get other stats logger.info("Calculating std") with warnings.catch_warnings(): @@ -603,8 +595,6 @@ def lst_average( std[normalizable] /= norm[normalizable] std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) - logger.info(f"STD: {std[20, 1000, 0]}") - std[~normalizable] = np.inf # While the previous norm is correct for normalizing the mean, we now @@ -1411,7 +1401,6 @@ def lst_bin_files_single_outfile( ] bl_chunks = [blg for blg in bl_chunks if len(blg) > 0] - logger.info(f"HERE IS BASELINE 20: {all_baselines[20]}") dlst = config_opts["dlst"] lst_bin_edges = np.array( [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] From ab1af892f0079dfa934432b45f263f5100559e38 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 15:12:42 +0200 Subject: [PATCH 46/81] chore: remove unnecessary comment --- hera_cal/lstbin_simple.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 1b9536178..411340100 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -584,14 +584,6 @@ def lst_average( # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan - # logger.info(f"Number of flags in 1000th channel for bl 40: {np.sum(flags[:, 20, 1000, 0])}") - # logger.info(f"Data: {data[:, 20, 1000, 0]}") - # logger.info(f"Flags: {flags[:, 20, 1000, 0]}") - # logger.info(f"Nsamples: {nsamples[:, 20, 1000, 0]}") - # logger.info(f"Mean: {meandata[20, 1000, 0]}") - # logger.info(f"Data Mask: {data.mask[:, 20, 1000, 0]}") - # logger.info(f"NORM: {norm[20, 1000, 0]}") - # get other stats logger.info("Calculating std") with warnings.catch_warnings(): @@ -603,8 +595,6 @@ def lst_average( std[normalizable] /= norm[normalizable] std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) - # logger.info(f"STD: {std[20, 1000, 0]}") - std[~normalizable] = np.inf # While the previous norm is correct for normalizing the mean, we now @@ -1429,7 +1419,6 @@ def lst_bin_files_single_outfile( ] bl_chunks = [blg for blg in bl_chunks if len(blg) > 0] - logger.info(f"HERE IS BASELINE 20: {all_baselines[20]}") dlst = config_opts["dlst"] lst_bin_edges = np.array( [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] From eeb8485f9293eb122e856dc50d971b9f1d1a95d3 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 20 Sep 2023 15:33:46 +0200 Subject: [PATCH 47/81] fix: can't use removeprefix in py38 --- hera_cal/lstbin_simple.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 411340100..8fb00eacd 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -1888,7 +1888,9 @@ def create_lstbin_output_file( ) # There's a weird gotcha with pathlib where if you do path / "/file.name" # You get just "/file.name" which is in root. - fname = outdir / fname.removeprefix('/') + if fname.startswith('/'): + fname = fname[1:] + fname = outdir / fname logger.info(f"Initializing {fname}") From 70c11d679f171b0e624e7ecbd6183f883e2c48b3 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Thu, 21 Sep 2023 10:36:46 +0200 Subject: [PATCH 48/81] test: add test to cover case where where-inpainted doesn't have bls --- hera_cal/tests/mock_uvdata.py | 10 ++-- hera_cal/tests/test_lstbin_simple.py | 83 +++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 8 deletions(-) diff --git a/hera_cal/tests/mock_uvdata.py b/hera_cal/tests/mock_uvdata.py index 9003d3b12..ee64e2afc 100644 --- a/hera_cal/tests/mock_uvdata.py +++ b/hera_cal/tests/mock_uvdata.py @@ -24,9 +24,9 @@ with open(f"{DATA_PATH}/hera_antpos.yaml", "r") as fl: HERA_ANTPOS = yaml.safe_load(fl) -start: 46920776.3671875 -end: 234298706.0546875 -delta: 122070.3125 +PHASEII_FREQS = np.arange( + 46920776.3671875, 234298706.0546875 + 10.0, 122070.3125 +) def create_mock_hera_obs( @@ -35,9 +35,7 @@ def create_mock_hera_obs( lst_start=0.1, jd_start: float | None = None, ntimes: int = 2, - freqs: np.ndarray = np.arange( - 46920776.3671875, 234298706.0546875 + 10.0, 122070.3125 - ), + freqs: np.ndarray = PHASEII_FREQS, pols: list[str] = ["xx", "yy", "xy", "yx"], ants: list[int] | None = None, antpairs: list[tuple[int, int]] | None = None, diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index d17b1c924..9ed847553 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -645,7 +645,6 @@ def test_flag_below_min_N(self): assert np.all(norm_n[0] == 2) assert not np.any(np.isinf(std_n[0])) - print(np.sum(flg_n[1:]), flg_n[1:].size) assert not np.any(flg_n[1:]) assert np.all(norm_n[1:] == 7) assert not np.any(np.isinf(std_n[1:])) @@ -859,6 +858,83 @@ def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): uvd_redavg.Npols, ) + def test_redavg_with_where_inpainted(self, tmp_path): + uvds = mockuvd.make_dataset( + ndays=2, + nfiles=3, + ntimes=2, + ants=np.arange(7), + creator=mockuvd.create_uvd_identifiable, + freqs=mockuvd.PHASEII_FREQS[:25], + pols=['xx', 'xy'], + redundantly_averaged=True, + ) + + uvd_files = mockuvd.write_files_in_hera_format( + uvds, tmp_path, add_where_inpainted_files=True + ) + + ap = uvds[0][0].get_antpairs() + reds = RedundantGroups.from_antpos( + dict(zip(uvds[0][0].antenna_numbers, uvds[0][0].antenna_positions)), + ) + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( + data_files=sum(uvd_files, []), # flatten the list-of-lists + lst_bin_edges=[0, 1.9 * np.pi], + redundantly_averaged=True, + rephase=False, + antpairs=ap, + reds=reds, + where_inpainted_files=[str(Path(f).with_suffix(".where_inpainted.h5")) for f in sum(uvd_files, [])], + ) + assert len(lstbins) == 1 + + # Also test that if a where_inpainted file has missing baselines, an error is + # raised. + # This is kind of a dodgy way to test it: copy the original data files, + # write a whole new dataset in the same place but with fewer baselines, then + # copy the data files (but not the where_inpainted files) back, so they mismatch. + for flist in uvd_files: + for fl in flist: + fl = Path(fl) + fl.rename(fl.parent / f"{fl.with_suffix('.bk')}") + + winp = fl.with_suffix(".where_inpainted.h5") + winp.unlink() + + uvds = mockuvd.make_dataset( + ndays=2, + nfiles=3, + ntimes=2, + ants=np.arange(5), # less than the original + creator=mockuvd.create_uvd_identifiable, + freqs=mockuvd.PHASEII_FREQS[:25], + pols=['xx', 'xy'], + redundantly_averaged=True, + ) + + uvd_files = mockuvd.write_files_in_hera_format( + uvds, tmp_path, add_where_inpainted_files=True + ) + + # Move back the originals. + for flist in uvd_files: + for fl in flist: + fl = Path(fl) + fl.unlink() + (fl.parent / f"{fl.with_suffix('.bk')}").rename(fl) + + with pytest.raises(ValueError, match="Could not find any baseline from group"): + lstbin_simple.lst_bin_files_for_baselines( + data_files=sum(uvd_files, []), # flatten the list-of-lists + lst_bin_edges=[0, 1.9 * np.pi], + redundantly_averaged=True, + rephase=False, + antpairs=ap, + reds=reds, + where_inpainted_files=[str(Path(f).with_suffix(".where_inpainted.h5")) for f in sum(uvd_files, [])], + ) + def test_make_lst_grid(): lst_grid = lstbin_simple.make_lst_grid(0.01, begin_lst=None) @@ -1463,9 +1539,12 @@ def test_inpaint_mode_no_flags(self, tmp_path_factory): ntimes_per_file=2, clobber=True, ) + + # Additionally try fname format with leading / which should be removed + # automatically in the writing. out_files = lstbin_simple.lst_bin_files( config_file=cfl, - fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + fname_format="/zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", rephase=False, sigma_clip_thresh=None, sigma_clip_min_N=2, From 85d5cb5280c27630933d87a9c464b3a046eeebc4 Mon Sep 17 00:00:00 2001 From: heramgr Date: Mon, 25 Sep 2023 11:07:18 -0600 Subject: [PATCH 49/81] fixed duplicate kwarg passing for tophat_frfilter_run.py --- scripts/tophat_frfilter_run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/tophat_frfilter_run.py b/scripts/tophat_frfilter_run.py index 8a4b80986..4f9fb98fc 100644 --- a/scripts/tophat_frfilter_run.py +++ b/scripts/tophat_frfilter_run.py @@ -20,7 +20,6 @@ 'edgecut_low': ap.edgecut_low, 'gain': ap.gain} if ap.window == 'tukey': filter_kwargs['alpha'] = ap.alpha - filter_kwargs['flag_model_rms_outliers'] = ap.flag_model_rms_outliers elif ap.mode in ['dayenu', 'dpss_leastsq']: filter_kwargs = {'max_contiguous_edge_flags': ap.max_contiguous_edge_flags} From 959420a1234032ad0146a42622611bc46a375aec Mon Sep 17 00:00:00 2001 From: Bryna Hazelton Date: Fri, 20 Oct 2023 12:24:27 -0700 Subject: [PATCH 50/81] improve handling for future pyuvdata Test the thing that matters -- the function signature. Fixes tests on pyuvdata PRs preparing for version 3.0 --- hera_cal/io.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index caa90bade..8413dd78c 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -9,7 +9,7 @@ import os import copy import warnings -from packaging import version +import inspect from functools import reduce from collections.abc import Iterable from pyuvdata import UVCal, UVData @@ -1032,19 +1032,18 @@ def partial_write(self, output_path, data=None, flags=None, nsamples=None, # else: # make a copy of this object and then update the relevant arrays using DataContainers # this = copy.deepcopy(self) - if version.parse(pyuvdata.__version__) < version.parse("3.0"): - hd_writer.write_uvh5_part(output_path, d, f, n, - run_check_acceptability=(output_path in self._writers), - **self.last_read_kwargs) + write_kwargs = { + "data_array": d, + "nsample_array": n, + "run_check_acceptability": (output_path in self._writers), + **self.last_read_kwargs, + } + # before pyuvdata 3.0, the "flag_array" parameter was called "flags_array" + if "flag_array" in inspect.signature(UVData.write_uvh5_part).parameters: + write_kwargs["flag_array"] = f else: - hd_writer.write_uvh5_part( - output_path, - data_array=d, - flag_array=f, - nsample_array=n, - run_check_acceptability=(output_path in self._writers), - **self.last_read_kwargs - ) + write_kwargs["flags_array"] = f + hd_writer.write_uvh5_part(output_path, **write_kwargs) def iterate_over_bls(self, Nbls=1, bls=None, chunk_by_redundant_group=False, reds=None, bl_error_tol=1.0, include_autos=True, frequencies=None): From 7cc9ec06776903319211d054e8ad32b4fd414c85 Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 30 Oct 2023 15:00:19 -0400 Subject: [PATCH 51/81] style: fix some style issues in FRF code Many of the lines were clipping in my text editor, so I inserted some line breaks to make it more readable. --- hera_cal/frf.py | 107 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index e157134e7..42a6f5c83 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -1572,6 +1572,9 @@ def tophat_frfilter_argparser(mode='clean'): "and apply independent fringe-rate filters. Default is 1 (no interleaved filters).", "This does not change the format of the output files but it does change the nature of their content.") filt_options.add_argument("--ninterleave", default=1, type=int, help=desc) + filt_options.add_argument( + "--param_file", default="", type=str, help="File containing filter parameters" + ) return ap @@ -1672,6 +1675,8 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf only used if case == 'uvbeam' ninterleave: int, optional Number of interleaved sets to run time filtering on. + param_file: str, optional + File containing filter parameters (e.g., centers and half-widths). filter_kwargs: additional keyword arguments to be passed to FRFilter.tophat_frfilter() ''' if baseline_list is not None and Nbls_per_load is not None: @@ -1730,30 +1735,49 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf if len(keys) > 0: # Deal with interleaved sets frfil._deinterleave_data_in_time('data', ninterleave=ninterleave) - frfil._deinterleave_data_in_time('flags', ninterleave=ninterleave, set_time_sets=False) - frfil._deinterleave_data_in_time('nsamples', ninterleave=ninterleave, set_time_sets=False) + frfil._deinterleave_data_in_time( + 'flags', ninterleave=ninterleave, set_time_sets=False + ) + frfil._deinterleave_data_in_time( + 'nsamples', ninterleave=ninterleave, set_time_sets=False + ) # figure out frige rate centers and half-widths - assert case in ['sky', 'max_frate_coeffs', 'uvbeam'], f'case={case} is not valid.' - # use conservative nfr (lowest resolution set). - nfr = int(np.min([len(tset) for tset in frfil.time_sets])) - frate_centers, frate_half_widths = select_tophat_frates(uvd=frfil.hd, blvecs=frfil.blvecs, - case=case, keys=keys, uvb=uvb, - frate_standoff=frate_standoff, - frate_width_multiplier=frate_width_multiplier, - min_frate_half_width=min_frate_half_width, - max_frate_half_width=max_frate_half_width, - max_frate_coeffs=max_frate_coeffs, - percentile_low=percentile_low, - percentile_high=percentile_high, - fr_freq_skip=fr_freq_skip, - verbose=verbose, nfr=nfr) - # Lists of names of datacontainers that will hold each interleaved data set until they are - # recombined. - filtered_data_names = [f'clean_data_interleave_{inum}' for inum in range(ninterleave)] - filtered_flag_names = [fstr.replace('data', 'flags') for fstr in filtered_data_names] - filtered_resid_names = [fstr.replace('data', 'resid') for fstr in filtered_data_names] - filtered_model_names = [fstr.replace('data', 'model') for fstr in filtered_data_names] - filtered_resid_flag_names = [fstr.replace('data', 'resid_flags') for fstr in filtered_data_names] + if param_file: + pass + else: + assert case in ['sky', 'max_frate_coeffs', 'uvbeam'], f'case={case} is not valid.' + # use conservative nfr (lowest resolution set). + nfr = int(np.min([len(tset) for tset in frfil.time_sets])) + frate_centers, frate_half_widths = select_tophat_frates( + uvd=frfil.hd, blvecs=frfil.blvecs, + case=case, keys=keys, uvb=uvb, + frate_standoff=frate_standoff, + frate_width_multiplier=frate_width_multiplier, + min_frate_half_width=min_frate_half_width, + max_frate_half_width=max_frate_half_width, + max_frate_coeffs=max_frate_coeffs, + percentile_low=percentile_low, + percentile_high=percentile_high, + fr_freq_skip=fr_freq_skip, + verbose=verbose, nfr=nfr, + ) + # Lists of names of datacontainers that will hold each interleaved + # data set until they are recombined. + filtered_data_names = [ + f'clean_data_interleave_{inum}' for inum in range(ninterleave) + ] + filtered_flag_names = [ + fstr.replace('data', 'flags') for fstr in filtered_data_names + ] + filtered_resid_names = [ + fstr.replace('data', 'resid') for fstr in filtered_data_names + ] + filtered_model_names = [ + fstr.replace('data', 'model') for fstr in filtered_data_names + ] + filtered_resid_flag_names = [ + fstr.replace('data', 'resid_flags') for fstr in filtered_data_names + ] for inum in range(ninterleave): @@ -1772,14 +1796,21 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf is_blacklisted = (lsts >= lb[0] * np.pi / 12)\ & (lsts <= lb[1] * np.pi / 12) else: - is_blacklisted = (lsts >= lb[0] * np.pi / 12) | (lsts <= lb[1] * np.pi / 12) - wgts[k][is_blacklisted, :] = wgts[k][is_blacklisted, :] * blacklist_wgt + is_blacklisted = (lsts >= lb[0] * np.pi / 12) + | (lsts <= lb[1] * np.pi / 12) + wgts[k][is_blacklisted, :] = ( + wgts[k][is_blacklisted, :] * blacklist_wgt + ) # run tophat filter - frfil.tophat_frfilter(frate_centers=frate_centers, frate_half_widths=frate_half_widths, - keys=keys, verbose=verbose, wgts=wgts, flags=getattr(frfil, f'flags_interleave_{inum}'), - data=getattr(frfil, f'data_interleave_{inum}'), output_postfix=f'interleave_{inum}', - times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, - **filter_kwargs) + frfil.tophat_frfilter( + frate_centers=frate_centers, frate_half_widths=frate_half_widths, + keys=keys, verbose=verbose, wgts=wgts, + flags=getattr(frfil, f'flags_interleave_{inum}'), + data=getattr(frfil, f'data_interleave_{inum}'), + output_postfix=f'interleave_{inum}', + times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, + **filter_kwargs + ) frfil._interleave_data_in_time(filtered_data_names, 'clean_data') frfil._interleave_data_in_time(filtered_flag_names, 'clean_flags') @@ -1804,10 +1835,18 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf frfil.clean_model[bl] = np.zeros_like(frfil.data[bl]) frfil.clean_resid_flags[bl] = frfil.flags[bl] - frfil.write_filtered_data(res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, - filled_outfilename=filled_outfilename, partial_write=Nbls_per_load < len(baseline_list), - clobber=clobber, add_to_history=add_to_history, - extra_attrs={'Nfreqs': frfil.hd.Nfreqs, 'freq_array': frfil.hd.freq_array, 'channel_width': frfil.hd.channel_width, 'flex_spw_id_array': frfil.hd.flex_spw_id_array}) + frfil.write_filtered_data( + res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, + filled_outfilename=filled_outfilename, + partial_write=Nbls_per_load < len(baseline_list), + clobber=clobber, add_to_history=add_to_history, + extra_attrs={ + 'Nfreqs': frfil.hd.Nfreqs, + 'freq_array': frfil.hd.freq_array, + 'channel_width': frfil.hd.channel_width, + 'flex_spw_id_array': frfil.hd.flex_spw_id_array + }, + ) frfil.hd.data_array = None # this forces a reload in the next loop From 9891e4b909820ce03265a71bb9e0fe104fb89eef Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 30 Oct 2023 17:30:03 -0400 Subject: [PATCH 52/81] feat: specify frate filter parameters from a file Also some more cleaning up of the filtering code. --- hera_cal/frf.py | 416 +++++++++++++++++++-------------- scripts/tophat_frfilter_run.py | 16 +- 2 files changed, 253 insertions(+), 179 deletions(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index 42a6f5c83..b05f9ea73 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -27,6 +27,8 @@ import astropy.constants as const from . import redcal from .utils import echo +import os +import yaml SPEED_OF_LIGHT = const.c.si.value SDAY_SEC = units.sday.to("s") @@ -1579,20 +1581,22 @@ def tophat_frfilter_argparser(mode='clean'): return ap -def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calfile_list=None, - Nbls_per_load=None, spw_range=None, external_flags=None, - factorize_flags=False, time_thresh=0.05, wgt_by_nsample=False, - lst_blacklists=None, blacklist_wgt=0.0, - res_outfilename=None, CLEAN_outfilename=None, filled_outfilename=None, - clobber=False, add_to_history='', avg_red_bllens=False, polarizations=None, - overwrite_flags=False, - flag_yaml=None, skip_autos=False, beamfitsfile=None, verbose=False, - read_axis=None, - percentile_low=5., percentile_high=95., - frate_standoff=0.0, frate_width_multiplier=1.0, - min_frate_half_width=0.025, max_frate_half_width=np.inf, - max_frate_coeffs=None, fr_freq_skip=1, ninterleave=1, - **filter_kwargs): +def load_tophat_frfilter_and_write( + datafile_list, case, baseline_list=None, calfile_list=None, + Nbls_per_load=None, spw_range=None, external_flags=None, + factorize_flags=False, time_thresh=0.05, wgt_by_nsample=False, + lst_blacklists=None, blacklist_wgt=0.0, + res_outfilename=None, CLEAN_outfilename=None, filled_outfilename=None, + clobber=False, add_to_history='', avg_red_bllens=False, polarizations=None, + overwrite_flags=False, + flag_yaml=None, skip_autos=False, beamfitsfile=None, verbose=False, + read_axis=None, + percentile_low=5., percentile_high=95., + frate_standoff=0.0, frate_width_multiplier=1.0, + min_frate_half_width=0.025, max_frate_half_width=np.inf, + max_frate_coeffs=None, fr_freq_skip=1, ninterleave=1, param_file="", + **filter_kwargs +): ''' A tophat fr-filtering method that only simultaneously loads and writes user-provided list of baselines. This is to support parallelization over baseline (rather then time) if baseline_list is specified. @@ -1681,173 +1685,237 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf ''' if baseline_list is not None and Nbls_per_load is not None: raise NotImplementedError("baseline loading and partial i/o not yet implemented.") - hd = io.HERAData(datafile_list, filetype='uvh5') if baseline_list is not None and len(baseline_list) == 0: - warnings.warn("Length of baseline list is zero." - "This can happen under normal circumstances when there are more files in datafile_list then baselines." - "in your dataset. Exiting without writing any output.", RuntimeWarning) + warnings.warn( + "Length of baseline list is zero. This can happen under normal " + "circumstances when there are more files in datafile_list then baselines." + "in your dataset. Exiting without writing any output.", RuntimeWarning + ) + return + + hd = io.HERAData(datafile_list, filetype='uvh5') + # Figure out which baselines to load if not provided. + if baseline_list is None: + if len(hd.filepaths) > 1: + baseline_list = list(hd.antpairs.values())[0] + else: + baseline_list = hd.antpairs + + # Figure out which frequencies to load. + if spw_range is None: + spw_range = [0, hd.Nfreqs] + freqs = hd.freq_array.flatten()[spw_range[0]:spw_range[1]] + + # Figure out which antennas to load if calfiles are provided. + baseline_antennas = [] + for blpolpair in baseline_list: + baseline_antennas += list(blpolpair[:2]) + baseline_antennas = np.unique(baseline_antennas).astype(int) + + # Read calibration solutions if provided. + if calfile_list is not None: + cals = io.HERACal(calfile_list) + cals.read(antenna_nums=baseline_antennas, frequencies=freqs) else: - if baseline_list is None: - if len(hd.filepaths) > 1: - baseline_list = list(hd.antpairs.values())[0] - else: - baseline_list = hd.antpairs - if spw_range is None: - spw_range = [0, hd.Nfreqs] - freqs = hd.freq_array.flatten()[spw_range[0]:spw_range[1]] - baseline_antennas = [] - for blpolpair in baseline_list: - baseline_antennas += list(blpolpair[:2]) - baseline_antennas = np.unique(baseline_antennas).astype(int) - if calfile_list is not None: - cals = io.HERACal(calfile_list) - cals.read(antenna_nums=baseline_antennas, frequencies=freqs) + cals = None + + # Figure out which polarizations to use. + if polarizations is None: + if len(hd.filepaths) > 1: + polarizations = list(hd.pols.values())[0] else: - cals = None - if polarizations is None: - if len(hd.filepaths) > 1: - polarizations = list(hd.pols.values())[0] - else: - polarizations = hd.pols - if Nbls_per_load is None: - Nbls_per_load = len(baseline_list) - for i in range(0, len(baseline_list), Nbls_per_load): - frfil = FRFilter(hd, input_cal=cals) - frfil.read(bls=baseline_list[i:i + Nbls_per_load], - frequencies=freqs, polarizations=polarizations, axis=read_axis) - if avg_red_bllens: - frfil.avg_red_baseline_vectors() - if external_flags is not None: - frfil.apply_flags(external_flags, overwrite_flags=overwrite_flags) - if flag_yaml is not None: - frfil.apply_flags(flag_yaml, overwrite_flags=overwrite_flags, filetype='yaml') - if factorize_flags: - frfil.factorize_flags(time_thresh=time_thresh, inplace=True) - keys = frfil.data.keys() - if skip_autos: - keys = [bl for bl in keys if bl[0] != bl[1]] - if beamfitsfile is not None: - uvb = UVBeam() - uvb.read_beamfits(beamfitsfile) - uvb.use_future_array_shapes() + polarizations = hd.pols + + # Load all baselines if a baseline list not provided. + if Nbls_per_load is None: + Nbls_per_load = len(baseline_list) + + # If a filter parameter file is provided, let's check that all of the + # baselines are present in the filter file. + if param_file and os.path.exists(param_file): + with open(param_file, "r") as f: + filter_info = yaml.load(f.read(), Loader=yaml.SafeLoader) + filter_antpairs = set(filter_info["filter_centers"].keys()) + have_bl_info = [ + (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) + for bl in baseline_list + ] + if not all(have_bl_info): + raise ValueError( + "Provided filter file doesn't have every baseline." + ) + + # Read the data in chunks and perform filtering. + for i in range(0, len(baseline_list), Nbls_per_load): + # Read data from this chunk of baselines. + frfil = FRFilter(hd, input_cal=cals) + frfil.read( + bls=baseline_list[i:i + Nbls_per_load], + frequencies=freqs, + polarizations=polarizations, + axis=read_axis, + ) + + # Some extra handling if requested. + if avg_red_bllens: + frfil.avg_red_baseline_vectors() + if external_flags is not None: + frfil.apply_flags(external_flags, overwrite_flags=overwrite_flags) + if flag_yaml is not None: + frfil.apply_flags(flag_yaml, overwrite_flags=overwrite_flags, filetype='yaml') + if factorize_flags: + frfil.factorize_flags(time_thresh=time_thresh, inplace=True) + + # Figure out which baselines we'll need for filtering. + keys = frfil.data.keys() + if skip_autos: + keys = [bl for bl in keys if bl[0] != bl[1]] + + # Read in the beam file if provided. + if beamfitsfile is not None: + uvb = UVBeam() + uvb.read_beamfits(beamfitsfile) + uvb.use_future_array_shapes() + else: + uvb = None + + # Filter the data if there is data to filter. + if len(keys) > 0: + # Deal with interleaved sets + frfil._deinterleave_data_in_time('data', ninterleave=ninterleave) + frfil._deinterleave_data_in_time( + 'flags', ninterleave=ninterleave, set_time_sets=False + ) + frfil._deinterleave_data_in_time( + 'nsamples', ninterleave=ninterleave, set_time_sets=False + ) + + # Figure out fringe-rate centers and half-widths. + if param_file and os.path.exists(param_file): + # Read from a file if provided. + with open(param_file, "r") as f: + param_info = yaml.load(f.read(), Loader=yaml.SafeLoader) + frate_centers = {} + frate_half_widths = {} + # Assuming we use the same filters for all polarizations. + for key in keys: + ai, aj = key[:2] + if (ai, aj) in filter_antpairs: + frate_centers[key] = param_info["filter_centers"][(ai,aj)] + frate_half_widths[key] = param_info["filter_half_widths"][(ai,aj)] + else: + # We've already enforced that all the data baselines + # are in the filter file, so we should be safe here. + frate_centers[key] = -param_info["filter_centers"][(aj,ai)] + frate_centers[key] = param_info["filter_half_widths"][(aj,ai)] else: - uvb = None - if len(keys) > 0: - # Deal with interleaved sets - frfil._deinterleave_data_in_time('data', ninterleave=ninterleave) - frfil._deinterleave_data_in_time( - 'flags', ninterleave=ninterleave, set_time_sets=False + # Otherwise, we need to compute the filter parameters. + if case not in ("sky", "max_frate_coeffs", "uvbeam"): + raise ValueError(f"case={case} is not valid") + + # Use conservative nfr (lowest resolution set). + nfr = int(np.min([len(tset) for tset in frfil.time_sets])) + frate_centers, frate_half_widths = select_tophat_frates( + uvd=frfil.hd, blvecs=frfil.blvecs, + case=case, keys=keys, uvb=uvb, + frate_standoff=frate_standoff, + frate_width_multiplier=frate_width_multiplier, + min_frate_half_width=min_frate_half_width, + max_frate_half_width=max_frate_half_width, + max_frate_coeffs=max_frate_coeffs, + percentile_low=percentile_low, + percentile_high=percentile_high, + fr_freq_skip=fr_freq_skip, + verbose=verbose, nfr=nfr, ) - frfil._deinterleave_data_in_time( - 'nsamples', ninterleave=ninterleave, set_time_sets=False + # Lists of names of datacontainers that will hold each interleaved + # data set until they are recombined. + filtered_data_names = [ + f'clean_data_interleave_{inum}' for inum in range(ninterleave) + ] + filtered_flag_names = [ + fstr.replace('data', 'flags') for fstr in filtered_data_names + ] + filtered_resid_names = [ + fstr.replace('data', 'resid') for fstr in filtered_data_names + ] + filtered_model_names = [ + fstr.replace('data', 'model') for fstr in filtered_data_names + ] + filtered_resid_flag_names = [ + fstr.replace('data', 'resid_flags') for fstr in filtered_data_names + ] + + for inum in range(ninterleave): + + # Build weights using flags, nsamples, and exlcuded lsts + flags = getattr(frfil, f'flags_interleave_{inum}') + nsamples = getattr(frfil, f'nsamples_interleave_{inum}') + wgts = io.DataContainer({k: (~flags[k]).astype(float) for k in flags}) + + lsts = frfil.lst_sets[inum] + for k in wgts: + if wgt_by_nsample: + wgts[k] *= nsamples[k] + if lst_blacklists is not None: + for lb in lst_blacklists: + if lb[0] < lb[1]: + is_blacklisted = (lsts >= lb[0] * np.pi / 12)\ + & (lsts <= lb[1] * np.pi / 12) + else: + is_blacklisted = (lsts >= lb[0] * np.pi / 12) + | (lsts <= lb[1] * np.pi / 12) + wgts[k][is_blacklisted, :] = ( + wgts[k][is_blacklisted, :] * blacklist_wgt + ) + # run tophat filter + frfil.tophat_frfilter( + frate_centers=frate_centers, frate_half_widths=frate_half_widths, + keys=keys, verbose=verbose, wgts=wgts, + flags=getattr(frfil, f'flags_interleave_{inum}'), + data=getattr(frfil, f'data_interleave_{inum}'), + output_postfix=f'interleave_{inum}', + times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, + **filter_kwargs ) - # figure out frige rate centers and half-widths - if param_file: - pass - else: - assert case in ['sky', 'max_frate_coeffs', 'uvbeam'], f'case={case} is not valid.' - # use conservative nfr (lowest resolution set). - nfr = int(np.min([len(tset) for tset in frfil.time_sets])) - frate_centers, frate_half_widths = select_tophat_frates( - uvd=frfil.hd, blvecs=frfil.blvecs, - case=case, keys=keys, uvb=uvb, - frate_standoff=frate_standoff, - frate_width_multiplier=frate_width_multiplier, - min_frate_half_width=min_frate_half_width, - max_frate_half_width=max_frate_half_width, - max_frate_coeffs=max_frate_coeffs, - percentile_low=percentile_low, - percentile_high=percentile_high, - fr_freq_skip=fr_freq_skip, - verbose=verbose, nfr=nfr, - ) - # Lists of names of datacontainers that will hold each interleaved - # data set until they are recombined. - filtered_data_names = [ - f'clean_data_interleave_{inum}' for inum in range(ninterleave) - ] - filtered_flag_names = [ - fstr.replace('data', 'flags') for fstr in filtered_data_names - ] - filtered_resid_names = [ - fstr.replace('data', 'resid') for fstr in filtered_data_names - ] - filtered_model_names = [ - fstr.replace('data', 'model') for fstr in filtered_data_names - ] - filtered_resid_flag_names = [ - fstr.replace('data', 'resid_flags') for fstr in filtered_data_names - ] - - for inum in range(ninterleave): - - # Build weights using flags, nsamples, and exlcuded lsts - flags = getattr(frfil, f'flags_interleave_{inum}') - nsamples = getattr(frfil, f'nsamples_interleave_{inum}') - wgts = io.DataContainer({k: (~flags[k]).astype(float) for k in flags}) - - lsts = frfil.lst_sets[inum] - for k in wgts: - if wgt_by_nsample: - wgts[k] *= nsamples[k] - if lst_blacklists is not None: - for lb in lst_blacklists: - if lb[0] < lb[1]: - is_blacklisted = (lsts >= lb[0] * np.pi / 12)\ - & (lsts <= lb[1] * np.pi / 12) - else: - is_blacklisted = (lsts >= lb[0] * np.pi / 12) - | (lsts <= lb[1] * np.pi / 12) - wgts[k][is_blacklisted, :] = ( - wgts[k][is_blacklisted, :] * blacklist_wgt - ) - # run tophat filter - frfil.tophat_frfilter( - frate_centers=frate_centers, frate_half_widths=frate_half_widths, - keys=keys, verbose=verbose, wgts=wgts, - flags=getattr(frfil, f'flags_interleave_{inum}'), - data=getattr(frfil, f'data_interleave_{inum}'), - output_postfix=f'interleave_{inum}', - times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, - **filter_kwargs - ) - - frfil._interleave_data_in_time(filtered_data_names, 'clean_data') - frfil._interleave_data_in_time(filtered_flag_names, 'clean_flags') - frfil._interleave_data_in_time(filtered_resid_names, 'clean_resid') - frfil._interleave_data_in_time(filtered_resid_flag_names, 'clean_resid_flags') - frfil._interleave_data_in_time(filtered_model_names, 'clean_model') - else: - frfil.clean_data = DataContainer({}) - frfil.clean_flags = DataContainer({}) - frfil.clean_resid = DataContainer({}) - frfil.clean_resid_flags = DataContainer({}) - frfil.clean_model = DataContainer({}) - # put autocorr data into filtered data containers if skip_autos = True. - # so that it can be written out into the filtered files. - if skip_autos: - for bl in frfil.data.keys(): - if bl[0] == bl[1]: - frfil.clean_data[bl] = frfil.data[bl] - frfil.clean_flags[bl] = frfil.flags[bl] - frfil.clean_resid[bl] = frfil.data[bl] - frfil.clean_model[bl] = np.zeros_like(frfil.data[bl]) - frfil.clean_resid_flags[bl] = frfil.flags[bl] - - frfil.write_filtered_data( - res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, - filled_outfilename=filled_outfilename, - partial_write=Nbls_per_load < len(baseline_list), - clobber=clobber, add_to_history=add_to_history, - extra_attrs={ - 'Nfreqs': frfil.hd.Nfreqs, - 'freq_array': frfil.hd.freq_array, - 'channel_width': frfil.hd.channel_width, - 'flex_spw_id_array': frfil.hd.flex_spw_id_array - }, - ) - frfil.hd.data_array = None # this forces a reload in the next loop + frfil._interleave_data_in_time(filtered_data_names, 'clean_data') + frfil._interleave_data_in_time(filtered_flag_names, 'clean_flags') + frfil._interleave_data_in_time(filtered_resid_names, 'clean_resid') + frfil._interleave_data_in_time(filtered_resid_flag_names, 'clean_resid_flags') + frfil._interleave_data_in_time(filtered_model_names, 'clean_model') + + else: + frfil.clean_data = DataContainer({}) + frfil.clean_flags = DataContainer({}) + frfil.clean_resid = DataContainer({}) + frfil.clean_resid_flags = DataContainer({}) + frfil.clean_model = DataContainer({}) + + # put autocorr data into filtered data containers if skip_autos = True. + # so that it can be written out into the filtered files. + if skip_autos: + for bl in frfil.data.keys(): + if bl[0] == bl[1]: + frfil.clean_data[bl] = frfil.data[bl] + frfil.clean_flags[bl] = frfil.flags[bl] + frfil.clean_resid[bl] = frfil.data[bl] + frfil.clean_model[bl] = np.zeros_like(frfil.data[bl]) + frfil.clean_resid_flags[bl] = frfil.flags[bl] + + frfil.write_filtered_data( + res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, + filled_outfilename=filled_outfilename, + partial_write=Nbls_per_load < len(baseline_list), + clobber=clobber, add_to_history=add_to_history, + extra_attrs={ + 'Nfreqs': frfil.hd.Nfreqs, + 'freq_array': frfil.hd.freq_array, + 'channel_width': frfil.hd.channel_width, + 'flex_spw_id_array': frfil.hd.flex_spw_id_array + }, + ) + frfil.hd.data_array = None # this forces a reload in the next loop def time_average_argparser(): diff --git a/scripts/tophat_frfilter_run.py b/scripts/tophat_frfilter_run.py index 4f9fb98fc..6178524e1 100644 --- a/scripts/tophat_frfilter_run.py +++ b/scripts/tophat_frfilter_run.py @@ -43,20 +43,26 @@ clobber=ap.clobber, write_cache=ap.write_cache, CLEAN_outfilename=ap.CLEAN_outfilename, read_cache=ap.read_cache, mode=ap.mode, res_outfilename=ap.res_outfilename, factorize_flags=ap.factorize_flags, time_thresh=ap.time_thresh, - wgt_by_nsample=ap.wgt_by_nsample, lst_blacklists=ap.lst_blacklists, blacklist_wgt=ap.blacklist_wgt, + wgt_by_nsample=ap.wgt_by_nsample, lst_blacklists=ap.lst_blacklists, + blacklist_wgt=ap.blacklist_wgt, add_to_history=' '.join(sys.argv), verbose=ap.verbose, flag_yaml=ap.flag_yaml, Nbls_per_load=ap.Nbls_per_load, case=ap.case, external_flags=ap.external_flags, filter_spw_ranges=ap.filter_spw_ranges, overwrite_flags=ap.overwrite_flags, skip_autos=ap.skip_autos, skip_if_flag_within_edge_distance=ap.skip_if_flag_within_edge_distance, zeropad=ap.zeropad, tol=ap.tol, skip_wgt=ap.skip_wgt, max_frate_coeffs=ap.max_frate_coeffs, - frate_width_multiplier=ap.frate_width_multiplier, frate_standoff=ap.frate_standoff, fr_freq_skip=ap.fr_freq_skip, + frate_width_multiplier=ap.frate_width_multiplier, frate_standoff=ap.frate_standoff, + fr_freq_skip=ap.fr_freq_skip, min_frate_half_width=ap.min_frate_half_width, max_frate_half_width=ap.max_frate_half_width, - beamfitsfile=ap.beamfitsfile, percentile_low=ap.percentile_low, percentile_high=ap.percentile_high, - skip_contiguous_flags=not(ap.dont_skip_contiguous_flags), max_contiguous_flag=ap.max_contiguous_flag, + beamfitsfile=ap.beamfitsfile, percentile_low=ap.percentile_low, + percentile_high=ap.percentile_high, + skip_contiguous_flags=not(ap.dont_skip_contiguous_flags), + max_contiguous_flag=ap.max_contiguous_flag, skip_flagged_edges=not(ap.dont_skip_flagged_edges), - flag_model_rms_outliers=not(ap.dont_flag_model_rms_outliers), model_rms_threshold=ap.model_rms_threshold, + flag_model_rms_outliers=not(ap.dont_flag_model_rms_outliers), + model_rms_threshold=ap.model_rms_threshold, clean_flags_in_resid_flags=not(ap.clean_flags_not_in_resid_flags), pre_filter_modes_between_lobe_minimum_and_zero=ap.pre_filter_modes_between_lobe_minimum_and_zero, + param_file=ap.param_file, **filter_kwargs ) From 19c9ccdae6b7ba9a9f5ea4add8a073fcb8d8b125 Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 30 Oct 2023 17:34:37 -0400 Subject: [PATCH 53/81] fix: error with applying LST blacklist --- hera_cal/frf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index b05f9ea73..8f4505626 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -1860,11 +1860,13 @@ def load_tophat_frfilter_and_write( if lst_blacklists is not None: for lb in lst_blacklists: if lb[0] < lb[1]: - is_blacklisted = (lsts >= lb[0] * np.pi / 12)\ - & (lsts <= lb[1] * np.pi / 12) + is_blacklisted = ( + lsts >= lb[0] * np.pi / 12 + ) & (lsts <= lb[1] * np.pi / 12) else: - is_blacklisted = (lsts >= lb[0] * np.pi / 12) - | (lsts <= lb[1] * np.pi / 12) + is_blacklisted = ( + lsts >= lb[0] * np.pi / 12 + ) | (lsts <= lb[1] * np.pi / 12) wgts[k][is_blacklisted, :] = ( wgts[k][is_blacklisted, :] * blacklist_wgt ) From 09250e9f11304cd44370e628b475b348a64e9a2d Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 30 Oct 2023 18:27:05 -0400 Subject: [PATCH 54/81] tests: write tests for new features; fix things --- hera_cal/frf.py | 20 ++++----- hera_cal/tests/test_frf.py | 87 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 10 deletions(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index 8f4505626..4b39c49f2 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -1734,12 +1734,15 @@ def load_tophat_frfilter_and_write( # baselines are present in the filter file. if param_file and os.path.exists(param_file): with open(param_file, "r") as f: - filter_info = yaml.load(f.read(), Loader=yaml.SafeLoader) + filter_info = yaml.load(f.read(), Loader=yaml.Loader) filter_antpairs = set(filter_info["filter_centers"].keys()) - have_bl_info = [ - (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) - for bl in baseline_list - ] + have_bl_info = [] + for bl in baseline_list: + if skip_autos and bl[0] == bl[1]: + continue + have_bl_info.append( + (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) + ) if not all(have_bl_info): raise ValueError( "Provided filter file doesn't have every baseline." @@ -1792,17 +1795,14 @@ def load_tophat_frfilter_and_write( # Figure out fringe-rate centers and half-widths. if param_file and os.path.exists(param_file): - # Read from a file if provided. - with open(param_file, "r") as f: - param_info = yaml.load(f.read(), Loader=yaml.SafeLoader) frate_centers = {} frate_half_widths = {} # Assuming we use the same filters for all polarizations. for key in keys: ai, aj = key[:2] if (ai, aj) in filter_antpairs: - frate_centers[key] = param_info["filter_centers"][(ai,aj)] - frate_half_widths[key] = param_info["filter_half_widths"][(ai,aj)] + frate_centers[key] = filter_info["filter_centers"][(ai,aj)] + frate_half_widths[key] = filter_info["filter_half_widths"][(ai,aj)] else: # We've already enforced that all the data baselines # are in the filter file, so we should be safe here. diff --git a/hera_cal/tests/test_frf.py b/hera_cal/tests/test_frf.py index 5f05c5f37..1342edefd 100644 --- a/hera_cal/tests/test_frf.py +++ b/hera_cal/tests/test_frf.py @@ -13,6 +13,7 @@ from pyuvdata import UVData from pyuvdata import utils as uvutils import unittest +import yaml from scipy import stats from scipy import constants from pyuvdata import UVFlag, UVBeam @@ -930,6 +931,92 @@ def test_load_dayenu_filter_and_write(self, tmpdir): os.remove(outfilename) shutil.rmtree(cdir) + def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir): + tmp_path = tmpdir.strpath + uvh5 = os.path.join( + DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" + ) + frate_centers = {(53, 54): 0.1} + frate_half_widths = {(53, 54): 0.05} + filter_info = dict( + filter_centers=frate_centers, + filter_half_widths=frate_half_widths, + ) + frate_centers = {k+("ee",): v for k, v in frate_centers.items()} + frate_half_widths = {k+("ee",): v for k, v in frate_half_widths.items()} + + with open(tmpdir / "filter_info.yaml", "w") as f: + yaml.dump(filter_info, f) + + outfilename = os.path.join(tmp_path, 'temp.h5') + CLEAN_outfilename = os.path.join(tmp_path, 'temp_clean.h5') + filled_outfilename = os.path.join(tmp_path, 'temp_filled.h5') + frf.load_tophat_frfilter_and_write( + uvh5, + res_outfilename=outfilename, + tol=1e-4, + clobber=True, + Nbls_per_load=1, + param_file=str(tmpdir / "filter_info.yaml"), + case="sky", + skip_autos=True, + ) + hd = io.HERAData(outfilename) + d, f, n = hd.read(bls=[(53, 54, 'ee')]) + for bl in d: + assert not np.allclose(d[bl], 0) + + frfil = frf.FRFilter(uvh5, filetype='uvh5') + frfil.read(bls=[(53, 54, 'ee')]) + frfil.tophat_frfilter( + keys=[(53, 54, 'ee')], + tol=1e-4, + verbose=True, + frate_centers=frate_centers, + frate_half_widths=frate_half_widths, + ) + + # Check that the filtered data both ways matches. + np.testing.assert_almost_equal( + d[(53, 54, 'ee')], frfil.clean_resid[(53, 54, 'ee')], decimal=5 + ) + + # Check that the flags match. + np.testing.assert_array_equal(f[(53, 54, 'ee')], frfil.flags[(53, 54, 'ee')]) + + + def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): + tmp_path = tmpdir.strpath + uvh5 = os.path.join( + DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" + ) + frate_centers = {(26, 37): 0.1} + frate_half_widths = {(26, 37): 0.05} + filter_info = dict( + filter_centers=frate_centers, + filter_half_widths=frate_half_widths, + ) + frate_centers = {k+("ee",): v for k, v in frate_centers.items()} + frate_half_widths = {k+("ee",): v for k, v in frate_half_widths.items()} + + with open(tmpdir / "filter_info.yaml", "w") as f: + yaml.dump(filter_info, f) + + outfilename = os.path.join(tmp_path, 'temp.h5') + CLEAN_outfilename = os.path.join(tmp_path, 'temp_clean.h5') + filled_outfilename = os.path.join(tmp_path, 'temp_filled.h5') + with pytest.raises(ValueError, match="doesn't have every baseline"): + frf.load_tophat_frfilter_and_write( + uvh5, + res_outfilename=outfilename, + tol=1e-4, + clobber=True, + Nbls_per_load=1, + param_file=str(tmpdir / "filter_info.yaml"), + case="sky", + ) + + def test_tophat_clean_argparser(self): sys.argv = [sys.argv[0], 'a', '--clobber', '--window', 'blackmanharris', '--max_frate_coeffs', '0.024', '-0.229'] parser = frf.tophat_frfilter_argparser() From 8912c7a272826f65cb6aea46b0c46fea3c43983e Mon Sep 17 00:00:00 2001 From: r-pascua Date: Thu, 2 Nov 2023 00:52:23 -0400 Subject: [PATCH 55/81] fix: address comments from PR --- hera_cal/frf.py | 68 +++++++++++++++++++++++++++++--------- hera_cal/tests/test_frf.py | 37 ++++++++++++--------- 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index 4b39c49f2..181b76c91 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -29,6 +29,7 @@ from .utils import echo import os import yaml +import re SPEED_OF_LIGHT = const.c.si.value SDAY_SEC = units.sday.to("s") @@ -1562,21 +1563,30 @@ def tophat_frfilter_argparser(mode='clean'): filt_options.add_argument("--blacklist_wgt", type=float, default=0.0, help="Relative weight to assign to blacklisted lsts compared to 1.0. Default 0.0 \ means no weight. Note that 0.0 will create problems for DPSS at edge times and frequencies.") - desc = ("Filtering case ['max_frate_coeffs', 'uvbeam', 'sky']", + desc = ("Filtering case ['max_frate_coeffs', 'uvbeam', 'sky', 'param_file']", "If case == 'max_frate_coeffs', then determine fringe rate centers", "and half-widths based on the max_frate_coeffs arg (see below).", "If case == 'uvbeam', then determine fringe rate centers and half widths", "from histogram of main-beam wrt instantaneous sky fringe rates.", "If case == 'sky': then use fringe-rates corresponding to range of ", - "instantanous fringe-rates that include sky emission.") + "instantanous fringe-rates that include sky emission.", + "If case == 'param_file': then use a provided parameter file to determine", + "filter centers and filter half-widths. See param_file help for more ", + "information regarding parameter file structure.") filt_options.add_argument("--case", default="sky", help=' '.join(desc), type=str) desc = ("Number interleaved time subsets to split the data into ", "and apply independent fringe-rate filters. Default is 1 (no interleaved filters).", "This does not change the format of the output files but it does change the nature of their content.") filt_options.add_argument("--ninterleave", default=1, type=int, help=desc) - filt_options.add_argument( - "--param_file", default="", type=str, help="File containing filter parameters" - ) + + desc = ("File containing filter parameters. Parameter file must be yaml-readable ", + "and contain two entries: filter_centers and filter_half_widths. Each of ", + "these entries must be dictionaries whose keys are strings of antenna ", + "pairs and whose values are floats. Filter parameters are assumed to ", + "correspond to a particular range of frequencies (chosen when making the ", + "filter parameter file) and depend on baseline but not polarization (i.e., ", + "the 'ee' and 'nn' polarizations will have the same filter for a given baseline.") + filt_options.add_argument("--param_file", default="", type=str, help=desc) return ap @@ -1681,6 +1691,13 @@ def load_tophat_frfilter_and_write( Number of interleaved sets to run time filtering on. param_file: str, optional File containing filter parameters (e.g., centers and half-widths). + The file must be readable by yaml, with a "filter_centers" entry and + a "filter_half_widths" entry. Each of these entries should correspond + to dictionaries with antenna pair strings as keys and floating point + numbers as values. When writing filter parameters to a file, ensure + that the antenna pair keys have been converted to strings prior to + dumping the contents to the file. See the file + for an example. filter_kwargs: additional keyword arguments to be passed to FRFilter.tophat_frfilter() ''' if baseline_list is not None and Nbls_per_load is not None: @@ -1693,6 +1710,12 @@ def load_tophat_frfilter_and_write( ) return + if case == "param_file" and not os.path.exists(param_file): + raise ValueError( + "When using a filter parameter file, a valid parameter file must " + f"be provided. The provided file {param_file} could not be found." + ) + hd = io.HERAData(datafile_list, filetype='uvh5') # Figure out which baselines to load if not provided. if baseline_list is None: @@ -1732,20 +1755,35 @@ def load_tophat_frfilter_and_write( # If a filter parameter file is provided, let's check that all of the # baselines are present in the filter file. - if param_file and os.path.exists(param_file): + if case == "param_file": with open(param_file, "r") as f: - filter_info = yaml.load(f.read(), Loader=yaml.Loader) + _filter_info = yaml.load(f.read(), Loader=yaml.SafeLoader) + + # Convert the dictionary keys into tuples. + filter_info = {param: {} for param in _filter_info.keys()} + for filter_param, info in _filter_info.items(): + for antpair_str, value in info.items(): + antpair = tuple( + int(ant) for ant in re.findall("[0-9]+", antpair_str) + ) + filter_info[filter_param][antpair] = value + filter_antpairs = set(filter_info["filter_centers"].keys()) have_bl_info = [] + missing_bls = set() for bl in baseline_list: if skip_autos and bl[0] == bl[1]: continue - have_bl_info.append( - (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) - ) - if not all(have_bl_info): + have_bl = (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) + have_bl_info.append(have_bl) + if not have_bl: + missing_bls.add(bl[:2]) + + if missing_bls: + missing_bls = [str(bl) for bl in missing_bls] raise ValueError( - "Provided filter file doesn't have every baseline." + "Provided filter file doesn't have every baseline. The following" + "baselines could not be found: " + " ".join(missing_bls) ) # Read the data in chunks and perform filtering. @@ -1794,7 +1832,7 @@ def load_tophat_frfilter_and_write( ) # Figure out fringe-rate centers and half-widths. - if param_file and os.path.exists(param_file): + if case == "param_file": frate_centers = {} frate_half_widths = {} # Assuming we use the same filters for all polarizations. @@ -1806,8 +1844,8 @@ def load_tophat_frfilter_and_write( else: # We've already enforced that all the data baselines # are in the filter file, so we should be safe here. - frate_centers[key] = -param_info["filter_centers"][(aj,ai)] - frate_centers[key] = param_info["filter_half_widths"][(aj,ai)] + frate_centers[key] = -filter_info["filter_centers"][(aj,ai)] + frate_half_widths[key] = filter_info["filter_half_widths"][(aj,ai)] else: # Otherwise, we need to compute the filter parameters. if case not in ("sky", "max_frate_coeffs", "uvbeam"): diff --git a/hera_cal/tests/test_frf.py b/hera_cal/tests/test_frf.py index 1342edefd..f77a8270d 100644 --- a/hera_cal/tests/test_frf.py +++ b/hera_cal/tests/test_frf.py @@ -931,17 +931,28 @@ def test_load_dayenu_filter_and_write(self, tmpdir): os.remove(outfilename) shutil.rmtree(cdir) - def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir): + @pytest.mark.parametrize("flip", [True, False]) + def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir, flip): tmp_path = tmpdir.strpath uvh5 = os.path.join( DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" ) frate_centers = {(53, 54): 0.1} frate_half_widths = {(53, 54): 0.05} - filter_info = dict( - filter_centers=frate_centers, - filter_half_widths=frate_half_widths, - ) + if flip: + filter_info = dict( + filter_centers={ + str(k[::-1]): -v for k, v in frate_centers.items() + }, + filter_half_widths={ + str(k[::-1]): v for k, v in frate_half_widths.items() + }, + ) + else: + filter_info = dict( + filter_centers={str(k): v for k, v in frate_centers.items()}, + filter_half_widths={str(k): v for k, v in frate_half_widths.items()}, + ) frate_centers = {k+("ee",): v for k, v in frate_centers.items()} frate_half_widths = {k+("ee",): v for k, v in frate_half_widths.items()} @@ -949,8 +960,6 @@ def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir): yaml.dump(filter_info, f) outfilename = os.path.join(tmp_path, 'temp.h5') - CLEAN_outfilename = os.path.join(tmp_path, 'temp_clean.h5') - filled_outfilename = os.path.join(tmp_path, 'temp_filled.h5') frf.load_tophat_frfilter_and_write( uvh5, res_outfilename=outfilename, @@ -958,7 +967,7 @@ def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir): clobber=True, Nbls_per_load=1, param_file=str(tmpdir / "filter_info.yaml"), - case="sky", + case="param_file", skip_autos=True, ) hd = io.HERAData(outfilename) @@ -993,19 +1002,15 @@ def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): frate_centers = {(26, 37): 0.1} frate_half_widths = {(26, 37): 0.05} filter_info = dict( - filter_centers=frate_centers, - filter_half_widths=frate_half_widths, + filter_centers={str(k): v for k, v in frate_centers.items()}, + filter_half_widths={str(k): v for k, v in frate_half_widths.items()}, ) - frate_centers = {k+("ee",): v for k, v in frate_centers.items()} - frate_half_widths = {k+("ee",): v for k, v in frate_half_widths.items()} with open(tmpdir / "filter_info.yaml", "w") as f: yaml.dump(filter_info, f) outfilename = os.path.join(tmp_path, 'temp.h5') - CLEAN_outfilename = os.path.join(tmp_path, 'temp_clean.h5') - filled_outfilename = os.path.join(tmp_path, 'temp_filled.h5') - with pytest.raises(ValueError, match="doesn't have every baseline"): + with pytest.raises(ValueError, match="(53, 54)"): frf.load_tophat_frfilter_and_write( uvh5, res_outfilename=outfilename, @@ -1013,7 +1018,7 @@ def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): clobber=True, Nbls_per_load=1, param_file=str(tmpdir / "filter_info.yaml"), - case="sky", + case="param_file", ) From 31bc3b7105c5f999d68ebd9ee6f7fa38a1301de7 Mon Sep 17 00:00:00 2001 From: r-pascua Date: Thu, 2 Nov 2023 00:54:30 -0400 Subject: [PATCH 56/81] doc: make example filter param file --- hera_cal/data/example_filter_params.yaml | 6 ++++++ hera_cal/frf.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 hera_cal/data/example_filter_params.yaml diff --git a/hera_cal/data/example_filter_params.yaml b/hera_cal/data/example_filter_params.yaml new file mode 100644 index 000000000..af52605e0 --- /dev/null +++ b/hera_cal/data/example_filter_params.yaml @@ -0,0 +1,6 @@ +filter_centers: + (0, 1): 0.1234 + (0, 2): 0.173 +filter_half_widths: + (0, 1): 0.05 + (0, 2): 0.08 diff --git a/hera_cal/frf.py b/hera_cal/frf.py index 181b76c91..edf8c3fa4 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -1696,7 +1696,7 @@ def load_tophat_frfilter_and_write( to dictionaries with antenna pair strings as keys and floating point numbers as values. When writing filter parameters to a file, ensure that the antenna pair keys have been converted to strings prior to - dumping the contents to the file. See the file + dumping the contents to the file. See the file example_filter_params.yaml for an example. filter_kwargs: additional keyword arguments to be passed to FRFilter.tophat_frfilter() ''' From d7d69b200fa46a78506dbb83302916ff6058d5ee Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 6 Nov 2023 14:48:16 -0500 Subject: [PATCH 57/81] test: use example filter file for a test --- hera_cal/tests/test_frf.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/hera_cal/tests/test_frf.py b/hera_cal/tests/test_frf.py index f77a8270d..1eb86c48e 100644 --- a/hera_cal/tests/test_frf.py +++ b/hera_cal/tests/test_frf.py @@ -999,15 +999,7 @@ def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): uvh5 = os.path.join( DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" ) - frate_centers = {(26, 37): 0.1} - frate_half_widths = {(26, 37): 0.05} - filter_info = dict( - filter_centers={str(k): v for k, v in frate_centers.items()}, - filter_half_widths={str(k): v for k, v in frate_half_widths.items()}, - ) - - with open(tmpdir / "filter_info.yaml", "w") as f: - yaml.dump(filter_info, f) + param_file = os.path.join(DATA_PATH, "example_filter_params.yaml") outfilename = os.path.join(tmp_path, 'temp.h5') with pytest.raises(ValueError, match="(53, 54)"): @@ -1017,7 +1009,7 @@ def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): tol=1e-4, clobber=True, Nbls_per_load=1, - param_file=str(tmpdir / "filter_info.yaml"), + param_file=param_file, case="param_file", ) From 0f1b175e133e41abede4a8f3dd29a85905ebdedd Mon Sep 17 00:00:00 2001 From: r-pascua Date: Mon, 6 Nov 2023 14:49:34 -0500 Subject: [PATCH 58/81] doc: note that filter params are in mHz --- hera_cal/frf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hera_cal/frf.py b/hera_cal/frf.py index edf8c3fa4..fd34940d6 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -1694,7 +1694,8 @@ def load_tophat_frfilter_and_write( The file must be readable by yaml, with a "filter_centers" entry and a "filter_half_widths" entry. Each of these entries should correspond to dictionaries with antenna pair strings as keys and floating point - numbers as values. When writing filter parameters to a file, ensure + numbers as values. The filter centers and filter half widths are assumed + to be provided in mHz. When writing filter parameters to a file, ensure that the antenna pair keys have been converted to strings prior to dumping the contents to the file. See the file example_filter_params.yaml for an example. From 3bc4fcd23541c9e8f98301f1da51fd8defad1c62 Mon Sep 17 00:00:00 2001 From: Tyler Cox Date: Thu, 26 Oct 2023 16:15:55 -0700 Subject: [PATCH 59/81] Modify dpss modeling --- hera_cal/abscal.py | 18 ++++++++---------- hera_cal/nucal.py | 29 ++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/hera_cal/abscal.py b/hera_cal/abscal.py index 6700314d4..d133fe175 100644 --- a/hera_cal/abscal.py +++ b/hera_cal/abscal.py @@ -1486,22 +1486,20 @@ def _phase_gradient_solution(Z_coefficients, transformed_b_vecs, resolution_fact Nk = 2 * dim_maxes + 1 Nk_use = resolution_factor * Nk + 1 if resolution_factor > 1 else Nk - # Initialize the grid and the corresponding frequencies - grid = np.zeros(tuple(Nk_use), dtype=np.complex64) + # Initialize the grid frequencies ft_freqs = [- 2 * np.pi * fft.fftshift(fft.fftfreq(n)) for n in Nk_use] # Get the indices of the grid points corresponding to the transformed_b_vecs grid_indices = [tuple(ii for ii in transformed_b_vecs[nn]) for nn in range(Ngroups)] - + grid_indices = tuple(np.array(grid_indices).T) # Loop over times and frequency channels for i_t in range(Ntimes): for i_f in range(Nfreqs): - + # Populate grid with Fourier coeffcients of the objective function - Z_coeffs_t_f = Z_coefficients[i_t, i_f] - for nn in range(Ngroups): - grid[grid_indices[nn]] = Z_coeffs_t_f[nn] - + grid = np.zeros(tuple(Nk_use), dtype=np.complex64) + np.add.at(grid, grid_indices, Z_coefficients[i_t, i_f]) + # Evaluate the complex objective function Z on a grid of sample points grid_ft = fft.fftshift(fft.fftn(grid, workers=-1)) / Ngroups @@ -1513,11 +1511,11 @@ def _phase_gradient_solution(Z_coefficients, transformed_b_vecs, resolution_fact # Find the local maximum near that initial point, which should also # be the global maximum, in which case it is the desired phase gradient solution - Lambda_t_f, niter_t_f = _newton_solve(Lambda_init, transformed_b_vecs, Z_coeffs_t_f, 1e-8, maxiter=newton_maxiter) + Lambda_t_f, niter_t_f = _newton_solve(Lambda_init, transformed_b_vecs, Z_coefficients[i_t, i_f], 1e-8, maxiter=newton_maxiter) # Store results Lambda_sol[i_t, i_f] = Lambda_t_f - Z_sol[i_t, i_f] = _eval_Z(Lambda_t_f, transformed_b_vecs, Z_coeffs_t_f) + Z_sol[i_t, i_f] = _eval_Z(Lambda_t_f, transformed_b_vecs, Z_coefficients[i_t, i_f]) newton_iterations[i_t, i_f] = niter_t_f return Lambda_sol, Z_sol, newton_iterations diff --git a/hera_cal/nucal.py b/hera_cal/nucal.py index f3c533754..85532bb8d 100644 --- a/hera_cal/nucal.py +++ b/hera_cal/nucal.py @@ -503,7 +503,7 @@ def compute_spectral_filters(freqs, spectral_filter_half_width, eigenval_cutoff= return dspec.dpss_operator(freqs, [0], [spectral_filter_half_width], eigenval_cutoff=[eigenval_cutoff])[0].real -def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filter_half_width=1, eigenval_cutoff=1e-12): +def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filter_half_width=1, eigenval_cutoff=1e-12, umin=None, umax=None): """ Compute prolate spheroidal wave function (PSWF) filters for a single radially redundant group. @@ -522,6 +522,14 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt the uv-plane to be modeled at half-wavelength scales. eigenval_cutoff : float Cutoff for the eigenvalues of the PSWF filter + umin : float, optional, default=None + Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal modeling if one is uininterested in u-modes below + umin. + umax : float, optional, default=None + Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal modeling particularly if the + baseline group has a few long baselines an one is uininterested in u-modes above umax. Returns: ------- @@ -531,8 +539,10 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt # Compute the minimum and maximum u values for the spatial filter group_bls_lengths = [bls_lengths[bl] for bl in group] - umin = np.min(group_bls_lengths) / SPEED_OF_LIGHT * np.min(freqs) - umax = np.max(group_bls_lengths) / SPEED_OF_LIGHT * np.max(freqs) + if umin is None: + umin = np.min(group_bls_lengths) / SPEED_OF_LIGHT * np.min(freqs) + if umax is None: + umax = np.max(group_bls_lengths) / SPEED_OF_LIGHT * np.max(freqs) # Create dictionary for storing spatial filters spatial_filters = {} @@ -550,7 +560,7 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt return spatial_filters -def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, cache={}): +def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, cache={}, umin=None, umax=None): """ Compute prolate spheroidal wave function (PSWF) filters for each radially redundant group in radial_reds. Note that if you are using a large array with a large range of short and long baselines in an individual radially @@ -570,6 +580,15 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig Sinc matrix eigenvalue cutoffs to use for included PSWF modes. cache : dictionary, default={} Dictionary containing cached PSWF eigenvectors to speed up computation + umin : float, optional, default=None + Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal modeling if one is uininterested in u-modes below + umin. If umin is not None, umin will be applied to all baseline groups in radial reds. + umax : float, optional, default=None + Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal modeling particularly if the + baseline group has a few long baselines an one is uininterested in u-modes above umax. If umax is not None, umax will be applied to all + baseline groups in radial reds. Returns: ------- @@ -584,7 +603,7 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig for group in radial_reds: # Compute spatial filters for each baseline in the group spatial_filters.update(compute_spatial_filters_single_group( - group, freqs, radial_reds.baseline_lengths, spatial_filter_half_width, eigenval_cutoff + group, freqs, radial_reds.baseline_lengths, spatial_filter_half_width, eigenval_cutoff, umin=umin, umax=umax ) ) From f6fb0449b9cb71134f216a5f790b8e3e4bff782c Mon Sep 17 00:00:00 2001 From: Tyler Cox Date: Mon, 6 Nov 2023 12:26:31 -0800 Subject: [PATCH 60/81] Allow user to set u-cut in compute_spatial_filters --- hera_cal/nucal.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hera_cal/nucal.py b/hera_cal/nucal.py index 85532bb8d..e3ccc58ea 100644 --- a/hera_cal/nucal.py +++ b/hera_cal/nucal.py @@ -560,7 +560,7 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt return spatial_filters -def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, cache={}, umin=None, umax=None): +def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, umin=None, umax=None): """ Compute prolate spheroidal wave function (PSWF) filters for each radially redundant group in radial_reds. Note that if you are using a large array with a large range of short and long baselines in an individual radially @@ -578,8 +578,6 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig modeling foregrounds out to the horizon. eigenval_cutoff : float, default=1e-12 Sinc matrix eigenvalue cutoffs to use for included PSWF modes. - cache : dictionary, default={} - Dictionary containing cached PSWF eigenvectors to speed up computation umin : float, optional, default=None Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal modeling if one is uininterested in u-modes below From 3b8ec0b8829e6a2f1f0b8624cec3e232c9ee4e4a Mon Sep 17 00:00:00 2001 From: Tyler Cox Date: Mon, 6 Nov 2023 12:41:33 -0800 Subject: [PATCH 61/81] Add additional test for u-cut --- hera_cal/tests/test_nucal.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hera_cal/tests/test_nucal.py b/hera_cal/tests/test_nucal.py index 63c7b102c..2ea65b6e9 100644 --- a/hera_cal/tests/test_nucal.py +++ b/hera_cal/tests/test_nucal.py @@ -307,6 +307,17 @@ def test_compute_spatial_filters(): model = design_matrix @ (XTXinv @ Xy) np.allclose(model, data, atol=1e-6) + # Show that filters with a u-max cutoff are set to zero + umax = 15 + antpos = linear_array(6, sep=5) + radial_reds = nucal.RadialRedundancy(antpos) + spatial_filters = nucal.compute_spatial_filters(radial_reds, freqs, umax=umax) + + for rdgrp in radial_reds: + for bl in rdgrp: + umodes = radial_reds.baseline_lengths[bl] * freqs / 2.998e8 + assert np.allclose(spatial_filters[bl][umodes > umax], 0) + def test_build_nucal_wgts(): bls = [(0, 1, 'ee'), (0, 2, 'ee'), (1, 2, 'ee')] auto_bls = [(0, 0, 'ee'), (1, 1, 'ee'), (2, 2, 'ee')] From 69e23d439505e97a547f958d71634f95e9f9ea29 Mon Sep 17 00:00:00 2001 From: Tyler Cox Date: Mon, 6 Nov 2023 12:53:29 -0800 Subject: [PATCH 62/81] Revert abscal changes --- hera_cal/abscal.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/hera_cal/abscal.py b/hera_cal/abscal.py index d133fe175..6700314d4 100644 --- a/hera_cal/abscal.py +++ b/hera_cal/abscal.py @@ -1486,20 +1486,22 @@ def _phase_gradient_solution(Z_coefficients, transformed_b_vecs, resolution_fact Nk = 2 * dim_maxes + 1 Nk_use = resolution_factor * Nk + 1 if resolution_factor > 1 else Nk - # Initialize the grid frequencies + # Initialize the grid and the corresponding frequencies + grid = np.zeros(tuple(Nk_use), dtype=np.complex64) ft_freqs = [- 2 * np.pi * fft.fftshift(fft.fftfreq(n)) for n in Nk_use] # Get the indices of the grid points corresponding to the transformed_b_vecs grid_indices = [tuple(ii for ii in transformed_b_vecs[nn]) for nn in range(Ngroups)] - grid_indices = tuple(np.array(grid_indices).T) + # Loop over times and frequency channels for i_t in range(Ntimes): for i_f in range(Nfreqs): - + # Populate grid with Fourier coeffcients of the objective function - grid = np.zeros(tuple(Nk_use), dtype=np.complex64) - np.add.at(grid, grid_indices, Z_coefficients[i_t, i_f]) - + Z_coeffs_t_f = Z_coefficients[i_t, i_f] + for nn in range(Ngroups): + grid[grid_indices[nn]] = Z_coeffs_t_f[nn] + # Evaluate the complex objective function Z on a grid of sample points grid_ft = fft.fftshift(fft.fftn(grid, workers=-1)) / Ngroups @@ -1511,11 +1513,11 @@ def _phase_gradient_solution(Z_coefficients, transformed_b_vecs, resolution_fact # Find the local maximum near that initial point, which should also # be the global maximum, in which case it is the desired phase gradient solution - Lambda_t_f, niter_t_f = _newton_solve(Lambda_init, transformed_b_vecs, Z_coefficients[i_t, i_f], 1e-8, maxiter=newton_maxiter) + Lambda_t_f, niter_t_f = _newton_solve(Lambda_init, transformed_b_vecs, Z_coeffs_t_f, 1e-8, maxiter=newton_maxiter) # Store results Lambda_sol[i_t, i_f] = Lambda_t_f - Z_sol[i_t, i_f] = _eval_Z(Lambda_t_f, transformed_b_vecs, Z_coefficients[i_t, i_f]) + Z_sol[i_t, i_f] = _eval_Z(Lambda_t_f, transformed_b_vecs, Z_coeffs_t_f) newton_iterations[i_t, i_f] = niter_t_f return Lambda_sol, Z_sol, newton_iterations From d89aaffc1f83cde0574f0c820dcc31b5c50ebbc0 Mon Sep 17 00:00:00 2001 From: Tyler Cox Date: Fri, 10 Nov 2023 12:51:00 -0800 Subject: [PATCH 63/81] Fix docstrings --- hera_cal/nucal.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hera_cal/nucal.py b/hera_cal/nucal.py index e3ccc58ea..7fd1f9288 100644 --- a/hera_cal/nucal.py +++ b/hera_cal/nucal.py @@ -524,11 +524,11 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt Cutoff for the eigenvalues of the PSWF filter umin : float, optional, default=None Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest - baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal modeling if one is uininterested in u-modes below + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal model if one is uininterested in u-modes below umin. umax : float, optional, default=None Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest - baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal modeling particularly if the + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal model particularly if the baseline group has a few long baselines an one is uininterested in u-modes above umax. Returns: @@ -580,11 +580,11 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig Sinc matrix eigenvalue cutoffs to use for included PSWF modes. umin : float, optional, default=None Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest - baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal modeling if one is uininterested in u-modes below + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal model if one is uininterested in u-modes below umin. If umin is not None, umin will be applied to all baseline groups in radial reds. umax : float, optional, default=None Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest - baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal modeling particularly if the + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal model particularly if the baseline group has a few long baselines an one is uininterested in u-modes above umax. If umax is not None, umax will be applied to all baseline groups in radial reds. From 9e513b5317ee0d9e5203fd098bf9ce60a64c446a Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 28 Nov 2023 12:19:16 +0100 Subject: [PATCH 64/81] add some more logging statements about inpainting --- hera_cal/lstbin_simple.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 8fb00eacd..09cb6e361 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -429,6 +429,7 @@ def get_masked_data( inpainted = flags.copy() * (~allf) flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) + logger.info(f"In inpainted_mode: {inpainted_mode}. Got {np.sum(inpainted)} inpainted samples, {np.sum(flags)} total flags, {np.sum(flags & ~inpainted)} non-inpainted flags.") data = np.ma.masked_array(data, mask=(flags & ~inpainted)) return data, flags From cbf04a76a3a2780b90f232377ba149fe4e6f1f7c Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Tue, 28 Nov 2023 15:31:45 +0100 Subject: [PATCH 65/81] fix nsamples calculation for in-painting --- hera_cal/lstbin_simple.py | 57 ++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 09cb6e361..634450602 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -357,10 +357,9 @@ def reduce_lst_bins( if d.size: d, f = get_masked_data( - d, n, f, inpainted=inpf, inpainted_mode=inpainted_mode + d, n, f, inpainted=inpf, + inpainted_mode=inpainted_mode, flag_thresh=flag_thresh ) - f = threshold_flags(f, inplace=True, flag_thresh=flag_thresh) - d.mask |= f ( out_data[:, lstbin], @@ -417,6 +416,7 @@ def get_masked_data( flags: np.ndarray, inpainted: np.ndarray | None = None, inpainted_mode: bool = False, + flag_thresh: float = 0.7, ) -> np.ma.MaskedArray: if not inpainted_mode: # Act like nothing is inpainted. @@ -429,6 +429,11 @@ def get_masked_data( inpainted = flags.copy() * (~allf) flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) + + # Threshold flags over time here, because we want the new flags to be treated on the + # same footing as the inpainted flags. + threshold_flags(flags, inplace=True, flag_thresh=flag_thresh) + logger.info(f"In inpainted_mode: {inpainted_mode}. Got {np.sum(inpainted)} inpainted samples, {np.sum(flags)} total flags, {np.sum(flags & ~inpainted)} non-inpainted flags.") data = np.ma.masked_array(data, mask=(flags & ~inpainted)) return data, flags @@ -454,6 +459,31 @@ def threshold_flags( inplace: bool = False, flag_thresh: float = 0.7, ): + """ + Thresholds the input flags array based on the flag fraction. + + Parameters + ---------- + flags : numpy.ndarray + A numpy array of shape (Nnights, ...) representing the flags. + inplace : bool, optional + If True, modifies the input flags array in place. If False, creates a copy of + the flags array. + flag_thresh : float, optional + The threshold value for the flag fraction. + + Returns + ------- + numpy.ndarray + A numpy array of shape (N, ...) with the thresholded flags. + + Examples + -------- + >>> flags = np.array([[True, False, True], [False, True, False]]) + >>> threshold_flags(flags, inplace=True, flag_thresh=0.5) + array([[ True, False, True], + [False, True, False]]) + """ if not inplace: flags = flags.copy() @@ -569,7 +599,12 @@ def lst_average( nsamples = np.ma.masked_array(nsamples, mask=data.mask) + # Norm is the total number of samples over the nights. In the in-painted case, + # it *counts* in-painted data as samples. In the non-inpainted case, it does not. norm = np.sum(nsamples, axis=0) + + # Ndays binned is the number of days that count towards the mean. This is the same + # in in-painted and flagged mode. ndays_binned = np.sum((~flags).astype(int), axis=0) logger.info("Calculating mean") @@ -581,10 +616,18 @@ def lst_average( lstbin_flagged[ndays_binned < sigma_clip_min_N] = True normalizable = norm > 0 + meandata[normalizable] /= norm[normalizable] # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan + # While the previous nsamples is different for in-painted and flagged mode, whih is + # what we want for the mean, for the std and nsamples we want to treat flags as really + # flagged. + nsamples.mask = flags + norm = np.sum(nsamples, axis=0) + normalizable = norm > 0 + # get other stats logger.info("Calculating std") with warnings.catch_warnings(): @@ -598,12 +641,8 @@ def lst_average( std[~normalizable] = np.inf - # While the previous norm is correct for normalizing the mean, we now - # calculate nsamples as the unflagged samples in each LST bin. - nsamples.mask = flags - nsamples = np.sum(nsamples, axis=0) - - return meandata.data, lstbin_flagged, std.data, nsamples.data, ndays_binned + logger.info(f"Mean of meandata: {np.mean(meandata)}. Mean of std: {np.mean(std)}. Total nsamples: {np.sum(norm)}") + return meandata.data, lstbin_flagged, std.data, norm.data, ndays_binned def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: From b8d802652cabf19d150838312cbae0368a8d8d26 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 29 Nov 2023 09:50:09 +0100 Subject: [PATCH 66/81] testfix: expect nsamples=0 for totally flagged lst/freq/pol --- hera_cal/tests/test_lstbin_simple.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index 9ed847553..a6f0b0ddb 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -624,13 +624,11 @@ def test_flag_below_min_N(self): ) assert np.all(flg_n) - # just because we're flagging it, doesn't mean we need to set nsamples=0 - # or the std to inf. We have info there, we're just choosing not to use it. - assert np.all(norm_n == 2) - assert not np.any(np.isinf(std_n)) + # nsamples is zero because all are flagged. + assert np.all(norm_n == 0) + assert np.all(np.isinf(std_n)) # this time, only one column is flagged too much... - # this time, there's enough samples, but too many are flagged... flags[:] = False flags[:5, 0] = True data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( @@ -642,8 +640,8 @@ def test_flag_below_min_N(self): ) assert np.all(flg_n[0]) - assert np.all(norm_n[0] == 2) - assert not np.any(np.isinf(std_n[0])) + assert np.all(norm_n[0] == 0) + assert np.all(np.isinf(std_n[0])) assert not np.any(flg_n[1:]) assert np.all(norm_n[1:] == 7) From 7de3c49e071c22e9a2b88372d22e57319ee2c9e6 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 29 Nov 2023 09:53:41 +0100 Subject: [PATCH 67/81] maint: typo in comment --- hera_cal/lstbin_simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index 634450602..b4f502e3d 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -621,7 +621,7 @@ def lst_average( # Multiply by nan instead of just setting as nan, so both real and imag parts are nan meandata[~normalizable] *= np.nan - # While the previous nsamples is different for in-painted and flagged mode, whih is + # While the previous nsamples is different for in-painted and flagged mode, which is # what we want for the mean, for the std and nsamples we want to treat flags as really # flagged. nsamples.mask = flags From e8f27666a8014e504de2c5e9dcdbb97f3151d791 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Thu, 30 Nov 2023 17:37:47 +0100 Subject: [PATCH 68/81] fix: only flag lstbin if all are flagged and NOT inpainted --- hera_cal/lstbin_simple.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index b4f502e3d..d5980704d 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -611,7 +611,8 @@ def lst_average( # np.sum works the same for both masked and non-masked arrays. meandata = np.sum(data * nsamples, axis=0) - lstbin_flagged = np.all(flags, axis=0) + lstbin_flagged = np.all(data.mask, axis=0) + if flag_below_min_N: lstbin_flagged[ndays_binned < sigma_clip_min_N] = True From f19c965aca6bfcddecb8c63070e00a6f3739dca1 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Thu, 30 Nov 2023 18:03:46 +0100 Subject: [PATCH 69/81] testfix: expect flags to be different between inpaint/flagged modes --- hera_cal/tests/test_lstbin_simple.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index a6f0b0ddb..42a1f4308 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -533,11 +533,11 @@ def test_inpaint_mode(self): data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True ) - # The data and std in the fully-flagged bin should be different, but Nsamples - # and Flags should be the same. + # The data, flags and std in the fully-flagged bin should be different, but + # Nsamples and Flags should be the same. assert not np.allclose(df[0, 0, 0], di[0, 0, 0]) assert np.allclose(df[1:], di[1:]) - assert np.allclose(ff, fi) + assert not np.allclose(ff, fi) assert not np.allclose(stdf[0, 0, 0], stdi[0, 0, 0]) assert np.allclose(stdf[1:], stdi[1:]) assert np.allclose(nf, ni) From 3afe61e362ebaa397ec573f280b5e0920903d854 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:44:14 -0800 Subject: [PATCH 70/81] avoid unnecessary duplication using the new build_solver option This relies on https://github.com/HERA-Team/linsolve/pull/51 --- hera_cal/redcal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index cdf127acf..3cdaa5ea7 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -832,7 +832,7 @@ def __init__(self, data, sol0, wgts={}, gain=.3, **kwargs): **kwargs: keyword arguments of constants (python variables in keys of data that are not to be solved for) which are passed to linsolve.LinProductSolver. """ - linsolve.LinProductSolver.__init__(self, data, sol0, wgts=wgts, **kwargs) + linsolve.LinProductSolver.__init__(self, data, sol0, wgts=wgts, build_solver=False, **kwargs) self.gain = np.float32(gain) # float32 to avoid accidentally promoting data to doubles. def _get_ans0(self, sol, keys=None): @@ -1887,7 +1887,7 @@ def redcal_iteration(hd, nInt_to_load=None, pol_mode='2pol', bl_error_tol=1.0, e # try to solve for gains on antennas excluded from calibration, but keep them flagged expand_omni_gains(sol, all_reds_this_pol, data, nsamples, chisq_per_ant=meta['chisq_per_ant']) - # try one more time to expand visibilities, keeping new ones flagged, but + # try one more time to expand visibilities, keeping new ones flagged, but # don't update chisq or chisq_per_ant expand_omni_vis(sol, all_reds_this_pol, data, nsamples) sol.make_sol_finite() From f22ebc2e3c67ddd35fd78ed9166668176651d0b1 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:46:25 -0800 Subject: [PATCH 71/81] avoid making copies when the clamping function is lambda x: 1 by instead making the default None --- hera_cal/redcal.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index 3cdaa5ea7..bfa2b8295 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -845,7 +845,7 @@ def _get_ans0(self, sol, keys=None): return {k: eval(k, _sol) for k in keys} def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, - wgt_func=lambda x: 1., verbose=False): + wgt_func=None, verbose=False): """Repeatedly solves and updates solution until convergence or maxiter is reached. Returns a meta-data about the solution and the solution itself. @@ -859,7 +859,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af wgt_func: a function f(abs^2 * wgt) operating on weighted absolute differences between data and model that returns an additional data weighting to apply to when calculating chisq and updating parameters. Example: lambda x: np.where(x>0, 5*np.tanh(x/5)/x, 1) - clamps deviations to 5 sigma. Default is no additional weighting (lambda x: 1.). + clamps deviations to 5 sigma. Default None is no additional weighting. Returns: meta, sol meta: a dictionary with metadata about the solution, including @@ -873,7 +873,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af for term in self.all_terms for (gi, gj, uij) in term] dmdl_u = self._get_ans0(sol) abs2_u = {k: np.abs(self.data[k] - dmdl_u[k])**2 * self.wgts[k] for k in self.keys} - chisq = sum([v * wgt_func(v) for v in abs2_u.values()]) + chisq = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) update = np.where(chisq > 0) abs2_u = {k: v[update] for k, v in abs2_u.items()} # variables with '_u' are flattened and only include pixels that need updating @@ -884,7 +884,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af # clamp_wgts_u adds additional sigma clamping done by wgt_func. # abs2_u holds abs(data - mdl)**2 * wgt (i.e. noise-weighted deviations), which is # passed to wgt_func to determine any additional weighting (to, e.g., clamp outliers). - clamp_wgts_u = {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()} + clamp_wgts_u = (wgts_u if wgt_func is None else {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()}) sol_u = {k: v[update].flatten() for k, v in sol.items()} iters = np.zeros(chisq.shape, dtype=int) conv = np.ones_like(chisq) @@ -920,7 +920,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af else: # Slow branch when we compute convergence/chisq abs2_u = {k: np.abs(v[update] - dmdl_u[k])**2 * wgts_u[k] for k, v in self.data.items()} - new_chisq_u = sum([v * wgt_func(v) for v in abs2_u.values()]) + new_chisq_u = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) chisq_u = chisq[update] gotbetter_u = (chisq_u > new_chisq_u) where_gotbetter_u = np.where(gotbetter_u) @@ -1289,7 +1289,7 @@ def lincal(self, data, sol0, wgts={}, sparse=False, mode='default', conv_crit=1e sol = RedSol(self.reds, sol_dict=prms) return meta, sol - def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, wgt_func=lambda x: 1.): + def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, wgt_func=None): """Use the Liu et al 2010 Omnical algorithm to linearize equations and iteratively minimize chi^2. Args: @@ -1307,7 +1307,7 @@ def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, che wgt_func: a function f(abs^2 * wgt) operating on weighted absolute differences between data and model that returns an additional data weighting to apply to when calculating chisq and updating parameters. Example: lambda x: np.where(x>0, 5*np.tanh(x/5)/x, 1) - clamps deviations to 5 sigma. Default is no additional weighting (lambda x: 1.). + clamps deviations to 5 sigma. Default None is no additional weighting. Returns: meta: dictionary of information about the convergence and chi^2 of the solution From f0496e955ade6159b1fd3a8c55f52263b9897f9d Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:47:18 -0800 Subject: [PATCH 72/81] allow in-place updating for _get_ans0() --- hera_cal/redcal.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index bfa2b8295..5efe560ef 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -835,14 +835,19 @@ def __init__(self, data, sol0, wgts={}, gain=.3, **kwargs): linsolve.LinProductSolver.__init__(self, data, sol0, wgts=wgts, build_solver=False, **kwargs) self.gain = np.float32(gain) # float32 to avoid accidentally promoting data to doubles. - def _get_ans0(self, sol, keys=None): + def _get_ans0(self, sol, keys=None, to_update_inplace=None): '''Evaluate the system of equations given input sol. Specify keys to evaluate only a subset of the equations.''' if keys is None: keys = self.keys _sol = {k + '_': v.conj() for k, v in sol.items() if k.startswith('g')} _sol.update(sol) - return {k: eval(k, _sol) for k in keys} + if to_update_inplace is None: + return {k: eval(k, _sol) for k in keys} + else: + # update in place to save memory + for k in keys: + to_update_inplace[k] = eval(k, _sol) def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, wgt_func=None, verbose=False): @@ -912,7 +917,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af sol_sum_u[uij] += numerator new_sol_u = {k: v * ((1 - self.gain) + self.gain * sol_sum_u[k] / sol_wgt_u[k]) for k, v in sol_u.items()} - dmdl_u = self._get_ans0(new_sol_u) + self._get_ans0(new_sol_u, to_update_inplace=dmdl_u) # check if i % check_every is 0, which is purposely one less than the '1' up at the top of the loop if i < maxiter and (i < check_after or (i % check_every) != 0): # Fast branch when we aren't expensively computing convergence/chisq From 36934b6617c432d1e3567904ab1425ebee050fbf Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:48:21 -0800 Subject: [PATCH 73/81] update with loops rather than list comprehension This allows re-writing dictionaries one element at a time rather than only after the whole new dictionary is generated --- hera_cal/redcal.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index 5efe560ef..c0f1472d4 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -880,9 +880,11 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af abs2_u = {k: np.abs(self.data[k] - dmdl_u[k])**2 * self.wgts[k] for k in self.keys} chisq = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) update = np.where(chisq > 0) - abs2_u = {k: v[update] for k, v in abs2_u.items()} + for k, v in abs2_u.items(): + abs2_u[k] = v[update] # variables with '_u' are flattened and only include pixels that need updating - dmdl_u = {k: v[update].flatten() for k, v in dmdl_u.items()} + for k, v in dmdl_u.items(): + dmdl_u[k] = v[update].flatten() # wgts_u hold the wgts the user provides wgts_u = {k: (v * np.ones(chisq.shape, dtype=np.float32))[update].flatten() for k, v in self.wgts.items()} @@ -924,7 +926,8 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af sol_u = new_sol_u else: # Slow branch when we compute convergence/chisq - abs2_u = {k: np.abs(v[update] - dmdl_u[k])**2 * wgts_u[k] for k, v in self.data.items()} + for k, v in self.data.items(): + abs2_u[k] = np.abs(v[update] - dmdl_u[k])**2 * wgts_u[k] new_chisq_u = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) chisq_u = chisq[update] gotbetter_u = (chisq_u > new_chisq_u) @@ -932,7 +935,8 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af update_where = tuple(u[where_gotbetter_u] for u in update) chisq[update_where] = new_chisq_u[where_gotbetter_u] iters[update_where] = i - new_sol_u = {k: np.where(gotbetter_u, v, sol_u[k]) for k, v in new_sol_u.items()} + for k, v in new_sol_u.items(): + new_sol_u[k] = np.where(gotbetter_u, v, sol_u[k]) deltas_u = [v - sol_u[k] for k, v in new_sol_u.items()] conv_u = np.sqrt(sum([(v * v.conj()).real for v in deltas_u]) / sum([(v * v.conj()).real for v in new_sol_u.values()])) @@ -943,11 +947,16 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af if update_u[0].size == 0 or i == maxiter: meta = {'iter': iters, 'chisq': chisq, 'conv_crit': conv} return meta, sol - dmdl_u = {k: v[update_u] for k, v in dmdl_u.items()} - wgts_u = {k: v[update_u] for k, v in wgts_u.items()} - sol_u = {k: v[update_u] for k, v in new_sol_u.items()} - abs2_u = {k: v[update_u] for k, v in abs2_u.items()} - clamp_wgts_u = {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()} + for k, v in dmdl_u.items(): + dmdl_u[k] = v[update_u] + for k, v in wgts_u.items(): + wgts_u[k] = v[update_u] + for k, v in wgts_u.items(): + clamp_wgts_u[k] = (v if wgt_func is None else v * wgt_func(abs2_u[k])) + for k, v in sol_u.items(): + sol_u[k] = v[update_u] + for k, v in abs2_u.items(): + abs2_u[k] = v[update_u] update = tuple(u[update_u] for u in update) if verbose: print(' = %f, = %f, CNT = %d', (np.mean(chisq), np.mean(conv), update[0].size)) From 5dc69a3b876785708bb637229b0b39aa5b939c8f Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:49:53 -0800 Subject: [PATCH 74/81] avoid creating new memory objects when not doing detrend_phs --- hera_cal/redcal.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index c0f1472d4..9ad0c7ac9 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -1126,14 +1126,18 @@ def _solver(self, solver, data, wgts={}, detrend_phs=False, **kwargs): dc = DataContainer(data) eqs = self.build_eqs(dc) self.phs_avg = {} # detrend phases within redundant group, used for logcal to avoid phase wraps + d_ls, w_ls = {}, {} if detrend_phs: for grp in self.reds: self.phs_avg[grp[0]] = np.exp(-np.complex64(1j) * np.median(np.unwrap([np.log(dc[bl]).imag for bl in grp], axis=0), axis=0)) for bl in grp: self.phs_avg[bl] = self.phs_avg[grp[0]].astype(dc[bl].dtype) - d_ls, w_ls = {}, {} - for eq, key in eqs.items(): - d_ls[eq] = dc[key] * self.phs_avg.get(key, np.float32(1)) + for eq, key in eqs.items(): + # this makes a copy, which prevents modification of the data when detrending phase + d_ls[eq] = dc[key] * self.phs_avg.get(key, np.float32(1)) + else: + for eq, key in eqs.items(): + d_ls[eq] = dc[key] if len(wgts) > 0: wc = DataContainer(wgts) for eq, key in eqs.items(): From 810d49254737a00ea4488ece1b3a2d7cde5f0150 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 12:50:57 -0800 Subject: [PATCH 75/81] avoid calculating the entirely of dwgts_u by doing more calculations inside the loop This works because self.keys is the same as self.data.keys() --- hera_cal/redcal.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index 9ad0c7ac9..194c180f4 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -902,14 +902,14 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af # compute data wgts: dwgts = sum(V_mdl^2 / n^2) = sum(V_mdl^2 * wgts) # don't need to update data weighting with every iteration # clamped weighting is passed to dwgts_u, which is used to update parameters - dwgts_u = {k: dmdl_u[k] * dmdl_u[k].conj() * clamp_wgts_u[k] for k in self.keys} sol_wgt_u = {k: 0 for k in sol.keys()} + dw_u = {} for k, (gi, gj, uij) in zip(self.keys, terms): - w = dwgts_u[k] - sol_wgt_u[gi] += w - sol_wgt_u[gj] += w - sol_wgt_u[uij] += w - dw_u = {k: v[update] * dwgts_u[k] for k, v in self.data.items()} + dwgts_u = dmdl_u[k] * dmdl_u[k].conj() * clamp_wgts_u[k] + sol_wgt_u[gi] += dwgts_u + sol_wgt_u[gj] += dwgts_u + sol_wgt_u[uij] += dwgts_u + dw_u[k] = self.data[k][update] * dwgts_u sol_sum_u = {k: 0 for k in sol_u.keys()} for k, (gi, gj, uij) in zip(self.keys, terms): # compute sum(wgts * V_meas / V_mdl) From e8da5b32f985a20fbab8415ebc6db1dd2780223b Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 13:21:37 -0800 Subject: [PATCH 76/81] expand comment on _get_ans0 --- hera_cal/redcal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index 194c180f4..a027f5238 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -837,7 +837,9 @@ def __init__(self, data, sol0, wgts={}, gain=.3, **kwargs): def _get_ans0(self, sol, keys=None, to_update_inplace=None): '''Evaluate the system of equations given input sol. - Specify keys to evaluate only a subset of the equations.''' + Specify keys to evaluate only a subset of the equations. + to_update_inplace is a dictionary mapping keys or self.keys() + to numpy arrays that we want to overwrite with ans0 values.''' if keys is None: keys = self.keys _sol = {k + '_': v.conj() for k, v in sol.items() if k.startswith('g')} From a9e1297f23b43ba4f2b217f12af76f8b40fb4f79 Mon Sep 17 00:00:00 2001 From: Josh Dillon Date: Tue, 5 Dec 2023 13:56:08 -0800 Subject: [PATCH 77/81] fix order of overwrites --- hera_cal/redcal.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index a027f5238..c63b9a5a1 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -953,12 +953,12 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af dmdl_u[k] = v[update_u] for k, v in wgts_u.items(): wgts_u[k] = v[update_u] - for k, v in wgts_u.items(): - clamp_wgts_u[k] = (v if wgt_func is None else v * wgt_func(abs2_u[k])) - for k, v in sol_u.items(): - sol_u[k] = v[update_u] for k, v in abs2_u.items(): abs2_u[k] = v[update_u] + for k, v in sol_u.items(): + sol_u[k] = v[update_u] + for k, v in wgts_u.items(): + clamp_wgts_u[k] = (v if wgt_func is None else v * wgt_func(abs2_u[k])) update = tuple(u[update_u] for u in update) if verbose: print(' = %f, = %f, CNT = %d', (np.mean(chisq), np.mean(conv), update[0].size)) From c2f2c41ce455569a7d47d944f3bd6ffe42c83c90 Mon Sep 17 00:00:00 2001 From: Bryna Hazelton Date: Mon, 4 Dec 2023 19:14:18 -0800 Subject: [PATCH 78/81] Make error catching work with updated pyuvdata errors --- hera_cal/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 8413dd78c..9c7cd1596 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -1832,9 +1832,9 @@ def partial_time_io(hd, times=None, time_range=None, lsts=None, lst_range=None, return_data=False, **kwargs) except ValueError as err: # check to see if the read failed because of the time range or lst range - if 'No elements in time range between ' in str(err): + if 'No elements in time range between ' in str(err) or 'No elements in time_array between ' in str(err): continue # no matching times, skip this file - elif 'No elements in LST range between ' in str(err): + elif 'No elements in LST range between ' in str(err) or 'No elements in lst_array between ' in str(err): continue # no matchings lsts, skip this file else: raise From 96dccf3ba17cfa17e81d7664591d247d4e0c9d59 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 6 Dec 2023 18:56:37 +0100 Subject: [PATCH 79/81] fix: enforce that ant numbers are uint64 --- hera_cal/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 9c7cd1596..6f4a6dfac 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -2202,7 +2202,7 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags flag_array = flag_array.reshape(Nblts, 1, Nfreqs, Npols) # configure baselines - antpairs = np.repeat(np.array(antpairs), Ntimes, axis=0) + antpairs = np.repeat(np.array(antpairs, dtype=np.uint64), Ntimes, axis=0) # get ant_1_array, ant_2_array ant_1_array = antpairs[:, 0] From 58154c36ab64a2c6b52eecf1404084dc9eddfffb Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 6 Dec 2023 19:14:16 +0100 Subject: [PATCH 80/81] fix: raise an error instead of warning --- hera_cal/io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index 6f4a6dfac..e9e8fad78 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -2202,7 +2202,9 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags flag_array = flag_array.reshape(Nblts, 1, Nfreqs, Npols) # configure baselines - antpairs = np.repeat(np.array(antpairs, dtype=np.uint64), Ntimes, axis=0) + with warnings.catch_warnings(): + warnings.simplefilter("error") + antpairs = np.repeat(np.array(antpairs, dtype=np.uint64), Ntimes, axis=0) # get ant_1_array, ant_2_array ant_1_array = antpairs[:, 0] From f1ea8172820c69820aaceb8e49b15f373150e925 Mon Sep 17 00:00:00 2001 From: Steven Murray Date: Wed, 6 Dec 2023 19:30:25 +0100 Subject: [PATCH 81/81] fix: actually raise error --- hera_cal/io.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hera_cal/io.py b/hera_cal/io.py index e9e8fad78..3adf9fcd3 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -2202,13 +2202,15 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags flag_array = flag_array.reshape(Nblts, 1, Nfreqs, Npols) # configure baselines - with warnings.catch_warnings(): - warnings.simplefilter("error") - antpairs = np.repeat(np.array(antpairs, dtype=np.uint64), Ntimes, axis=0) + antpairs = np.repeat(np.array(antpairs), Ntimes, axis=0) + + antpairs_int = antpairs.astype(np.uint64) + if not np.allclose(antpairs, antpairs_int): + raise ValueError("antenna numbers must be non-negative integers") # get ant_1_array, ant_2_array - ant_1_array = antpairs[:, 0] - ant_2_array = antpairs[:, 1] + ant_1_array = antpairs_int[:, 0] + ant_2_array = antpairs_int[:, 1] # get baseline array baseline_array = 2048 * (ant_1_array + 1) + (ant_2_array + 1) + 2**16