diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c24fd7d1..b2bcf0e9b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: PYTHON: ${{ matrix.python-version }} OS: ${{ matrix.os }} runs-on: ${{ matrix.os }} - + strategy: matrix: os: [ubuntu-latest, macos-latest] @@ -40,7 +40,12 @@ jobs: run: | pytest -n auto --pyargs hera_cal --cov=hera_cal --cov-config=./.coveragerc --cov-report xml:./coverage.xml --durations=15 - - name: Upload Coverage (Ubuntu) + - name: Upload coverage report if: matrix.os == 'ubuntu-latest' && success() - run: | - bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }} + uses: codecov/codecov-action@v3.1.3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: unittests + name: codecov-umbrella + fail_ci_if_error: true diff --git a/hera_cal/data/example_filter_params.yaml b/hera_cal/data/example_filter_params.yaml new file mode 100644 index 000000000..af52605e0 --- /dev/null +++ b/hera_cal/data/example_filter_params.yaml @@ -0,0 +1,6 @@ +filter_centers: + (0, 1): 0.1234 + (0, 2): 0.173 +filter_half_widths: + (0, 1): 0.05 + (0, 2): 0.08 diff --git a/hera_cal/datacontainer.py b/hera_cal/datacontainer.py index 7a99f1771..17b435116 100644 --- a/hera_cal/datacontainer.py +++ b/hera_cal/datacontainer.py @@ -5,11 +5,13 @@ import numpy as np from collections import OrderedDict as odict import copy +import warnings from typing import Sequence from .utils import conj_pol, comply_pol, make_bl, comply_bl, reverse_bl from .red_groups import RedundantGroups, Baseline, AntPair + class DataContainer: """Dictionary-like object that abstracts away the pol/ant pair ordering of data dictionaries and the the polarization case (i.e. 'nn' vs. 'NN'). Keys are in @@ -68,8 +70,7 @@ def __init__(self, data): setattr(self, attr, getattr(data, attr)) else: setattr(self, attr, None) - - + @property def dtype(self): """The dtype of the underlying data.""" @@ -81,7 +82,6 @@ def dtype(self): else: return None - def antpairs(self, pol=None): '''Return a set of antenna pairs (with a specific pol or more generally).''' if pol is None: @@ -185,7 +185,7 @@ def __delitem__(self, key): @property def shape(self) -> tuple[int]: return self[next(iter(self.keys()))].shape - + def concatenate(self, D, axis=0): '''Concatenates D, a DataContainer or a list of DCs, with self along an axis''' # check type of D @@ -514,18 +514,73 @@ def select_or_expand_times(self, new_times, in_place=True, skip_bda_check=False) if not in_place: return dc + def select_freqs( + self, + freqs: np.ndarray | None = None, + channels: np.ndarray | slice | None = None, + in_place: bool = True + ): + """Update the object with a subset of frequencies (which may be repeated). + + While typically this will be used to down-select frequencies, one can + 'expand' the frequencies by duplicating channels. + + Parameters + ---------- + freqs : np.ndarray, optional + Frequencies to select. If given, all frequencies must be in the datacontainer. + channels : np.ndarray, slice, optional + Channels to select. If given, all channels must be in the datacontainer. + Only one of freqs or channels can be given. + in_place : bool, optional + If True, modify the object in place. Otherwise, return a modified copy. + Even if `in_place` is True, the object is still returned for convenience. + + Returns + ------- + DataContainer + The modified object. If `in_place` is True, this is the same object. + """ + obj = self if in_place else copy.deepcopy(self) + if freqs is None and channels is None: + return obj + elif freqs is not None and channels is not None: + raise ValueError('Cannot specify both freqs and channels.') + + if freqs is not None: + if obj.freqs is None: + raise ValueError('Cannot select frequencies if self.freqs is None.') + + if not np.all([fq in obj.freqs for fq in freqs]): + raise ValueError('All freqs must be in self.freqs.') + channels = np.searchsorted(obj.freqs, freqs) + + if obj.freqs is None: + warnings.warn("It is impossible to automatically detect which axis is frequency. Trying last axis.") + axis = -1 + else: + axis = obj[next(iter(obj.keys()))].shape.index(len(obj.freqs)) + for bl in obj: + obj[bl] = obj[bl].take(channels, axis=axis) + + # update metadata + if obj.freqs is not None: + obj.freqs = obj.freqs[channels] + + return obj + class RedDataContainer(DataContainer): '''Structure for containing redundant visibilities that can be accessed by any one of the redundant baseline keys (or their conjugate).''' def __init__( - self, - data: DataContainer | dict[Baseline, np.ndarray], - reds: RedundantGroups | Sequence[Sequence[Baseline | AntPair]] | None=None, - antpos: dict[int, np.ndarray] | None=None, - bl_error_tol: float=1.0 - ): + self, + data: DataContainer | dict[Baseline, np.ndarray], + reds: RedundantGroups | Sequence[Sequence[Baseline | AntPair]] | None = None, + antpos: dict[int, np.ndarray] | None = None, + bl_error_tol: float = 1.0 + ): '''Creates a RedDataContainer. Parameters @@ -533,7 +588,7 @@ def __init__( data : DataContainer or dictionary of visibilities, just as one would pass into DataContainer(). Will error if multiple baselines are part of the same redundant group. reds : :class:`RedundantGroups` object, or list of lists of redundant baseline tuples, e.g. (ind1, ind2, pol). - These are the redundant groups of baselines. If not provided, will try to + These are the redundant groups of baselines. If not provided, will try to infer them from antpos. antpos: dictionary of antenna positions in the form {ant_index: np.array([x, y, z])}. Will error if one tries to provide both reds and antpos. If neither is provided, @@ -545,10 +600,10 @@ def __init__( Attributes ---------- - reds - A :class:`RedundantGroups` object that contains the redundant groups for - the entire array, and methods to manipulate them. - + reds + A :class:`RedundantGroups` object that contains the redundant groups for + the entire array, and methods to manipulate them. + ''' if reds is not None and antpos is not None: raise ValueError('Can only provide reds or antpos, not both.') @@ -568,25 +623,24 @@ def __init__( ) else: raise ValueError('Must provide reds, antpos, or have antpos available at data.antpos') - + if not isinstance(reds, RedundantGroups): reds = RedundantGroups(red_list=reds, antpos=self.antpos) self.build_red_keys(reds) - def build_red_keys(self, reds: RedundantGroups | list[list[Baseline]]): '''Build the dictionaries that map baselines to redundant keys. Arguments: reds: list of lists of redundant baseline tuples, e.g. (ind1, ind2, pol). ''' - + if isinstance(reds, RedundantGroups): self.reds = reds else: self.reds = RedundantGroups(red_list=reds, antpos=getattr(self, 'antpos', None)) - + self._reds_keyed_on_data = self.reds.keyed_on_bls(bls=self.bls()) # delete unused data to avoid leaking memory @@ -600,24 +654,23 @@ def build_red_keys(self, reds: RedundantGroups | list[list[Baseline]]): raise ValueError( 'RedDataContainer can only be constructed with (at most) one baseline per group, ' f'but {bl} is redundant with {redkeys[ubl]}.' - ) + ) else: redkeys[ubl] = bl - def get_ubl_key(self, bl): '''Returns the blkey used to internally denote the data stored. - + If this bl is in a redundant group present in the data, this will return the blkey that exists in the data. Otherwise, it will return the array-wide blkey representing this group. ''' return self._reds_keyed_on_data.get_ubl_key(bl) - + def get_red(self, key): '''Returns the list of baselines in the array redundant with this key. - - Note: this is not just baselines existing in the data itself, but in the + + Note: this is not just baselines existing in the data itself, but in the entire array. ''' return self.reds[key] @@ -642,8 +695,6 @@ def __setitem__(self, key, value): super().__setitem__(ubl_key, value) - def __contains__(self, key): '''Returns true if the baseline redundant with the key is in the data.''' return (key in self.reds) and (super().__contains__(self.get_ubl_key(key))) - \ No newline at end of file diff --git a/hera_cal/frf.py b/hera_cal/frf.py index e157134e7..fd34940d6 100644 --- a/hera_cal/frf.py +++ b/hera_cal/frf.py @@ -27,6 +27,9 @@ import astropy.constants as const from . import redcal from .utils import echo +import os +import yaml +import re SPEED_OF_LIGHT = const.c.si.value SDAY_SEC = units.sday.to("s") @@ -1560,36 +1563,50 @@ def tophat_frfilter_argparser(mode='clean'): filt_options.add_argument("--blacklist_wgt", type=float, default=0.0, help="Relative weight to assign to blacklisted lsts compared to 1.0. Default 0.0 \ means no weight. Note that 0.0 will create problems for DPSS at edge times and frequencies.") - desc = ("Filtering case ['max_frate_coeffs', 'uvbeam', 'sky']", + desc = ("Filtering case ['max_frate_coeffs', 'uvbeam', 'sky', 'param_file']", "If case == 'max_frate_coeffs', then determine fringe rate centers", "and half-widths based on the max_frate_coeffs arg (see below).", "If case == 'uvbeam', then determine fringe rate centers and half widths", "from histogram of main-beam wrt instantaneous sky fringe rates.", "If case == 'sky': then use fringe-rates corresponding to range of ", - "instantanous fringe-rates that include sky emission.") + "instantanous fringe-rates that include sky emission.", + "If case == 'param_file': then use a provided parameter file to determine", + "filter centers and filter half-widths. See param_file help for more ", + "information regarding parameter file structure.") filt_options.add_argument("--case", default="sky", help=' '.join(desc), type=str) desc = ("Number interleaved time subsets to split the data into ", "and apply independent fringe-rate filters. Default is 1 (no interleaved filters).", "This does not change the format of the output files but it does change the nature of their content.") filt_options.add_argument("--ninterleave", default=1, type=int, help=desc) + desc = ("File containing filter parameters. Parameter file must be yaml-readable ", + "and contain two entries: filter_centers and filter_half_widths. Each of ", + "these entries must be dictionaries whose keys are strings of antenna ", + "pairs and whose values are floats. Filter parameters are assumed to ", + "correspond to a particular range of frequencies (chosen when making the ", + "filter parameter file) and depend on baseline but not polarization (i.e., ", + "the 'ee' and 'nn' polarizations will have the same filter for a given baseline.") + filt_options.add_argument("--param_file", default="", type=str, help=desc) + return ap -def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calfile_list=None, - Nbls_per_load=None, spw_range=None, external_flags=None, - factorize_flags=False, time_thresh=0.05, wgt_by_nsample=False, - lst_blacklists=None, blacklist_wgt=0.0, - res_outfilename=None, CLEAN_outfilename=None, filled_outfilename=None, - clobber=False, add_to_history='', avg_red_bllens=False, polarizations=None, - overwrite_flags=False, - flag_yaml=None, skip_autos=False, beamfitsfile=None, verbose=False, - read_axis=None, - percentile_low=5., percentile_high=95., - frate_standoff=0.0, frate_width_multiplier=1.0, - min_frate_half_width=0.025, max_frate_half_width=np.inf, - max_frate_coeffs=None, fr_freq_skip=1, ninterleave=1, - **filter_kwargs): +def load_tophat_frfilter_and_write( + datafile_list, case, baseline_list=None, calfile_list=None, + Nbls_per_load=None, spw_range=None, external_flags=None, + factorize_flags=False, time_thresh=0.05, wgt_by_nsample=False, + lst_blacklists=None, blacklist_wgt=0.0, + res_outfilename=None, CLEAN_outfilename=None, filled_outfilename=None, + clobber=False, add_to_history='', avg_red_bllens=False, polarizations=None, + overwrite_flags=False, + flag_yaml=None, skip_autos=False, beamfitsfile=None, verbose=False, + read_axis=None, + percentile_low=5., percentile_high=95., + frate_standoff=0.0, frate_width_multiplier=1.0, + min_frate_half_width=0.025, max_frate_half_width=np.inf, + max_frate_coeffs=None, fr_freq_skip=1, ninterleave=1, param_file="", + **filter_kwargs +): ''' A tophat fr-filtering method that only simultaneously loads and writes user-provided list of baselines. This is to support parallelization over baseline (rather then time) if baseline_list is specified. @@ -1672,143 +1689,274 @@ def load_tophat_frfilter_and_write(datafile_list, case, baseline_list=None, calf only used if case == 'uvbeam' ninterleave: int, optional Number of interleaved sets to run time filtering on. + param_file: str, optional + File containing filter parameters (e.g., centers and half-widths). + The file must be readable by yaml, with a "filter_centers" entry and + a "filter_half_widths" entry. Each of these entries should correspond + to dictionaries with antenna pair strings as keys and floating point + numbers as values. The filter centers and filter half widths are assumed + to be provided in mHz. When writing filter parameters to a file, ensure + that the antenna pair keys have been converted to strings prior to + dumping the contents to the file. See the file example_filter_params.yaml + for an example. filter_kwargs: additional keyword arguments to be passed to FRFilter.tophat_frfilter() ''' if baseline_list is not None and Nbls_per_load is not None: raise NotImplementedError("baseline loading and partial i/o not yet implemented.") - hd = io.HERAData(datafile_list, filetype='uvh5') if baseline_list is not None and len(baseline_list) == 0: - warnings.warn("Length of baseline list is zero." - "This can happen under normal circumstances when there are more files in datafile_list then baselines." - "in your dataset. Exiting without writing any output.", RuntimeWarning) + warnings.warn( + "Length of baseline list is zero. This can happen under normal " + "circumstances when there are more files in datafile_list then baselines." + "in your dataset. Exiting without writing any output.", RuntimeWarning + ) + return + + if case == "param_file" and not os.path.exists(param_file): + raise ValueError( + "When using a filter parameter file, a valid parameter file must " + f"be provided. The provided file {param_file} could not be found." + ) + + hd = io.HERAData(datafile_list, filetype='uvh5') + # Figure out which baselines to load if not provided. + if baseline_list is None: + if len(hd.filepaths) > 1: + baseline_list = list(hd.antpairs.values())[0] + else: + baseline_list = hd.antpairs + + # Figure out which frequencies to load. + if spw_range is None: + spw_range = [0, hd.Nfreqs] + freqs = hd.freq_array.flatten()[spw_range[0]:spw_range[1]] + + # Figure out which antennas to load if calfiles are provided. + baseline_antennas = [] + for blpolpair in baseline_list: + baseline_antennas += list(blpolpair[:2]) + baseline_antennas = np.unique(baseline_antennas).astype(int) + + # Read calibration solutions if provided. + if calfile_list is not None: + cals = io.HERACal(calfile_list) + cals.read(antenna_nums=baseline_antennas, frequencies=freqs) else: - if baseline_list is None: - if len(hd.filepaths) > 1: - baseline_list = list(hd.antpairs.values())[0] - else: - baseline_list = hd.antpairs - if spw_range is None: - spw_range = [0, hd.Nfreqs] - freqs = hd.freq_array.flatten()[spw_range[0]:spw_range[1]] - baseline_antennas = [] - for blpolpair in baseline_list: - baseline_antennas += list(blpolpair[:2]) - baseline_antennas = np.unique(baseline_antennas).astype(int) - if calfile_list is not None: - cals = io.HERACal(calfile_list) - cals.read(antenna_nums=baseline_antennas, frequencies=freqs) + cals = None + + # Figure out which polarizations to use. + if polarizations is None: + if len(hd.filepaths) > 1: + polarizations = list(hd.pols.values())[0] else: - cals = None - if polarizations is None: - if len(hd.filepaths) > 1: - polarizations = list(hd.pols.values())[0] - else: - polarizations = hd.pols - if Nbls_per_load is None: - Nbls_per_load = len(baseline_list) - for i in range(0, len(baseline_list), Nbls_per_load): - frfil = FRFilter(hd, input_cal=cals) - frfil.read(bls=baseline_list[i:i + Nbls_per_load], - frequencies=freqs, polarizations=polarizations, axis=read_axis) - if avg_red_bllens: - frfil.avg_red_baseline_vectors() - if external_flags is not None: - frfil.apply_flags(external_flags, overwrite_flags=overwrite_flags) - if flag_yaml is not None: - frfil.apply_flags(flag_yaml, overwrite_flags=overwrite_flags, filetype='yaml') - if factorize_flags: - frfil.factorize_flags(time_thresh=time_thresh, inplace=True) - keys = frfil.data.keys() - if skip_autos: - keys = [bl for bl in keys if bl[0] != bl[1]] - if beamfitsfile is not None: - uvb = UVBeam() - uvb.read_beamfits(beamfitsfile) - uvb.use_future_array_shapes() + polarizations = hd.pols + + # Load all baselines if a baseline list not provided. + if Nbls_per_load is None: + Nbls_per_load = len(baseline_list) + + # If a filter parameter file is provided, let's check that all of the + # baselines are present in the filter file. + if case == "param_file": + with open(param_file, "r") as f: + _filter_info = yaml.load(f.read(), Loader=yaml.SafeLoader) + + # Convert the dictionary keys into tuples. + filter_info = {param: {} for param in _filter_info.keys()} + for filter_param, info in _filter_info.items(): + for antpair_str, value in info.items(): + antpair = tuple( + int(ant) for ant in re.findall("[0-9]+", antpair_str) + ) + filter_info[filter_param][antpair] = value + + filter_antpairs = set(filter_info["filter_centers"].keys()) + have_bl_info = [] + missing_bls = set() + for bl in baseline_list: + if skip_autos and bl[0] == bl[1]: + continue + have_bl = (bl[:2] in filter_antpairs) or (bl[:2][::-1] in filter_antpairs) + have_bl_info.append(have_bl) + if not have_bl: + missing_bls.add(bl[:2]) + + if missing_bls: + missing_bls = [str(bl) for bl in missing_bls] + raise ValueError( + "Provided filter file doesn't have every baseline. The following" + "baselines could not be found: " + " ".join(missing_bls) + ) + + # Read the data in chunks and perform filtering. + for i in range(0, len(baseline_list), Nbls_per_load): + # Read data from this chunk of baselines. + frfil = FRFilter(hd, input_cal=cals) + frfil.read( + bls=baseline_list[i:i + Nbls_per_load], + frequencies=freqs, + polarizations=polarizations, + axis=read_axis, + ) + + # Some extra handling if requested. + if avg_red_bllens: + frfil.avg_red_baseline_vectors() + if external_flags is not None: + frfil.apply_flags(external_flags, overwrite_flags=overwrite_flags) + if flag_yaml is not None: + frfil.apply_flags(flag_yaml, overwrite_flags=overwrite_flags, filetype='yaml') + if factorize_flags: + frfil.factorize_flags(time_thresh=time_thresh, inplace=True) + + # Figure out which baselines we'll need for filtering. + keys = frfil.data.keys() + if skip_autos: + keys = [bl for bl in keys if bl[0] != bl[1]] + + # Read in the beam file if provided. + if beamfitsfile is not None: + uvb = UVBeam() + uvb.read_beamfits(beamfitsfile) + uvb.use_future_array_shapes() + else: + uvb = None + + # Filter the data if there is data to filter. + if len(keys) > 0: + # Deal with interleaved sets + frfil._deinterleave_data_in_time('data', ninterleave=ninterleave) + frfil._deinterleave_data_in_time( + 'flags', ninterleave=ninterleave, set_time_sets=False + ) + frfil._deinterleave_data_in_time( + 'nsamples', ninterleave=ninterleave, set_time_sets=False + ) + + # Figure out fringe-rate centers and half-widths. + if case == "param_file": + frate_centers = {} + frate_half_widths = {} + # Assuming we use the same filters for all polarizations. + for key in keys: + ai, aj = key[:2] + if (ai, aj) in filter_antpairs: + frate_centers[key] = filter_info["filter_centers"][(ai,aj)] + frate_half_widths[key] = filter_info["filter_half_widths"][(ai,aj)] + else: + # We've already enforced that all the data baselines + # are in the filter file, so we should be safe here. + frate_centers[key] = -filter_info["filter_centers"][(aj,ai)] + frate_half_widths[key] = filter_info["filter_half_widths"][(aj,ai)] else: - uvb = None - if len(keys) > 0: - # Deal with interleaved sets - frfil._deinterleave_data_in_time('data', ninterleave=ninterleave) - frfil._deinterleave_data_in_time('flags', ninterleave=ninterleave, set_time_sets=False) - frfil._deinterleave_data_in_time('nsamples', ninterleave=ninterleave, set_time_sets=False) - # figure out frige rate centers and half-widths - assert case in ['sky', 'max_frate_coeffs', 'uvbeam'], f'case={case} is not valid.' - # use conservative nfr (lowest resolution set). + # Otherwise, we need to compute the filter parameters. + if case not in ("sky", "max_frate_coeffs", "uvbeam"): + raise ValueError(f"case={case} is not valid") + + # Use conservative nfr (lowest resolution set). nfr = int(np.min([len(tset) for tset in frfil.time_sets])) - frate_centers, frate_half_widths = select_tophat_frates(uvd=frfil.hd, blvecs=frfil.blvecs, - case=case, keys=keys, uvb=uvb, - frate_standoff=frate_standoff, - frate_width_multiplier=frate_width_multiplier, - min_frate_half_width=min_frate_half_width, - max_frate_half_width=max_frate_half_width, - max_frate_coeffs=max_frate_coeffs, - percentile_low=percentile_low, - percentile_high=percentile_high, - fr_freq_skip=fr_freq_skip, - verbose=verbose, nfr=nfr) - # Lists of names of datacontainers that will hold each interleaved data set until they are - # recombined. - filtered_data_names = [f'clean_data_interleave_{inum}' for inum in range(ninterleave)] - filtered_flag_names = [fstr.replace('data', 'flags') for fstr in filtered_data_names] - filtered_resid_names = [fstr.replace('data', 'resid') for fstr in filtered_data_names] - filtered_model_names = [fstr.replace('data', 'model') for fstr in filtered_data_names] - filtered_resid_flag_names = [fstr.replace('data', 'resid_flags') for fstr in filtered_data_names] - - for inum in range(ninterleave): - - # Build weights using flags, nsamples, and exlcuded lsts - flags = getattr(frfil, f'flags_interleave_{inum}') - nsamples = getattr(frfil, f'nsamples_interleave_{inum}') - wgts = io.DataContainer({k: (~flags[k]).astype(float) for k in flags}) - - lsts = frfil.lst_sets[inum] - for k in wgts: - if wgt_by_nsample: - wgts[k] *= nsamples[k] - if lst_blacklists is not None: - for lb in lst_blacklists: - if lb[0] < lb[1]: - is_blacklisted = (lsts >= lb[0] * np.pi / 12)\ - & (lsts <= lb[1] * np.pi / 12) - else: - is_blacklisted = (lsts >= lb[0] * np.pi / 12) | (lsts <= lb[1] * np.pi / 12) - wgts[k][is_blacklisted, :] = wgts[k][is_blacklisted, :] * blacklist_wgt - # run tophat filter - frfil.tophat_frfilter(frate_centers=frate_centers, frate_half_widths=frate_half_widths, - keys=keys, verbose=verbose, wgts=wgts, flags=getattr(frfil, f'flags_interleave_{inum}'), - data=getattr(frfil, f'data_interleave_{inum}'), output_postfix=f'interleave_{inum}', - times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, - **filter_kwargs) - - frfil._interleave_data_in_time(filtered_data_names, 'clean_data') - frfil._interleave_data_in_time(filtered_flag_names, 'clean_flags') - frfil._interleave_data_in_time(filtered_resid_names, 'clean_resid') - frfil._interleave_data_in_time(filtered_resid_flag_names, 'clean_resid_flags') - frfil._interleave_data_in_time(filtered_model_names, 'clean_model') + frate_centers, frate_half_widths = select_tophat_frates( + uvd=frfil.hd, blvecs=frfil.blvecs, + case=case, keys=keys, uvb=uvb, + frate_standoff=frate_standoff, + frate_width_multiplier=frate_width_multiplier, + min_frate_half_width=min_frate_half_width, + max_frate_half_width=max_frate_half_width, + max_frate_coeffs=max_frate_coeffs, + percentile_low=percentile_low, + percentile_high=percentile_high, + fr_freq_skip=fr_freq_skip, + verbose=verbose, nfr=nfr, + ) + # Lists of names of datacontainers that will hold each interleaved + # data set until they are recombined. + filtered_data_names = [ + f'clean_data_interleave_{inum}' for inum in range(ninterleave) + ] + filtered_flag_names = [ + fstr.replace('data', 'flags') for fstr in filtered_data_names + ] + filtered_resid_names = [ + fstr.replace('data', 'resid') for fstr in filtered_data_names + ] + filtered_model_names = [ + fstr.replace('data', 'model') for fstr in filtered_data_names + ] + filtered_resid_flag_names = [ + fstr.replace('data', 'resid_flags') for fstr in filtered_data_names + ] - else: - frfil.clean_data = DataContainer({}) - frfil.clean_flags = DataContainer({}) - frfil.clean_resid = DataContainer({}) - frfil.clean_resid_flags = DataContainer({}) - frfil.clean_model = DataContainer({}) - # put autocorr data into filtered data containers if skip_autos = True. - # so that it can be written out into the filtered files. - if skip_autos: - for bl in frfil.data.keys(): - if bl[0] == bl[1]: - frfil.clean_data[bl] = frfil.data[bl] - frfil.clean_flags[bl] = frfil.flags[bl] - frfil.clean_resid[bl] = frfil.data[bl] - frfil.clean_model[bl] = np.zeros_like(frfil.data[bl]) - frfil.clean_resid_flags[bl] = frfil.flags[bl] - - frfil.write_filtered_data(res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, - filled_outfilename=filled_outfilename, partial_write=Nbls_per_load < len(baseline_list), - clobber=clobber, add_to_history=add_to_history, - extra_attrs={'Nfreqs': frfil.hd.Nfreqs, 'freq_array': frfil.hd.freq_array, 'channel_width': frfil.hd.channel_width, 'flex_spw_id_array': frfil.hd.flex_spw_id_array}) - frfil.hd.data_array = None # this forces a reload in the next loop + for inum in range(ninterleave): + + # Build weights using flags, nsamples, and exlcuded lsts + flags = getattr(frfil, f'flags_interleave_{inum}') + nsamples = getattr(frfil, f'nsamples_interleave_{inum}') + wgts = io.DataContainer({k: (~flags[k]).astype(float) for k in flags}) + + lsts = frfil.lst_sets[inum] + for k in wgts: + if wgt_by_nsample: + wgts[k] *= nsamples[k] + if lst_blacklists is not None: + for lb in lst_blacklists: + if lb[0] < lb[1]: + is_blacklisted = ( + lsts >= lb[0] * np.pi / 12 + ) & (lsts <= lb[1] * np.pi / 12) + else: + is_blacklisted = ( + lsts >= lb[0] * np.pi / 12 + ) | (lsts <= lb[1] * np.pi / 12) + wgts[k][is_blacklisted, :] = ( + wgts[k][is_blacklisted, :] * blacklist_wgt + ) + # run tophat filter + frfil.tophat_frfilter( + frate_centers=frate_centers, frate_half_widths=frate_half_widths, + keys=keys, verbose=verbose, wgts=wgts, + flags=getattr(frfil, f'flags_interleave_{inum}'), + data=getattr(frfil, f'data_interleave_{inum}'), + output_postfix=f'interleave_{inum}', + times=frfil.time_sets[inum] * SDAY_SEC * 1e-3, + **filter_kwargs + ) + + frfil._interleave_data_in_time(filtered_data_names, 'clean_data') + frfil._interleave_data_in_time(filtered_flag_names, 'clean_flags') + frfil._interleave_data_in_time(filtered_resid_names, 'clean_resid') + frfil._interleave_data_in_time(filtered_resid_flag_names, 'clean_resid_flags') + frfil._interleave_data_in_time(filtered_model_names, 'clean_model') + + else: + frfil.clean_data = DataContainer({}) + frfil.clean_flags = DataContainer({}) + frfil.clean_resid = DataContainer({}) + frfil.clean_resid_flags = DataContainer({}) + frfil.clean_model = DataContainer({}) + + # put autocorr data into filtered data containers if skip_autos = True. + # so that it can be written out into the filtered files. + if skip_autos: + for bl in frfil.data.keys(): + if bl[0] == bl[1]: + frfil.clean_data[bl] = frfil.data[bl] + frfil.clean_flags[bl] = frfil.flags[bl] + frfil.clean_resid[bl] = frfil.data[bl] + frfil.clean_model[bl] = np.zeros_like(frfil.data[bl]) + frfil.clean_resid_flags[bl] = frfil.flags[bl] + + frfil.write_filtered_data( + res_outfilename=res_outfilename, CLEAN_outfilename=CLEAN_outfilename, + filled_outfilename=filled_outfilename, + partial_write=Nbls_per_load < len(baseline_list), + clobber=clobber, add_to_history=add_to_history, + extra_attrs={ + 'Nfreqs': frfil.hd.Nfreqs, + 'freq_array': frfil.hd.freq_array, + 'channel_width': frfil.hd.channel_width, + 'flex_spw_id_array': frfil.hd.flex_spw_id_array + }, + ) + frfil.hd.data_array = None # this forces a reload in the next loop def time_average_argparser(): diff --git a/hera_cal/io.py b/hera_cal/io.py index 155996e12..3adf9fcd3 100644 --- a/hera_cal/io.py +++ b/hera_cal/io.py @@ -9,6 +9,7 @@ import os import copy import warnings +import inspect from functools import reduce from collections.abc import Iterable from pyuvdata import UVCal, UVData @@ -31,6 +32,7 @@ from contextlib import contextmanager from functools import lru_cache from pyuvdata.uvdata import FastUVH5Meta +import pyuvdata try: import aipy @@ -108,21 +110,31 @@ def build_calcontainers(self): total_qual: dict mapping polarization to (Nint, Nfreq) float total quality array ''' self._extract_metadata() - gains, flags, quals, total_qual = odict(), odict(), odict(), odict() + gains, flags = odict(), odict() + + if self.total_quality_array is not None: + total_qual = odict() + else: + total_qual = None + + if self.quality_array is not None: + quals = odict() + else: + quals = None # build dict of gains, flags, and quals for (ant, pol) in self.ants: i, ip = self._antnum_indices[ant], self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] gains[(ant, pol)] = np.array(self.gain_array[i, :, :, ip].T) flags[(ant, pol)] = np.array(self.flag_array[i, :, :, ip].T) - quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + if quals is not None: + quals[(ant, pol)] = np.array(self.quality_array[i, :, :, ip].T) + # build dict of total_qual if available - for pol in self.pols: - ip = self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] - if self.total_quality_array is not None: + if total_qual is not None: + for pol in self.pols: + ip = self._jnum_indices[jstr2num(pol, x_orientation=self.x_orientation)] total_qual[pol] = np.array(self.total_quality_array[:, :, ip].T) - else: - total_qual = None return gains, flags, quals, total_qual @@ -450,7 +462,7 @@ def get_blt_slices(uvo, tried_to_reorder=False): if getattr(uvo, 'blts_are_rectangular', False): if uvo.time_axis_faster_than_bls: for i in range(uvo.Nbls): - start = i*uvo.Ntimes + start = i * uvo.Ntimes antp = (uvo.ant_1_array[start], uvo.ant_2_array[start]) blt_slices[antp] = slice(start, start + uvo.Ntimes, 1) assert uvo.Nbls == len(blt_slices) @@ -469,8 +481,10 @@ def get_blt_slices(uvo, tried_to_reorder=False): uvo.reorder_blts(order='time') return get_blt_slices(uvo, tried_to_reorder=True) else: - raise NotImplementedError('UVData objects with non-regular spacing of ' - 'baselines in its baseline-times are not supported.') + raise NotImplementedError( + 'UVData objects with non-regular spacing of ' + 'baselines in its baseline-times are not supported.' + ) else: blt_slices[(ant1, ant2)] = slice(indices[0], indices[-1] + 1, indices[1] - indices[0]) return blt_slices @@ -787,7 +801,7 @@ def read(self, bls=None, polarizations=None, times=None, time_range=None, lsts=N locs = locals() partials = ['bls', 'polarizations', 'times', 'time_range', 'lsts', 'lst_range', 'frequencies', 'freq_chans'] self.last_read_kwargs = {p: locs[p] for p in partials} - + # if filepaths is None, this was converted to HERAData # from a different pre-loaded object with no history of filepath if self.filepaths is not None: @@ -802,7 +816,7 @@ def read(self, bls=None, polarizations=None, times=None, time_range=None, lsts=N run_check_acceptability=run_check_acceptability, **kwargs) self.use_future_array_shapes() if self.filetype == 'uvfits': - self.unphase_to_drift() + self.unproject_phase() else: if not read_data: raise NotImplementedError('reading only metadata is not implemented for ' + self.filetype) @@ -1018,9 +1032,18 @@ def partial_write(self, output_path, data=None, flags=None, nsamples=None, # else: # make a copy of this object and then update the relevant arrays using DataContainers # this = copy.deepcopy(self) - hd_writer.write_uvh5_part(output_path, d, f, n, - run_check_acceptability=(output_path in self._writers), - **self.last_read_kwargs) + write_kwargs = { + "data_array": d, + "nsample_array": n, + "run_check_acceptability": (output_path in self._writers), + **self.last_read_kwargs, + } + # before pyuvdata 3.0, the "flag_array" parameter was called "flags_array" + if "flag_array" in inspect.signature(UVData.write_uvh5_part).parameters: + write_kwargs["flag_array"] = f + else: + write_kwargs["flags_array"] = f + hd_writer.write_uvh5_part(output_path, **write_kwargs) def iterate_over_bls(self, Nbls=1, bls=None, chunk_by_redundant_group=False, reds=None, bl_error_tol=1.0, include_autos=True, frequencies=None): @@ -1423,8 +1446,12 @@ def _adapt_metadata(self, info_dict, skip_lsts=False): info_dict['antpairs'] = sorted(info_dict['bls']) info_dict['bls'] = sorted(set([ap + (pol, ) for ap in info_dict['antpairs'] for pol in info_dict['pols']])) XYZ = XYZ_from_LatLonAlt(info_dict['latitude'] * np.pi / 180, info_dict['longitude'] * np.pi / 180, info_dict['altitude']) - enu_antpos = ENU_from_ECEF(np.array([antpos for ant, antpos in info_dict['antpos'].items()]) + XYZ, - info_dict['latitude'] * np.pi / 180, info_dict['longitude'] * np.pi / 180, info_dict['altitude']) + enu_antpos = ENU_from_ECEF( + np.array([antpos for ant, antpos in info_dict['antpos'].items()]) + XYZ, + latitude=info_dict['latitude'] * np.pi / 180, + longitude=info_dict['longitude'] * np.pi / 180, + altitude=info_dict['altitude'] + ) info_dict['antpos'] = {ant: enu for enu, ant in zip(enu_antpos, info_dict['antpos'])} info_dict['data_antpos'] = {ant: info_dict['antpos'][ant] for ant in info_dict['data_ants']} info_dict['times'] = np.unique(info_dict['times']) @@ -1588,7 +1615,7 @@ def load_flags(flagfile, filetype='h5', return_meta=False): elif filetype == 'h5': from pyuvdata import UVFlag uvf = UVFlag(flagfile) - assert uvf.mode == 'flag', 'The input h5-based UVFlag object must be in flag mode.' + assert uvf.mode == 'flag', f'The input h5-based UVFlag object must be in flag mode, got {uvf.mode}' assert (np.issubsctype(uvf.polarization_array.dtype, np.signedinteger) or np.issubsctype(uvf.polarization_array.dtype, np.str_)), \ "The input h5-based UVFlag object's polarization_array must be integers or byte strings." @@ -1608,6 +1635,8 @@ def load_flags(flagfile, filetype='h5', return_meta=False): # data container only supports standard polarizations strings if np.issubdtype(uvf.polarization_array.dtype, np.signedinteger): flags = DataContainer(flags) + flags.times = times + flags.freqs = freqs elif uvf.type == 'antenna': # one time x freq waterfall per antenna for i, ant in enumerate(uvf.ant_array): @@ -1720,9 +1749,11 @@ def get_file_times(filepaths, filetype='uvh5'): _f[u'Header'][u'altitude'][()])) # figure out which baseline has the most times in order to handle BDA appropriately - baseline_array = uvutils.antnums_to_baseline(np.array(_f[u'Header'][u'ant_1_array']), - np.array(_f[u'Header'][u'ant_2_array']), - np.array(_f[u'Header'][u'Nants_telescope'])) + baseline_array = uvutils.antnums_to_baseline( + np.array(_f[u'Header'][u'ant_1_array']), + np.array(_f[u'Header'][u'ant_2_array']), + Nants_telescope=np.array(_f[u'Header'][u'Nants_telescope']) + ) most_common_bl_num = scipy.stats.mode(baseline_array, keepdims=True)[0][0] time_array = time_array[baseline_array == most_common_bl_num] lst_array = lst_array[baseline_array == most_common_bl_num] @@ -1801,9 +1832,9 @@ def partial_time_io(hd, times=None, time_range=None, lsts=None, lst_range=None, return_data=False, **kwargs) except ValueError as err: # check to see if the read failed because of the time range or lst range - if 'No elements in time range between ' in str(err): + if 'No elements in time range between ' in str(err) or 'No elements in time_array between ' in str(err): continue # no matching times, skip this file - elif 'No elements in LST range between ' in str(err): + elif 'No elements in LST range between ' in str(err) or 'No elements in lst_array between ' in str(err): continue # no matchings lsts, skip this file else: raise @@ -2100,18 +2131,25 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags antenna_names = [f"HH{a}" for a in antenna_numbers] # get antenna positions in ITRF frame - tel_lat_lon_alt = uvutils.LatLonAlt_from_XYZ(telescope_location) + lat, lon, alt = uvutils.LatLonAlt_from_XYZ(telescope_location) antenna_positions = np.array([antpos[k] for k in antenna_numbers]) - antenna_positions = uvutils.ECEF_from_ENU(antenna_positions, *tel_lat_lon_alt) - telescope_location + antenna_positions = uvutils.ECEF_from_ENU( + antenna_positions, latitude=lat, longitude=lon, altitude=alt + ) - telescope_location # get times if time_array is None: if start_jd is None: raise AttributeError("if time_array is not fed, start_jd must be fed") - time_array = LST2JD(lst_array, start_jd, allow_other_jd=True, lst_branch_cut=lst_branch_cut, - latitude=(tel_lat_lon_alt[0] * 180 / np.pi), - longitude=(tel_lat_lon_alt[1] * 180 / np.pi), - altitude=tel_lat_lon_alt[2]) + time_array = LST2JD( + lst_array, + start_jd, + allow_other_jd=True, + lst_branch_cut=lst_branch_cut, + latitude=(lat * 180 / np.pi), + longitude=(lon * 180 / np.pi), + altitude=alt + ) Ntimes = len(time_array) # get freqs @@ -2166,9 +2204,13 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags # configure baselines antpairs = np.repeat(np.array(antpairs), Ntimes, axis=0) + antpairs_int = antpairs.astype(np.uint64) + if not np.allclose(antpairs, antpairs_int): + raise ValueError("antenna numbers must be non-negative integers") + # get ant_1_array, ant_2_array - ant_1_array = antpairs[:, 0] - ant_2_array = antpairs[:, 1] + ant_1_array = antpairs_int[:, 0] + ant_2_array = antpairs_int[:, 1] # get baseline array baseline_array = 2048 * (ant_1_array + 1) + (ant_2_array + 1) + 2**16 @@ -2224,9 +2266,6 @@ def write_vis(fname, data, lst_array, freq_array, antpos, time_array=None, flags return uvd - - - def update_uvdata(uvd, data=None, flags=None, nsamples=None, add_to_history='', **kwargs): '''Updates a UVData/HERAData object with data or parameters. Cannot modify the shape of data arrays. More than one spectral window is not supported. Assumes every baseline @@ -2263,7 +2302,7 @@ def _write_HERAData_to_filetype(hd, outfilename, filetype_out='miriad', clobber= if filetype_out == 'miriad': hd.write_miriad(outfilename, clobber=clobber) elif filetype_out == 'uvfits': - hd.write_uvfits(outfilename, force_phase=True, spoof_nonessential=True) + hd.write_uvfits(outfilename, force_phase=True) elif filetype_out == 'uvh5': hd.write_uvh5(outfilename, clobber=clobber) else: @@ -2439,7 +2478,6 @@ def write_cal(fname, gains, freqs, times, lsts=None, flags=None, quality=None, t # get time info time_array = np.array(times, float) Ntimes = len(time_array) - time_range = np.array([time_array.min(), time_array.max()], float) if len(time_array) > 1: integration_time = np.median(np.diff(time_array)) * 24. * 3600. else: @@ -2508,7 +2546,7 @@ def write_cal(fname, gains, freqs, times, lsts=None, flags=None, quality=None, t "ant_array", "antenna_numbers", "antenna_names", "cal_style", "history", "channel_width", "flag_array", "gain_array", "quality_array", "jones_array", "time_array", "lst_array", "spw_array", "freq_array", "history", "integration_time", - "time_range", "x_orientation", "telescope_name", "gain_convention", "total_quality_array"] + "x_orientation", "telescope_name", "gain_convention", "total_quality_array"] # create local parameter dict local_params = locals() @@ -2709,6 +2747,7 @@ def throw_away_flagged_ants_parser(): ap.add_argument("--clobber", default=False, action="store_true", help='overwrites existing file at outfile') return ap + def uvdata_from_fastuvh5( meta: FastUVH5Meta, antpairs: list[tuple[int, int]] | None = None, @@ -2716,7 +2755,8 @@ def uvdata_from_fastuvh5( lsts: np.ndarray | None = None, start_jd: float | None = None, lst_branch_cut: float = 0.0, - **kwargs) -> UVData: + **kwargs +) -> UVData: """Convert a FastUVH5Meta object to a UVData object. This is a convenience function to convert a FastUVH5Meta object to a UVData diff --git a/hera_cal/lstbin.py b/hera_cal/lstbin.py index 2bb036b10..b4e99e69c 100644 --- a/hera_cal/lstbin.py +++ b/hera_cal/lstbin.py @@ -1045,7 +1045,7 @@ def make_lst_grid( return lst_grid -def sigma_clip(array: np.ndarray, sigma: float=4.0, min_N: int=4, axis: int = 0): +def sigma_clip(array: np.ndarray | np.ma.MaskedArray, sigma: float=4.0, min_N: int=4, axis: int = 0): """ One-iteration robust sigma clipping algorithm. @@ -1075,11 +1075,15 @@ def sigma_clip(array: np.ndarray, sigma: float=4.0, min_N: int=4, axis: int = 0) if array.shape[axis] < min_N: return np.zeros_like(array, dtype=bool) - # get robust location - location = np.expand_dims(np.nanmedian(array, axis=axis), axis=axis) + if isinstance(array, np.ma.MaskedArray): + location = np.expand_dims(np.ma.median(array, axis=axis), axis=axis) + scale = np.expand_dims(np.ma.median(np.abs(array - location), axis=axis) * 1.482579, axis=axis) + else: + # get robust location + location = np.expand_dims(np.nanmedian(array, axis=axis), axis=axis) - # get MAD * 1.482579 for consistency with Gaussian white noise - scale = np.expand_dims(np.nanmedian(np.abs(array - location), axis=axis) * 1.482579, axis=axis) + # get MAD * 1.482579 for consistency with Gaussian white noise + scale = np.expand_dims(np.nanmedian(np.abs(array - location), axis=axis) * 1.482579, axis=axis) # get clipped data clip_flags = np.abs(array - location) / scale > sigma diff --git a/hera_cal/lstbin_simple.py b/hera_cal/lstbin_simple.py index bb02c1218..d5980704d 100644 --- a/hera_cal/lstbin_simple.py +++ b/hera_cal/lstbin_simple.py @@ -33,16 +33,20 @@ import h5py from functools import partial import yaml -from .types import Antpair, Pol, Baseline +from .types import Antpair +from .datacontainer import DataContainer try: profile except NameError: + def profile(fnc): return fnc + logger = logging.getLogger(__name__) + @profile def lst_align( data: np.ndarray, @@ -52,6 +56,7 @@ def lst_align( freq_array: np.ndarray, flags: np.ndarray | None = None, nsamples: np.ndarray | None = None, + where_inpainted: np.ndarray | None = None, rephase: bool = True, antpos: dict[int, np.ndarray] | None = None, lat: float = -30.72152, @@ -137,9 +142,7 @@ def lst_align( flags = np.zeros(data.shape, dtype=bool) if flags.shape != data.shape: - raise ValueError( - f"flags should have shape {data.shape} but got {flags.shape}" - ) + raise ValueError(f"flags should have shape {data.shape} but got {flags.shape}") if nsamples is None: nsamples = np.ones(data.shape, dtype=float) @@ -156,13 +159,11 @@ def lst_align( adjust_lst_bin_edges(lst_bin_edges) if not np.all(np.diff(lst_bin_edges) > 0): - raise ValueError( - "lst_bin_edges must be monotonically increasing." - ) + raise ValueError("lst_bin_edges must be monotonically increasing.") # Now ensure that all the observed LSTs are wrapped so they start above the first bin edges grid_indices, data_lsts, lst_mask = get_lst_bins(data_lsts, lst_bin_edges) - lst_bin_centres = (lst_bin_edges[1:] + lst_bin_edges[:-1])/2 + lst_bin_centres = (lst_bin_edges[1:] + lst_bin_edges[:-1]) / 2 logger.info(f"Data Shape: {data.shape}") @@ -206,7 +207,7 @@ def lst_align( # in scope, and is therefore removed when the function returns). # shortcut -- just return all the data, re-organized. - _data, _flags, _nsamples = [], [], [] + _data, _flags, _nsamples, _where_inpainted = [], [], [], [] empty_shape = (0, len(antpairs), len(freq_array), npols) for lstbin in range(len(lst_bin_centres)): mask = grid_indices == lstbin @@ -214,14 +215,21 @@ def lst_align( _data.append(data[mask]) _flags.append(flags[mask]) _nsamples.append(nsamples[mask]) + if where_inpainted is not None: + _where_inpainted.append(where_inpainted[mask]) else: _data.append(np.zeros(empty_shape, complex)) _flags.append(np.zeros(empty_shape, bool)) _nsamples.append(np.zeros(empty_shape, int)) + if where_inpainted is not None: + _where_inpainted.append(np.zeros(empty_shape, bool)) - return lst_bin_centres, _data, _flags, _nsamples + return lst_bin_centres, _data, _flags, _nsamples, _where_inpainted or None -def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + +def get_lst_bins( + lsts: np.ndarray, edges: np.ndarray +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Get the LST bin indices for a set of LSTs. Parameters @@ -244,20 +252,26 @@ def get_lst_bins(lsts: np.ndarray, edges: np.ndarray) -> tuple[np.ndarray, np.nd lsts = np.array(lsts).copy() # Now ensure that all the observed LSTs are wrapped so they start above the first bin edges - lsts %= 2*np.pi - lsts[lsts < edges[0]] += 2* np.pi + lsts %= 2 * np.pi + lsts[lsts < edges[0]] += 2 * np.pi bins = np.digitize(lsts, edges, right=True) - 1 - mask = (bins >= 0) & (bins < (len(edges)-1)) + mask = (bins >= 0) & (bins < (len(edges) - 1)) return bins, lsts, mask + def reduce_lst_bins( - data: list[np.ndarray], flags: list[np.ndarray], nsamples: list[np.ndarray], + data: list[np.ndarray], + flags: list[np.ndarray], + nsamples: list[np.ndarray], + where_inpainted: list[np.ndarray] | None = None, + inpainted_mode: bool = False, mutable: bool = False, sigma_clip_thresh: float | None = None, sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, flag_thresh: float = 0.7, -) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + get_mad: bool = False, +) -> dict[str, np.ndarray]: """ From a list of LST-binned data, produce reduced statistics. @@ -277,6 +291,13 @@ def reduce_lst_bins( nsamples A list, the same length/shape as ``data``, containing the number of samples for each measurement. + where_inpainted + A list, the same length/shape as ``data``, containing a boolean mask indicating + which samples have been inpainted. + inpainted_mode + Whether to use the inpainted samples when calculating the statistics. If False, + the inpainted samples are ignored. If True, the inpainted samples are used, and + the statistics are calculated over the un-inpainted samples. mutable Whether the input data (and flags and nsamples) can be modified in place within the algorithm. Setting to true saves memory, and is safe for a one-shot script. @@ -292,12 +313,17 @@ def reduce_lst_bins( The fraction of integrations for a particular (antpair, pol, channel) combination within an LST-bin that can be flagged before that combination is flagged in the LST-average. + get_mad + Whether to compute the median and median absolute deviation of the data in each + LST bin, in addition to the mean and standard deviation. Returns ------- - out_data, out_flags, out_std, out_nsamples - The reduced data, flags, standard deviation (across days) and nsamples. - Shape ``(nbl, nlst_bins, nfreq, npol)``. + dict + The reduced data in a dictionary. Keys are 'data' (the lst-binned mean), + 'nsamples', 'flags', 'days_binned' (the number of days that went into each bin), + 'std' (standard deviation) and *otionally* 'median' and 'mad' (if `get_mad` is + True). All values are arrays of the same shape: ``(nbl, nlst_bins, nfreq, npol)``. """ nlst_bins = len(data) (_, nbl, nfreq, npol) = data[0].shape @@ -312,33 +338,70 @@ def reduce_lst_bins( out_flags = np.zeros(out_data.shape, dtype=bool) out_std = np.ones(out_data.shape, dtype=complex) out_nsamples = np.zeros(out_data.shape, dtype=float) + days_binned = np.zeros(out_nsamples.shape, dtype=int) + + if get_mad: + mad = np.ones(out_data.shape, dtype=complex) + med = np.ones(out_data.shape, dtype=complex) + + if where_inpainted is None: + where_inpainted = [None] * nlst_bins - for lstbin, (d,n,f) in enumerate(zip(data, nsamples, flags)): + for lstbin, (d, n, f, inpf) in enumerate( + zip(data, nsamples, flags, where_inpainted) + ): logger.info(f"Computing LST bin {lstbin+1} / {nlst_bins}") # TODO: check that this doesn't make yet another copy... # This is just the data in this particular lst-bin. + if d.size: + d, f = get_masked_data( + d, n, f, inpainted=inpf, + inpainted_mode=inpainted_mode, flag_thresh=flag_thresh + ) + ( out_data[:, lstbin], out_flags[:, lstbin], out_std[:, lstbin], - out_nsamples[:, lstbin] + out_nsamples[:, lstbin], + days_binned[:, lstbin], ) = lst_average( - d, n, f, mutable=mutable, - flag_thresh=flag_thresh, + d, + n, + f, + inpainted_mode=inpainted_mode, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=sigma_clip_min_N, flag_below_min_N=flag_below_min_N, ) + + if get_mad: + med[:, lstbin], mad[:, lstbin] = get_lst_median_and_mad(d) else: out_data[:, lstbin] *= np.nan out_flags[:, lstbin] = True out_std[:, lstbin] *= np.inf out_nsamples[:, lstbin] = 0.0 + if get_mad: + mad[:, lstbin] *= np.inf + med[:, lstbin] *= np.nan + + out = { + "data": out_data, + "flags": out_flags, + "std": out_std, + "nsamples": out_nsamples, + "days_binned": days_binned, + } + if get_mad: + out["mad"] = mad + out["median"] = med + + return out - return out_data, out_flags, out_std, out_nsamples def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): data = np.full(shape, d, dtype=complex) @@ -346,13 +409,106 @@ def _allocate_dfn(shape: tuple[int], d=0.0, f=0, n=0): nsamples = np.full(shape, n, dtype=float) return data, flags, nsamples + +def get_masked_data( + data: np.ndarray, + nsamples: np.ndarray, + flags: np.ndarray, + inpainted: np.ndarray | None = None, + inpainted_mode: bool = False, + flag_thresh: float = 0.7, +) -> np.ma.MaskedArray: + if not inpainted_mode: + # Act like nothing is inpainted. + inpainted = np.zeros(flags.shape, dtype=bool) + elif inpainted is None: + # Flag if a whole blt is flagged: + allf = np.all(flags, axis=2)[:, :, None, :] + + # Assume everything else that's flagged is inpainted. + inpainted = flags.copy() * (~allf) + + flags = flags | np.isnan(data) | np.isinf(data) | (nsamples == 0) + + # Threshold flags over time here, because we want the new flags to be treated on the + # same footing as the inpainted flags. + threshold_flags(flags, inplace=True, flag_thresh=flag_thresh) + + logger.info(f"In inpainted_mode: {inpainted_mode}. Got {np.sum(inpainted)} inpainted samples, {np.sum(flags)} total flags, {np.sum(flags & ~inpainted)} non-inpainted flags.") + data = np.ma.masked_array(data, mask=(flags & ~inpainted)) + return data, flags + + +def get_lst_median_and_mad( + data: np.ndarray | np.ma.MaskedArray, +): + """Compute the median absolute deviation of a set of data over its zeroth axis. + + Flagged data will be ignored in flagged mode, but included in inpainted mode. + Nsamples is not taken into account at all, unless Nsamples=0. + """ + fnc = np.ma.median if isinstance(data, np.ma.MaskedArray) else np.median + med = fnc(data, axis=0) + madrl = fnc(np.abs(data.real - med.real), axis=0) * 1.482579 + madim = fnc(np.abs(data.imag - med.imag), axis=0) * 1.482579 + return med, madrl + madim * 1j + + +def threshold_flags( + flags: np.ndarray, + inplace: bool = False, + flag_thresh: float = 0.7, +): + """ + Thresholds the input flags array based on the flag fraction. + + Parameters + ---------- + flags : numpy.ndarray + A numpy array of shape (Nnights, ...) representing the flags. + inplace : bool, optional + If True, modifies the input flags array in place. If False, creates a copy of + the flags array. + flag_thresh : float, optional + The threshold value for the flag fraction. + + Returns + ------- + numpy.ndarray + A numpy array of shape (N, ...) with the thresholded flags. + + Examples + -------- + >>> flags = np.array([[True, False, True], [False, True, False]]) + >>> threshold_flags(flags, inplace=True, flag_thresh=0.5) + array([[ True, False, True], + [False, True, False]]) + """ + if not inplace: + flags = flags.copy() + + # Flag entire LST bins if there are too many flags over time + flag_frac = np.sum(flags, axis=0) / flags.shape[0] + nflags = np.sum(flags) + logger.info( + f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%" + ) + flags |= flag_frac > flag_thresh + logger.info( + f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}" + ) + + return flags + + @profile def lst_average( - data: np.ndarray, nsamples: np.ndarray, flags: np.ndarray, - flag_thresh: float = 0.7, - mutable: bool = False, - sigma_clip_thresh: float | None= None, - sigma_clip_min_N: int=4, + data: np.ndarray | np.ma.MaskedArray, + nsamples: np.ndarray, + flags: np.ndarray, + inpainted_mode: bool = False, + sigma_clip_thresh: float | None = None, + sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ @@ -395,65 +551,100 @@ def lst_average( # data has shape (nnights, nbl, npols, nfreqs) # all data is assumed to be in the same LST bin. - assert data.shape == nsamples.shape == flags.shape - - if not mutable: - flags = flags.copy() - nsamples = nsamples.copy() - data = data.copy() - - flags[np.isnan(data) | np.isinf(data) | (nsamples == 0)] = True - - # Flag entire LST bins if there are too many flags over time - flag_frac = np.sum(flags, axis=0) / flags.shape[0] - nflags = np.sum(flags) - logger.info(f"Percent of data flagged before thresholding: {100*nflags/flags.size:.2f}%") - flags |= flag_frac > flag_thresh - data[flags] *= np.nan # do this so that we can do nansum later. multiply to get both real/imag as nan - logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to flag_frac > {flag_thresh}") + if not isinstance(data, np.ma.MaskedArray): + # Generally, we want a MaskedArray for 'data', where the mask is *either* + # the flags or the 'non-inpainted flags', as obtained by `threshold_flags`. + # However, if this hasn't been called, and we just have an array, apply flags + # appropriately here. + data, flags = get_masked_data( + data, nsamples, flags, inpainted_mode=inpainted_mode + ) # Now do sigma-clipping. if sigma_clip_thresh is not None: + if inpainted_mode: + warnings.warn( + "Sigma-clipping in in-painted mode is a bad idea, because it creates " + "non-uniform flags over frequency, which can cause artificial spectral " + "structure. In-painted mode specifically attempts to avoid this." + ) nflags = np.sum(flags) - clip_flags = sigma_clip(data.real, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) - clip_flags |= sigma_clip(data.imag, sigma=sigma_clip_thresh, min_N = sigma_clip_min_N) + clip_flags = sigma_clip( + data.real, sigma=sigma_clip_thresh, min_N=sigma_clip_min_N + ) + clip_flags |= sigma_clip( + data.imag, sigma=sigma_clip_thresh, min_N=sigma_clip_min_N + ) # Need to restore min_N condition properly here because it's not done properly in sigma_clip sc_min_N = np.sum(~flags, axis=0) < sigma_clip_min_N clip_flags[:, sc_min_N] = False flags |= clip_flags - data[flags] *= np.nan - logger.info(f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping") - nsamples[flags] = 0 - norm = np.sum(nsamples, axis=0) # missing a "clip" between 1e-99 and inf here... - ndays_binned = np.sum(nsamples > 0, axis=0) + data.mask |= clip_flags + + logger.info( + f"Flagged a further {100*(np.sum(flags) - nflags)/flags.size:.2f}% of visibilities due to sigma clipping" + ) + + # Here we do a check to make sure Nsamples is uniform across frequency. + # Do this before setting non_inpainted to zero nsamples. + ndiff = np.diff(nsamples, axis=2) + if np.any(ndiff != 0): + warnings.warn( + "Nsamples is not uniform across frequency. This will result in spectral structure." + ) + + nsamples = np.ma.masked_array(nsamples, mask=data.mask) + + # Norm is the total number of samples over the nights. In the in-painted case, + # it *counts* in-painted data as samples. In the non-inpainted case, it does not. + norm = np.sum(nsamples, axis=0) + + # Ndays binned is the number of days that count towards the mean. This is the same + # in in-painted and flagged mode. + ndays_binned = np.sum((~flags).astype(int), axis=0) logger.info("Calculating mean") - meandata = np.nansum(data * nsamples, axis=0) + # np.sum works the same for both masked and non-masked arrays. + meandata = np.sum(data * nsamples, axis=0) + + lstbin_flagged = np.all(data.mask, axis=0) - f_min = np.all(flags, axis=0) if flag_below_min_N: - f_min[ndays_binned < sigma_clip_min_N] = True + lstbin_flagged[ndays_binned < sigma_clip_min_N] = True + + normalizable = norm > 0 - meandata[~f_min] /= norm[~f_min] - meandata[f_min] *= np.nan + meandata[normalizable] /= norm[normalizable] + # Multiply by nan instead of just setting as nan, so both real and imag parts are nan + meandata[~normalizable] *= np.nan + + # While the previous nsamples is different for in-painted and flagged mode, which is + # what we want for the mean, for the std and nsamples we want to treat flags as really + # flagged. + nsamples.mask = flags + norm = np.sum(nsamples, axis=0) + normalizable = norm > 0 # get other stats logger.info("Calculating std") with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Degrees of freedom <= 0 for slice.") - std = np.square(data.real - meandata.real) + 1j*np.square(data.imag - meandata.imag) - std = np.nansum(std * nsamples, axis=0) - std[~f_min] /= norm[~f_min] - std = np.sqrt(std.real) + 1j*np.sqrt(std.imag) + std = np.square(data.real - meandata.real) + 1j * np.square( + data.imag - meandata.imag + ) + std = np.sum(std * nsamples, axis=0) + std[normalizable] /= norm[normalizable] + std = np.sqrt(std.real) + 1j * np.sqrt(std.imag) - std[f_min] = np.inf - norm[f_min] = 0 + std[~normalizable] = np.inf + + logger.info(f"Mean of meandata: {np.mean(meandata)}. Mean of std: {np.mean(std)}. Total nsamples: {np.sum(norm)}") + return meandata.data, lstbin_flagged, std.data, norm.data, ndays_binned - return meandata, f_min, std, norm def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: """ @@ -462,9 +653,10 @@ def adjust_lst_bin_edges(lst_bin_edges: np.ndarray) -> np.ndarray: Performs the adjustment in-place. """ while lst_bin_edges[0] < 0: - lst_bin_edges += 2*np.pi - while lst_bin_edges[0] >= 2*np.pi: - lst_bin_edges -= 2*np.pi + lst_bin_edges += 2 * np.pi + while lst_bin_edges[0] >= 2 * np.pi: + lst_bin_edges -= 2 * np.pi + def lst_bin_files_for_baselines( data_files: list[Path | FastUVH5Meta], @@ -483,6 +675,7 @@ def lst_bin_files_for_baselines( reds: RedundantGroups | None = None, freq_min: float | None = None, freq_max: float | None = None, + where_inpainted_files: list[list[str | Path | None]] | None = None, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list[np.ndarray]]: """Produce a set of LST-binned (but not averaged) data for a set of baselines. @@ -547,6 +740,10 @@ def lst_bin_files_for_baselines( freq_min, freq_max Minimum and maximum frequencies to include in the data. If not provided, all frequencies will be included. + where_inpainted_files + A list of lists of strings, one for each file, where each file is a UVFlag file + specifying which data are in-painted. If not provided, no inpainting will be + assumed. Returns ------- @@ -560,10 +757,17 @@ def lst_bin_files_for_baselines( Same as ``data``, but boolean flags. nsamples Same as ``data``, but sample counts. + where_inpainted + Same as ``data``, but boolean flags indicating where inpainting has been done. times_in_bins The JDs that are in each LST bin -- a list of arrays. """ - metas = [fl if isinstance(fl, FastUVH5Meta) else FastUVH5Meta(fl, blts_are_rectangular=True) for fl in data_files] + metas = [ + fl + if isinstance(fl, FastUVH5Meta) + else FastUVH5Meta(fl, blts_are_rectangular=True) + for fl in data_files + ] lst_bin_edges = np.array(lst_bin_edges) @@ -599,28 +803,36 @@ def lst_bin_files_for_baselines( if time_idx is None: adjust_lst_bin_edges(lst_bin_edges) - lst_bin_edges %= 2*np.pi + lst_bin_edges %= 2 * np.pi op = np.logical_and if lst_bin_edges[0] < lst_bin_edges[-1] else np.logical_or time_idx = [] for meta in metas: - _lsts = meta.get_transactional('lsts') + _lsts = meta.get_transactional("lsts") time_idx.append(op(_lsts >= lst_bin_edges[0], _lsts < lst_bin_edges[-1])) if time_arrays is None: - time_arrays = [meta.get_transactional('times')[idx] for meta, idx in zip(metas, time_idx)] + time_arrays = [ + meta.get_transactional("times")[idx] for meta, idx in zip(metas, time_idx) + ] if lsts is None: lsts = np.concatenate( - [meta.get_transactional('lsts')[idx] for meta, idx in zip(metas, time_idx)] + [meta.get_transactional("lsts")[idx] for meta, idx in zip(metas, time_idx)] ) # Now we can set up our master arrays of data. data, flags, nsamples = _allocate_dfn( (len(lsts), len(antpairs), len(freqs), len(pols)), - d=np.nan + np.nan*1j, - f=True + d=np.nan + np.nan * 1j, + f=True, ) + if where_inpainted_files is None or all(w is None for w in where_inpainted_files): + where_inpainted_files = [None] * len(metas) + where_inpainted = None + else: + where_inpainted = np.zeros_like(flags) + if cal_files is None: cal_files = [None] * len(metas) @@ -631,18 +843,29 @@ def lst_bin_files_for_baselines( # This loop actually reads the associated data in this LST bin. ntimes_so_far = 0 - for meta, calfl, tind, tarr in zip(metas, cal_files, time_idx, time_arrays): + for meta, calfl, tind, tarr, inpfile in zip( + metas, cal_files, time_idx, time_arrays, where_inpainted_files + ): logger.info(f"Reading {meta.path}") - slc = slice(ntimes_so_far,ntimes_so_far+len(tarr)) + slc = slice(ntimes_so_far, ntimes_so_far + len(tarr)) ntimes_so_far += len(tarr) - #hd = io.HERAData(str(fl.path), filetype='uvh5') - data_antpairs = meta.get_transactional('antpairs') + # hd = io.HERAData(str(fl.path), filetype='uvh5') + data_antpairs = meta.get_transactional("antpairs") if redundantly_averaged: - bls_to_load = [bl for bl in data_antpairs if reds.get_ubl_key(bl) in antpairs or reds.get_ubl_key(bl[::-1]) in antpairs] + bls_to_load = [ + bl + for bl in data_antpairs + if reds.get_ubl_key(bl) in antpairs + or reds.get_ubl_key(bl[::-1]) in antpairs + ] else: - bls_to_load = [bl for bl in antpairs if bl in data_antpairs or bl[::-1] in data_antpairs] + bls_to_load = [ + bl + for bl in antpairs + if bl in data_antpairs or bl[::-1] in data_antpairs + ] if not bls_to_load or not np.any(tind): # If none of the requested baselines are in this file, then just @@ -655,8 +878,25 @@ def lst_bin_files_for_baselines( # TODO: use Fast readers here instead. _data, _flags, _nsamples = io.HERAData(meta.path).read( - bls=bls_to_load, times=tarr, freq_chans=freq_chans, polarizations=pols, + bls=bls_to_load, + times=tarr, + freq_chans=freq_chans, + polarizations=pols, ) + if inpfile is not None: + # This returns a DataContainer (unless something went wrong) since it should + # always be a 'baseline' type of UVFlag. + inpainted = io.load_flags(inpfile) + if not isinstance(inpainted, DataContainer): + raise ValueError(f"Expected {inpfile} to be a DataContainer") + + # We need to down-selecton times/freqs (bls and pols will be sub-selected + # based on those in the data through the next loop) + inpainted.select_or_expand_times(new_times=tarr, skip_bda_check=True) + inpainted.select_freqs(channels=freq_chans) + else: + inpainted = None + if redundantly_averaged: keyed = reds.keyed_on_bls(_data.antpairs()) @@ -672,13 +912,17 @@ def lst_bin_files_for_baselines( gains, cal_flags, _, _ = uvc.build_calcontainers() apply_cal.calibrate_in_place( - _data, gains, data_flags=_flags, cal_flags=cal_flags, - gain_convention=uvc.gain_convention + _data, + gains, + data_flags=_flags, + cal_flags=cal_flags, + gain_convention=uvc.gain_convention, ) for i, bl in enumerate(antpairs): if redundantly_averaged: bl = keyed.get_ubl_key(bl) + for j, pol in enumerate(pols): blpol = bl + (pol,) @@ -686,6 +930,22 @@ def lst_bin_files_for_baselines( data[slc, i, :, j] = _data[blpol] flags[slc, i, :, j] = _flags[blpol] nsamples[slc, i, :, j] = _nsamples[blpol] + + if inpainted is not None: + # Get the representative baseline key from this bl group that + # exists in the where_inpainted data. + if redundantly_averaged: + for inpbl in reds[bl]: + if inpbl + (pol,) in inpainted: + blpol = inpbl + (pol,) + break + else: + raise ValueError( + f"Could not find any baseline from group {bl} in " + "inpainted file" + ) + + where_inpainted[slc, i, :, j] = inpainted[blpol] else: # This baseline+pol doesn't exist in this file. That's # OK, we don't assume all baselines are in every file. @@ -697,16 +957,16 @@ def lst_bin_files_for_baselines( # LST bin edges are the actual edges of the bins, so should have length # +1 of the LST centres. We use +dlst instead of +dlst/2 on the top edge # so that np.arange definitely gets the last edge. - # lst_edges = np.arange(outfile_lsts[0] - dlst/2, outfile_lsts[-1] + dlst, dlst) - bin_lst, data, flags, nsamples = lst_align( + bin_lst, data, flags, nsamples, where_inpainted = lst_align( data=data, flags=None if ignore_flags else flags, nsamples=nsamples, data_lsts=lsts, + where_inpainted=where_inpainted, antpairs=antpairs, lst_bin_edges=lst_bin_edges, - freq_array = freqs, - rephase = rephase, + freq_array=freqs, + rephase=rephase, antpos=antpos, ) @@ -717,12 +977,13 @@ def lst_bin_files_for_baselines( mask = bins == i times_in_bins.append(times[mask]) - return bin_lst, data, flags, nsamples, times_in_bins + return bin_lst, data, flags, nsamples, where_inpainted, times_in_bins + def apply_calfile_rules( data_files: list[list[str]], calfile_rules: list[tuple[str, str]], - ignore_missing: bool + ignore_missing: bool, ) -> tuple[list[list[str]], list[list[str]]]: """ Apply a set of rules to convert data file names to calibration file names. @@ -768,36 +1029,41 @@ def apply_calfile_rules( data_files[night] = [df for df in dflist if df not in missing] return data_files, input_cals + def filter_required_files_by_times( lst_range: tuple[float, float], data_metas: list[list[FastUVH5Meta]], cal_files: list[list[str]] | None = None, -) -> tuple[list, list, list, list, list]: - + where_inpainted_files: list[list[str]] | None = None, +) -> tuple[list, list, list, list, list, list]: lstmin, lstmax = lst_range - lstmin %= (2 * np.pi) - lstmax %= (2 * np.pi) + lstmin %= 2 * np.pi + lstmax %= 2 * np.pi if lstmin > lstmax: lstmax += 2 * np.pi if not cal_files: cal_files = [[None for _ in dm] for dm in data_metas] + if not where_inpainted_files: + where_inpainted_files = [[None for _ in dm] for dm in data_metas] + tinds = [] all_lsts = [] file_list = [] time_arrays = [] cals = [] + where_inpainted = [] # This loop just gets the number of times that we'll be reading. # Even though the input files should already have been configured to be those # that fall within the output LST range, we still need to read them in to # check exactly which time integrations we require. - for night, callist in zip(data_metas, cal_files): - for meta, cal in zip(night, callist): + for night, callist, inplist in zip(data_metas, cal_files, where_inpainted_files): + for meta, cal, inp in zip(night, callist, inplist): tarr = meta.times - lsts = meta.lsts % (2*np.pi) + lsts = meta.lsts % (2 * np.pi) lsts[lsts < lstmin] += 2 * np.pi tind = (lsts > lstmin) & (lsts < lstmax) @@ -808,7 +1074,45 @@ def filter_required_files_by_times( all_lsts.append(lsts[tind]) file_list.append(meta) cals.append(cal) - return tinds, time_arrays, all_lsts, file_list, cals + where_inpainted.append(inp) + + return tinds, time_arrays, all_lsts, file_list, cals, where_inpainted + + +def _get_where_inpainted_files( + data_files: list[list[str | Path]], + where_inpainted_file_rules: list[tuple[str, str]] | None, +) -> list[list[str | Path]] | None: + if where_inpainted_file_rules is None: + return None + + where_inpainted_files = [] + for dflist in data_files: + this = [] + where_inpainted_files.append(this) + for df in dflist: + wif = str(df) + for rule in where_inpainted_file_rules: + wif = wif.replace(rule[0], rule[1]) + if os.path.exists(wif): + this.append(wif) + else: + raise IOError(f"Where inpainted file {wif} does not exist") + + return where_inpainted_files + + +def _configure_inpainted_mode(output_flagged, output_inpainted, where_inpainted_files): + # Sort out defaults for inpaint/flagging mode + if output_inpainted is None: + output_inpainted = bool(where_inpainted_files) + + if not output_inpainted and not output_flagged: + raise ValueError( + "Both output_inpainted and output_flagged are False. One must be True." + ) + + return output_flagged, output_inpainted def lst_bin_files_single_outfile( @@ -816,30 +1120,35 @@ def lst_bin_files_single_outfile( metadata: dict[str, Any], lst_bins: np.ndarray, data_files: list[list[str]], - calfile_rules: list[tuple[str, str]] | None= None, + calfile_rules: list[tuple[str, str]] | None = None, ignore_missing_calfiles: bool = False, outdir: str | Path | None = None, reds: RedundantGroups | None = None, redundantly_averaged: bool | None = None, only_last_file_per_night: bool = False, - history: str = '', - fname_format: str="zen.{kind}.{lst:7.5f}.uvh5", - overwrite: bool=False, - rephase: bool=False, - Nbls_to_load: int | None=None, - ignore_flags: bool=False, - include_autos: bool=True, + history: str = "", + fname_format: str = "zen.{kind}.{lst:7.5f}.uvh5", + overwrite: bool = False, + rephase: bool = False, + Nbls_to_load: int | None = None, + ignore_flags: bool = False, + include_autos: bool = True, ex_ant_yaml_files=None, - ignore_ants: tuple[int]=(), + ignore_ants: tuple[int] = (), write_kwargs: dict | None = None, save_channels: list[int] = (), golden_lsts: tuple[float] = (), - sigma_clip_thresh: float | None= None, + sigma_clip_thresh: float | None = None, sigma_clip_min_N: int = 4, flag_below_min_N: bool = False, flag_thresh: float = 0.7, freq_min: float | None = None, freq_max: float | None = None, + output_inpainted: bool | None = None, + output_flagged: bool = True, + where_inpainted_file_rules: list[tuple[str, str]] | None = None, + sigma_clip_in_inpainted_mode: bool = False, + write_med_mad: bool = False, ) -> dict[str, Path]: """ Bin data files into LST bins, and write all bins to disk in a single file. @@ -847,6 +1156,21 @@ def lst_bin_files_single_outfile( Note that this is generally not meant to be called directly, but rather through the :func:`lst_bin_files` function. + The mode(s) in which the function does the averaging can be specified via the + `output_inpainted`, `output_flagged` and `where_inpainted_file_rules` options. + Algorithmically, there are two modes of averaging: either flagged data is *ignored* + in the average (and in Nsamples) or the flagged data is included in the average but + ignored in Nsamples. These are called "flagged" and "inpainted" modes respectively. + The latter only makes sense if the data in the flagged regions has been inpainted, + rather than left as raw data. For delay-filtered data, either mode is equivalent, + since the flagged data itself is set to zero. By default, this function *only* + uses the "flagged" mode, *unless* the `where_inpainted_file_rules` option is set, + which indicates that the files are definitely in-painted, and in this case it will + use *both* modes by default. + + .. note:: Both "flagged" and "inpainted" mode as implemented in this function are + *not* spectrally smooth if the input Nsamples are not spectrally uniform. + Parameters ---------- config_opts @@ -947,12 +1271,86 @@ def lst_bin_files_single_outfile( freq_max The maximum frequency to include in the output files. If not provided, this is set to the maximum frequency in the first data file on the first night. + output_inpainted + If True, output data LST-binned in in-painted mode. This mode does *not* flag + data for the averaging, assuming that data that has flags has been in-painted + to improve spectral smoothness. It does take the flags into account for the + LST-binned Nsamples, however. + output_flagged + If True, output data LST-binned in flagged mode. This mode *does* apply flags + to the data before averaging. It will yield the same Nsamples as the in-painted + mode, but simply ignores flagged data for the average, which can yield less + spectrally-smooth LST-binned results. + where_inpainted_file_rules + Rules to transform the input data file names into the corresponding "where + inpainted" files (which should be in UVFlag format). If provided, this indicates + that the data itself is in-painted, and the `output_inpainted` mode will be + switched on by default. These files should specify which data is in-painted + in the associated data file (which may be different than the in-situ flags + of the data object). If not provided, but `output_inpainted` is set to True, + all data-flags will be considered in-painted except for baseline-times that are + fully flagged, which will be completely ignored. + sigma_clip_in_inpainted_mode + If True, sigma clip the data in inpainted mode (if sigma-clipping is turned on). + This is generally not a good idea, since the point of inpainting is to get + smoother spectra, and sigma-clipping creates non-uniform Nsamples, which can + lead to less smooth spectra. This option is only here to enable sigma-clipping + to be turned on for flagged mode, and off for inpainted mode. + write_med_mad + If True, write out the median and MAD of the data in each LST bin. Returns ------- out_files A dict of output files, keyed by the type of file (e.g. 'LST', 'STD', 'GOLDEN', 'REDUCEDCHAN'). + + Notes + ----- + It is worth describing in a bit more detail what is actually _in_ the output files. + The "LST" files contain the LST-binned data, with the data averaged over each LST + bin. There are two possible modes for this: inpainted and flagged. In inpainted mode, + the data in flagged bl-channel-pols is used for the averaging, as it is considered + to be in-painted. This gives the most spectrally-smooth results. In order to ignore + a particular bl-channel-pol while still using inpaint mode, supply a "where inpainted" + file, which should be a UVFlag object that specifies which bl-channel-pols are + inpainted in the associated data file. Anything that's flagged but not inpainted is + ignored for the averaging. In this inpainted mode, the Nsamples are the number of + un-flagged integrations (whether they were in-painted or not). The LST-binned flags + are only set to True if ALL of the nights for a given bl-channel-pol are flagged + (again, whether they were in-painted or not). In flagged mode, both the Nsamples + and Flags are the same as inpainted mode. The averaged data, however, ignores any + flagged data. This can lead to less spectrally-smooth results. The "STD" files + contain LST-binned "data" that is the standard deviation of the data in each LST-bin. + This differs between inpainted and flagged modes in the same way as the "LST" files: + in inpainted mode, the flagged and inpainted data is used for calculating the sample + variance, while in flagged mode, only the unflagged data is used. The final flags + in the "STD" files is equivalent to that in the "LST" files. The Nsamples in the + "STD" files is actually the number of unflagged nights in the LST bin (so, not the + sum of Nsamples), where "unflagged" really does mean unflagged -- whether inpainted + or not. + + One note here about what is considered "flagged" vs. "flagged and inpainted" vs + "flagged and not inpainted". In flagged mode, there are input flags that exist in the + input files. These are potentially *augmented* by sigma clipping within the LST + binner, and also by flagging whole LST bins if they have too few unflagged integrations. + In inpainted mode, input flags are considered as normal flags. However, only + "non-inpainted" flags are *ignored* for the averaging. By default, all flagged data + is considered to to be in-painted UNLESS it is a blt-pol that is fully flagged (i.e. + all channels are flagged for an integration for a single bl and pol). However, you + can tell the routine that other data is NOT in-painted by supplying a "where inpainted" + file. Now, integrations in LST-bins that end up having "too few" unflagged + integrations will be flagged inside the binner, however in inpainted mode, if these + are considered "inpainted", they will still be used in averaging (this means they + will have "valid" data for the average, but their average will be flagged). + On the other hand, flags that are applied by sigma-clipping will be considered + NOT inpainted, i.e. those data will be ignored in the averaged, just like flagging + mode. In this case, either choice is bad: to include them in the average is bad + because even though they may have been actually in-painted, whatever value they have + is clearly triggering the sigma-clipper and is therefore an outlier. On the other + hand, to ignore them is bad because the point of in-painting mode is to get + smoother spectra, and this negates that. So, it's best just to not do sigma-clipping + in inpainted mode. """ write_kwargs = write_kwargs or {} @@ -964,21 +1362,33 @@ def lst_bin_files_single_outfile( data_files, calfile_rules, ignore_missing=ignore_missing_calfiles ) + where_inpainted_files = _get_where_inpainted_files( + data_files, where_inpainted_file_rules + ) + + output_flagged, output_inpainted = _configure_inpainted_mode( + output_flagged, output_inpainted, where_inpainted_files + ) + # Prune empty nights (some nights start with files, but have files removed because # they have no associated calibration) data_files = [df for df in data_files if df] input_cals = [cf for cf in input_cals if cf] + if where_inpainted_files is not None: + where_inpainted_files = [wif for wif in where_inpainted_files if wif] logger.info("Got the following numbers of data files per night:") for dflist in data_files: logger.info(f"{dflist[0].split('/')[-1]}: {len(dflist)}") - data_metas = [[ - FastUVH5Meta( - df, - blts_are_rectangular=metadata['blts_are_rectangular'], - time_axis_faster_than_bls=metadata['time_axis_faster_than_bls'] - ) for df in dflist + data_metas = [ + [ + FastUVH5Meta( + df, + blts_are_rectangular=metadata["blts_are_rectangular"], + time_axis_faster_than_bls=metadata["time_axis_faster_than_bls"], + ) + for df in dflist ] for dflist in data_files ] @@ -987,9 +1397,7 @@ def lst_bin_files_single_outfile( if outdir is None: outdir = os.path.dirname(os.path.commonprefix(abscal.flatten(data_files))) - x_orientation = metadata['x_orientation'] - start_jd = metadata['start_jd'] - integration_time = metadata['integration_time'] + start_jd = metadata["start_jd"] # get metadata logger.info("Getting metadata from first file...") @@ -1001,18 +1409,15 @@ def lst_bin_files_single_outfile( # all the antenna positions are included in every file. antpos = dict(zip(meta.antenna_numbers, meta.antpos_enu)) if reds is None: - reds = RedundantGroups.from_antpos( - antpos=antpos, - include_autos=include_autos - ) + reds = RedundantGroups.from_antpos(antpos=antpos, include_autos=include_autos) if redundantly_averaged is None: # Try to work out if the files are redundantly averaged. # just look at the middle file from each night. for fl_list in data_metas: - meta = fl_list[len(fl_list)//2] + meta = fl_list[len(fl_list) // 2] antpairs = meta.get_transactional("antpairs") - ubls = set(reds.get_ubl_key(ap) for ap in antpairs) + ubls = {reds.get_ubl_key(ap) for ap in antpairs} if len(ubls) != len(antpairs): # At least two of the antpairs are in the same redundant group. redundantly_averaged = False @@ -1032,11 +1437,11 @@ def lst_bin_files_single_outfile( reds=reds, only_last_file_per_night=only_last_file_per_night, ) - nants0 = meta.header['antenna_numbers'].size + nants0 = meta.header["antenna_numbers"].size # Do a quick check to make sure all nights at least have the same number of Nants for dflist in data_metas: - _nants = dflist[0].header['antenna_numbers'].size + _nants = dflist[0].header["antenna_numbers"].size dflist[0].close() if _nants != nants0: raise ValueError( @@ -1049,26 +1454,37 @@ def lst_bin_files_single_outfile( if Nbls_to_load is None: Nbls_to_load = len(all_baselines) + 1 n_bl_chunks = len(all_baselines) // Nbls_to_load + 1 - bl_chunks = [all_baselines[i * Nbls_to_load:(i + 1) * Nbls_to_load] for i in range(n_bl_chunks)] + bl_chunks = [ + all_baselines[i * Nbls_to_load: (i + 1) * Nbls_to_load] + for i in range(n_bl_chunks) + ] bl_chunks = [blg for blg in bl_chunks if len(blg) > 0] - dlst = config_opts['dlst'] + dlst = config_opts["dlst"] lst_bin_edges = np.array( - [x - dlst/2 for x in lst_bins] + [lst_bins[-1] + dlst/2] + [x - dlst / 2 for x in lst_bins] + [lst_bins[-1] + dlst / 2] ) - tinds, time_arrays, all_lsts, file_list, cals = filter_required_files_by_times( + ( + tinds, + time_arrays, + all_lsts, + file_list, + cals, + where_inpainted_files, + ) = filter_required_files_by_times( (lst_bin_edges[0], lst_bin_edges[-1]), data_metas, input_cals, + where_inpainted_files, ) + # If we have no times at all for this file, just return if len(all_lsts) == 0: return {} all_lsts = np.concatenate(all_lsts) - # The "golden" data is the full data over all days for a small subset of LST # bins. This works best if the LST bins are small (similar to the size of the # raw integrations). Usually, the length of "bins" will be zero. @@ -1085,7 +1501,7 @@ def lst_bin_files_single_outfile( # make it a bit easier to create the outfiles create_outfile = partial( create_lstbin_output_file, - outdir = outdir, + outdir=outdir, pols=all_pols, file_list=file_list, history=history, @@ -1095,24 +1511,42 @@ def lst_bin_files_single_outfile( start_jd=start_jd, freq_min=freq_min, freq_max=freq_max, + lst_branch_cut=metadata["lst_branch_cut"], **write_kwargs, ) out_files = {} - for kind in ['LST', 'STD']: - # Create the files we'll write to - out_files[kind] = create_outfile( - kind = kind, - lst=lst_bin_edges[0], - lsts=lst_bins, - ) + for inpaint_mode in [True, False]: + if inpaint_mode and not output_inpainted: + continue + if not inpaint_mode and not output_flagged: + continue + + kinds = ["LST", "STD"] + if write_med_mad: + kinds += ["MED", "MAD"] + for kind in kinds: + # Create the files we'll write to + out_files[(kind, inpaint_mode)] = create_outfile( + kind=kind, + lst=lst_bin_edges[0], + lsts=lst_bins, + inpaint_mode=inpaint_mode, + ) nbls_so_far = 0 for bi, bl_chunk in enumerate(bl_chunks): logger.info(f"Baseline Chunk {bi+1} / {len(bl_chunks)}") # data/flags/nsamples are *lists*, with nlst_bins entries, each being an # array, with shape (times, bls, freqs, npols) - bin_lst, data, flags, nsamples, binned_times = lst_bin_files_for_baselines( - data_files = file_list, + ( + bin_lst, + data, + flags, + nsamples, + where_inpainted, + binned_times, + ) = lst_bin_files_for_baselines( + data_files=file_list, lst_bin_edges=lst_bin_edges, antpairs=bl_chunk, freqs=freq_array, @@ -1128,34 +1562,38 @@ def lst_bin_files_single_outfile( reds=reds, freq_min=freq_min, freq_max=freq_max, + where_inpainted_files=where_inpainted_files, ) slc = slice(nbls_so_far, nbls_so_far + len(bl_chunk)) if bi == 0: # On the first baseline chunk, create the output file - out_files['GOLDEN'] = [] + # TODO: we're not writing out the where_inpainted data for the GOLDEN + # or REDUCEDCHAN files yet -- it looks like we'd have to write out + # completely new UVFlag files for this. + out_files["GOLDEN"] = [] for nbin in golden_bins: out_files["GOLDEN"].append( create_outfile( - kind = 'GOLDEN', + kind="GOLDEN", lst=lst_bin_edges[nbin], times=binned_times[nbin], ) ) if save_channels and len(binned_times[0]) > 0: - out_files['REDUCEDCHAN'] = create_outfile( - kind = 'REDUCEDCHAN', + out_files["REDUCEDCHAN"] = create_outfile( + kind="REDUCEDCHAN", lst=lst_bin_edges[0], times=binned_times[0], channels=list(save_channels), ) - if len(golden_bins)>0: - for fl, nbin in zip(out_files['GOLDEN'], golden_bins): + if len(golden_bins) > 0: + for fl, nbin in zip(out_files["GOLDEN"], golden_bins): write_baseline_slc_to_file( - fl = fl, + fl=fl, slc=slc, data=data[nbin].transpose((1, 0, 2, 3)), flags=flags[nbin].transpose((1, 0, 2, 3)), @@ -1164,35 +1602,67 @@ def lst_bin_files_single_outfile( if "REDUCEDCHAN" in out_files: write_baseline_slc_to_file( - fl = out_files['REDUCEDCHAN'], + fl=out_files["REDUCEDCHAN"], slc=slc, data=data[0][:, :, save_channels].transpose((1, 0, 2, 3)), flags=flags[0][:, :, save_channels].transpose((1, 0, 2, 3)), nsamples=nsamples[0][:, :, save_channels].transpose((1, 0, 2, 3)), ) - data, flags, std, nsamples = reduce_lst_bins( - data, flags, nsamples, - flag_thresh=flag_thresh, - sigma_clip_thresh = sigma_clip_thresh, - sigma_clip_min_N = sigma_clip_min_N, - flag_below_min_N=flag_below_min_N, - ) + for inpainted in [True, False]: + if inpainted and not output_inpainted: + continue + if not inpainted and not output_flagged: + continue + + rdc = reduce_lst_bins( + data, + flags, + nsamples, + where_inpainted=where_inpainted, + inpainted_mode=inpainted, + flag_thresh=flag_thresh, + sigma_clip_thresh=( + None + if inpainted and not sigma_clip_in_inpainted_mode + else sigma_clip_thresh + ), + sigma_clip_min_N=sigma_clip_min_N, + flag_below_min_N=flag_below_min_N, + get_mad=write_med_mad, + ) - write_baseline_slc_to_file( - fl=out_files['LST'], - slc=slc, - data=data, - flags=flags, - nsamples=nsamples, - ) - write_baseline_slc_to_file( - fl=out_files['STD'], - slc=slc, - data=std, - flags=flags, - nsamples=nsamples, - ) + write_baseline_slc_to_file( + fl=out_files[("LST", inpainted)], + slc=slc, + data=rdc["data"], + flags=rdc["flags"], + nsamples=rdc["nsamples"], + ) + + write_baseline_slc_to_file( + fl=out_files[("STD", inpainted)], + slc=slc, + data=rdc["std"], + flags=rdc["flags"], + nsamples=rdc["days_binned"], + ) + + if write_med_mad: + write_baseline_slc_to_file( + fl=out_files[("MED", inpainted)], + slc=slc, + data=rdc["median"], + flags=rdc["flags"], + nsamples=rdc["nsamples"], + ) + write_baseline_slc_to_file( + fl=out_files[("MAD", inpainted)], + slc=slc, + data=rdc["mad"], + flags=rdc["flags"], + nsamples=rdc["days_binned"], + ) nbls_so_far += len(bl_chunk) @@ -1202,9 +1672,9 @@ def lst_bin_files_single_outfile( @profile def lst_bin_files( config_file: str | Path, - output_file_select: int | Sequence[int] | None=None, - include_autos: bool=True, - **kwargs + output_file_select: int | Sequence[int] | None = None, + include_autos: bool = True, + **kwargs, ) -> list[dict[str, Path]]: """ LST bin a series of UVH5 files. @@ -1243,10 +1713,10 @@ def lst_bin_files( with open(config_file, "r") as fl: configuration = yaml.safe_load(fl) - config_opts = configuration['config_params'] - lst_grid = configuration['lst_grid'] - matched_files = configuration['matched_files'] - metadata = configuration['metadata'] + config_opts = configuration["config_params"] + lst_grid = configuration["lst_grid"] + matched_files = configuration["matched_files"] + metadata = configuration["metadata"] if output_file_select is None: output_file_select = list(range(len(matched_files))) @@ -1261,33 +1731,31 @@ def lst_bin_files( meta = FastUVH5Meta( matched_files[0][0][0], - blts_are_rectangular=metadata['blts_are_rectangular'], - time_axis_faster_than_bls=metadata['time_axis_faster_than_bls'], + blts_are_rectangular=metadata["blts_are_rectangular"], + time_axis_faster_than_bls=metadata["time_axis_faster_than_bls"], ) antpos = dict(zip(meta.antenna_numbers, meta.antpos_enu)) - reds = RedundantGroups.from_antpos( - antpos=antpos, - include_autos=include_autos - ) + reds = RedundantGroups.from_antpos(antpos=antpos, include_autos=include_autos) output_files = [] for outfile_index in output_file_select: data_files = matched_files[outfile_index] output_files.append( lst_bin_files_single_outfile( - config_opts = config_opts, - metadata = metadata, - lst_bins = lst_grid[outfile_index], - data_files = data_files, - reds = reds, - include_autos = include_autos, - **kwargs + config_opts=config_opts, + metadata=metadata, + lst_bins=lst_grid[outfile_index], + data_files=data_files, + reds=reds, + include_autos=include_autos, + **kwargs, ) ) return output_files + @profile def get_all_unflagged_baselines( data_files: list[list[str | Path | FastUVH5Meta]], @@ -1324,33 +1792,37 @@ def get_all_unflagged_baselines( data_files = [ [ - fl if isinstance(fl, FastUVH5Meta) else - FastUVH5Meta( - fl, blts_are_rectangular=blts_are_rectangular, - time_axis_faster_than_bls=time_axis_faster_than_bls - ) for fl in fl_list - ] for fl_list in data_files + fl + if isinstance(fl, FastUVH5Meta) + else FastUVH5Meta( + fl, + blts_are_rectangular=blts_are_rectangular, + time_axis_faster_than_bls=time_axis_faster_than_bls, + ) + for fl in fl_list + ] + for fl_list in data_files ] - all_baselines = set() all_pols = set() meta0 = data_files[0][0] - x_orientation = meta0.get_transactional('x_orientation') + x_orientation = meta0.get_transactional("x_orientation") # reds will contain all of the redundant groups for the whole array, because # all the antenna positions are included in every file. if reds is None: reds = RedundantGroups.from_antpos( - antpos=dict(zip(meta0.antenna_numbers, meta0.antpos_enu)), include_autos=True + antpos=dict(zip(meta0.antenna_numbers, meta0.antpos_enu)), + include_autos=True, ) if redundantly_averaged is None: # Try to work out if the files are redundantly averaged. # just look at the middle file from each night. for fl_list in data_files: - meta = fl_list[len(fl_list)//2] + meta = fl_list[len(fl_list) // 2] antpairs = meta.get_transactional("antpairs") ubls = set(reds.get_ubl_key(ap) for ap in antpairs) if len(ubls) != len(antpairs): @@ -1404,19 +1876,19 @@ def get_all_unflagged_baselines( a1, a2 = reds.get_ubl_key((a1, a2)) if ( - (a1, a2) not in all_baselines and # Do this first because after the - (a2, a1) not in all_baselines and # first file it often triggers. - a1 not in ignore_ants and - a2 not in ignore_ants and - (include_autos or a1 != a2) and - a1 not in a_priori_antenna_flags and - a2 not in a_priori_antenna_flags + (a1, a2) not in all_baselines + and (a2, a1) not in all_baselines # Do this first because after the + and a1 not in ignore_ants # first file it often triggers. + and a2 not in ignore_ants + and (include_autos or a1 != a2) + and a1 not in a_priori_antenna_flags + and a2 not in a_priori_antenna_flags ): all_baselines.add((a1, a2)) - return sorted(all_baselines), sorted(all_pols) + def create_lstbin_output_file( outdir: Path, kind: str, @@ -1426,23 +1898,40 @@ def create_lstbin_output_file( start_jd: float, times: np.ndarray | None = None, lsts: np.ndarray | None = None, - history: str = "", - fname_format: str="zen.{kind}.{lst:7.5f}.uvh5", + history: str = "", + fname_format: str = "zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", overwrite: bool = False, antpairs: list[tuple[int, int]] | None = None, freq_min: float | None = None, freq_max: float | None = None, channels: np.ndarray | list[int] | None = None, vis_units: str = "Jy", + inpaint_mode: bool | None = None, + lst_branch_cut: float = 0.0, ) -> Path: outdir = Path(outdir) - # pols = set(pols) + # update history file_list_str = "-".join(ff.path.name for ff in file_list) file_history = f"{history} Input files: {file_list_str}" _history = file_history + utils.history_string() - fname = outdir / fname_format.format(kind=kind, lst=lst, pol=''.join(pols)) + if lst < lst_branch_cut: + lst += 2 * np.pi + + fname = fname_format.format( + kind=kind, + lst=lst, + pol="".join(pols), + inpaint_mode="inpaint" + if inpaint_mode + else ("flagged" if inpaint_mode is False else ""), + ) + # There's a weird gotcha with pathlib where if you do path / "/file.name" + # You get just "/file.name" which is in root. + if fname.startswith('/'): + fname = fname[1:] + fname = outdir / fname logger.info(f"Initializing {fname}") @@ -1466,42 +1955,45 @@ def create_lstbin_output_file( times=times, history=_history, start_jd=start_jd, - time_axis_faster_than_bls = True, + time_axis_faster_than_bls=True, vis_units=vis_units, + lst_branch_cut=lst_branch_cut, ) uvd_template.select(frequencies=freqs, polarizations=pols, inplace=True) # Need to set the polarization array manually because even though the select # operation does the down-select, it doesn't re-order the pols. - uvd_template.polarization_array = np.array(uvutils.polstr2num(pols, x_orientation=uvd_template.x_orientation)) + uvd_template.polarization_array = np.array( + uvutils.polstr2num(pols, x_orientation=uvd_template.x_orientation) + ) uvd_template.initialize_uvh5_file(str(fname.absolute()), clobber=overwrite) return fname + def write_baseline_slc_to_file( - fl: Path, - slc: slice, - data: np.ndarray, - flags: np.ndarray, - nsamples: np.ndarray + fl: Path, slc: slice, data: np.ndarray, flags: np.ndarray, nsamples: np.ndarray ): """Write a baseline slice to a file.""" - with h5py.File(fl, 'r+') as f: - ntimes = int(f['Header']['Ntimes'][()]) - timefirst = bool(f['Header']['time_axis_faster_than_bls'][()]) + with h5py.File(fl, "r+") as f: + ntimes = int(f["Header"]["Ntimes"][()]) + timefirst = bool(f["Header"]["time_axis_faster_than_bls"][()]) if not timefirst and ntimes > 1: raise NotImplementedError("Can only do time-first files for now.") - slc = slice(slc.start*ntimes, slc.stop*ntimes, 1) - f['Data']['visdata'][slc] = data.reshape((-1, data.shape[2], data.shape[3])) - f['Data']['flags'][slc] = flags.reshape((-1, data.shape[2], data.shape[3])) - f['Data']['nsamples'][slc] = nsamples.reshape((-1, data.shape[2], data.shape[3])) + slc = slice(slc.start * ntimes, slc.stop * ntimes, 1) + f["Data"]["visdata"][slc] = data.reshape((-1, data.shape[2], data.shape[3])) + f["Data"]["flags"][slc] = flags.reshape((-1, data.shape[2], data.shape[3])) + f["Data"]["nsamples"][slc] = nsamples.reshape( + (-1, data.shape[2], data.shape[3]) + ) + def config_lst_bin_files( data_files: list[list[str | FastUVH5Meta]], - dlst: float | None=None, - atol: float=1e-10, - lst_start: float | None=None, - lst_width: float = 2*np.pi, + dlst: float | None = None, + atol: float = 1e-10, + lst_start: float | None = None, + lst_width: float = 2 * np.pi, blts_are_rectangular: bool | None = None, time_axis_faster_than_bls: bool | None = None, ntimes_per_file: int | None = None, @@ -1533,27 +2025,23 @@ def config_lst_bin_files( Returns ------- lst_grid : float ndarray holding LST bin centers. - dlst : float, LST bin width of output lst_grid - file_lsts : list, contains the lst grid of each output file. Empty files are dropped. - begin_lst : float, starting lst for LST binner. If lst_start is not None, this equals lst_start. - lst_arrays : list, list of lst arrays for each file. These will have 2 pis added or subtracted - to match the range of lst_grid given lst_start - time_arrays : list, list of time arrays for each file + matched_files : list of lists of files, one list for each output file. """ logger.info("Configuring lst_grid") - data_files=[sorted(df) for df in data_files] + data_files = [sorted(df) for df in data_files] df0 = data_files[0][0] # get dlst from first data file if None if dlst is None: - dlst = io.get_file_times(df0, filetype='uvh5')[0] + dlst = io.get_file_times(df0, filetype="uvh5")[0] # Get rectangularity of blts from first file if None meta = FastUVH5Meta( - df0, blts_are_rectangular=blts_are_rectangular, - time_axis_faster_than_bls=time_axis_faster_than_bls + df0, + blts_are_rectangular=blts_are_rectangular, + time_axis_faster_than_bls=time_axis_faster_than_bls, ) if blts_are_rectangular is None: @@ -1570,7 +2058,8 @@ def config_lst_bin_files( df[0], blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, - ) for df in data_files + ) + for df in data_files ] # get begin_lst from lst_start or from the first JD in the data_files @@ -1582,7 +2071,7 @@ def config_lst_bin_files( lst_grid = make_lst_grid(dlst, begin_lst=begin_lst, lst_width=lst_width) dlst = lst_grid[1] - lst_grid[0] - lst_edges = np.concatenate([lst_grid - dlst/2, [lst_grid[-1] + dlst/2]]) + lst_edges = np.concatenate([lst_grid - dlst / 2, [lst_grid[-1] + dlst / 2]]) # Now, what we need here is actually the lst_edges of the *files*, not the actual # bins. @@ -1596,36 +2085,42 @@ def config_lst_bin_files( for fllist in data_files: matched = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fllist, + file_list=fllist, files_sorted=True, jd_regex=jd_regex, blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, - atol=atol + atol=atol, ) for i, m in enumerate(matched): matched_files[i].append(m) nfiles = int(np.ceil(len(lst_grid) / ntimes_per_file)) - lst_grid = [lst_grid[ntimes_per_file*i:ntimes_per_file*(i+1)] for i in range(nfiles)] + lst_grid = [ + lst_grid[ntimes_per_file * i: ntimes_per_file * (i + 1)] for i in range(nfiles) + ] # Only keep output files that have data associated - lst_grid = [lg for lg, mf in zip(lst_grid, matched_files) if any(len(mff)>0 for mff in mf)] - matched_files = [mf for mf in matched_files if any(len(mff)>0 for mff in mf)] + lst_grid = [ + lg for lg, mf in zip(lst_grid, matched_files) if any(len(mff) > 0 for mff in mf) + ] + matched_files = [mf for mf in matched_files if any(len(mff) > 0 for mff in mf)] return lst_grid, matched_files + def make_lst_bin_config_file( config_file: str | Path, data_files: list[list[str | FastUVH5Meta]], clobber: bool = False, - dlst: float | None=None, - atol: float=1e-10, - lst_start: float | None=None, - lst_width: float = 2*np.pi, - ntimes_per_file: int=60, + dlst: float | None = None, + atol: float = 1e-10, + lst_start: float | None = None, + lst_width: float = 2 * np.pi, + ntimes_per_file: int = 60, blts_are_rectangular: bool | None = None, time_axis_faster_than_bls: bool | None = None, jd_regex: str = r"zen\.(\d+\.\d+)\.", + lst_branch_cut: float | None = None, ) -> dict[str, Any]: """Construct and write a YAML configuration file for lst-binning. @@ -1675,6 +2170,12 @@ def make_lst_bin_config_file( jd_regex : str, optional Regex to use to extract the JD from the file name. Set to None or empty to force the LST-matching to use the LSTs in the metadata within the file. + lst_branch_cut + The LST at which to branch cut the LST grid for file writing. The JDs in the + output LST-binned files will be *lowest* at the lst_branch_cut, and all file + names will have LSTs that are higher than lst_branch_cut. If None, this will + be determined automatically by finding the largest gap in LSTs and starting + AFTER it. Returns ------- @@ -1694,55 +2195,68 @@ def make_lst_bin_config_file( blts_are_rectangular=blts_are_rectangular, time_axis_faster_than_bls=time_axis_faster_than_bls, jd_regex=jd_regex, + ntimes_per_file=ntimes_per_file, ) + # Get the best lst_branch_cut by finding the largest gap in LSTs and starting + # AFTER it + if lst_branch_cut is None: + lst_branch_cut = float(utils.get_best_lst_branch_cut(np.concatenate(lst_grid))) + dlst = lst_grid[0][1] - lst_grid[0][0] # Make it a real list of floats to make the YAML easier to read - lst_grid = [[float(l) for l in lsts] for lsts in lst_grid] + lst_grid = [[float(lst) for lst in lsts] for lsts in lst_grid] # now matched files is a list of output files, each containing a list of nights, # each containing a list of files logger.info("Getting metadata from first file...") + def get_meta(): for outfile in matched_files: for night in outfile: for i, fl in enumerate(night): return fl + meta = get_meta() tint = np.median(meta.integration_time) if not np.all(np.abs(np.diff(np.diff(meta.times))) < 1e-6): raise ValueError( - 'All integrations must be of equal length (BDA not supported), got diffs: ' - f'{np.diff(meta.times)}' + "All integrations must be of equal length (BDA not supported), got diffs: " + f"{np.diff(meta.times)}" ) - matched_files = [[[str(m.path) for m in night] for night in outfiles] for outfiles in matched_files] + matched_files = [ + [[str(m.path) for m in night] for night in outfiles] + for outfiles in matched_files + ] output = { - 'config_params': { - 'dlst': float(dlst), - 'atol': atol, - 'lst_start': lst_start, - 'lst_width': lst_width, - 'jd_regex': jd_regex, + "config_params": { + "dlst": float(dlst), + "atol": atol, + "lst_start": lst_start, + "lst_width": lst_width, + "jd_regex": jd_regex, + }, + "lst_grid": lst_grid, + "matched_files": matched_files, + "metadata": { + "x_orientation": meta.x_orientation, + "blts_are_rectangular": meta.blts_are_rectangular, + "time_axis_faster_than_bls": meta.time_axis_faster_than_bls, + "start_jd": int(meta.times[0]), + "integration_time": float(tint), + "lst_branch_cut": lst_branch_cut, }, - 'lst_grid': lst_grid, - 'matched_files': matched_files, - 'metadata': { - 'x_orientation': meta.x_orientation, - 'blts_are_rectangular': meta.blts_are_rectangular, - 'time_axis_faster_than_bls': meta.time_axis_faster_than_bls, - 'start_jd': int(meta.times[0]), - 'integration_time': float(tint), - } } - with open(config_file, 'w') as fl: + with open(config_file, "w") as fl: yaml.safe_dump(output, fl) return output + def lst_bin_arg_parser(): """ arg parser for lst_bin_files() function. data_files argument must be quotation-bounded @@ -1759,32 +2273,166 @@ def lst_bin_arg_parser(): "Consult lstbin.lst_bin_files() for further details on functionality." ) ) - a.add_argument('configfile', type=str, help="config file produced by lstbin.make_lst_bin_config_file") a.add_argument( - "--calfile-rules", nargs='*', type=str, - help="rules to convert datafile names to calfile names. A series of two strings where the first will be replaced by the latter" + "configfile", + type=str, + help="config file produced by lstbin.make_lst_bin_config_file", + ) + a.add_argument( + "--calfile-rules", + nargs="*", + type=str, + help="rules to convert datafile names to calfile names. A series of two strings where the first will be replaced by the latter", + ) + a.add_argument( + "--fname-format", + type=str, + default="zen.{kind}.{lst:7.5f}.uvh5", + help="filename format for output files. See docstring for details.", + ) + a.add_argument( + "--outdir", default=None, type=str, help="directory for writing output" + ) + a.add_argument( + "--overwrite", default=False, action="store_true", help="overwrite output files" + ) + a.add_argument( + "--rephase", + default=False, + action="store_true", + help="rephase data to center of LST bin before binning", + ) + a.add_argument( + "--history", default=" ", type=str, help="history to insert into output files" + ) + a.add_argument( + "--output_file_select", + default=None, + nargs="*", + type=int, + help="list of output file integers to run on. Default is all output files.", + ) + a.add_argument( + "--vis_units", default="Jy", type=str, help="visibility units of output files." + ) + a.add_argument( + "--ignore_flags", + default=False, + action="store_true", + help="Ignore flags in data files, such that all input data is included in binning.", + ) + a.add_argument( + "--Nbls_to_load", + default=None, + type=int, + help="Number of baselines to load and bin simultaneously. Default is all.", + ) + a.add_argument( + "--ex_ant_yaml_files", + default=None, + type=str, + nargs="+", + help="list of paths to yamls with lists of antennas from each night to exclude lstbinned data files.", + ) + a.add_argument( + "--ignore-ants", default=(), type=int, nargs="+", help="ants to ignore" + ) + a.add_argument( + "--ignore-missing-calfiles", + default=False, + action="store_true", + help="if true, any datafile with missing calfile will just be removed from lstbinning.", + ) + a.add_argument( + "--write_kwargs", + default="{}", + type=str, + help="json dictionary of arguments to the uvh5 writer", + ) + a.add_argument( + "--golden-lsts", + type=str, + help="LSTS (rad) to save longitudinal data for, separated by commas", + ) + a.add_argument( + "--save-channels", + type=str, + help="integer channels separated by commas to save longitudinal data for", + ) + a.add_argument( + "--sigma-clip-thresh", + type=float, + help="sigma clip threshold for flagging data in an LST bin over time. Zero means no clipping.", + default=None, + ) + a.add_argument( + "--sigma-clip-min-N", + type=int, + help="number of unflagged data points over time to require before considering sigma clipping", + default=4, + ) + a.add_argument( + "--flag-below-min-N", + action="store_true", + help="if true, flag all data in an LST bin if there are fewer than --sigma-clip-min-N unflagged data points over time", + ) + a.add_argument( + "--flag-thresh", + type=float, + help="fraction of integrations in an LST bin for a particular (antpair, pol, channel) that must be flagged for the entire bin to be flagged", + default=0.7, + ) + a.add_argument( + "--redundantly-averaged", + action="store_true", + default=None, + help="if true, assume input files are redundantly averaged", + ) + a.add_argument( + "--only-last-file-per-night", + action="store_true", + default=False, + help="if true, only use the first and last file every night to obtain antpairs", + ) + a.add_argument( + "--freq-min", + type=float, + default=None, + help="minimum frequency to include in lstbinning", + ) + a.add_argument( + "--freq-max", + type=float, + default=None, + help="maximum frequency to include in lstbinning", + ) + a.add_argument( + "--no-flagged-mode", + action="store_true", + help="turn off output of flagged mode LST-binning", + ) + a.add_argument( + "--do-inpaint-mode", + action="store_true", + default=None, + help="turn on inpainting mode LST-binning", + ) + a.add_argument( + "--where-inpainted-file-rules", + nargs="*", + type=str, + help="rules to convert datafile names to where-inpainted-file names. A series of two strings where the first will be replaced by the latter", + ) + a.add_argument( + "--sigma-clip-in-inpainted-mode", + action="store_true", + default=False, + help="allow sigma-clipping in inpainted mode", + ) + a.add_argument( + "--write-med-mad", + action="store_true", + default=False, + help="option to write out MED/MAD files in addition to LST/STD files", ) - a.add_argument("--fname-format", type=str, default="zen.{kind}.{lst:7.5f}.uvh5", help="filename format for output files. See docstring for details.") - a.add_argument("--outdir", default=None, type=str, help="directory for writing output") - a.add_argument("--overwrite", default=False, action='store_true', help="overwrite output files") - a.add_argument("--rephase", default=False, action='store_true', help="rephase data to center of LST bin before binning") - a.add_argument("--history", default=' ', type=str, help="history to insert into output files") - a.add_argument("--output_file_select", default=None, nargs='*', type=int, help="list of output file integers to run on. Default is all output files.") - a.add_argument("--vis_units", default='Jy', type=str, help="visibility units of output files.") - a.add_argument("--ignore_flags", default=False, action='store_true', help="Ignore flags in data files, such that all input data is included in binning.") - a.add_argument("--Nbls_to_load", default=None, type=int, help="Number of baselines to load and bin simultaneously. Default is all.") - a.add_argument("--ex_ant_yaml_files", default=None, type=str, nargs='+', help="list of paths to yamls with lists of antennas from each night to exclude lstbinned data files.") - a.add_argument("--ignore-ants", default=(), type=int, nargs='+', help='ants to ignore') - a.add_argument("--ignore-missing-calfiles", default=False,action='store_true', help='if true, any datafile with missing calfile will just be removed from lstbinning.') - a.add_argument("--write_kwargs", default='{}', type=str, help="json dictionary of arguments to the uvh5 writer") - a.add_argument("--golden-lsts", type=str, help="LSTS (rad) to save longitudinal data for, separated by commas") - a.add_argument("--save-channels", type=str, help="integer channels separated by commas to save longitudinal data for") - a.add_argument("--sigma-clip-thresh", type=float, help="sigma clip threshold for flagging data in an LST bin over time. Zero means no clipping.", default=None) - a.add_argument("--sigma-clip-min-N", type=int, help="number of unflagged data points over time to require before considering sigma clipping", default=4) - a.add_argument("--flag-below-min-N", action='store_true', help="if true, flag all data in an LST bin if there are fewer than --sigma-clip-min-N unflagged data points over time") - a.add_argument("--flag-thresh", type=float, help="fraction of integrations in an LST bin for a particular (antpair, pol, channel) that must be flagged for the entire bin to be flagged", default=0.7) - a.add_argument("--redundantly-averaged", action='store_true', default=None, help="if true, assume input files are redundantly averaged") - a.add_argument("--only-last-file-per-night", action='store_true', default=False, help="if true, only use the first and last file every night to obtain antpairs") - a.add_argument("--freq-min", type=float, default=None, help="minimum frequency to include in lstbinning") - a.add_argument("--freq-max", type=float, default=None, help="maximum frequency to include in lstbinning") return a diff --git a/hera_cal/nucal.py b/hera_cal/nucal.py index f3c533754..7fd1f9288 100644 --- a/hera_cal/nucal.py +++ b/hera_cal/nucal.py @@ -503,7 +503,7 @@ def compute_spectral_filters(freqs, spectral_filter_half_width, eigenval_cutoff= return dspec.dpss_operator(freqs, [0], [spectral_filter_half_width], eigenval_cutoff=[eigenval_cutoff])[0].real -def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filter_half_width=1, eigenval_cutoff=1e-12): +def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filter_half_width=1, eigenval_cutoff=1e-12, umin=None, umax=None): """ Compute prolate spheroidal wave function (PSWF) filters for a single radially redundant group. @@ -522,6 +522,14 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt the uv-plane to be modeled at half-wavelength scales. eigenval_cutoff : float Cutoff for the eigenvalues of the PSWF filter + umin : float, optional, default=None + Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal model if one is uininterested in u-modes below + umin. + umax : float, optional, default=None + Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal model particularly if the + baseline group has a few long baselines an one is uininterested in u-modes above umax. Returns: ------- @@ -531,8 +539,10 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt # Compute the minimum and maximum u values for the spatial filter group_bls_lengths = [bls_lengths[bl] for bl in group] - umin = np.min(group_bls_lengths) / SPEED_OF_LIGHT * np.min(freqs) - umax = np.max(group_bls_lengths) / SPEED_OF_LIGHT * np.max(freqs) + if umin is None: + umin = np.min(group_bls_lengths) / SPEED_OF_LIGHT * np.min(freqs) + if umax is None: + umax = np.max(group_bls_lengths) / SPEED_OF_LIGHT * np.max(freqs) # Create dictionary for storing spatial filters spatial_filters = {} @@ -550,7 +560,7 @@ def compute_spatial_filters_single_group(group, freqs, bls_lengths, spatial_filt return spatial_filters -def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, cache={}): +def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eigenval_cutoff=1e-12, umin=None, umax=None): """ Compute prolate spheroidal wave function (PSWF) filters for each radially redundant group in radial_reds. Note that if you are using a large array with a large range of short and long baselines in an individual radially @@ -568,8 +578,15 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig modeling foregrounds out to the horizon. eigenval_cutoff : float, default=1e-12 Sinc matrix eigenvalue cutoffs to use for included PSWF modes. - cache : dictionary, default={} - Dictionary containing cached PSWF eigenvectors to speed up computation + umin : float, optional, default=None + Minimum u-mode at which the filters are computed. If None, filter bounds will be computed from the minimum frequency value and shortest + baseline length. Restricting the minimum u-mode can decrease the degrees of freedom in a nucal model if one is uininterested in u-modes below + umin. If umin is not None, umin will be applied to all baseline groups in radial reds. + umax : float, optional, default=None + Maximum u-mode at which the filters are computed. If None, filter bounds will be computed from the maximum frequency value and longest + baseline length. Restricting the maximum u-mode can significantly decrease the degrees of freedom in a nucal model particularly if the + baseline group has a few long baselines an one is uininterested in u-modes above umax. If umax is not None, umax will be applied to all + baseline groups in radial reds. Returns: ------- @@ -584,7 +601,7 @@ def compute_spatial_filters(radial_reds, freqs, spatial_filter_half_width=1, eig for group in radial_reds: # Compute spatial filters for each baseline in the group spatial_filters.update(compute_spatial_filters_single_group( - group, freqs, radial_reds.baseline_lengths, spatial_filter_half_width, eigenval_cutoff + group, freqs, radial_reds.baseline_lengths, spatial_filter_half_width, eigenval_cutoff, umin=umin, umax=umax ) ) diff --git a/hera_cal/redcal.py b/hera_cal/redcal.py index cdf127acf..c63b9a5a1 100644 --- a/hera_cal/redcal.py +++ b/hera_cal/redcal.py @@ -832,20 +832,27 @@ def __init__(self, data, sol0, wgts={}, gain=.3, **kwargs): **kwargs: keyword arguments of constants (python variables in keys of data that are not to be solved for) which are passed to linsolve.LinProductSolver. """ - linsolve.LinProductSolver.__init__(self, data, sol0, wgts=wgts, **kwargs) + linsolve.LinProductSolver.__init__(self, data, sol0, wgts=wgts, build_solver=False, **kwargs) self.gain = np.float32(gain) # float32 to avoid accidentally promoting data to doubles. - def _get_ans0(self, sol, keys=None): + def _get_ans0(self, sol, keys=None, to_update_inplace=None): '''Evaluate the system of equations given input sol. - Specify keys to evaluate only a subset of the equations.''' + Specify keys to evaluate only a subset of the equations. + to_update_inplace is a dictionary mapping keys or self.keys() + to numpy arrays that we want to overwrite with ans0 values.''' if keys is None: keys = self.keys _sol = {k + '_': v.conj() for k, v in sol.items() if k.startswith('g')} _sol.update(sol) - return {k: eval(k, _sol) for k in keys} + if to_update_inplace is None: + return {k: eval(k, _sol) for k in keys} + else: + # update in place to save memory + for k in keys: + to_update_inplace[k] = eval(k, _sol) def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, - wgt_func=lambda x: 1., verbose=False): + wgt_func=None, verbose=False): """Repeatedly solves and updates solution until convergence or maxiter is reached. Returns a meta-data about the solution and the solution itself. @@ -859,7 +866,7 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af wgt_func: a function f(abs^2 * wgt) operating on weighted absolute differences between data and model that returns an additional data weighting to apply to when calculating chisq and updating parameters. Example: lambda x: np.where(x>0, 5*np.tanh(x/5)/x, 1) - clamps deviations to 5 sigma. Default is no additional weighting (lambda x: 1.). + clamps deviations to 5 sigma. Default None is no additional weighting. Returns: meta, sol meta: a dictionary with metadata about the solution, including @@ -873,18 +880,20 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af for term in self.all_terms for (gi, gj, uij) in term] dmdl_u = self._get_ans0(sol) abs2_u = {k: np.abs(self.data[k] - dmdl_u[k])**2 * self.wgts[k] for k in self.keys} - chisq = sum([v * wgt_func(v) for v in abs2_u.values()]) + chisq = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) update = np.where(chisq > 0) - abs2_u = {k: v[update] for k, v in abs2_u.items()} + for k, v in abs2_u.items(): + abs2_u[k] = v[update] # variables with '_u' are flattened and only include pixels that need updating - dmdl_u = {k: v[update].flatten() for k, v in dmdl_u.items()} + for k, v in dmdl_u.items(): + dmdl_u[k] = v[update].flatten() # wgts_u hold the wgts the user provides wgts_u = {k: (v * np.ones(chisq.shape, dtype=np.float32))[update].flatten() for k, v in self.wgts.items()} # clamp_wgts_u adds additional sigma clamping done by wgt_func. # abs2_u holds abs(data - mdl)**2 * wgt (i.e. noise-weighted deviations), which is # passed to wgt_func to determine any additional weighting (to, e.g., clamp outliers). - clamp_wgts_u = {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()} + clamp_wgts_u = (wgts_u if wgt_func is None else {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()}) sol_u = {k: v[update].flatten() for k, v in sol.items()} iters = np.zeros(chisq.shape, dtype=int) conv = np.ones_like(chisq) @@ -895,14 +904,14 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af # compute data wgts: dwgts = sum(V_mdl^2 / n^2) = sum(V_mdl^2 * wgts) # don't need to update data weighting with every iteration # clamped weighting is passed to dwgts_u, which is used to update parameters - dwgts_u = {k: dmdl_u[k] * dmdl_u[k].conj() * clamp_wgts_u[k] for k in self.keys} sol_wgt_u = {k: 0 for k in sol.keys()} + dw_u = {} for k, (gi, gj, uij) in zip(self.keys, terms): - w = dwgts_u[k] - sol_wgt_u[gi] += w - sol_wgt_u[gj] += w - sol_wgt_u[uij] += w - dw_u = {k: v[update] * dwgts_u[k] for k, v in self.data.items()} + dwgts_u = dmdl_u[k] * dmdl_u[k].conj() * clamp_wgts_u[k] + sol_wgt_u[gi] += dwgts_u + sol_wgt_u[gj] += dwgts_u + sol_wgt_u[uij] += dwgts_u + dw_u[k] = self.data[k][update] * dwgts_u sol_sum_u = {k: 0 for k in sol_u.keys()} for k, (gi, gj, uij) in zip(self.keys, terms): # compute sum(wgts * V_meas / V_mdl) @@ -912,22 +921,24 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af sol_sum_u[uij] += numerator new_sol_u = {k: v * ((1 - self.gain) + self.gain * sol_sum_u[k] / sol_wgt_u[k]) for k, v in sol_u.items()} - dmdl_u = self._get_ans0(new_sol_u) + self._get_ans0(new_sol_u, to_update_inplace=dmdl_u) # check if i % check_every is 0, which is purposely one less than the '1' up at the top of the loop if i < maxiter and (i < check_after or (i % check_every) != 0): # Fast branch when we aren't expensively computing convergence/chisq sol_u = new_sol_u else: # Slow branch when we compute convergence/chisq - abs2_u = {k: np.abs(v[update] - dmdl_u[k])**2 * wgts_u[k] for k, v in self.data.items()} - new_chisq_u = sum([v * wgt_func(v) for v in abs2_u.values()]) + for k, v in self.data.items(): + abs2_u[k] = np.abs(v[update] - dmdl_u[k])**2 * wgts_u[k] + new_chisq_u = sum([(v if wgt_func is None else v * wgt_func(v)) for v in abs2_u.values()]) chisq_u = chisq[update] gotbetter_u = (chisq_u > new_chisq_u) where_gotbetter_u = np.where(gotbetter_u) update_where = tuple(u[where_gotbetter_u] for u in update) chisq[update_where] = new_chisq_u[where_gotbetter_u] iters[update_where] = i - new_sol_u = {k: np.where(gotbetter_u, v, sol_u[k]) for k, v in new_sol_u.items()} + for k, v in new_sol_u.items(): + new_sol_u[k] = np.where(gotbetter_u, v, sol_u[k]) deltas_u = [v - sol_u[k] for k, v in new_sol_u.items()] conv_u = np.sqrt(sum([(v * v.conj()).real for v in deltas_u]) / sum([(v * v.conj()).real for v in new_sol_u.values()])) @@ -938,11 +949,16 @@ def solve_iteratively(self, conv_crit=1e-10, maxiter=50, check_every=4, check_af if update_u[0].size == 0 or i == maxiter: meta = {'iter': iters, 'chisq': chisq, 'conv_crit': conv} return meta, sol - dmdl_u = {k: v[update_u] for k, v in dmdl_u.items()} - wgts_u = {k: v[update_u] for k, v in wgts_u.items()} - sol_u = {k: v[update_u] for k, v in new_sol_u.items()} - abs2_u = {k: v[update_u] for k, v in abs2_u.items()} - clamp_wgts_u = {k: v * wgt_func(abs2_u[k]) for k, v in wgts_u.items()} + for k, v in dmdl_u.items(): + dmdl_u[k] = v[update_u] + for k, v in wgts_u.items(): + wgts_u[k] = v[update_u] + for k, v in abs2_u.items(): + abs2_u[k] = v[update_u] + for k, v in sol_u.items(): + sol_u[k] = v[update_u] + for k, v in wgts_u.items(): + clamp_wgts_u[k] = (v if wgt_func is None else v * wgt_func(abs2_u[k])) update = tuple(u[update_u] for u in update) if verbose: print(' = %f, = %f, CNT = %d', (np.mean(chisq), np.mean(conv), update[0].size)) @@ -1112,14 +1128,18 @@ def _solver(self, solver, data, wgts={}, detrend_phs=False, **kwargs): dc = DataContainer(data) eqs = self.build_eqs(dc) self.phs_avg = {} # detrend phases within redundant group, used for logcal to avoid phase wraps + d_ls, w_ls = {}, {} if detrend_phs: for grp in self.reds: self.phs_avg[grp[0]] = np.exp(-np.complex64(1j) * np.median(np.unwrap([np.log(dc[bl]).imag for bl in grp], axis=0), axis=0)) for bl in grp: self.phs_avg[bl] = self.phs_avg[grp[0]].astype(dc[bl].dtype) - d_ls, w_ls = {}, {} - for eq, key in eqs.items(): - d_ls[eq] = dc[key] * self.phs_avg.get(key, np.float32(1)) + for eq, key in eqs.items(): + # this makes a copy, which prevents modification of the data when detrending phase + d_ls[eq] = dc[key] * self.phs_avg.get(key, np.float32(1)) + else: + for eq, key in eqs.items(): + d_ls[eq] = dc[key] if len(wgts) > 0: wc = DataContainer(wgts) for eq, key in eqs.items(): @@ -1289,7 +1309,7 @@ def lincal(self, data, sol0, wgts={}, sparse=False, mode='default', conv_crit=1e sol = RedSol(self.reds, sol_dict=prms) return meta, sol - def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, wgt_func=lambda x: 1.): + def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, check_every=4, check_after=1, wgt_func=None): """Use the Liu et al 2010 Omnical algorithm to linearize equations and iteratively minimize chi^2. Args: @@ -1307,7 +1327,7 @@ def omnical(self, data, sol0, wgts={}, gain=.3, conv_crit=1e-10, maxiter=50, che wgt_func: a function f(abs^2 * wgt) operating on weighted absolute differences between data and model that returns an additional data weighting to apply to when calculating chisq and updating parameters. Example: lambda x: np.where(x>0, 5*np.tanh(x/5)/x, 1) - clamps deviations to 5 sigma. Default is no additional weighting (lambda x: 1.). + clamps deviations to 5 sigma. Default None is no additional weighting. Returns: meta: dictionary of information about the convergence and chi^2 of the solution @@ -1887,7 +1907,7 @@ def redcal_iteration(hd, nInt_to_load=None, pol_mode='2pol', bl_error_tol=1.0, e # try to solve for gains on antennas excluded from calibration, but keep them flagged expand_omni_gains(sol, all_reds_this_pol, data, nsamples, chisq_per_ant=meta['chisq_per_ant']) - # try one more time to expand visibilities, keeping new ones flagged, but + # try one more time to expand visibilities, keeping new ones flagged, but # don't update chisq or chisq_per_ant expand_omni_vis(sol, all_reds_this_pol, data, nsamples) sol.make_sol_finite() diff --git a/hera_cal/reflections.py b/hera_cal/reflections.py index a34ee5d5d..225c5de45 100644 --- a/hera_cal/reflections.py +++ b/hera_cal/reflections.py @@ -1320,7 +1320,7 @@ def L(x, amp, dly, phs, clean_data, clean_model, clean_wgts, dly_range, freqs, f else: x0.append(phs0[i]) _phs = None - x0 = np.array(x0) + x0 = np.squeeze(x0) # optimize res = minimize(L, x0, args=(_amp, _dly, _phs, clean_data[i], clean_model[i], clean_wgts[i], dly_range, freqs, fft_kwargs), method=method, tol=tol, options=dict(maxiter=maxiter)) diff --git a/hera_cal/smooth_cal.py b/hera_cal/smooth_cal.py index 7f04e5a26..2cbbb0bb8 100644 --- a/hera_cal/smooth_cal.py +++ b/hera_cal/smooth_cal.py @@ -648,7 +648,10 @@ def build_time_blacklist(time_grid, time_blacklists=[], lst_blacklists=[], lat_l raise NotImplementedError(f'No known position for telescope {telescope_name}. lat_lon_alt_degrees must be specified.') # calculate LST grid in hours from time grid and lat_lon_alt - lst_grid = pyuvdata.utils.get_lst_for_time(time_grid, *lat_lon_alt_degrees) * 12 / np.pi + lat, lon, alt = lat_lon_alt_degrees + lst_grid = pyuvdata.utils.get_lst_for_time( + time_grid, latitude=lat, longitude=lon, altitude=alt + ) * 12 / np.pi # add blacklisted times from lst_blacklists for bounds in lst_blacklists: diff --git a/hera_cal/tests/mock_uvdata.py b/hera_cal/tests/mock_uvdata.py index b96f80504..ee64e2afc 100644 --- a/hera_cal/tests/mock_uvdata.py +++ b/hera_cal/tests/mock_uvdata.py @@ -1,7 +1,7 @@ """Functions for mocking UVdata objects for testing purposes.""" from __future__ import annotations -from pyuvdata import UVData, UVCal +from pyuvdata import UVData, UVCal, UVFlag from pyuvdata.uvdata import FastUVH5Meta from pyuvdata.telescopes import get_telescope @@ -24,17 +24,18 @@ with open(f"{DATA_PATH}/hera_antpos.yaml", "r") as fl: HERA_ANTPOS = yaml.safe_load(fl) -start: 46920776.3671875 -end: 234298706.0546875 -delta: 122070.3125 +PHASEII_FREQS = np.arange( + 46920776.3671875, 234298706.0546875 + 10.0, 122070.3125 +) + def create_mock_hera_obs( jdint: int = 2459855, integration_time=9.663677215576172, lst_start=0.1, jd_start: float | None = None, - ntimes: int=2, - freqs: np.ndarray = np.arange(46920776.3671875, 234298706.0546875 + 10.0, 122070.3125), + ntimes: int = 2, + freqs: np.ndarray = PHASEII_FREQS, pols: list[str] = ["xx", "yy", "xy", "yx"], ants: list[int] | None = None, antpairs: list[tuple[int, int]] | None = None, @@ -49,17 +50,17 @@ def create_mock_hera_obs( if jd_start is None: # We ensure that the LSTs align exactly with the LST-grid that would be LST-binned. lst_start = make_lst_grid(dlst, begin_lst=lst_start)[0] - lsts = np.arange(lst_start, ntimes*dlst + lst_start, dlst)[:ntimes] + lsts = np.arange(lst_start, ntimes * dlst + lst_start, dlst)[:ntimes] times = utils.LST2JD(lsts, start_jd=jdint, allow_other_jd=False) else: if jd_start < 2000000: jd_start += jdint - times = np.arange(jd_start, ntimes*tint + jd_start, tint)[:ntimes] + times = np.arange(jd_start, ntimes * tint + jd_start, tint)[:ntimes] if ants is None: ants = list(HERA_ANTPOS.keys()) - antpos = {k: v for k,v in HERA_ANTPOS.items() if k in ants} + antpos = {k: v for k, v in HERA_ANTPOS.items() if k in ants} reds = RedundantGroups.from_antpos(antpos) @@ -75,51 +76,54 @@ def create_mock_hera_obs( # build a UVData object uvd = UVData.new( - freq_array= freqs, + freq_array=freqs, polarization_array=pols, - antenna_positions= antpos, + antenna_positions=antpos, antpairs=np.array(antpairs), - telescope_location= HERA_LOC, - telescope_name= "HERA", - times= times, + telescope_location=HERA_LOC, + telescope_name="HERA", + times=times, x_orientation=x_orientation, empty=empty, time_axis_faster_than_bls=time_axis_faster_than_bls, - do_blt_outer=True + do_blt_outer=True, ) uvd.polarization_array = np.array(uvd.polarization_array) return uvd def create_uvd_ones(**kwargs) -> UVData: - uvd = create_mock_hera_obs( - empty=True, - **kwargs - ) + uvd = create_mock_hera_obs(empty=True, **kwargs) uvd.data_array += 1.0 return uvd + def identifiable_data_from_uvd( ones: UVData, reshape: bool = True, ) -> np.ndarray: lsts = np.unique(ones.lst_array) - normfreqs = 2*np.pi * (ones.freq_array - 100e6) / 100e6 + normfreqs = 2 * np.pi * (ones.freq_array - 100e6) / 100e6 data = np.zeros_like(ones.data_array) orig_shape = data.shape # in-place reshape of data to make it easier for us. - data.shape = (len(ones.get_antpairs()), ones.Ntimes, ones.Nfreqs, len(ones.polarization_array)) + data.shape = ( + len(ones.get_antpairs()), + ones.Ntimes, + ones.Nfreqs, + len(ones.polarization_array), + ) for i, ap in enumerate(ones.get_antpairs()): for j, p in enumerate(ones.polarization_array): if ap[0] == ap[1] and p in (-5, -6): - d = np.outer(-p*lsts*1000, (ones.freq_array / 75e6)**-2) + d = np.outer(-p * lsts * 1000, (ones.freq_array / 75e6) ** -2) else: d = np.outer( - p*lsts, np.cos(normfreqs*ap[0]) + np.sin(normfreqs*ap[1])*1j + p * lsts, np.cos(normfreqs * ap[0]) + np.sin(normfreqs * ap[1]) * 1j ) data[i, :, :, j] = d @@ -127,10 +131,13 @@ def identifiable_data_from_uvd( data.shape = orig_shape return data + def create_uvd_identifiable( with_noise: bool = False, autos_noise: bool = False, - **kwargs) -> UVData: + flag_frac: float = 0.0, + **kwargs, +) -> UVData: """Make a UVData object with identifiable data. Each baseline, pol, LST and freq channel should be identifiable in the data with the @@ -142,16 +149,18 @@ def create_uvd_identifiable( uvd.data_array = identifiable_data_from_uvd(uvd) if with_noise: - add_noise_to_uvd(uvd, autos = autos_noise) + add_noise_to_uvd(uvd, autos=autos_noise) + if flag_frac > 0: + add_flags_to_uvd(uvd, flag_frac=flag_frac) return uvd + def add_noise_to_uvd(uvd, autos: bool = False): hd = io.to_HERAData(uvd) - data, flags, nsamples = hd.read() - dt = (data.times[1] - data.times[0])* units.si.day.in_units(units.si.s) + dt = (data.times[1] - data.times[0]) * units.si.day.in_units(units.si.s) df = data.freqs[1] - data.freqs[0] for bl in data.bls(): @@ -161,19 +170,30 @@ def add_noise_to_uvd(uvd, autos: bool = False): variance = noise.predict_noise_variance_from_autos( bl, data, dt=dt, df=df, nsamples=nsamples ) - data[bl] += ( - np.random.normal(scale=np.sqrt(variance/2)) - + 1j * np.random.normal(scale=np.sqrt(variance/2)) - ) + data[bl] += np.random.normal( + scale=np.sqrt(variance / 2) + ) + 1j * np.random.normal(scale=np.sqrt(variance / 2)) hd.update(data=data) + +def add_flags_to_uvd(uvd, flag_frac: float = 0.1): + hd = io.to_HERAData(uvd) + data, flags, nsamples = hd.read() + + for bl in data.bls(): + if bl[0] == bl[1] and bl[2][0] == bl[2][1]: + continue + flags[bl] = np.random.uniform(size=flags[bl].shape) < flag_frac + hd.update(flags=flags) + + def write_files_in_hera_format( uvds: list[UVData] | list[list[UVData]], tmpdir: Path, fmt: str = "zen.{jd:.5f}.sum.uvh5", in_jdint_dir: bool = True, + add_where_inpainted_files: bool = False, ) -> list[list[str]]: - if not tmpdir.exists(): tmpdir.mkdir() @@ -186,7 +206,7 @@ def write_files_in_hera_format( for uvdlist in uvds: if in_jdint_dir: jdint = int(uvdlist[0].time_array.min()) - daydir = (tmpdir / str(jdint)) + daydir = tmpdir / str(jdint) if not daydir.exists(): daydir.mkdir() else: @@ -199,6 +219,14 @@ def write_files_in_hera_format( obj.initialize_uvh5_file(fl, clobber=True) else: obj.write_uvh5(fl, clobber=True) + + if add_where_inpainted_files: + flg = UVFlag() + flg.from_uvdata(obj, copy_flags=True, waterfall=False) + flg.flag_array[:] = True # Everything inpainted. + flgfile = fl.with_suffix(".where_inpainted.h5") + flg.write(flgfile, clobber=True) + _fls.append(str(fl)) fls.append(_fls) @@ -207,7 +235,10 @@ def write_files_in_hera_format( else: return fls -def make_day(nfiles: int, creator: callable = create_mock_hera_obs, **kwargs) -> list[UVData]: + +def make_day( + nfiles: int, creator: callable = create_mock_hera_obs, **kwargs +) -> list[UVData]: """Make a day of UVData objects.""" uvds = [] @@ -215,17 +246,21 @@ def make_day(nfiles: int, creator: callable = create_mock_hera_obs, **kwargs) -> for i in range(nfiles): uvds.append(creator(lst_start=lst_start, **kwargs)) - if i ==0: + if i == 0: lsts = np.unique(uvds[0].lst_array) dlst = lsts[1] - lsts[0] lst_start = uvds[-1].lst_array[-1] + dlst - kwargs['jd_start'] = None # only use for first file, after, use lst_start + kwargs["jd_start"] = None # only use for first file, after, use lst_start return uvds + def make_dataset( - ndays: int, nfiles: int, start_jdint: int = 2459855, creator: callable = create_mock_hera_obs, + ndays: int, + nfiles: int, + start_jdint: int = 2459855, + creator: callable = create_mock_hera_obs, random_ants_to_drop: int = 0, - **kwargs + **kwargs, ) -> list[list[UVData]]: """Make a dataset of UVData objects.""" @@ -233,64 +268,86 @@ def make_dataset( default = creator(jdint=start_jdint, **kwargs) antpairs = default.get_antpairs() data_ants = list(set([ap[0] for ap in antpairs] + [ap[1] for ap in antpairs])) - if 'antpairs' in kwargs: - del kwargs['antpairs'] + if "antpairs" in kwargs: + del kwargs["antpairs"] uvds = [] for i in range(ndays): if random_ants_to_drop > 0: drop_ants = np.random.choice(data_ants, random_ants_to_drop, replace=False) - _antpairs = [ap for ap in antpairs if ap[0] not in drop_ants and ap[1] not in drop_ants] - kwargs['antpairs'] = _antpairs + _antpairs = [ + ap + for ap in antpairs + if ap[0] not in drop_ants and ap[1] not in drop_ants + ] + kwargs["antpairs"] = _antpairs uvds.append(make_day(nfiles, jdint=start_jdint + i, creator=creator, **kwargs)) return uvds -def make_uvc_ones(uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0): + +def make_uvc_ones( + uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0 +): uvc = UVCal.initialize_from_uvdata( uvd, - cal_style = "redundant", - gain_convention = "multiply", - jones_array = "linear", - cal_type = "gain", - metadata_only=False + cal_style="redundant", + gain_convention="multiply", + jones_array="linear", + cal_type="gain", + metadata_only=False, ) if flag_full_ant > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_full_ant, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_full_ant, replace=False + ) uvc.flag_array[badants] = True if flag_ant_time > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_ant_time, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_ant_time, replace=False + ) badtime = np.random.randint(0, uvc.Ntimes) uvc.flag_array[badants, :, badtime] = True if flag_ant_freq > 0: - badants = np.random.choice(np.arange(uvc.Nants_data), flag_ant_freq, replace=False) + badants = np.random.choice( + np.arange(uvc.Nants_data), flag_ant_freq, replace=False + ) badfreq = np.random.randint(0, uvc.Nfreqs) uvc.flag_array[badants, badfreq, :] = True return uvc + def identifiable_gains_from_uvc(uvc: UVCal): gains = np.zeros_like(uvc.gain_array) for i, ant in enumerate(uvc.ant_array): for j in range(uvc.Njones): gains[i, :, :, j] = np.outer( - np.exp(2j*np.pi*uvc.freq_array*(j+1)), - np.ones_like(uvc.time_array)*(ant+1) + np.exp(2j * np.pi * uvc.freq_array * (j + 1)), + np.ones_like(uvc.time_array) * (ant + 1), ) return gains -def make_uvc_identifiable(uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0): - uvc = make_uvc_ones(uvd, flag_full_ant=flag_full_ant, flag_ant_time=flag_ant_time, flag_ant_freq=flag_ant_freq) + +def make_uvc_identifiable( + uvd: UVData, flag_full_ant: int = 0, flag_ant_time: int = 0, flag_ant_freq: int = 0 +): + uvc = make_uvc_ones( + uvd, + flag_full_ant=flag_full_ant, + flag_ant_time=flag_ant_time, + flag_ant_freq=flag_ant_freq, + ) uvc.gain_array = identifiable_gains_from_uvc(uvc) return uvc + def write_cals_in_hera_format( uvds: list[UVCal] | list[list[UVCal]], tmpdir: Path, fmt: str = "zen.{jd:.5f}.sum.calfits", in_jdint_dir: bool = True, ) -> list[list[str]]: - if not tmpdir.exists(): tmpdir.mkdir() @@ -303,7 +360,7 @@ def write_cals_in_hera_format( for uvdlist in uvds: if in_jdint_dir: jdint = int(uvdlist[0].time_array.min()) - daydir = (tmpdir / str(jdint)) + daydir = tmpdir / str(jdint) if not daydir.exists(): daydir.mkdir() else: diff --git a/hera_cal/tests/test_datacontainer.py b/hera_cal/tests/test_datacontainer.py index 1ec1be146..29caf4e76 100644 --- a/hera_cal/tests/test_datacontainer.py +++ b/hera_cal/tests/test_datacontainer.py @@ -180,10 +180,10 @@ def test_del(self): del dc['bad_key'] with pytest.raises(ValueError, match='Tuple keys to delete must be in the format'): - del dc[(1,2,3,4)] - + del dc[(1, 2, 3, 4)] + with pytest.raises(ValueError, match='Tuple keys to delete must be in the format'): - del dc[[1,2,'xx']] + del dc[[1, 2, 'xx']] def test_getitem(self): dc = datacontainer.DataContainer(self.blpol) @@ -314,7 +314,7 @@ def test_dtype(self): assert dc.dtype is None dc = datacontainer.DataContainer({(1, 2): {'xx': 2}}) assert dc.dtype is None - + dc = datacontainer.DataContainer(self.blpolarr) assert dc.dtype == complex dc = datacontainer.DataContainer(self.bools) @@ -326,18 +326,19 @@ class SmallDataContainer: def __init__(self, d: dict): self._data = deepcopy(d) self.ants = set(sum(self._data.keys(), ())) - + def __getitem__(self, key): return self._data[key] - + def keys(self): return self._data.keys() - + blpol = SmallDataContainer(self.blpol) - + dc = datacontainer.DataContainer(blpol) assert dc.ants == blpol.ants + @pytest.mark.filterwarnings("ignore:The default for the `center` keyword has changed") class TestDataContainerWithRealData: @@ -467,6 +468,55 @@ def test_select_or_expand_times(self): assert np.all(dc.lsts == (np.arange(10) * 2 * np.pi / 10)[new_times]) assert np.all(dc.lsts_by_bl[0, 1] == (np.arange(10) * 2 * np.pi / 10)[new_times]) + def test_select_freqs(self): + fq = np.linspace(0, 1, 10) + dc1 = datacontainer.DataContainer({(0, 1, 'ee'): np.arange(10)}) + dc1.freqs = fq + + # Both of the following should pick all the freqs + dc2 = dc1.select_freqs(fq, in_place=False) + dc1.select_freqs(fq, in_place=True) + dc1.select_freqs() + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(10)) + assert np.all(dc.freqs == fq) + + # Now actually sub-select + fq2 = fq[:9] + dc2 = dc1.select_freqs(fq2, in_place=False) + dc1.select_freqs(fq2, in_place=True) + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(9)) + assert np.all(dc.freqs == fq2) + + # Sub-select by channel + chans = np.arange(8) + dc2 = dc1.select_freqs(channels=chans, in_place=False) + dc1.select_freqs(channels=chans, in_place=True) + + for dc in (dc1, dc2): + assert np.all(dc[(0, 1, 'ee')] == np.arange(8)) + assert np.all(dc.freqs == fq[chans]) + + # Error if both freqs and chans given + with pytest.raises(ValueError, match='Cannot specify both freqs and channels'): + dc1.select_freqs(fq, channels=chans) + + with pytest.raises(ValueError, match="All freqs must be in self.freqs"): + dc1.select_freqs(np.array([100.0])) + + # Ensure warning is raised without freqs + dc1.freqs = None + with pytest.warns(UserWarning, match='It is impossible'): + dc1.select_freqs(channels=chans) + + assert len(dc1[(0, 1, 'ee')]) == 8 + + with pytest.raises(ValueError, match='Cannot select frequencies'): + dc1.select_freqs(fq2) + def test_RedDataContainer(): # build hexagonal array and data which tells us which baseline it comes from @@ -486,7 +536,7 @@ def test_RedDataContainer(): # build an incomplete datacontainer, then finish it rdata6 = datacontainer.RedDataContainer(deepcopy(data), reds[:-1]) rdata6[reds[-1][0]] = deepcopy(data[reds[-1][0]]) - #rdata6.build_red_keys(reds) + # rdata6.build_red_keys(reds) # make sure that the data for a redundant group are being accessed from the same place in memory for i, rdata in enumerate([rdata1, rdata2, rdata3, rdata4, rdata5, rdata6]): @@ -555,4 +605,3 @@ def test_RedDataContainerKeyManipulation(): rdc[1, 3, 'ee'] = 21j with pytest.raises(ValueError): rdc.build_red_keys([[(0, 2, 'ee'), (1, 3, 'ee')]]) - diff --git a/hera_cal/tests/test_frf.py b/hera_cal/tests/test_frf.py index 5f05c5f37..1eb86c48e 100644 --- a/hera_cal/tests/test_frf.py +++ b/hera_cal/tests/test_frf.py @@ -13,6 +13,7 @@ from pyuvdata import UVData from pyuvdata import utils as uvutils import unittest +import yaml from scipy import stats from scipy import constants from pyuvdata import UVFlag, UVBeam @@ -930,6 +931,89 @@ def test_load_dayenu_filter_and_write(self, tmpdir): os.remove(outfilename) shutil.rmtree(cdir) + @pytest.mark.parametrize("flip", [True, False]) + def test_load_tophat_frfilter_and_write_with_filter_yaml(self, tmpdir, flip): + tmp_path = tmpdir.strpath + uvh5 = os.path.join( + DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" + ) + frate_centers = {(53, 54): 0.1} + frate_half_widths = {(53, 54): 0.05} + if flip: + filter_info = dict( + filter_centers={ + str(k[::-1]): -v for k, v in frate_centers.items() + }, + filter_half_widths={ + str(k[::-1]): v for k, v in frate_half_widths.items() + }, + ) + else: + filter_info = dict( + filter_centers={str(k): v for k, v in frate_centers.items()}, + filter_half_widths={str(k): v for k, v in frate_half_widths.items()}, + ) + frate_centers = {k+("ee",): v for k, v in frate_centers.items()} + frate_half_widths = {k+("ee",): v for k, v in frate_half_widths.items()} + + with open(tmpdir / "filter_info.yaml", "w") as f: + yaml.dump(filter_info, f) + + outfilename = os.path.join(tmp_path, 'temp.h5') + frf.load_tophat_frfilter_and_write( + uvh5, + res_outfilename=outfilename, + tol=1e-4, + clobber=True, + Nbls_per_load=1, + param_file=str(tmpdir / "filter_info.yaml"), + case="param_file", + skip_autos=True, + ) + hd = io.HERAData(outfilename) + d, f, n = hd.read(bls=[(53, 54, 'ee')]) + for bl in d: + assert not np.allclose(d[bl], 0) + + frfil = frf.FRFilter(uvh5, filetype='uvh5') + frfil.read(bls=[(53, 54, 'ee')]) + frfil.tophat_frfilter( + keys=[(53, 54, 'ee')], + tol=1e-4, + verbose=True, + frate_centers=frate_centers, + frate_half_widths=frate_half_widths, + ) + + # Check that the filtered data both ways matches. + np.testing.assert_almost_equal( + d[(53, 54, 'ee')], frfil.clean_resid[(53, 54, 'ee')], decimal=5 + ) + + # Check that the flags match. + np.testing.assert_array_equal(f[(53, 54, 'ee')], frfil.flags[(53, 54, 'ee')]) + + + def test_load_tophat_frfilter_and_write_with_bad_filter_yaml(self, tmpdir): + tmp_path = tmpdir.strpath + uvh5 = os.path.join( + DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5" + ) + param_file = os.path.join(DATA_PATH, "example_filter_params.yaml") + + outfilename = os.path.join(tmp_path, 'temp.h5') + with pytest.raises(ValueError, match="(53, 54)"): + frf.load_tophat_frfilter_and_write( + uvh5, + res_outfilename=outfilename, + tol=1e-4, + clobber=True, + Nbls_per_load=1, + param_file=param_file, + case="param_file", + ) + + def test_tophat_clean_argparser(self): sys.argv = [sys.argv[0], 'a', '--clobber', '--window', 'blackmanharris', '--max_frate_coeffs', '0.024', '-0.229'] parser = frf.tophat_frfilter_argparser() diff --git a/hera_cal/tests/test_io.py b/hera_cal/tests/test_io.py index c447860b9..e28fba76a 100644 --- a/hera_cal/tests/test_io.py +++ b/hera_cal/tests/test_io.py @@ -779,6 +779,7 @@ def test_to_dataarray(self): nsample_array = hd.set_data_array_with_datacontainer(n, hd.nsample_array.copy()) assert nsample_array.dtype == n.dtype + class Test_ReadHeraHdf5(object): def setup_method(self): self.uvh5_1 = os.path.join(DATA_PATH, "zen.2458116.61019.xx.HH.XRS_downselected.uvh5") @@ -943,20 +944,16 @@ def test_comp_to_HERAData_dc(self, infile, bls, pols): hd2 = io.HERAData(indata) if bls == "first10": bls = hd2.get_antpairs()[:10] - print(bls) elif bls == "conjugated": bls = hd2.get_antpairs()[:15] bls = [(b, a) for a, b in bls] - d, f, n = hd.read(check=True,bls=bls, pols=pols) + d, f, n = hd.read(check=True, bls=bls, pols=pols) d2, f2, n2 = hd2.read(bls=bls, polarizations=pols) - print(d.antpairs()) - print(d.times_by_bl.keys()) # compare all data and metadata for dc1, dc2 in zip([d, f, n], [d2, f2, n2]): self.compare_datacontainers(dc1, dc2, allow_close=infile != 'uvh5_h4c') - @pytest.mark.parametrize( 'infile', (['uvh5_1'], 'uvh5_h4c') ) @@ -1758,6 +1755,7 @@ def test_throw_away_flagged_ants(tmpdir): else: assert ant in set(hdo.ant_1_array).union(set(hdo.ant_2_array)) + class Test_UVDataFromFastUVH5: """Test the uvdata_from_fastuvh5 function.""" @@ -1830,5 +1828,3 @@ def test_pass_metadata(self): uvd = io.uvdata_from_fastuvh5(self.meta_default, history='I made this!') assert uvd.history == 'I made this!' - - \ No newline at end of file diff --git a/hera_cal/tests/test_lstbin_simple.py b/hera_cal/tests/test_lstbin_simple.py index bfd60ef5b..42a1f4308 100644 --- a/hera_cal/tests/test_lstbin_simple.py +++ b/hera_cal/tests/test_lstbin_simple.py @@ -3,14 +3,13 @@ from . import mock_uvdata as mockuvd import pytest from pathlib import Path -from pyuvdata import UVCal -from itertools import combinations_with_replacement +from pyuvdata import UVCal, UVFlag import numpy as np from hera_cal import lstbin_simple as lstbin_simple import pytest import numpy as np from pyuvdata import UVCal, UVData -from .. import io, utils, lstbin_simple, noise +from .. import io, utils, lstbin_simple from hera_cal import apply_cal from pyuvdata import utils as uvutils from hera_cal.red_groups import RedundantGroups @@ -20,12 +19,15 @@ try: benchmark except NameError: - @pytest.fixture(scope='module') + + @pytest.fixture(scope="module") def benchmark(): def fnc(wrapped, *args, **kwargs): return wrapped(*args, **kwargs) + return fnc + class Test_LSTAlign: @classmethod def get_lst_align_data( @@ -33,48 +35,56 @@ def get_lst_align_data( ntimes: int = 10, nfreqs: int = 5, npols: int = 4, - nants: int = 4, + nants: int = 4, ndays: int = 1, creator: callable = mockuvd.create_uvd_identifiable, ): - uvds = mockuvd.make_dataset( ndays=ndays, nfiles=1, creator=creator, - pols=('xx', 'yy', 'xy', 'yx')[:npols], - freqs = np.linspace(100e6, 200e6, nfreqs), - ants = np.arange(nants), + pols=("xx", "yy", "xy", "yx")[:npols], + freqs=np.linspace(100e6, 200e6, nfreqs), + ants=np.arange(nants), ntimes=ntimes, time_axis_faster_than_bls=False, ) uvds = [uvd[0] for uvd in uvds] # flatten to single list - data = np.concatenate([uvd.data_array.reshape((ntimes, -1, nfreqs, npols)) for uvd in uvds], axis=0) + data = np.concatenate( + [uvd.data_array.reshape((ntimes, -1, nfreqs, npols)) for uvd in uvds], + axis=0, + ) freq_array = uvds[0].freq_array - data_lsts = np.concatenate([np.unique(uvds[0].lst_array),]*len(uvds)) + data_lsts = np.concatenate( + [ + np.unique(uvds[0].lst_array), + ] + * len(uvds) + ) antpairs = uvds[0].get_antpairs() antpos = uvds[0].antenna_positions # Ensure that each LST is in its own bin lsts = np.sort(np.unique(data_lsts)) - lst_bin_edges = np.concatenate(( - [lsts[0] - (lsts[1] - lsts[0])/2], - (lsts[1:] + lsts[:-1])/2, - [lsts[-1] + (lsts[-1] - lsts[-2])/2] - )) + lst_bin_edges = np.concatenate( + ( + [lsts[0] - (lsts[1] - lsts[0]) / 2], + (lsts[1:] + lsts[:-1]) / 2, + [lsts[-1] + (lsts[-1] - lsts[-2]) / 2], + ) + ) return dict( - data = data, - data_lsts = data_lsts, + data=data, + data_lsts=data_lsts, antpairs=antpairs, lst_bin_edges=lst_bin_edges, freq_array=freq_array, antpos=antpos, ) - def test_bad_inputs(self): # Test that we get the right errors for bad inputs kwargs = self.get_lst_align_data() @@ -82,12 +92,12 @@ def test_bad_inputs(self): def lst_align_with(**kw): return lstbin_simple.lst_align(**{**kwargs, **kw}) - data = np.ones(kwargs['data'].shape[:-1] + (5,)) + data = np.ones(kwargs["data"].shape[:-1] + (5,)) with pytest.raises(ValueError, match="data has more than 4 pols"): lst_align_with(data=data) with pytest.raises(ValueError, match="data should have shape"): - lst_align_with(freq_array=kwargs['freq_array'][:-1]) + lst_align_with(freq_array=kwargs["freq_array"][:-1]) with pytest.raises(ValueError, match="flags should have shape"): lst_align_with(flags=np.ones(13, dtype=bool)) @@ -97,37 +107,41 @@ def lst_align_with(**kw): lst_align_with(nsamples=np.ones(13)) # Use only one bin edge - with pytest.raises(ValueError, match="lst_bin_edges must have at least 2 elements"): + with pytest.raises( + ValueError, match="lst_bin_edges must have at least 2 elements" + ): lst_align_with(lst_bin_edges=np.ones(1)) # Try rephasing without freq_array or antpos - with pytest.raises(ValueError, match="freq_array and antpos is needed for rephase"): + with pytest.raises( + ValueError, match="freq_array and antpos is needed for rephase" + ): lst_align_with(rephase=True, antpos=None) def test_increasing_lsts_one_per_bin(self, benchmark): kwargs = self.get_lst_align_data(ntimes=6) - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) + bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) # We should not be changing the data at all. d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d, kwargs['data']) + np.testing.assert_allclose(d, kwargs["data"]) assert not np.any(f) assert np.all(n == 1.0) assert len(bins) == 6 def test_multi_days_one_per_bin(self, benchmark): kwargs = self.get_lst_align_data(ndays=2, ntimes=7) - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) + bins, d, f, n, inp = benchmark(lstbin_simple.lst_align, rephase=False, **kwargs) # We should not be changing the data at all. d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f) assert np.all(n == 1.0) assert len(bins) == 7 @@ -136,17 +150,19 @@ def test_multi_days_with_flagging(self, benchmark): kwargs = self.get_lst_align_data(ndays=2, ntimes=7) # Flag everything after the first day, and make the data there crazy. - flags = np.zeros_like(kwargs['data'], dtype=bool) + flags = np.zeros_like(kwargs["data"], dtype=bool) flags[7:] = True - kwargs['data'][7:] = 1000.0 + kwargs["data"][7:] = 1000.0 - bins, d, f, n = benchmark(lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs) + bins, d, f, n, inp = benchmark( + lstbin_simple.lst_align, rephase=False, flags=flags, **kwargs + ) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f[:, 0]) assert np.all(f[:, 1]) assert len(bins) == 7 @@ -155,71 +171,89 @@ def test_multi_days_with_nsamples_zero(self, benchmark): kwargs = benchmark(self.get_lst_align_data, ndays=2, ntimes=7) # Flag everything after the first day, and make the data there crazy. - nsamples = np.ones_like(kwargs['data'], dtype=float) + nsamples = np.ones_like(kwargs["data"], dtype=float) nsamples[7:] = 0.0 - kwargs['data'][7:] = 1000.0 + kwargs["data"][7:] = 1000.0 - bins, d, f, n = lstbin_simple.lst_align(rephase=False, nsamples=nsamples, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align( + rephase=False, nsamples=nsamples, **kwargs + ) d = np.squeeze(np.asarray(d)) f = np.squeeze(np.asarray(f)) n = np.squeeze(np.asarray(n)) - np.testing.assert_allclose(d[:, 0], kwargs['data'][:7]) + np.testing.assert_allclose(d[:, 0], kwargs["data"][:7]) assert not np.any(f) assert np.all(n[:, 0] == 1.0) assert np.all(n[:, 1] == 0.0) assert len(bins) == 7 - def test_rephase(self, benchmark): """Test that rephasing where each bin is already at center does nothing.""" kwargs = self.get_lst_align_data(ntimes=7) - bins0, d0, f0, n0 = benchmark(lstbin_simple.lst_align, rephase=True, **kwargs) - bins, d, f, n = lstbin_simple.lst_align(rephase=False, **kwargs) + bins0, d0, f0, n0, inp = benchmark( + lstbin_simple.lst_align, rephase=True, **kwargs + ) + bins, d, f, n, inp = lstbin_simple.lst_align(rephase=False, **kwargs) np.testing.assert_allclose(d, d0, rtol=1e-6) np.testing.assert_allclose(f, f0, rtol=1e-6) np.testing.assert_allclose(n, n0, rtol=1e-6) assert len(bins) == len(bins0) def test_lstbinedges_modulus(self, benchmark): - kwargs = self.get_lst_align_data(ntimes=7) edges = kwargs.pop("lst_bin_edges") lst_bin_edges = edges.copy() - lst_bin_edges -= 4*np.pi + lst_bin_edges -= 4 * np.pi - bins, d0, f0, n0 = benchmark(lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs) + bins, d0, f0, n0, inp = benchmark( + lstbin_simple.lst_align, lst_bin_edges=lst_bin_edges, **kwargs + ) lst_bin_edges = edges.copy() - lst_bin_edges += 4*np.pi + lst_bin_edges += 4 * np.pi - bins, d, f, n = lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges, **kwargs) + bins, d, f, n, inp = lstbin_simple.lst_align( + lst_bin_edges=lst_bin_edges, **kwargs + ) np.testing.assert_allclose(d, d0) np.testing.assert_allclose(f, f0) np.testing.assert_allclose(n, n0) - with pytest.raises(ValueError, match="lst_bin_edges must be monotonically increasing."): + with pytest.raises( + ValueError, match="lst_bin_edges must be monotonically increasing." + ): lstbin_simple.lst_align(lst_bin_edges=lst_bin_edges[::-1], **kwargs) + def test_argparser_returns(): args = lstbin_simple.lst_bin_arg_parser() assert args is not None + class Test_ReduceLSTBins: @classmethod def get_input_data( - cls, - nfreqs: int=3, npols: int = 1, nbls: int = 6, ntimes: tuple[int] = (4, ) + cls, nfreqs: int = 3, npols: int = 1, nbls: int = 6, ntimes: tuple[int] = (4,) ): data = np.random.random((nbls, nfreqs, npols)) # Make len(ntimes) LST bins, each with ntimes[i] time-entries, all the same # data. - data = [np.array([data, ]*nt).reshape((nt,) + data.shape)*(i+1) for i, nt in enumerate(ntimes)] + data = [ + np.array( + [ + data, + ] + * nt + ).reshape((nt,) + data.shape) + * (i + 1) + for i, nt in enumerate(ntimes) + ] flags = [np.zeros(d.shape, dtype=bool) for d in data] nsamples = [np.ones(d.shape, dtype=float) for d in data] @@ -227,14 +261,19 @@ def get_input_data( def test_one_point_per_bin(self, benchmark): d, f, n = self.get_input_data(ntimes=(1,)) - dd, ff, std, nn = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + rdc = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) # reduce_data swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) @@ -243,24 +282,31 @@ def test_one_point_per_bin(self, benchmark): def test_zerosize_bin(self): d, f, n = self.get_input_data(ntimes=(0, 1)) print(d[0].shape, len(d)) - dd, ff, std, nn = lstbin_simple.reduce_lst_bins(d, f, n) + rdc = lstbin_simple.reduce_lst_bins(d, f, n, get_mad=True) - assert dd.shape[1] == 2 # 2 LST bins - assert np.all(np.isnan(dd[:, 0])) - assert np.all(ff[:, 0]) - assert np.all(nn[:, 0] == 0.0) + assert rdc["data"].shape[1] == 2 # 2 LST bins + assert np.all(np.isnan(rdc["data"][:, 0])) + assert np.all(rdc["flags"][:, 0]) + assert np.all(rdc["nsamples"][:, 0] == 0.0) + assert np.all(np.isinf(rdc["mad"][:, 0])) + assert np.all(np.isnan(rdc["median"][:, 0])) - @pytest.mark.parametrize("ntimes", [(4, ), (5, 4)]) + @pytest.mark.parametrize("ntimes", [(4,), (5, 4)]) def test_multi_points_per_bin(self, ntimes, benchmark): d, f, n = self.get_input_data(ntimes=ntimes) - dd, ff, std, nn = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) + rdc = benchmark(lstbin_simple.reduce_lst_bins, d, f, n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) - # reduce_lst_bins swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + # reduce_data swaps the order of bls/times + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) assert not np.any(ff) for lst in range(len(ntimes)): @@ -271,19 +317,31 @@ def test_multi_points_per_bin_flagged(self): d, f, n = self.get_input_data(ntimes=(4,)) f[0][2:] = True d[0][2:] = 1000.0 - dd, ff, std, nn = lstbin_simple.reduce_lst_bins(d,f,n) + rdc = lstbin_simple.reduce_lst_bins(d, f, n) - assert dd.shape == ff.shape == std.shape == nn.shape + assert ( + rdc["data"].shape + == rdc["flags"].shape + == rdc["std"].shape + == rdc["nsamples"].shape + ) # reduce_data swaps the order of bls/times - dd = dd.swapaxes(0, 1) - ff = ff.swapaxes(0, 1) - nn = nn.swapaxes(0, 1) + dd = rdc["data"].swapaxes(0, 1) + ff = rdc["flags"].swapaxes(0, 1) + nn = rdc["nsamples"].swapaxes(0, 1) np.testing.assert_allclose(dd[0], d[0][0]) assert not np.any(ff) np.testing.assert_allclose(nn, 2.0) + def test_get_med_mad(self): + d, f, n = self.get_input_data(ntimes=(4,)) + rdc = lstbin_simple.reduce_lst_bins(d, f, n, get_mad=True) + + assert np.all(rdc["median"] == rdc["data"]) + + def test_apply_calfile_rules(tmpdir_factory): direc = tmpdir_factory.mktemp("test_apply_calfile_rules") @@ -297,8 +355,8 @@ def test_apply_calfile_rules(tmpdir_factory): data_files, calfiles = lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=False + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=False, ) assert len(data_files[0]) == 3 assert len(calfiles[0]) == 3 @@ -307,36 +365,35 @@ def test_apply_calfile_rules(tmpdir_factory): with pytest.raises(IOError, match="does not exist"): lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=False + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=False, ) data_files, calfiles = lstbin_simple.apply_calfile_rules( [[str(d) for d in datas]], - calfile_rules = [('.uvh5', '.calfile')], - ignore_missing=True + calfile_rules=[(".uvh5", ".calfile")], + ignore_missing=True, ) assert len(data_files[0]) == 2 assert len(calfiles[0]) == 2 class Test_LSTAverage: - def test_sigma_clip_without_outliers(self, benchmark): - shape = (7,8,9) + shape = (7, 8, 9) np.random.seed(42) - data = np.random.normal(size=shape) + np.random.normal(size=shape)*1j + data = np.random.normal(size=shape) + np.random.normal(size=shape) * 1j nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n =lstbin_simple.lst_average( + data_n, flg_n, std_n, norm_n, daysbinned = lstbin_simple.lst_average( data=data, nsamples=nsamples, flags=flags, sigma_clip_thresh=None, ) - data, flg, std, norm = benchmark( + data, flg, std, norm, daysbinned = benchmark( lstbin_simple.lst_average, data=data, nsamples=nsamples, @@ -345,18 +402,20 @@ def test_sigma_clip_without_outliers(self, benchmark): ) assert data.shape == flg.shape == std.shape == norm.shape == nsamples.shape[1:] - np.testing.assert_allclose(data,data_n) + np.testing.assert_allclose(data, data_n) def test_average_repeated(self): - shape = (7,8,9) - _data = np.random.random(shape) + np.random.random(shape)*1j + shape = (7, 8, 9) + _data = np.random.random(shape) + np.random.random(shape) * 1j data = np.array([_data, _data, _data]) nsamples = np.ones_like(data) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=data, + nsamples=nsamples, + flags=flags, ) assert np.allclose(data_n, _data) @@ -366,8 +425,10 @@ def test_average_repeated(self): # Now flag the last "night" flags[-1] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=data, + nsamples=nsamples, + flags=flags, ) assert np.allclose(data_n, _data) @@ -376,29 +437,34 @@ def test_average_repeated(self): assert np.allclose(norm_n, 2.0) def test_std_simple(self): - shape = (5000, 2) # 1000 nights, doesn't matter what the other axis is. + shape = (5000, 1, 2, 2) # 1000 nights, doesn't matter what the other axis is. std = 2.0 - data = np.random.normal(scale=std, size=shape) + np.random.normal(scale=std, size=shape)*1j + data = ( + np.random.normal(scale=std, size=shape) + + np.random.normal(scale=std, size=shape) * 1j + ) nsamples = np.ones_like(data, dtype=float) flags = np.zeros_like(data, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=data, + nsamples=nsamples, + flags=flags, ) # Check the averaged data is within 6 sigma of the population mean - np.testing.assert_allclose(data_n, 0.0, atol=std*6/np.sqrt(shape[0])) + np.testing.assert_allclose(data_n, 0.0, atol=std * 6 / np.sqrt(shape[0])) # Check the standard deviation is within 20% of the true value - np.testing.assert_allclose(std_n, std + std*1j, rtol=0.2) + np.testing.assert_allclose(std_n, std + std * 1j, rtol=0.2) assert not np.any(flg_n) @pytest.mark.parametrize("nsamples", ("ones", "random")) @pytest.mark.parametrize("flags", ("zeros", "random")) def test_std(self, nsamples, flags): - shape = (5000, 2) # 1000 nights, doesn't matter what the other axis is. + shape = (5000, 1, 2, 2) # 1000 nights, doesn't matter what the other axis is. std = 2.0 if nsamples == "ones": @@ -413,16 +479,18 @@ def test_std(self, nsamples, flags): else: flags = np.random.random(shape) > 0.1 - data = np.random.normal(scale=std) + np.random.normal(scale=std)*1j + data = np.random.normal(scale=std) + np.random.normal(scale=std) * 1j flags = np.zeros(data.shape, dtype=bool) - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=data, nsamples=nsamples, flags=flags, + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=data, + nsamples=nsamples, + flags=flags, ) # Check the averaged data is within 6 sigma of the population mean - assert np.allclose(data_n, 0.0, atol=std*6/np.sqrt(shape[0])) + assert np.allclose(data_n, 0.0, atol=std * 6 / np.sqrt(shape[0])) # In reality the std is infinity where flags is True std[flags] = np.inf @@ -430,103 +498,222 @@ def test_std(self, nsamples, flags): sample_var_expectation = sve = w * (shape[0] - 1) # Check the standard deviation is within 20% of the true value - np.testing.assert_allclose(std_n, np.sqrt(sve) + np.sqrt(sve)*1j, rtol=0.2) + np.testing.assert_allclose(std_n, np.sqrt(sve) + np.sqrt(sve) * 1j, rtol=0.2) assert not np.any(flg_n) + def test_inpaint_mode(self): + shape = (3, 2, 4, 1) # nights, bls, freqs, pols + _data = np.random.random(shape) + np.random.random(shape) * 1j + nsamples = np.ones(_data.shape, dtype=float) + flags = np.zeros(_data.shape, dtype=bool) + + # First test -- no flags should mean inpainted_mode does nothing. + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + assert np.allclose(df, di) + assert np.allclose(ff, fi) + np.testing.assert_allclose(stdf, stdi) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + # Now test with a whole LST bin flagged for a single bl-chan-pol (but inpainted) + flags[:, 0, 0, 0] = True + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + # The data, flags and std in the fully-flagged bin should be different, but + # Nsamples and Flags should be the same. + assert not np.allclose(df[0, 0, 0], di[0, 0, 0]) + assert np.allclose(df[1:], di[1:]) + assert not np.allclose(ff, fi) + assert not np.allclose(stdf[0, 0, 0], stdi[0, 0, 0]) + assert np.allclose(stdf[1:], stdi[1:]) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + # Now test with a whole spectrum flagged for one night + flags[:] = False + flags[0, 0, :, 0] = True + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=False + ) + + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_data, nsamples=nsamples, flags=flags, inpainted_mode=True + ) + + # This should give exactly the same results either way, because the full-flagged + # blt is considered to NOT be inpainted by default. + assert np.allclose(df, di) + assert np.allclose(ff, fi) + np.testing.assert_allclose(stdf, stdi) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + + # However, if we had explicitly told the routine that the blt was inpainted, + # we'd get a different result... + _d, _f = lstbin_simple.get_masked_data( + _data, nsamples, flags, inpainted=np.ones_like(flags), inpainted_mode=False + ) + df, ff, stdf, nf, dbf = lstbin_simple.lst_average( + data=_d, + nsamples=nsamples, + flags=_f, + inpainted_mode=False, + ) + + _d, _f = lstbin_simple.get_masked_data( + _data, nsamples, flags, inpainted=np.ones_like(flags), inpainted_mode=True + ) + di, fi, stdi, ni, dbi = lstbin_simple.lst_average( + data=_d, + nsamples=nsamples, + flags=_f, + inpainted_mode=True, + ) + + # The LST-binned data will be different for blt=0, pol=0: + assert not np.allclose(df[0, :, 0], di[0, :, 0]) + assert np.allclose(df[1:], di[1:]) + assert np.allclose(ff, fi) + assert not np.allclose(stdf[0, :, 0], stdi[0, :, 0]) + assert np.allclose(stdf[1:], stdi[1:]) + assert np.allclose(nf, ni) + assert np.allclose(dbf, dbi) + def test_flag_below_min_N(self): - shape = (7,8,9) - _data = np.random.random(shape) + np.random.random(shape)*1j + shape = (7, 8, 9, 2) + _data = np.random.random(shape) + np.random.random(shape) * 1j nsamples = np.ones(_data.shape, dtype=float) flags = np.zeros(_data.shape, dtype=bool) # No samples have more than min_N, so they should all be flagged. - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=8, - flag_below_min_N=True + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=8, + flag_below_min_N=True, ) assert np.all(flg_n) - assert np.all(norm_n==0) - assert np.all(np.isinf(std_n)) - assert np.all(np.isnan(data_n)) + assert np.all(norm_n == 7) # Even though they're flagged, we track the nsamples + assert not np.all(np.isinf(std_n)) + assert not np.all(np.isnan(data_n)) # this time, there's enough samples, but too many are flagged... flags[:5] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, - flag_below_min_N=True + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=5, + flag_below_min_N=True, ) assert np.all(flg_n) - assert np.all(norm_n==0) + # nsamples is zero because all are flagged. + assert np.all(norm_n == 0) assert np.all(np.isinf(std_n)) - assert np.all(np.isnan(data_n)) # this time, only one column is flagged too much... - # this time, there's enough samples, but too many are flagged... flags[:] = False flags[:5, 0] = True - data_n, flg_n, std_n, norm_n = lstbin_simple.lst_average( - data=_data, nsamples=nsamples, flags=flags, sigma_clip_min_N=5, - flag_below_min_N=True + data_n, flg_n, std_n, norm_n, db = lstbin_simple.lst_average( + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_min_N=5, + flag_below_min_N=True, ) assert np.all(flg_n[0]) - assert np.all(norm_n[0]==0) + assert np.all(norm_n[0] == 0) assert np.all(np.isinf(std_n[0])) - assert np.all(np.isnan(data_n[0])) assert not np.any(flg_n[1:]) - assert np.all(norm_n[1:]>0) + assert np.all(norm_n[1:] == 7) assert not np.any(np.isinf(std_n[1:])) assert not np.any(np.isnan(data_n[1:])) - + + def test_sigma_clip_and_inpainted_warning(self): + """Test that a warning is raised if doing inpainted_mode as well as sigma-clip.""" + shape = (7, 8, 9, 2) + _data = np.random.random(shape) + np.random.random(shape) * 1j + nsamples = np.ones(_data.shape, dtype=float) + flags = np.zeros(_data.shape, dtype=bool) + + with pytest.warns(UserWarning, match="Sigma-clipping in in-painted mode"): + lstbin_simple.lst_average( + data=_data, + nsamples=nsamples, + flags=flags, + sigma_clip_thresh=2.0, + inpainted_mode=True, + ) + def create_small_array_uvd(identifiable: bool = False, **kwargs): kwargs.update( freqs=np.linspace(150e6, 160e6, 100), - ants=[0,1,2,127,128], - antpairs=[(0,0), (0,1), (0,2), (1, 1), (1,2), (2, 2)], - pols=('xx', 'yy') + ants=[0, 1, 2, 127, 128], + antpairs=[(0, 0), (0, 1), (0, 2), (1, 1), (1, 2), (2, 2)], + pols=("xx", "yy"), ) if identifiable: return mockuvd.create_uvd_identifiable(**kwargs) else: return mockuvd.create_uvd_ones(**kwargs) + @pytest.fixture(scope="function") def uvd(): return create_small_array_uvd() + @pytest.fixture(scope="function") def uvd_redavg(): return create_small_array_uvd(redundantly_averaged=True) + @pytest.fixture(scope="function") def uvc(uvd): return UVCal.initialize_from_uvdata( uvd, - cal_style = "redundant", - gain_convention = "multiply", - jones_array = "linear", - cal_type = "gain", + cal_style="redundant", + gain_convention="multiply", + jones_array="linear", + cal_type="gain", metadata_only=False, ) + @pytest.fixture(scope="function") def uvd_file(uvd, tmpdir_factory) -> Path: # Write to file, so we can run lst_bin_files tmp = Path(tmpdir_factory.mktemp("test_partial_times")) - mock = tmp / 'mock.uvh5' + mock = tmp / "mock.uvh5" uvd.write_uvh5(str(mock), clobber=True) return mock + @pytest.fixture(scope="function") def uvd_redavg_file(uvd_redavg, tmpdir_factory) -> Path: # Write to file, so we can run lst_bin_files tmp = Path(tmpdir_factory.mktemp("test_partial_times")) - mock = tmp / 'mock.uvh5' + mock = tmp / "mock.uvh5" uvd_redavg.write_uvh5(str(mock), clobber=True) return mock @@ -535,29 +722,30 @@ def uvd_redavg_file(uvd_redavg, tmpdir_factory) -> Path: def uvc_file(uvc, uvd_file: Path) -> Path: # Write to file, so we can run lst_bin_files tmp = uvd_file.parent - fl = f'{tmp}/mock.calfits' + fl = f"{tmp}/mock.calfits" uvc.write_calfits(str(fl), clobber=True) return fl + class Test_LSTBinFilesForBaselines: def test_defaults(self, uvd, uvd_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], - lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], + lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max() + 0.01], antpairs=uvd.get_antpairs(), - rephase=False + rephase=False, ) - lstbins, d, f, n, times = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d, f, n, inpflg, times = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], - lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()+0.01], + lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max() + 0.01], antpairs=uvd.get_antpairs(), freqs=uvd.freq_array, pols=uvd.polarization_array, - time_idx = [np.ones(uvd.Ntimes, dtype=bool)], + time_idx=[np.ones(uvd.Ntimes, dtype=bool)], time_arrays=[np.unique(uvd.time_array)], - lsts = np.unique(uvd.lst_array), - rephase=False + lsts=np.unique(uvd.lst_array), + rephase=False, ) np.testing.assert_allclose(d0, d) @@ -566,32 +754,41 @@ def test_defaults(self, uvd, uvd_file): np.testing.assert_allclose(times0, times) def test_empty(self, uvd, uvd_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=[(127, 128)], - rephase=True + rephase=True, ) assert np.all(f0) def test_extra(self, uvd, uvd_file): # Providing baselines that don't exist in the file is fine, they're just ignored. - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], antpairs=uvd.get_antpairs() + [(127, 128)], - rephase=True + rephase=True, ) - assert np.all(f0[0][:, -1]) # last baseline is the extra one that's all flagged. + assert np.all( + f0[0][:, -1] + ) # last baseline is the extra one that's all flagged. def test_freqrange(self, uvd, uvd_file, uvc_file): """Test that providing freq_range works.""" - bins, data, flags, nsamples, times = lstbin_simple.lst_bin_files_for_baselines( + ( + bins, + data, + flags, + nsamples, + inpflg, + times, + ) = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], - cal_files = [uvc_file], + cal_files=[uvc_file], freq_min=153e6, freq_max=158e6, antpairs=uvd.get_antpairs(), @@ -600,7 +797,7 @@ def test_freqrange(self, uvd, uvd_file, uvc_file): assert data[0].shape[-2] < uvd.Nfreqs def test_bad_pols(self, uvd, uvd_file): - with pytest.raises(KeyError, match='7'): + with pytest.raises(KeyError, match="7"): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -609,7 +806,9 @@ def test_bad_pols(self, uvd, uvd_file): ) def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): - with pytest.raises(ValueError, match='reds must be provided if redundantly_averaged is True'): + with pytest.raises( + ValueError, match="reds must be provided if redundantly_averaged is True" + ): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -617,7 +816,9 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): antpairs=uvd.get_antpairs(), ) - with pytest.raises(ValueError, match="Cannot apply calibration if redundantly_averaged is True"): + with pytest.raises( + ValueError, match="Cannot apply calibration if redundantly_averaged is True" + ): lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_file], lst_bin_edges=[uvd.lst_array.min(), uvd.lst_array.max()], @@ -625,15 +826,20 @@ def test_incorrect_red_input(self, uvd, uvd_file, uvc_file): redundantly_averaged=True, reds=RedundantGroups.from_antpos( dict(zip(uvd.antenna_numbers, uvd.antenna_positions)), - pols=uvutils.polnum2str(uvd.polarization_array, x_orientation=uvd.x_orientation), + pols=uvutils.polnum2str( + uvd.polarization_array, x_orientation=uvd.x_orientation + ), ), antpairs=uvd.get_antpairs(), ) def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): - lstbins, d0, f0, n0, times0 = lstbin_simple.lst_bin_files_for_baselines( + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( data_files=[uvd_redavg_file], - lst_bin_edges=[uvd_redavg.lst_array.min()-0.1, uvd_redavg.lst_array.max()+0.1], + lst_bin_edges=[ + uvd_redavg.lst_array.min() - 0.1, + uvd_redavg.lst_array.max() + 0.1, + ], redundantly_averaged=True, rephase=False, antpairs=uvd_redavg.get_antpairs(), @@ -643,7 +849,90 @@ def test_simple_redundant_averaged_file(self, uvd_redavg, uvd_redavg_file): ) assert len(d0) == 1 - assert d0[0].shape == (uvd_redavg.Ntimes, uvd_redavg.Nbls, uvd_redavg.Nfreqs, uvd_redavg.Npols) + assert d0[0].shape == ( + uvd_redavg.Ntimes, + uvd_redavg.Nbls, + uvd_redavg.Nfreqs, + uvd_redavg.Npols, + ) + + def test_redavg_with_where_inpainted(self, tmp_path): + uvds = mockuvd.make_dataset( + ndays=2, + nfiles=3, + ntimes=2, + ants=np.arange(7), + creator=mockuvd.create_uvd_identifiable, + freqs=mockuvd.PHASEII_FREQS[:25], + pols=['xx', 'xy'], + redundantly_averaged=True, + ) + + uvd_files = mockuvd.write_files_in_hera_format( + uvds, tmp_path, add_where_inpainted_files=True + ) + + ap = uvds[0][0].get_antpairs() + reds = RedundantGroups.from_antpos( + dict(zip(uvds[0][0].antenna_numbers, uvds[0][0].antenna_positions)), + ) + lstbins, d0, f0, n0, inpflg, times0 = lstbin_simple.lst_bin_files_for_baselines( + data_files=sum(uvd_files, []), # flatten the list-of-lists + lst_bin_edges=[0, 1.9 * np.pi], + redundantly_averaged=True, + rephase=False, + antpairs=ap, + reds=reds, + where_inpainted_files=[str(Path(f).with_suffix(".where_inpainted.h5")) for f in sum(uvd_files, [])], + ) + assert len(lstbins) == 1 + + # Also test that if a where_inpainted file has missing baselines, an error is + # raised. + # This is kind of a dodgy way to test it: copy the original data files, + # write a whole new dataset in the same place but with fewer baselines, then + # copy the data files (but not the where_inpainted files) back, so they mismatch. + for flist in uvd_files: + for fl in flist: + fl = Path(fl) + fl.rename(fl.parent / f"{fl.with_suffix('.bk')}") + + winp = fl.with_suffix(".where_inpainted.h5") + winp.unlink() + + uvds = mockuvd.make_dataset( + ndays=2, + nfiles=3, + ntimes=2, + ants=np.arange(5), # less than the original + creator=mockuvd.create_uvd_identifiable, + freqs=mockuvd.PHASEII_FREQS[:25], + pols=['xx', 'xy'], + redundantly_averaged=True, + ) + + uvd_files = mockuvd.write_files_in_hera_format( + uvds, tmp_path, add_where_inpainted_files=True + ) + + # Move back the originals. + for flist in uvd_files: + for fl in flist: + fl = Path(fl) + fl.unlink() + (fl.parent / f"{fl.with_suffix('.bk')}").rename(fl) + + with pytest.raises(ValueError, match="Could not find any baseline from group"): + lstbin_simple.lst_bin_files_for_baselines( + data_files=sum(uvd_files, []), # flatten the list-of-lists + lst_bin_edges=[0, 1.9 * np.pi], + redundantly_averaged=True, + rephase=False, + antpairs=ap, + reds=reds, + where_inpainted_files=[str(Path(f).with_suffix(".where_inpainted.h5")) for f in sum(uvd_files, [])], + ) + def test_make_lst_grid(): lst_grid = lstbin_simple.make_lst_grid(0.01, begin_lst=None) @@ -656,10 +945,13 @@ def test_make_lst_grid(): assert len(lst_grid) == 628 assert np.isclose(lst_grid[0], 3.1365901175171982) + class Test_GetAllUnflaggedBaselines: @pytest.mark.parametrize("redundantly_averaged", [True, False]) @pytest.mark.parametrize("only_last_file_per_night", [True, False]) - def test_get_all_baselines(self, tmp_path_factory, redundantly_averaged, only_last_file_per_night): + def test_get_all_baselines( + self, tmp_path_factory, redundantly_averaged, only_last_file_per_night + ): tmp = tmp_path_factory.mktemp("get_all_baselines") uvds = mockuvd.make_dataset( ndays=3, @@ -691,13 +983,19 @@ def test_bad_inputs(self, tmp_path_factory): ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) - with pytest.raises(ValueError, match='Cannot ignore antennas if the files are redundantly averaged'): + with pytest.raises( + ValueError, + match="Cannot ignore antennas if the files are redundantly averaged", + ): lstbin_simple.get_all_unflagged_baselines(data_files, ignore_ants=[0, 1]) - with pytest.raises(ValueError, match='Cannot exclude antennas if the files are redundantly averaged'): + with pytest.raises( + ValueError, + match="Cannot exclude antennas if the files are redundantly averaged", + ): lstbin_simple.get_all_unflagged_baselines( data_files, - ex_ant_yaml_files=['non-existent-file.yaml'], + ex_ant_yaml_files=["non-existent-file.yaml"], ) uvds_different_xorient = mockuvd.make_dataset( @@ -706,47 +1004,58 @@ def test_bad_inputs(self, tmp_path_factory): ntimes=2, ants=np.arange(10), creator=create_small_array_uvd, - x_orientation='east', + x_orientation="east", redundantly_averaged=True, ) - data_files = mockuvd.write_files_in_hera_format(uvds + uvds_different_xorient, tmp) + data_files = mockuvd.write_files_in_hera_format( + uvds + uvds_different_xorient, tmp + ) - with pytest.raises(ValueError, match='Not all files have the same xorientation!'): + with pytest.raises( + ValueError, match="Not all files have the same xorientation!" + ): lstbin_simple.get_all_unflagged_baselines(data_files) + class Test_LSTBinFiles: def test_golden_data(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("lstbin_golden_data") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, identifiable=True, - creator=create_small_array_uvd, time_axis_faster_than_bls=True + ndays=3, + nfiles=4, + ntimes=2, + identifiable=True, + creator=create_small_array_uvd, + time_axis_faster_than_bls=True, ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) print(len(uvds)) cfl = tmp / "lstbin_config_file.yaml" print(cfl) lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, rephase=False, - golden_lsts=uvds[0][1].lst_array.min() + 0.0001 + config_file=cfl, + rephase=False, + golden_lsts=uvds[0][1].lst_array.min() + 0.0001, ) assert len(out_files) == 4 - assert out_files[1]['GOLDEN'] + assert out_files[1]["GOLDEN"] assert not out_files[0]["GOLDEN"] assert not out_files[2]["GOLDEN"] assert not out_files[3]["GOLDEN"] uvd = UVData() - uvd.read(out_files[1]['GOLDEN']) - + uvd.read(out_files[1]["GOLDEN"]) # Read the Golden File - golden_hd = io.HERAData(out_files[1]['GOLDEN']) + golden_hd = io.HERAData(out_files[1]["GOLDEN"]) gd, gf, gn = golden_hd.read() assert gd.shape[0] == 3 # ndays @@ -757,8 +1066,8 @@ def test_golden_data(self, tmp_path_factory): assert len(gd.keys()) == 12 # Check that autos are all the same - assert np.all(gd[(0,0,'ee')] == gd[(1, 1,'ee')]) - assert np.all(gd[(0,0,'ee')] == gd[(2, 2,'ee')]) + assert np.all(gd[(0, 0, "ee")] == gd[(1, 1, "ee")]) + assert np.all(gd[(0, 0, "ee")] == gd[(2, 2, "ee")]) # Since each day is at exactly the same LST by construction, the golden data # over time should be the same. @@ -768,19 +1077,26 @@ def test_golden_data(self, tmp_path_factory): for day in data: np.testing.assert_allclose(data[0], day, atol=1e-6) - assert not np.allclose(gd[(0, 1, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 1, 'ee')][0]) - + assert not np.allclose(gd[(0, 1, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 1, "ee")][0]) def test_save_chans(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("lstbin_golden_data") - uvds = mockuvd.make_dataset(ndays=3, nfiles=4, ntimes=2, identifiable=True, creator=create_small_array_uvd) + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=4, + ntimes=2, + identifiable=True, + creator=create_small_array_uvd, + ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( @@ -790,10 +1106,10 @@ def test_save_chans(self, tmp_path_factory): assert len(out_files) == 4 # Ensure there's a REDUCEDCHAN file for each output LST for fl in out_files: - assert fl['REDUCEDCHAN'] + assert fl["REDUCEDCHAN"] # Read the Golden File - hd = io.HERAData(fl['REDUCEDCHAN']) + hd = io.HERAData(fl["REDUCEDCHAN"]) gd, gf, gn = hd.read() assert gd.shape[0] == 3 # ndays @@ -804,8 +1120,8 @@ def test_save_chans(self, tmp_path_factory): assert len(gd.keys()) == 12 # Check that autos are all the same - assert np.all(gd[(0,0,'ee')] == gd[(1, 1,'ee')]) - assert np.all(gd[(0,0,'ee')] == gd[(2, 2,'ee')]) + assert np.all(gd[(0, 0, "ee")] == gd[(1, 1, "ee")]) + assert np.all(gd[(0, 0, "ee")] == gd[(2, 2, "ee")]) # Since each day is at exactly the same LST by construction, the golden data # over time should be the same. @@ -813,81 +1129,104 @@ def test_save_chans(self, tmp_path_factory): for day in data: np.testing.assert_allclose(data[0], day, rtol=1e-6) - assert not np.allclose(gd[(0, 1, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 2, 'ee')][0]) - assert not np.allclose(gd[(1, 2, 'ee')][0], gd[(0, 1, 'ee')][0]) + assert not np.allclose(gd[(0, 1, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 2, "ee")][0]) + assert not np.allclose(gd[(1, 2, "ee")][0], gd[(0, 1, "ee")][0]) def test_baseline_chunking(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("baseline_chunking") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(10) for j in range(i, 10)], # 55 antpairs - pols = ['xx', 'yy'], + antpairs=[(i, j) for i in range(10) for j in range(i, 10)], # 55 antpairs + pols=["xx", "yy"], freqs=np.linspace(140e6, 180e6, 12), ) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, + cfl, + data_files, + ntimes_per_file=2, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + write_med_mad=True, ) out_files_chunked = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.chunked.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.chunked.uvh5", Nbls_to_load=10, + write_med_mad=True, ) for flset, flsetc in zip(out_files, out_files_chunked): - assert flset['LST'] != flsetc['LST'] + assert flset[("LST", False)] != flsetc[("LST", False)] uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[("LST", False)]) uvdlstc = UVData() - uvdlstc.read(flsetc['LST']) + uvdlstc.read(flsetc[("LST", False)]) assert uvdlst == uvdlstc - def test_compare_nontrivial_cal( - self, tmp_path_factory - ): + assert flset[("MED", False)] != flsetc[("MED", False)] + uvdlst = UVData() + uvdlst.read(flset[("MED", False)]) + + uvdlstc = UVData() + uvdlstc.read(flsetc[("MED", False)]) + + assert uvdlst == uvdlstc + + def test_compare_nontrivial_cal(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs - pols = ('xx', 'yy'), + antpairs=[(i, j) for i in range(7) for j in range(i, 7)], # 55 antpairs + pols=("xx", "yy"), freqs=np.linspace(140e6, 180e6, 3), ) - uvcs = [ - [mockuvd.make_uvc_identifiable(d) for d in uvd ] for uvd in uvds - ] + uvcs = [[mockuvd.make_uvc_identifiable(d) for d in uvd] for uvd in uvds] for night in uvds: print([np.unique(night[0].lst_array)]) data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cal_files = mockuvd.write_cals_in_hera_format(uvcs, tmp) - decal_files = [[df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files] + decal_files = [ + [df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(data_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - df, uf, cf, - gain_convention='divide', # go the wrong way + df, + uf, + cf, + gain_convention="divide", # go the wrong way clobber=True, ) # First, let's go the other way to check if we get the same thing back directly - recaled_files = [[df.replace(".uvh5", ".recal.uvh5") for df in dfl] for dfl in data_files] + recaled_files = [ + [df.replace(".uvh5", ".recal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(recaled_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - uf, df, cf, - gain_convention='multiply', # go the wrong way + uf, + df, + cf, + gain_convention="multiply", # go the wrong way clobber=True, ) @@ -902,32 +1241,39 @@ def test_compare_nontrivial_cal( cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, decal_files, ntimes_per_file=2, + cfl, + decal_files, + ntimes_per_file=2, ) out_files_recal = lstbin_simple.lst_bin_files( - config_file=cfl, calfile_rules=[(".decal.uvh5", ".calfits")], + config_file=cfl, + calfile_rules=[(".decal.uvh5", ".calfits")], fname_format="zen.{kind}.{lst:7.5f}.recal.uvh5", Nbls_to_load=10, - rephase=False + rephase=False, ) lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", Nbls_to_load=11, - rephase=False + rephase=False, ) for flset, flsetc in zip(out_files, out_files_recal): - assert flset['LST'] != flsetc['LST'] + assert flset[("LST", False)] != flsetc[("LST", False)] uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[("LST", False)]) uvdlstc = UVData() - uvdlstc.read(flsetc['LST']) + uvdlstc.read(flsetc[("LST", False)]) print(np.unique(uvdlstc.lst_array)) expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -940,9 +1286,15 @@ def test_compare_nontrivial_cal( # when we put in flags, we end up setting the data to nan (and # never using it...) np.testing.assert_allclose( - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, uvdlstc.get_data(ap+(pol,))), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 + np.where( + uvdlst.get_flags(ap + (pol,)), + 1.0, + uvdlstc.get_data(ap + (pol,)), + ), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4, ) # Don't worry about history here, because we know they use different inputs @@ -952,65 +1304,82 @@ def test_compare_nontrivial_cal( @pytest.mark.parametrize( "random_ants_to_drop, rephase, sigma_clip_thresh, flag_strategy, pols, freq_range", [ - (0, True, 0.0, (0,0,0), ('xx', 'yy'), None), - (0, True, 0.0, (0,0,0), ('xx', 'yy', 'xy', 'yx'), None), - (0, True, 0.0, (0,0,0), ('xx', 'yy'), (150e6, 180e6)), - (0, True, 0.0, (2,1,3), ('xx', 'yy'), None), - (0, True, 3.0, (0,0,0), ('xx', 'yy'), None), - (0, False, 0.0, (0,0,0), ('xx', 'yy'), None), - (3, True, 0.0, (0,0,0), ('xx', 'yy'), None), - ] + (0, True, 0.0, (0, 0, 0), ("xx", "yy"), None), + (0, True, 0.0, (0, 0, 0), ("xx", "yy", "xy", "yx"), None), + (0, True, 0.0, (0, 0, 0), ("xx", "yy"), (150e6, 180e6)), + (0, True, 0.0, (2, 1, 3), ("xx", "yy"), None), + (0, True, 3.0, (0, 0, 0), ("xx", "yy"), None), + (0, False, 0.0, (0, 0, 0), ("xx", "yy"), None), + (3, True, 0.0, (0, 0, 0), ("xx", "yy"), None), + ], ) def test_nontrivial_cal( - self, tmp_path_factory, random_ants_to_drop: int, rephase: bool, - sigma_clip_thresh: float, flag_strategy: tuple[int, int, int], - pols: tuple[str], freq_range: tuple[float, float] | None, - benchmark + self, + tmp_path_factory, + random_ants_to_drop: int, + rephase: bool, + sigma_clip_thresh: float, + flag_strategy: tuple[int, int, int], + pols: tuple[str], + freq_range: tuple[float, float] | None, + benchmark, ): - tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=2, ntimes=2, ants=np.arange(7), + ndays=3, + nfiles=2, + ntimes=2, + ants=np.arange(7), creator=mockuvd.create_uvd_identifiable, - pols = pols, + pols=pols, freqs=np.linspace(140e6, 180e6, 3), random_ants_to_drop=random_ants_to_drop, ) uvcs = [ - [mockuvd.make_uvc_identifiable(d, *flag_strategy) for d in uvd ] for uvd in uvds + [mockuvd.make_uvc_identifiable(d, *flag_strategy) for d in uvd] + for uvd in uvds ] data_files = mockuvd.write_files_in_hera_format(uvds, tmp) cal_files = mockuvd.write_cals_in_hera_format(uvcs, tmp) - decal_files = [[df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files] + decal_files = [ + [df.replace(".uvh5", ".decal.uvh5") for df in dfl] for dfl in data_files + ] for flist, clist, ulist in zip(data_files, cal_files, decal_files): for df, cf, uf in zip(flist, clist, ulist): apply_cal.apply_cal( - df, uf, cf, - gain_convention='divide', # go the wrong way + df, + uf, + cf, + gain_convention="divide", # go the wrong way clobber=True, ) cfl = tmp / "lstbin_config_file.yaml" lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = benchmark( lstbin_simple.lst_bin_files, - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=rephase, + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + Nbls_to_load=11, + rephase=rephase, sigma_clip_thresh=sigma_clip_thresh, sigma_clip_min_N=2, freq_min=freq_range[0] if freq_range is not None else None, freq_max=freq_range[1] if freq_range is not None else None, - overwrite=True + overwrite=True, ) assert len(out_files) == 2 for flset in out_files: uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[("LST", False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1029,23 +1398,29 @@ def test_nontrivial_cal( print(uvdlst.get_data(ap)) print(expected[i]) np.testing.assert_allclose( - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, uvdlst.get_data(ap+(pol,))), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 if (not rephase or (ap[0] == ap[1] and pol[0]==pol[1])) else 1e-3 + np.where( + uvdlst.get_flags(ap + (pol,)), + 1.0, + uvdlst.get_data(ap + (pol,)), + ), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4 + if (not rephase or (ap[0] == ap[1] and pol[0] == pol[1])) + else 1e-3, ) - @pytest.mark.parametrize("tell_it", (True, False)) - def test_redundantly_averaged( - self, tmp_path_factory, tell_it - ): - + def test_redundantly_averaged(self, tmp_path_factory, tell_it): tmp = tmp_path_factory.mktemp("nontrivial_cal") uvds = mockuvd.make_dataset( - ndays=3, nfiles=2, ntimes=2, + ndays=3, + nfiles=2, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(7) for j in range(i, 7)], # 55 antpairs - pols = ('xx', 'yx'), + antpairs=[(i, j) for i in range(7) for j in range(i, 7)], # 55 antpairs + pols=("xx", "yx"), freqs=np.linspace(140e6, 180e6, 3), redundantly_averaged=True, ) @@ -1054,11 +1429,16 @@ def test_redundantly_averaged( cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) out_files = lstbin_simple.lst_bin_files( - config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=False, + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}.uvh5", + Nbls_to_load=11, + rephase=False, sigma_clip_thresh=0.0, sigma_clip_min_N=2, redundantly_averaged=True if tell_it else None, @@ -1068,7 +1448,7 @@ def test_redundantly_averaged( for flset in out_files: uvdlst = UVData() - uvdlst.read(flset['LST']) + uvdlst.read(flset[("LST", False)]) # Don't worry about history here, because we know they use different inputs expected = mockuvd.identifiable_data_from_uvd(uvdlst, reshape=False) @@ -1086,18 +1466,22 @@ def test_redundantly_averaged( # when we put in flags, we end up setting the data to 1.0 (and # never using it...) np.testing.assert_allclose( - uvdlst.get_data(ap+(pol,)), - np.where(uvdlst.get_flags(ap+(pol,)), 1.0, expected[i, :, :, j]), - rtol=1e-4 + uvdlst.get_data(ap + (pol,)), + np.where( + uvdlst.get_flags(ap + (pol,)), 1.0, expected[i, :, :, j] + ), + rtol=1e-4, ) def test_output_file_select(self, tmp_path_factory): tmp = tmp_path_factory.mktemp("output_file_select") uvds = mockuvd.make_dataset( - ndays=3, nfiles=4, ntimes=2, + ndays=3, + nfiles=4, + ntimes=2, creator=mockuvd.create_uvd_identifiable, - antpairs = [(i,j) for i in range(4) for j in range(i, 4)], # 55 antpairs - pols = ('xx', 'yx'), + antpairs=[(i, j) for i in range(4) for j in range(i, 4)], # 55 antpairs + pols=("xx", "yx"), freqs=np.linspace(140e6, 180e6, 3), ) @@ -1105,21 +1489,253 @@ def test_output_file_select(self, tmp_path_factory): cfl = tmp / "lstbin_config_file.yaml" config_info = lstbin_simple.make_lst_bin_config_file( - cfl, data_files, ntimes_per_file=2, clobber=True, + cfl, + data_files, + ntimes_per_file=2, + clobber=True, ) lstbf = partial( - lstbin_simple.lst_bin_files, config_file=cfl, + lstbin_simple.lst_bin_files, + config_file=cfl, fname_format="zen.{kind}.{lst:7.5f}.uvh5", - Nbls_to_load=11, rephase=False, + Nbls_to_load=11, + rephase=False, ) out_files = lstbf(output_file_select=0) assert len(out_files) == 1 - out_files = lstbf(output_file_select=(1, 2)) assert len(out_files) == 2 - with pytest.raises(ValueError, match='output_file_select must be less than the number of output files'): + with pytest.raises( + ValueError, + match="output_file_select must be less than the number of output files", + ): lstbf(output_file_select=100) + def test_inpaint_mode_no_flags(self, tmp_path_factory): + """Test that using inpaint mode does nothing when there's no flags.""" + tmp = tmp_path_factory.mktemp("inpaint_no_flags") + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=1, + ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format(uvds, tmp) + + cfl = tmp / "lstbin_config_file.yaml" + config_info = lstbin_simple.make_lst_bin_config_file( + cfl, + data_files, + ntimes_per_file=2, + clobber=True, + ) + + # Additionally try fname format with leading / which should be removed + # automatically in the writing. + out_files = lstbin_simple.lst_bin_files( + config_file=cfl, + fname_format="/zen.{kind}.{lst:7.5f}.{inpaint_mode}.uvh5", + rephase=False, + sigma_clip_thresh=None, + sigma_clip_min_N=2, + output_flagged=True, + output_inpainted=True, + ) + + assert len(out_files) == 1 + + for flset in out_files: + flagged = UVData.from_file(flset[("LST", False)]) + inpainted = UVData.from_file(flset[("LST", True)]) + + assert flagged == inpainted + + def test_inpaint_mode_no_flags_where_inpainted(self, tmp_path_factory): + """Test that .""" + tmp = tmp_path_factory.mktemp("inpaint_no_flags") + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=1, + ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format( + uvds, tmp, add_where_inpainted_files=True + ) + + cfl = tmp / "lstbin_config_file.yaml" + config_info = lstbin_simple.make_lst_bin_config_file( + cfl, + data_files, + ntimes_per_file=2, + clobber=True, + ) + out_files = lstbin_simple.lst_bin_files( + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + rephase=False, + sigma_clip_thresh=None, + sigma_clip_min_N=2, + output_flagged=True, + output_inpainted=True, + where_inpainted_file_rules=[(".uvh5", ".where_inpainted.h5")], + ) + + assert len(out_files) == 1 + + for flset in out_files: + flagged = UVData.from_file(flset[("LST", False)]) + inpainted = UVData.from_file(flset[("LST", True)]) + + assert flagged == inpainted + + def test_where_inpainted_not_baseline_type(self, tmp_path_factory): + """Test that proper error is raised when using incorrect inpainted files.""" + tmp = tmp_path_factory.mktemp("inpaint_not_baseline_type") + uvds = mockuvd.make_dataset( + ndays=3, + nfiles=1, + ntimes=2, + creator=mockuvd.create_uvd_identifiable, + antpairs=[(i, j) for i in range(3) for j in range(i, 3)], # 55 antpairs + pols=("xx", "yx"), + freqs=np.linspace(140e6, 180e6, 3), + redundantly_averaged=True, + ) + + data_files = mockuvd.write_files_in_hera_format( + uvds, tmp, add_where_inpainted_files=True + ) + + # Now create dodgy where_inpainted files + inp = lstbin_simple._get_where_inpainted_files( + data_files, [(".uvh5", ".where_inpainted.h5")] + ) + for fllist in inp: + for fl in fllist: + uvf = UVFlag() + uvf.read(fl) + uvf.to_waterfall() + uvf.to_flag() + uvf.write(fl.replace(".h5", ".waterfall.h5"), clobber=True) + + cfl = tmp / "lstbin_config_file.yaml" + lstbin_simple.make_lst_bin_config_file( + cfl, + data_files, + ntimes_per_file=2, + clobber=True, + ) + with pytest.raises(ValueError, match="to be a DataContainer"): + lstbin_simple.lst_bin_files( + config_file=cfl, + fname_format="zen.{kind}.{lst:7.5f}{inpaint_mode}.uvh5", + output_flagged=False, + where_inpainted_file_rules=[(".uvh5", ".where_inpainted.waterfall.h5")], + ) + + +def test_get_where_inpainted(tmp_path_factory): + tmp: Path = tmp_path_factory.mktemp("get_where_inpainted") + + fls = [] + for outer in ["abc", "def"]: + these = [] + for filename in outer: + (tmp / f"{filename}.uvh5").touch() + (tmp / f"{filename}.where_inpainted.h5").touch() + these.append(tmp / f"{filename}.uvh5") + fls.append(these) + + out = lstbin_simple._get_where_inpainted_files( + fls, [(".uvh5", ".where_inpainted.h5")] + ) + + assert len(out) == 2 + assert len(out[0]) == 3 + assert len(out[1]) == 3 + + with pytest.raises(IOError, match="Where inpainted file"): + lstbin_simple._get_where_inpainted_files(fls, [(".uvh5", ".non_existent.h5")]) + + +def test_configure_inpainted_mode(): + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=True, where_inpainted_files=[] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=True, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=False, where_inpainted_files=[] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=False, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=None, where_inpainted_files=[] + ) + assert flg + assert not inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=True, output_inpainted=None, where_inpainted_files=["a_file.h5"] + ) + assert flg + assert inp + + flg, inp = lstbin_simple._configure_inpainted_mode( + output_flagged=False, output_inpainted=True, where_inpainted_files=[] + ) + assert not flg + assert inp + + with pytest.raises(ValueError, match="Both output_inpainted and output_flagged"): + lstbin_simple._configure_inpainted_mode( + output_flagged=False, output_inpainted=False, where_inpainted_files=[] + ) + + +class TestThresholdFlags: + def test_inplace(self): + flags = np.zeros((10, 12), dtype=bool) # shape = (ntime, ...) + flags[:8, 0] = True + + new = lstbin_simple.threshold_flags(flags, flag_thresh=0.7, inplace=True) + + assert np.all(flags[:, 0]) + assert np.all(new == flags) + + def test_not_inplace(self): + flags = np.zeros((10, 12), dtype=bool) # shape = (ntime, ...) + flags[:8, 0] = True + + new = lstbin_simple.threshold_flags(flags, flag_thresh=0.7, inplace=False) + + assert not np.all(flags[:, 0]) + assert np.all(new[:, 0]) diff --git a/hera_cal/tests/test_nucal.py b/hera_cal/tests/test_nucal.py index 63c7b102c..2ea65b6e9 100644 --- a/hera_cal/tests/test_nucal.py +++ b/hera_cal/tests/test_nucal.py @@ -307,6 +307,17 @@ def test_compute_spatial_filters(): model = design_matrix @ (XTXinv @ Xy) np.allclose(model, data, atol=1e-6) + # Show that filters with a u-max cutoff are set to zero + umax = 15 + antpos = linear_array(6, sep=5) + radial_reds = nucal.RadialRedundancy(antpos) + spatial_filters = nucal.compute_spatial_filters(radial_reds, freqs, umax=umax) + + for rdgrp in radial_reds: + for bl in rdgrp: + umodes = radial_reds.baseline_lengths[bl] * freqs / 2.998e8 + assert np.allclose(spatial_filters[bl][umodes > umax], 0) + def test_build_nucal_wgts(): bls = [(0, 1, 'ee'), (0, 2, 'ee'), (1, 2, 'ee')] auto_bls = [(0, 0, 'ee'), (1, 1, 'ee'), (2, 2, 'ee')] diff --git a/hera_cal/tests/test_redcal.py b/hera_cal/tests/test_redcal.py index 31e53a325..72f79711d 100644 --- a/hera_cal/tests/test_redcal.py +++ b/hera_cal/tests/test_redcal.py @@ -1890,9 +1890,10 @@ def test_redcal_run(self): sys.stdout = open(os.devnull, 'w') redcal_meta, hc_first, hc_omni, hd_vissol = om.redcal_run(input_data, verbose=True, ant_z_thresh=1.7, add_to_history='testing', a_priori_ex_ants_yaml=os.path.join(DATA_PATH, 'test_input', 'a_priori_flags_sample.yaml'), + flag_nchan_low=10, flag_nchan_high=10, check_after=500, iter0_prefix='.iter0', metrics_files=ant_metrics_file, clobber=True) hd = io.HERAData(input_data) - redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, ex_ants=[11, 50]) + redcal_meta0, hc_first0, hc_omni0, hd_vissol0 = om.redcal_iteration(hd, flag_nchan_low=10, flag_nchan_high=10, check_after=500, ex_ants=[11, 50]) sys.stdout = sys.__stdout__ for prefix, hc_here, bad_ants in [('', hc_first, [11, 50, 12, 24]), ('.iter0', hc_first0, [11, 50])]: diff --git a/hera_cal/tests/test_utils.py b/hera_cal/tests/test_utils.py index 89e8b21a3..dd392f2fb 100644 --- a/hera_cal/tests/test_utils.py +++ b/hera_cal/tests/test_utils.py @@ -26,6 +26,7 @@ from . import mock_uvdata as mockuvd from pathlib import Path + class Test_Pol_Ops(object): def test_comply_pol(self): assert utils.comply_pol('XX') == 'xx' @@ -425,12 +426,12 @@ def test_lst_rephase_vectorized(): for i, antpair in enumerate(antpairs): for j, pol in enumerate(pols): data_drift[:, i, :, j] = data[antpair + (pol,)].copy() - + # basic test: single dlst for all integrations - _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) - + _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) + # check error at transit - phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime+1, ibl, ifreq, ipol]) + phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime + 1, ibl, ifreq, ipol]) assert np.isclose(phs_err, 0, atol=1e-7) # check error across file phs_err = np.angle(_data[:-1, ibl, ifreq, ipol] / data_drift[1:, ibl, ifreq, ipol]) @@ -440,7 +441,7 @@ def test_lst_rephase_vectorized(): dlst = np.array([np.median(np.diff(lsts))] * data.shape[0]) _data = utils.lst_rephase(data_drift, blvec, freqs, dlst, lat=0.0, inplace=False) # check error at transit - phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime+1, ibl, ifreq, ipol]) + phs_err = np.angle(_data[itime, ibl, ifreq, ipol] / data_drift[itime + 1, ibl, ifreq, ipol]) assert np.isclose(phs_err, 0, atol=1e-7) # check err across file phs_err = np.angle(_data[:-1, ibl, ifreq, ipol] / data_drift[1:, ibl, ifreq, ipol]) @@ -456,6 +457,7 @@ def test_lst_rephase_vectorized(): phs_err = np.angle(_data[:, ibl, ifreq, ipol] / data_drift[50, ibl, ifreq, ipol]) assert np.abs(phs_err).max() < 1e-4 + def test_lst_rephase(): # load point source sim w/ array at latitude = 0 fname = os.path.join(DATA_PATH, "PAPER_point_source_sim.uv") @@ -898,42 +900,19 @@ def test_select_spw_ranges_argparser(): assert args.spw_ranges == [(0, 20), (30, 100), (120, 150)] -def test_select_spw_ranges_run_script_code(tmpdir): - # test script code from scripts/test_select_spw_ranges.py - tmp_path = str(tmpdir) - new_cal = os.path.join(DATA_PATH, "test_input/zen.2458101.46106.xx.HH.uv.abs.calfits_54x_only") - uvh5 = os.path.join(DATA_PATH, "test_input/zen.2458101.46106.xx.HH.OCR_53x_54x_only.uvh5") - hd = io.HERAData(uvh5) - hd.read() - nf = hd.Nfreqs - output = os.path.join(tmp_path, 'test_calibrated_output.uvh5') - # construct bash script command - select_cmd = f'python ./scripts/select_spw_ranges.py {uvh5} {output} --clobber --spw_ranges 0~256,332~364,792~1000' - # and excecute inside of python - os.system(select_cmd) - # test that output has correct frequencies. - hdo = io.HERAData(output) - hdo.read() - assert np.allclose(hdo.freq_array, np.hstack([hd.freq_array[:256], hd.freq_array[332:364], hd.freq_array[792:1000]])) - freq_inds = np.hstack([np.arange(0, 256).astype(int), np.arange(332, 364).astype(int), np.arange(792, 1000).astype(int)]) - # and check that data, flags, nsamples make sense. - assert np.allclose(hdo.data_array, hd.data_array[:, freq_inds, :]) - assert np.allclose(hdo.flag_array, hd.flag_array[:, freq_inds, :]) - assert np.allclose(hdo.nsample_array, hd.nsample_array[:, freq_inds, :]) - - @pytest.fixture(scope='function') def tmpdir(tmp_path_factory): return tmp_path_factory.mktemp('test_match_files_to_lst_bins') + class TestMatchFilesToLSTBins: def make_empty_day(self, nfiles: int, **kwargs): - # We make the internal data really small here, since we care only about the + # We make the internal data really small here, since we care only about the # time metadata return mockuvd.make_day( nfiles=nfiles, ants=[0, 1, 2], - antpairs=[(0,0), (0,1)], + antpairs=[(0, 0), (0, 1)], freqs=np.array([155e6]), creator=mockuvd.create_mock_hera_obs, pols=('xx',), @@ -945,7 +924,7 @@ def make_empty_day(self, nfiles: int, **kwargs): @pytest.mark.parametrize('time_axis_faster_than_bls', [True, False, None]) @pytest.mark.parametrize('jd_regex', (r"zen\.(\d+\.\d+)\.", None)) def test_simple_zero2pi( - self, + self, files_sorted: bool, blts_are_rectangular: bool, time_axis_faster_than_bls: bool, @@ -956,14 +935,14 @@ def test_simple_zero2pi( which should match all files. This makes it easy to test different input parameter combinations. """ - + uvds = self.make_empty_day(nfiles=10, time_axis_faster_than_bls=bool(time_axis_faster_than_bls)) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) # Ensure that using lst-bin edges of (0, 2pi) works fine out_fls = utils.match_files_to_lst_bins( - lst_edges=(0, 2*np.pi), - file_list = fls, + lst_edges=(0, 2 * np.pi), + file_list=fls, files_sorted=files_sorted, blts_are_rectangular=blts_are_rectangular, jd_regex=jd_regex, @@ -979,13 +958,13 @@ def test_regex_vs_readtimes(self, tmpdir: Path): """ uvds = self.make_empty_day(nfiles=10) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) - + # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(uvds[0].lst_array.min(), (uvds[5].lst_array.min() +uvds[5].lst_array.max())/2 ) + lst_edges = (uvds[0].lst_array.min(), (uvds[5].lst_array.min() + uvds[5].lst_array.max()) / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 @@ -993,7 +972,7 @@ def test_regex_vs_readtimes(self, tmpdir: Path): out_fls_meta = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, jd_regex=None, ) @@ -1011,16 +990,16 @@ def test_lst_bins_crossing_2pi(self, tmpdir: Path): total_dlst = dlst * nfiles * 2 uvds = self.make_empty_day( - nfiles=nfiles, lst_start=2*np.pi - total_dlst/2, integration_time=tint, + nfiles=nfiles, lst_start=2 * np.pi - total_dlst / 2, integration_time=tint, ) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(3*np.pi/2, np.pi/2) + lst_edges = (3 * np.pi / 2, np.pi / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 @@ -1028,38 +1007,38 @@ def test_lst_bins_crossing_2pi(self, tmpdir: Path): # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(3*np.pi/2, 2*np.pi) + lst_edges = (3 * np.pi / 2, 2 * np.pi) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 - assert len(out_fls[0]) == nfiles//2 + assert len(out_fls[0]) == nfiles // 2 # To make it interesting, choose one big LST bin that covers about half the # files, including a partial file at the end. - lst_edges=(0, np.pi/2) + lst_edges = (0, np.pi / 2) out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == 1 - assert len(out_fls[0]) == nfiles//2 - + assert len(out_fls[0]) == nfiles // 2 + def test_one_file_per_lstbin(self, tmpdir): uvds = self.make_empty_day(nfiles=10, jd_start=0.2) fls = mockuvd.write_files_in_hera_format(uvds, tmpdir) - - lst_edges=[ + + lst_edges = [ uvd.lst_array.min() for uvd in uvds ] lst_edges.append(uvds[-1].lst_array.max()) - + out_fls = utils.match_files_to_lst_bins( lst_edges=lst_edges, - file_list = fls, + file_list=fls, files_sorted=True, ) assert len(out_fls) == len(lst_edges) - 1 @@ -1069,4 +1048,40 @@ def test_one_file_per_lstbin(self, tmpdir): assert 0 < len(match) <= 2 # Ensure all files are matched - assert len(files_matched) == len(fls) \ No newline at end of file + assert len(files_matched) == len(fls) + + +class Test_LSTBranchCut: + def test_simple_ascending(self): + lsts = np.linspace(0, 1.0, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 + + def test_simple_descending(self): + lsts = np.linspace(1.0, 0, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 + + def test_simple_ascending_with_offset(self): + lsts = np.linspace(0.1, 1.1, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 0.1) + + def test_wrap_around_2pi(self): + lsts = np.linspace(3 * np.pi / 2, 5 * np.pi / 2, 100) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 3 * np.pi / 2) + + def test_wrap_around_2pi_starting_low(self): + lsts0 = np.linspace(0, np.pi / 2) + lsts1 = np.linspace(3 * np.pi / 2, 2 * np.pi) + lsts = np.concatenate([lsts0, lsts1]) + best = utils.get_best_lst_branch_cut(lsts) + assert np.isclose(best, 3 * np.pi / 2) + + def test_with_crazy_periods(self): + lsts = np.linspace(0, 1.0, 100) + n = np.random.random_integers(10, size=100) + lsts += n * 2 * np.pi + best = utils.get_best_lst_branch_cut(lsts) + assert best == 0 diff --git a/hera_cal/utils.py b/hera_cal/utils.py index 59ba43c34..8ab68539c 100644 --- a/hera_cal/utils.py +++ b/hera_cal/utils.py @@ -119,7 +119,7 @@ def comply_pol(pol): compliant with pyuvdata and hera_cal.''' try: return _comply_vispol(pol) - except(KeyError): # happens if we have an antpol, not vispol + except KeyError: # happens if we have an antpol, not vispol return _comply_antpol(pol) @@ -399,7 +399,7 @@ def update(self): def get_params(self, ant_prms={'*': '*'}): try: prms = aipy.pol.AntennaArray.get_params(self, ant_prms) - except(IndexError): + except IndexError: return {} return prms @@ -411,17 +411,17 @@ def set_params(self, prms): try: top_pos[0] = prms[str(i)]['top_x'] ant_changed = True - except(KeyError): + except KeyError: pass try: top_pos[1] = prms[str(i)]['top_y'] ant_changed = True - except(KeyError): + except KeyError: pass try: top_pos[2] = prms[str(i)]['top_z'] ant_changed = True - except(KeyError): + except KeyError: pass if ant_changed: # rotate from zenith to equatorial, convert from meters to ns @@ -539,7 +539,9 @@ def JD2LST(JD, latitude=-30.721526120689507, longitude=21.428303826863015, altit JD = [JD] # use pyuvdata - LST = uvutils.get_lst_for_time(np.array(JD), latitude, longitude, altitude) + LST = uvutils.get_lst_for_time( + np.array(JD), latitude=latitude, longitude=longitude, altitude=altitude + ) if _array: return LST @@ -563,7 +565,7 @@ def LST2JD(LST, start_jd, allow_other_jd=False, lst_branch_cut=0.0, latitude=-30 LSTs to correspond to previous or subsequent days if necessary lst_branch_cut : type=float, LST that must fall during start_jd even when allow_other_jd - is True. Used as the starting point starting point where LSTs below this + is True. Used as the starting point where LSTs below this value map to later JDs than this LST [radians] latitude : type=float, degrees North of observer, default=HERA latitude @@ -790,8 +792,8 @@ def lst_rephase( bls: dict[tp.Baseline, np.ndarray] | np.ndarray, freqs: np.ndarray, dlst: float | list | np.ndarray, - lat: float =-30.721526120689507, - inplace: bool=True, + lat=-30.721526120689507, + inplace=True, array=False ): """ @@ -820,7 +822,7 @@ def lst_rephase( if True edit arrays in data in memory, else make a copy and return array Deprecated parameter that specifies that the input data is an array - with shape (ntimes, nfreqs, [npols]). Don't write code with this + with shape (ntimes, nfreqs, [npols]). Don't write code with this parameter -- instead just set the baseline axis of data to be length-1. Notes: @@ -901,7 +903,7 @@ def lst_rephase( if array: warnings.warn( "the 'array' parameter to lst_rephase is deprecated. Please provide data with second-axis of Nbls", - category=DeprecationWarning, + category=DeprecationWarning, ) data = data[:, None] @@ -933,7 +935,7 @@ def lst_rephase( if not inplace: if orig_type in (dict, OrderedDict, DataContainer): - return orig_type({k: data[:,i] for i,k in enumerate(keys)}) + return orig_type({k: data[:, i] for i, k in enumerate(keys)}) else: if array: return data[:, 0] @@ -1465,6 +1467,7 @@ def red_average(data, reds=None, bl_tol=1.0, inplace=False, else: return data + def match_files_to_lst_bins( lst_edges: tuple[float], file_list: Sequence[str | Path], @@ -1554,12 +1557,12 @@ def match_files_to_lst_bins( if len(lst_edges) < 2: raise ValueError("lst_edges must have at least 2 elements.") lst0 = lst_edges[0] - lst_edges=np.array([(lst - lst0)%(2*np.pi) + lst0 for lst in lst_edges]) + lst_edges = np.array([(lst - lst0) % (2 * np.pi) + lst0 for lst in lst_edges]) # We can have the case that an edges is at lst0 + 2pi exactly, which would # get wrapped around to lst0, but it should stay at lst0+2pi. - lst_edges[1:][lst_edges[1:] == lst_edges[0]] += 2*np.pi + lst_edges[1:][lst_edges[1:] == lst_edges[0]] += 2 * np.pi - if np.any(np.diff(lst_edges) < 0 ): + if np.any(np.diff(lst_edges) < 0): raise ValueError("lst_edges must not extend beyond 2pi total radians from start to finish.") lstmin, lstmax = lst_edges[0], lst_edges[-1] @@ -1597,10 +1600,12 @@ def match_files_to_lst_bins( if jd_regex is not None: jd_regex = re.compile(jd_regex) + @cache def get_first_time(path: Path) -> float: return float(jd_regex.findall(path.name)[0]) else: + @cache def get_first_time(path: Path) -> float: meta = _metas[path] @@ -1609,7 +1614,7 @@ def get_first_time(path: Path) -> float: def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: jdstart, jdend = jd_range - jdstart_for_file = jdstart - tint*(ntimes_per_file-1) - atol + jdstart_for_file = jdstart - tint * (ntimes_per_file - 1) - atol # The lst-edges might wrap around within a day. If so, when comparing times # to the bins, we need to use an OR instead of an AND. @@ -1630,22 +1635,21 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: break diff = jdstart_for_file - thistime - in_range = cmp(jdstart_for_file <= thistime, thistime <= jdend+atol) + in_range = cmp(jdstart_for_file <= thistime, thistime <= jdend + atol) if np.abs(diff) < best_diff and in_range: # This file has a time that is in our range, and is closest so far # to the start of the range. best_diff = np.abs(diff) best_index = index - - move_nfiles = round(diff / (tint*ntimes_per_file)) + move_nfiles = round(diff / (tint * ntimes_per_file)) if move_nfiles == 0: if in_range: break else: # Move by one file in the right direction, just to be sure. - move_nfiles = int(1*np.sign(diff)) + move_nfiles = int(1 * np.sign(diff)) if diff > 0: minidx = max(minidx, index) @@ -1654,7 +1658,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: index += move_nfiles if not (minidx < index < maxidx): - index = min(maxidx-1, max(minidx+1, (minidx + maxidx) // 2)) + index = min(maxidx - 1, max(minidx + 1, (minidx + maxidx) // 2)) if index < 0: index = 0 elif index >= len(meta_list): @@ -1669,15 +1673,12 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: return best_index try: - # We subtract one, because the file before the first file in the range - # may just get into the range by a tiny bit, and we allow the user to test - # that out later. first_file = get_first_file_in_range(metadata_list, (jdstart, jdend)) except ValueError: - return [[] for _ in range(len(lst_edges)-1)] + return [[] for _ in range(len(lst_edges) - 1)] # Now, we have to actually go through the bins in lst_edges and assign files. - lstbin_files = [[] for _ in range(len(lst_edges)-1)] + lstbin_files = [[] for _ in range(len(lst_edges) - 1)] _meta_list = metadata_list[first_file:] + metadata_list[:first_file] # roll the files for i, fl in enumerate(_meta_list): @@ -1686,7 +1687,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: for lstbin, (jdstart, jdend) in enumerate(zip(jd_edges[:-1], jd_edges[1:])): # Go through each lst bin cmp = operator.and_ if (jdstart < jdend) else operator.or_ - if cmp(jdstart - (ntimes_per_file-1)*tint - atol <= thistime, thistime < jdend + atol): + if cmp(jdstart - (ntimes_per_file - 1) * tint - atol <= thistime, thistime < jdend + atol): lstbin_files[lstbin].append(fl) fl_in_range = True @@ -1695,6 +1696,7 @@ def get_first_file_in_range(meta_list, jd_range: Tuple[float, float]) -> int: return lstbin_files + def eq2top_m(ha, dec): """Return the 3x3 matrix converting equatorial coordinates to topocentric at the given hour angle (ha) and declination (dec). @@ -1779,6 +1781,21 @@ def chunk_baselines_by_redundant_groups(reds, max_chunk_size): return baseline_chunks +def get_best_lst_branch_cut(lsts_in): + # Sort them + lsts = np.sort(lsts_in % (2 * np.pi)) + + # wrap lstg around in case the biggest gap is right at the start + lsts = np.concatenate((lsts, [lsts[0]])) + + diff_lst = np.diff(lsts) + while np.any(diff_lst < 0): + lsts[np.where(diff_lst < 0)[0] + 1] += 2 * np.pi + diff_lst = np.diff(lsts) + idxmax = np.argmax(diff_lst) + 1 + return lsts[idxmax] % (2 * np.pi) + + def select_spw_ranges(inputfilename, outputfilename, spw_ranges=None, clobber=False): """Utility function for selecting spw_ranges and writing out diff --git a/pyproject.toml b/pyproject.toml index 869b95217..e7d87af06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=30.3.0", "wheel", "setuptools_scm[toml]>=6.2"] +requires = ["setuptools>=30.3.0", "wheel", "setuptools_scm[toml]>=6.2,!=8.0"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] diff --git a/scripts/lstbin_simple.py b/scripts/lstbin_simple.py index d809635c5..83c754bd2 100644 --- a/scripts/lstbin_simple.py +++ b/scripts/lstbin_simple.py @@ -7,11 +7,8 @@ from hera_cal import lstbin_simple as lstbin import sys -import glob import json from hera_cal._cli_tools import setup_logger, parse_args, filter_kwargs, run_with_profiling -import logging -import importlib from pathlib import Path a = lstbin.lst_bin_arg_parser() @@ -48,19 +45,27 @@ # Turn a list into a list of 2-tuples. crules = kwargs.pop("calfile_rules") if crules is not None: - calfile_rules = [(crules[i], crules[i+1]) for i in range(len(crules)//2)] + calfile_rules = [(crules[i], crules[i + 1]) for i in range(len(crules) // 2)] else: calfile_rules = None +inprules = kwargs.pop("where_inpainted_file_rules") +if inprules is not None: + inpaint_rules = [(inprules[i], inprules[i + 1]) for i in range(len(inprules) // 2)] +else: + inpaint_rules = None kwargs['save_channels'] = tuple(int(ch) for ch in kwargs['save_channels'].split(',')) kwargs['golden_lsts'] = tuple(float(lst) for lst in kwargs['golden_lsts'].split(',')) +kwargs['output_flagged'] = not kwargs.pop('no_flagged_mode') +kwargs['output_inpainted'] = kwargs.pop("do_inpaint_mode") run_with_profiling( - lstbin.lst_bin_files, + lstbin.lst_bin_files, args, config_file=kwargs.pop('configfile'), - calfile_rules=calfile_rules, - write_kwargs=write_kwargs, + calfile_rules=calfile_rules, + where_inpainted_file_rules=inpaint_rules, + write_kwargs=write_kwargs, **kwargs ) diff --git a/scripts/tophat_frfilter_run.py b/scripts/tophat_frfilter_run.py index 8a4b80986..6178524e1 100644 --- a/scripts/tophat_frfilter_run.py +++ b/scripts/tophat_frfilter_run.py @@ -20,7 +20,6 @@ 'edgecut_low': ap.edgecut_low, 'gain': ap.gain} if ap.window == 'tukey': filter_kwargs['alpha'] = ap.alpha - filter_kwargs['flag_model_rms_outliers'] = ap.flag_model_rms_outliers elif ap.mode in ['dayenu', 'dpss_leastsq']: filter_kwargs = {'max_contiguous_edge_flags': ap.max_contiguous_edge_flags} @@ -44,20 +43,26 @@ clobber=ap.clobber, write_cache=ap.write_cache, CLEAN_outfilename=ap.CLEAN_outfilename, read_cache=ap.read_cache, mode=ap.mode, res_outfilename=ap.res_outfilename, factorize_flags=ap.factorize_flags, time_thresh=ap.time_thresh, - wgt_by_nsample=ap.wgt_by_nsample, lst_blacklists=ap.lst_blacklists, blacklist_wgt=ap.blacklist_wgt, + wgt_by_nsample=ap.wgt_by_nsample, lst_blacklists=ap.lst_blacklists, + blacklist_wgt=ap.blacklist_wgt, add_to_history=' '.join(sys.argv), verbose=ap.verbose, flag_yaml=ap.flag_yaml, Nbls_per_load=ap.Nbls_per_load, case=ap.case, external_flags=ap.external_flags, filter_spw_ranges=ap.filter_spw_ranges, overwrite_flags=ap.overwrite_flags, skip_autos=ap.skip_autos, skip_if_flag_within_edge_distance=ap.skip_if_flag_within_edge_distance, zeropad=ap.zeropad, tol=ap.tol, skip_wgt=ap.skip_wgt, max_frate_coeffs=ap.max_frate_coeffs, - frate_width_multiplier=ap.frate_width_multiplier, frate_standoff=ap.frate_standoff, fr_freq_skip=ap.fr_freq_skip, + frate_width_multiplier=ap.frate_width_multiplier, frate_standoff=ap.frate_standoff, + fr_freq_skip=ap.fr_freq_skip, min_frate_half_width=ap.min_frate_half_width, max_frate_half_width=ap.max_frate_half_width, - beamfitsfile=ap.beamfitsfile, percentile_low=ap.percentile_low, percentile_high=ap.percentile_high, - skip_contiguous_flags=not(ap.dont_skip_contiguous_flags), max_contiguous_flag=ap.max_contiguous_flag, + beamfitsfile=ap.beamfitsfile, percentile_low=ap.percentile_low, + percentile_high=ap.percentile_high, + skip_contiguous_flags=not(ap.dont_skip_contiguous_flags), + max_contiguous_flag=ap.max_contiguous_flag, skip_flagged_edges=not(ap.dont_skip_flagged_edges), - flag_model_rms_outliers=not(ap.dont_flag_model_rms_outliers), model_rms_threshold=ap.model_rms_threshold, + flag_model_rms_outliers=not(ap.dont_flag_model_rms_outliers), + model_rms_threshold=ap.model_rms_threshold, clean_flags_in_resid_flags=not(ap.clean_flags_not_in_resid_flags), pre_filter_modes_between_lobe_minimum_and_zero=ap.pre_filter_modes_between_lobe_minimum_and_zero, + param_file=ap.param_file, **filter_kwargs )