diff --git a/src/matvis/cli.py b/src/matvis/cli.py index c6dbcd1..96c591f 100644 --- a/src/matvis/cli.py +++ b/src/matvis/cli.py @@ -21,6 +21,7 @@ from pathlib import Path from pyuvdata import UVBeam from pyuvsim import AnalyticBeam, simsetup +from submatrices import * from typing import Literal from matvis import DATA_PATH, HAVE_GPU, coordinates, cpu, simulate_vis @@ -78,6 +79,7 @@ def run_profile( naz=360, nza=180, pairs=None, + matsets=None, nchunks=1, source_buffer=1.0, ): @@ -136,6 +138,7 @@ def run_profile( beam_idx=beam_idx, matprod_method=f"{'GPU' if gpu else 'CPU'}{method}", antpairs=pairs, + matsets=matsets, min_chunks=nchunks, source_buffer=source_buffer, ) @@ -278,8 +281,9 @@ def get_redundancies(bls, ndecimals: int = 2): ) @click.option("-k", "--keep-ants", type=str, default="") @click.option("--outriggers/--no-outriggers", default=False) +@click.option("-msm", "--matset-method", type=str, default="") @add_common_options -def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs): +def hera_profile(hex_num, nside, keep_ants, outriggers, matset_method, **kwargs): """Run profiling of matvis with a HERA-like array.""" from py21cmsense.antpos import hera @@ -291,7 +295,16 @@ def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs): bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2] pairs = np.array(get_redundancies(bls.value)) - run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, **kwargs) + if matset_method == "": + matsets = None + + run_profile( + nsource=12 * nside**2, + nants=antpos.shape[0], + pairs=pairs, + matsets=matsets, + **kwargs, + ) def get_line_based_stats(lstats) -> tuple[dict, float]: diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py index dbe49e9..788a9eb 100644 --- a/src/matvis/core/matprod.py +++ b/src/matvis/core/matprod.py @@ -3,8 +3,13 @@ from abc import ABC, abstractmethod from typing import Any +from matvis import HAVE_GPU + from .._utils import get_dtypes +if HAVE_GPU: + import cupy as cp + class MatProd(ABC): """ @@ -20,6 +25,9 @@ class MatProd(ABC): Number of antennas. antpairs The antenna pairs to sum over. If None, all pairs are used. + matsets + The list of sub-matrices to calculate. If None, all pairs are used in a single + matrix multiplication. precision The precision of the data (1 or 2). """ @@ -29,16 +37,24 @@ def __init__( nchunks: int, nfeed: int, nant: int, - antpairs: np.ndarray | None, + antpairs: np.ndarray | list | None, + matsets: list | None, precision=1, ): if antpairs is None: self.all_pairs = True self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)]) + self.matsets = None else: self.all_pairs = False self.antpairs = antpairs + if HAVE_GPU: + for i in range(len(matsets)): + matsets[i][0] = cp.array(matsets[i][0]) + matsets[i][1] = cp.array(matsets[i][1]) + self.matsets = matsets + self.nchunks = nchunks self.nfeed = nfeed self.nant = nant @@ -60,10 +76,33 @@ def allocate_vis(self): (self.nchunks, self.npairs, self.nfeed, self.nfeed), 0.0, dtype=self.ctype ) + def check_antpairs_in_matsets(self): + """Check that all non-redundant ant pairs are included in set of sub-matrices. + + If using the CPUMatChunk method, make sure that all non-redudant antpairs are included somewhere + in the set of sub-matrices to be calculated. Otherwise, throw an exception. + """ + antpairs_set = set() + matsets_set = set() + + for _, (ai, aj) in enumerate(self.antpairs): + antpairs_set.add((ai, aj)) + + for _, (ai, aj) in enumerate(self.matsets): + for i in range(len(ai)): + for j in range(len(aj)): + matsets_set.add((ai[i], aj[j])) + + if not antpairs_set.issubset(matsets_set): + raise Exception("Some non-redundant pairs are missing from sub-matrices. ") + def setup(self): """Setup the memory for the object.""" self.allocate_vis() + if self.matsets is not None: + self.check_antpairs_in_matsets() + @abstractmethod def compute(self, z: np.ndarray, out: np.ndarray): """ diff --git a/src/matvis/cpu/cpu.py b/src/matvis/cpu/cpu.py index 34f23dd..99c12dc 100644 --- a/src/matvis/cpu/cpu.py +++ b/src/matvis/cpu/cpu.py @@ -32,6 +32,7 @@ def simulate( I_sky: np.ndarray, beam_list: Sequence[UVBeam | Callable] | None, antpairs: np.ndarray | list[tuple[int, int]] | None = None, + matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None, precision: int = 1, polarized: bool = False, beam_idx: np.ndarray | None = None, @@ -81,6 +82,13 @@ def simulate( Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with the list of antenna-pairs to return as visibilities (all feed-pairs are always calculated). If None, all feed-pairs are returned. + matsets : array_like, optional + A list of containing a set of 2-tuples of numpy arrays. Each entry in the list + corresponds to a different sub-matrix. The first element of the ith tuple lists the rows of Z corresponding + to the rows of the ith sub-matrix, and the second tuple element contains the list of columns. In the case + of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only + one int, such that the sub-matrices contain only one element each. Outer dimension has to be a list because + numpy doesn't like inhomogeneous arrays. precision : int, optional Which precision level to use for floats and complex numbers. Allowed values: @@ -101,14 +109,15 @@ def simulate( allows). Default is 100. matprod_method : str, optional The method to use for the final matrix multiplication. Default is 'CPUMatMul', - which simply uses `np.dot` over the two full matrices. Currently, the other - option is `CPUVectorLoop`, which uses a loop over the antenna pairs, - computing the sum over sources as a vector dot product. - Whether to calculate visibilities for each antpair in antpairs as a vector - dot-product instead of using a full matrix-matrix multiplication for all - possible pairs. Default is False. Setting to True can be faster for large - arrays where `antpairs` is small (possibly from high redundancy). You should - run a performance test before using this. + which simply uses `np.dot` over the two full matrices. The second option is + `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum + over sources as a vector dot product. The third option is `CPUMatChunk`, which + divides the product into several matrix multiplications that are optimized to have the + highest possible density of non-redundant pairs without invoking the overhead of a large + for loop over vector products. For very large arrays, with very high redundancy, CPUVectorLoop + is usually fastest, while for cases with low redundancy, `CPUMatMul` is fastest. The `CPUMatChunk` + can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays. You should + run a performance test before choosing one of these options. max_memory : int, optional The maximum memory (in bytes) to use for the visibility calculation. This is not a hard-set limit, but rather a guideline for how much memory to use. If the @@ -175,7 +184,7 @@ def simulate( source_buffer=source_buffer, ) mpcls = getattr(mp, matprod_method) - matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision) + matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision) zcalc = ZMatrixCalc( nsrc=nsrc_alloc, nfeed=nfeed, diff --git a/src/matvis/cpu/matprod.py b/src/matvis/cpu/matprod.py index 2e1191c..2b96d34 100644 --- a/src/matvis/cpu/matprod.py +++ b/src/matvis/cpu/matprod.py @@ -50,3 +50,38 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray: out[i] = z[ai].conj().dot(z[aj].T) return out + + +class CPUMatChunk(MatProd): + """Loop over a small set of sub-matrix products which collectively contain all nont-redundant pairs.""" + + def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray: + """Perform the source-summing operation for a single time and chunk. + + Parameters + ---------- + z + Complex integrand. Shape=(Nfeed, Nant, Nax, Nsrc). + out + Output array, shaped as (Nfeed, Nfeed, Npairs). + """ + z = z.reshape((self.nant, self.nfeed, -1)) + + mat_product = np.zeros( + (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype + ) + + # Chris 12/20/23: instead we will use matsets + for j in range(self.nfeed): + for k in range(self.nfeed): + for i, (ai, aj) in enumerate(self.matsets): + AI, AJ = np.meshgrid(ai, aj) + mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T + + # Now, we need to identify the non-redundant pairs and put them into the final output array + for j in range(self.nfeed): + for k in range(self.nfeed): + for i, (ai, aj) in enumerate(self.antpairs): + out[i, j, k] = mat_product[ai, aj, j, k] + + return out diff --git a/src/matvis/gpu/gpu.py b/src/matvis/gpu/gpu.py index 9bf82ec..b93ee02 100644 --- a/src/matvis/gpu/gpu.py +++ b/src/matvis/gpu/gpu.py @@ -49,6 +49,7 @@ def simulate( beam_list: Sequence[UVBeam | Callable] | None, polarized: bool = False, antpairs: np.ndarray | list[tuple[int, int]] | None = None, + matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None, beam_idx: np.ndarray | None = None, max_memory: int = np.inf, min_chunks: int = 1, @@ -120,7 +121,7 @@ def simulate( ctype=ctype, ) mpcls = getattr(mp, matprod_method) - matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision) + matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision) npixc = nsrc // nchunks diff --git a/src/matvis/gpu/matprod.py b/src/matvis/gpu/matprod.py index 6bff898..5bfd1f2 100644 --- a/src/matvis/gpu/matprod.py +++ b/src/matvis/gpu/matprod.py @@ -92,3 +92,47 @@ def sum_chunks(self, out: np.ndarray): out[:] = self.vis[0].transpose((2, 0, 1)).get() cp.cuda.Device().synchronize() + + +class GPUMatChunk(MatProd): + """Use a loop over specific pairs, performing a vdot over the source axis.""" + + def allocate_vis(self): + """Allocate memory for the visibilities.""" + self.vis = [ + cp.full( + (self.nfeed, self.nfeed, self.npairs), 0.0, dtype=self.ctype, order="F" + ) + for _ in range(self.nchunks) + ] + + def compute(self, z: cp.ndarray, out: cp.ndarray) -> cp.ndarray: + """Perform the source-summing operation for a single time and chunk.""" + z = z.reshape((self.nant, self.nfeed, -1)) + + mat_product = cp.zeros( + (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype + ) + + for j in range(self.nfeed): + for k in range(self.nfeed): + for i, (ai, aj) in enumerate(self.matsets): + AI, AJ = cp.meshgrid(ai, aj) + mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T + + for j in range(self.nfeed): + for k in range(self.nfeed): + for i, (ai, aj) in enumerate(self.antpairs): + out[j, k, i] = mat_product[ai, aj, j, k] + + cp.cuda.Device().synchronize() + return out + + def sum_chunks(self, out: np.ndarray): + """Sum the chunks into the output array.""" + if self.nchunks > 1: + for i in range(1, len(self.vis)): + self.vis[0] += self.vis[i] + + out[:] = self.vis[0].transpose((2, 0, 1)).get() + cp.cuda.Device().synchronize() diff --git a/src/matvis/submatrices.py b/src/matvis/submatrices.py new file mode 100644 index 0000000..f755f2d --- /dev/null +++ b/src/matvis/submatrices.py @@ -0,0 +1,22 @@ +"""Module containing several options for computing sub-matrices for the MatChunk method.""" +import numpy as np + + +def get_matrix_sets(bls, ndecimals: int = 2): + """Find redundant baselines.""" + uvbins = set() + msets = [] + + # Everything here is in wavelengths + bls = np.round(bls, decimals=ndecimals) + nant = bls.shape[0] + + # group redundant baselines + for i in range(nant): + for j in range(i + 1, nant): + u, v = bls[i, j] + if (u, v) not in uvbins and (-u, -v) not in uvbins: + uvbins.add((u, v)) + msets.append([np.array([i]), np.array([j])]) + + return msets diff --git a/src/matvis/wrapper.py b/src/matvis/wrapper.py index 9c7f7a1..831ca5a 100644 --- a/src/matvis/wrapper.py +++ b/src/matvis/wrapper.py @@ -28,6 +28,8 @@ def simulate_vis( beam_spline_opts: dict | None = None, beam_idx: np.ndarray | None = None, antpairs: np.ndarray | list[tuple[int, int]] | None = None, + matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] + | None = None, # set of sub-matrices source_buffer: float = 1.0, **backend_kwargs, ): @@ -150,6 +152,7 @@ def simulate_vis( beam_spline_opts=beam_spline_opts, beam_idx=beam_idx, antpairs=antpairs, + matsets=matsets, source_buffer=source_buffer, **backend_kwargs, ) diff --git a/tests/test_matprod.py b/tests/test_matprod.py index 154813b..6bf39bb 100644 --- a/tests/test_matprod.py +++ b/tests/test_matprod.py @@ -1,7 +1,9 @@ """Tests that the various matprod methods produce the same result.""" + import pytest +import cupy as cp import numpy as np from matvis._utils import get_dtypes @@ -16,13 +18,20 @@ def simple_matprod(z): @pytest.mark.parametrize("antpairs", [True, False]) @pytest.mark.parametrize("precision", [1, 2]) @pytest.mark.parametrize( - "method", ["CPUMatMul", "CPUVectorDot", "GPUMatMul", "GPUVectorDot"] + "method", + [ + "CPUMatMul", + "CPUVectorDot", + "CPUMatChunk", + "GPUMatMul", + "GPUVectorDot", + "GPUMatChunk", + ], ) -def test_matprod(nfeed, antpairs, precision, method): +def test_matprod(nfeed, antpairs, matsets, precision, method): """Test that the various matprod methods produce the same result.""" if method.startswith("GPU"): pytest.importorskip("cupy") - import cupy as cp from matvis.gpu import matprod as module else: @@ -36,7 +45,24 @@ def test_matprod(nfeed, antpairs, precision, method): else: antpairs = None - obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, precision=precision) + if matsets: + matsets = [ + (np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3])), + (np.array([0, 1, 2, 3]), np.array([3, 4])), + (np.array([3, 4]), np.array([0, 1, 2, 3])), + (np.array([3, 4]), np.array([3, 4])), + ] + else: + matsets = None + + obj = cls( + nchunks=1, + nfeed=nfeed, + nant=nant, + antpairs=antpairs, + matsets=matsets, + precision=precision, + ) obj.setup() ctype = get_dtypes(precision)[1]