diff --git a/src/matvis/cli.py b/src/matvis/cli.py
index c6dbcd1..96c591f 100644
--- a/src/matvis/cli.py
+++ b/src/matvis/cli.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 from pyuvdata import UVBeam
 from pyuvsim import AnalyticBeam, simsetup
+from submatrices import *
 from typing import Literal
 
 from matvis import DATA_PATH, HAVE_GPU, coordinates, cpu, simulate_vis
@@ -78,6 +79,7 @@ def run_profile(
     naz=360,
     nza=180,
     pairs=None,
+    matsets=None,
     nchunks=1,
     source_buffer=1.0,
 ):
@@ -136,6 +138,7 @@ def run_profile(
         beam_idx=beam_idx,
         matprod_method=f"{'GPU' if gpu else 'CPU'}{method}",
         antpairs=pairs,
+        matsets=matsets,
         min_chunks=nchunks,
         source_buffer=source_buffer,
     )
@@ -278,8 +281,9 @@ def get_redundancies(bls, ndecimals: int = 2):
 )
 @click.option("-k", "--keep-ants", type=str, default="")
 @click.option("--outriggers/--no-outriggers", default=False)
+@click.option("-msm", "--matset-method", type=str, default="")
 @add_common_options
-def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
+def hera_profile(hex_num, nside, keep_ants, outriggers, matset_method, **kwargs):
     """Run profiling of matvis with a HERA-like array."""
     from py21cmsense.antpos import hera
 
@@ -291,7 +295,16 @@ def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
     bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2]
     pairs = np.array(get_redundancies(bls.value))
 
-    run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, **kwargs)
+    if matset_method == "":
+        matsets = None
+
+    run_profile(
+        nsource=12 * nside**2,
+        nants=antpos.shape[0],
+        pairs=pairs,
+        matsets=matsets,
+        **kwargs,
+    )
 
 
 def get_line_based_stats(lstats) -> tuple[dict, float]:
diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py
index dbe49e9..788a9eb 100644
--- a/src/matvis/core/matprod.py
+++ b/src/matvis/core/matprod.py
@@ -3,8 +3,13 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+from matvis import HAVE_GPU
+
 from .._utils import get_dtypes
 
+if HAVE_GPU:
+    import cupy as cp
+
 
 class MatProd(ABC):
     """
@@ -20,6 +25,9 @@ class MatProd(ABC):
         Number of antennas.
     antpairs
         The antenna pairs to sum over. If None, all pairs are used.
+    matsets
+        The list of sub-matrices to calculate.  If None, all pairs are used in a single
+        matrix multiplication.
     precision
         The precision of the data (1 or 2).
     """
@@ -29,16 +37,24 @@ def __init__(
         nchunks: int,
         nfeed: int,
         nant: int,
-        antpairs: np.ndarray | None,
+        antpairs: np.ndarray | list | None,
+        matsets: list | None,
         precision=1,
     ):
         if antpairs is None:
             self.all_pairs = True
             self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
+            self.matsets = None
         else:
             self.all_pairs = False
             self.antpairs = antpairs
 
+            if HAVE_GPU:
+                for i in range(len(matsets)):
+                    matsets[i][0] = cp.array(matsets[i][0])
+                    matsets[i][1] = cp.array(matsets[i][1])
+            self.matsets = matsets
+
         self.nchunks = nchunks
         self.nfeed = nfeed
         self.nant = nant
@@ -60,10 +76,33 @@ def allocate_vis(self):
             (self.nchunks, self.npairs, self.nfeed, self.nfeed), 0.0, dtype=self.ctype
         )
 
+    def check_antpairs_in_matsets(self):
+        """Check that all non-redundant ant pairs are included in set of sub-matrices.
+
+        If using the CPUMatChunk method, make sure that all non-redudant antpairs are included somewhere
+        in the set of sub-matrices to be calculated.  Otherwise, throw an exception.
+        """
+        antpairs_set = set()
+        matsets_set = set()
+
+        for _, (ai, aj) in enumerate(self.antpairs):
+            antpairs_set.add((ai, aj))
+
+        for _, (ai, aj) in enumerate(self.matsets):
+            for i in range(len(ai)):
+                for j in range(len(aj)):
+                    matsets_set.add((ai[i], aj[j]))
+
+        if not antpairs_set.issubset(matsets_set):
+            raise Exception("Some non-redundant pairs are missing from sub-matrices.  ")
+
     def setup(self):
         """Setup the memory for the object."""
         self.allocate_vis()
 
+        if self.matsets is not None:
+            self.check_antpairs_in_matsets()
+
     @abstractmethod
     def compute(self, z: np.ndarray, out: np.ndarray):
         """
diff --git a/src/matvis/cpu/cpu.py b/src/matvis/cpu/cpu.py
index 34f23dd..99c12dc 100644
--- a/src/matvis/cpu/cpu.py
+++ b/src/matvis/cpu/cpu.py
@@ -32,6 +32,7 @@ def simulate(
     I_sky: np.ndarray,
     beam_list: Sequence[UVBeam | Callable] | None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     precision: int = 1,
     polarized: bool = False,
     beam_idx: np.ndarray | None = None,
@@ -81,6 +82,13 @@ def simulate(
         Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with
         the list of antenna-pairs to return as visibilities (all feed-pairs are always
         calculated). If None, all feed-pairs are returned.
+    matsets  : array_like, optional
+        A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list
+        corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
+        to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
+        of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
+        one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
+        numpy doesn't like inhomogeneous arrays.
     precision : int, optional
         Which precision level to use for floats and complex numbers.
         Allowed values:
@@ -101,14 +109,15 @@ def simulate(
         allows). Default is 100.
     matprod_method : str, optional
         The method to use for the final matrix multiplication. Default is 'CPUMatMul',
-        which simply uses `np.dot` over the two full matrices. Currently, the other
-        option is `CPUVectorLoop`, which uses a loop over the antenna pairs,
-        computing the sum over sources as a vector dot product.
-        Whether to calculate visibilities for each antpair in antpairs as a vector
-        dot-product instead of using a full matrix-matrix multiplication for all
-        possible pairs. Default is False. Setting to True can be faster for large
-        arrays where `antpairs` is small (possibly from high redundancy). You should
-        run a performance test before using this.
+        which simply uses `np.dot` over the two full matrices. The second option is
+        `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum
+        over sources as a vector dot product.  The third option is `CPUMatChunk`, which
+        divides the product into several matrix multiplications that are optimized to have the
+        highest possible density of non-redundant pairs without invoking the overhead of a large
+        for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
+        is usually fastest, while for cases with low redundancy, `CPUMatMul` is fastest.  The `CPUMatChunk`
+        can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
+        run a performance test before choosing one of these options.
     max_memory : int, optional
         The maximum memory (in bytes) to use for the visibility calculation. This is
         not a hard-set limit, but rather a guideline for how much memory to use. If the
@@ -175,7 +184,7 @@ def simulate(
         source_buffer=source_buffer,
     )
     mpcls = getattr(mp, matprod_method)
-    matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+    matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
     zcalc = ZMatrixCalc(
         nsrc=nsrc_alloc,
         nfeed=nfeed,
diff --git a/src/matvis/cpu/matprod.py b/src/matvis/cpu/matprod.py
index 2e1191c..2b96d34 100644
--- a/src/matvis/cpu/matprod.py
+++ b/src/matvis/cpu/matprod.py
@@ -50,3 +50,38 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
             out[i] = z[ai].conj().dot(z[aj].T)
 
         return out
+
+
+class CPUMatChunk(MatProd):
+    """Loop over a small set of sub-matrix products which collectively contain all nont-redundant pairs."""
+
+    def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
+        """Perform the source-summing operation for a single time and chunk.
+
+        Parameters
+        ----------
+        z
+            Complex integrand. Shape=(Nfeed, Nant, Nax, Nsrc).
+        out
+            Output array, shaped as (Nfeed, Nfeed, Npairs).
+        """
+        z = z.reshape((self.nant, self.nfeed, -1))
+
+        mat_product = np.zeros(
+            (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+        )
+
+        # Chris 12/20/23: instead we will use matsets
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.matsets):
+                    AI, AJ = np.meshgrid(ai, aj)
+                    mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+
+        # Now, we need to identify the non-redundant pairs and put them into the final output array
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.antpairs):
+                    out[i, j, k] = mat_product[ai, aj, j, k]
+
+        return out
diff --git a/src/matvis/gpu/gpu.py b/src/matvis/gpu/gpu.py
index 9bf82ec..b93ee02 100644
--- a/src/matvis/gpu/gpu.py
+++ b/src/matvis/gpu/gpu.py
@@ -49,6 +49,7 @@ def simulate(
     beam_list: Sequence[UVBeam | Callable] | None,
     polarized: bool = False,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     beam_idx: np.ndarray | None = None,
     max_memory: int = np.inf,
     min_chunks: int = 1,
@@ -120,7 +121,7 @@ def simulate(
         ctype=ctype,
     )
     mpcls = getattr(mp, matprod_method)
-    matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+    matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
 
     npixc = nsrc // nchunks
 
diff --git a/src/matvis/gpu/matprod.py b/src/matvis/gpu/matprod.py
index 6bff898..5bfd1f2 100644
--- a/src/matvis/gpu/matprod.py
+++ b/src/matvis/gpu/matprod.py
@@ -92,3 +92,47 @@ def sum_chunks(self, out: np.ndarray):
 
         out[:] = self.vis[0].transpose((2, 0, 1)).get()
         cp.cuda.Device().synchronize()
+
+
+class GPUMatChunk(MatProd):
+    """Use a loop over specific pairs, performing a vdot over the source axis."""
+
+    def allocate_vis(self):
+        """Allocate memory for the visibilities."""
+        self.vis = [
+            cp.full(
+                (self.nfeed, self.nfeed, self.npairs), 0.0, dtype=self.ctype, order="F"
+            )
+            for _ in range(self.nchunks)
+        ]
+
+    def compute(self, z: cp.ndarray, out: cp.ndarray) -> cp.ndarray:
+        """Perform the source-summing operation for a single time and chunk."""
+        z = z.reshape((self.nant, self.nfeed, -1))
+
+        mat_product = cp.zeros(
+            (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+        )
+
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.matsets):
+                    AI, AJ = cp.meshgrid(ai, aj)
+                    mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.antpairs):
+                    out[j, k, i] = mat_product[ai, aj, j, k]
+
+        cp.cuda.Device().synchronize()
+        return out
+
+    def sum_chunks(self, out: np.ndarray):
+        """Sum the chunks into the output array."""
+        if self.nchunks > 1:
+            for i in range(1, len(self.vis)):
+                self.vis[0] += self.vis[i]
+
+        out[:] = self.vis[0].transpose((2, 0, 1)).get()
+        cp.cuda.Device().synchronize()
diff --git a/src/matvis/submatrices.py b/src/matvis/submatrices.py
new file mode 100644
index 0000000..f755f2d
--- /dev/null
+++ b/src/matvis/submatrices.py
@@ -0,0 +1,22 @@
+"""Module containing several options for computing sub-matrices for the MatChunk method."""
+import numpy as np
+
+
+def get_matrix_sets(bls, ndecimals: int = 2):
+    """Find redundant baselines."""
+    uvbins = set()
+    msets = []
+
+    # Everything here is in wavelengths
+    bls = np.round(bls, decimals=ndecimals)
+    nant = bls.shape[0]
+
+    # group redundant baselines
+    for i in range(nant):
+        for j in range(i + 1, nant):
+            u, v = bls[i, j]
+            if (u, v) not in uvbins and (-u, -v) not in uvbins:
+                uvbins.add((u, v))
+                msets.append([np.array([i]), np.array([j])])
+
+    return msets
diff --git a/src/matvis/wrapper.py b/src/matvis/wrapper.py
index 9c7f7a1..831ca5a 100644
--- a/src/matvis/wrapper.py
+++ b/src/matvis/wrapper.py
@@ -28,6 +28,8 @@ def simulate_vis(
     beam_spline_opts: dict | None = None,
     beam_idx: np.ndarray | None = None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]]
+    | None = None,  # set of sub-matrices
     source_buffer: float = 1.0,
     **backend_kwargs,
 ):
@@ -150,6 +152,7 @@ def simulate_vis(
             beam_spline_opts=beam_spline_opts,
             beam_idx=beam_idx,
             antpairs=antpairs,
+            matsets=matsets,
             source_buffer=source_buffer,
             **backend_kwargs,
         )
diff --git a/tests/test_matprod.py b/tests/test_matprod.py
index 154813b..6bf39bb 100644
--- a/tests/test_matprod.py
+++ b/tests/test_matprod.py
@@ -1,7 +1,9 @@
 """Tests that the various matprod methods produce the same result."""
 
+
 import pytest
 
+import cupy as cp
 import numpy as np
 
 from matvis._utils import get_dtypes
@@ -16,13 +18,20 @@ def simple_matprod(z):
 @pytest.mark.parametrize("antpairs", [True, False])
 @pytest.mark.parametrize("precision", [1, 2])
 @pytest.mark.parametrize(
-    "method", ["CPUMatMul", "CPUVectorDot", "GPUMatMul", "GPUVectorDot"]
+    "method",
+    [
+        "CPUMatMul",
+        "CPUVectorDot",
+        "CPUMatChunk",
+        "GPUMatMul",
+        "GPUVectorDot",
+        "GPUMatChunk",
+    ],
 )
-def test_matprod(nfeed, antpairs, precision, method):
+def test_matprod(nfeed, antpairs, matsets, precision, method):
     """Test that the various matprod methods produce the same result."""
     if method.startswith("GPU"):
         pytest.importorskip("cupy")
-        import cupy as cp
 
         from matvis.gpu import matprod as module
     else:
@@ -36,7 +45,24 @@ def test_matprod(nfeed, antpairs, precision, method):
     else:
         antpairs = None
 
-    obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, precision=precision)
+    if matsets:
+        matsets = [
+            (np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3])),
+            (np.array([0, 1, 2, 3]), np.array([3, 4])),
+            (np.array([3, 4]), np.array([0, 1, 2, 3])),
+            (np.array([3, 4]), np.array([3, 4])),
+        ]
+    else:
+        matsets = None
+
+    obj = cls(
+        nchunks=1,
+        nfeed=nfeed,
+        nant=nant,
+        antpairs=antpairs,
+        matsets=matsets,
+        precision=precision,
+    )
     obj.setup()
 
     ctype = get_dtypes(precision)[1]