From 2921f7d1151c0d91bb6b04ee43924055ab2678f3 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 7 Mar 2023 12:05:07 +0000
Subject: [PATCH 01/75] Sparse FDM preconditioner for the de Rham complex

---
 firedrake/preconditioners/fdm.py | 1513 ++++++++++++++++++++++++------
 tests/regression/test_fdm.py     |  149 ++-
 2 files changed, 1342 insertions(+), 320 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 826e1f4447..4542178d7c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1,11 +1,17 @@
-from functools import lru_cache, partial
+from functools import partial, lru_cache
+from itertools import product
+from pyop2.sparsity import get_preallocation
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
+from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
+from firedrake_citations import Citations
 import firedrake.dmhooks as dmhooks
 import firedrake
+import ctypes
 import numpy
 import ufl
-from firedrake_citations import Citations
+import FIAT
+import finat
 
 Citations().add("Brubeck2021", """
 @misc{Brubeck2021,
@@ -18,32 +24,42 @@
 }
 """)
 
-__all__ = ("FDMPC",)
+__all__ = ("FDMPC", "PoissonFDMPC")
 
 
 class FDMPC(PCBase):
     """
     A preconditioner for tensor-product elements that changes the shape
-    functions so that the H^1 Riesz map is diagonalized in the interior of a
-    Cartesian cell, and assembles a global sparse matrix on which other
-    preconditioners, such as `ASMStarPC`, can be applied.
+    functions so that the H(d) Riesz map is sparse on Cartesian cells,
+    and assembles a global sparse matrix on which other preconditioners,
+    such as `ASMStarPC`, can be applied.
 
     Here we assume that the volume integrals in the Jacobian can be expressed as:
 
-    inner(grad(v), alpha(grad(u)))*dx + inner(v, beta(u))*dx
+    inner(d(v), alpha(d(u)))*dx + inner(v, beta(u))*dx
 
     where alpha and beta are linear functions (tensor contractions).
-    The sparse matrix is obtained by approximating alpha and beta by cell-wise
-    constants and discarding the coefficients in alpha that couple together
-    mixed derivatives and mixed components.
-
-    For spaces that are not H^1-conforming, this preconditioner will use
-    the symmetric interior-penalty DG method. The penalty coefficient can be
-    provided in the application context, keyed on ``"eta"``.
+    The sparse matrix is obtained by approximating (v, alpha u) and (v, beta u) as
+    diagonal mass matrices
     """
 
     _prefix = "fdm_"
 
+    _variant = "fdm"
+
+    _reference_tensor_cache = {}
+    _coefficient_cache = {}
+    _c_code_cache = {}
+
+    @staticmethod
+    def load_set_values(triu=False):
+        cache = FDMPC._c_code_cache
+        key = triu
+        if key not in cache:
+            comm = PETSc.COMM_SELF
+            cache[key] = load_assemble_csr(comm, triu=triu)
+        return cache[key]
+
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
         from firedrake.assemble import allocate_matrix, assemble
@@ -52,77 +68,91 @@ def initialize(self, pc):
         Citations().register("Brubeck2021")
 
         self.comm = pc.comm
-
+        Amat, Pmat = pc.getOperators()
         prefix = pc.getOptionsPrefix()
         options_prefix = prefix + self._prefix
+        options = PETSc.Options(options_prefix)
+
+        use_amat = options.getBool("pc_use_amat", True)
+        pmat_type = options.getString("mat_type", PETSc.Mat.Type.AIJ)
+        diagonal_scale = options.getBool("diagonal_scale", False)
 
         appctx = self.get_appctx(pc)
         fcp = appctx.get("form_compiler_parameters")
+        self.appctx = appctx
 
         # Get original Jacobian form and bcs
-        octx = dmhooks.get_appctx(pc.getDM())
-        mat_type = octx.mat_type
-        oproblem = octx._problem
-        J = oproblem.J
-        bcs = tuple(oproblem.bcs)
+        if Pmat.getType() == "python":
+            ctx = Pmat.getPythonContext()
+            J = ctx.a
+            bcs = tuple(ctx.bcs)
+            mat_type = "matfree"
+        else:
+            ctx = dmhooks.get_appctx(pc.getDM())
+            J = ctx.Jp or ctx.J
+            bcs = tuple(ctx._problem.bcs)
+            mat_type = ctx.mat_type
+
+        if isinstance(J, firedrake.slate.Add):
+            J = J.children[0].form
+        assert type(J) == ufl.Form
 
         # Transform the problem into the space with FDM shape functions
-        V = J.arguments()[0].function_space()
+        V = J.arguments()[-1].function_space()
         element = V.ufl_element()
-        e_fdm = element.reconstruct(variant="fdm")
-
-        def interp_nullspace(I, nsp):
-            if not nsp:
-                return nsp
-            vectors = []
-            for x in nsp.getVecs():
-                y = I.createVecLeft()
-                I.mult(x, y)
-                vectors.append(y)
-            if nsp.hasConstant():
-                y = I.createVecLeft()
-                x = I.createVecRight()
-                x.set(1.0E0)
-                I.mult(x, y)
-                vectors.append(y)
-                x.destroy()
-            return PETSc.NullSpace().create(constant=False, vectors=vectors, comm=nsp.getComm())
-
-        # Matrix-free assembly of the transformed Jacobian
+        e_fdm = element.reconstruct(variant=self._variant)
+
         if element == e_fdm:
             V_fdm, J_fdm, bcs_fdm = (V, J, bcs)
-            Amat, _ = pc.getOperators()
-            self._ctx_ref = octx
         else:
+            # Matrix-free assembly of the transformed Jacobian
             V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
-            J_fdm = ufl.replace(J, {t: t.reconstruct(function_space=V_fdm) for t in J.arguments()})
-            bcs_fdm = tuple(bc.reconstruct(V=V_fdm) for bc in bcs)
-            self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
-            self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp, mat_type=mat_type,
-                                     options_prefix=options_prefix)
-            self._assemble_A = partial(assemble, J_fdm, tensor=self.A, bcs=bcs_fdm,
-                                       form_compiler_parameters=fcp, mat_type=mat_type)
-            self._assemble_A()
-            Amat = self.A.petscmat
+            J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
+            bcs_fdm = tuple(bc.reconstruct(V=V_fdm, g=0) for bc in bcs)
 
-            omat, _ = pc.getOperators()
-            inject = prolongation_matrix_matfree(V_fdm, V, [], [])
-            Amat.setNullSpace(interp_nullspace(inject, omat.getNullSpace()))
-            Amat.setTransposeNullSpace(interp_nullspace(inject, omat.getTransposeNullSpace()))
-            Amat.setNearNullSpace(interp_nullspace(inject, omat.getNearNullSpace()))
+            self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
             self.work_vec_x = Amat.createVecLeft()
             self.work_vec_y = Amat.createVecRight()
-
+            if use_amat:
+                omat = Amat
+                self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp,
+                                         mat_type=mat_type, options_prefix=options_prefix)
+                self._assemble_A = partial(assemble, J_fdm, tensor=self.A, bcs=bcs_fdm,
+                                           form_compiler_parameters=fcp, mat_type=mat_type)
+                self._assemble_A()
+                Amat = self.A.petscmat
+
+                def interp_nullspace(I, nsp):
+                    if not nsp.handle:
+                        return nsp
+                    vectors = []
+                    for x in nsp.getVecs():
+                        y = I.createVecLeft()
+                        I.mult(x, y)
+                        vectors.append(y)
+                    if nsp.hasConstant():
+                        y = I.createVecLeft()
+                        x = I.createVecRight()
+                        x.set(1.0E0)
+                        I.mult(x, y)
+                        vectors.append(y)
+                        x.destroy()
+                    return PETSc.NullSpace().create(constant=False, vectors=vectors, comm=nsp.getComm())
+
+                inject = prolongation_matrix_matfree(V_fdm, V, [], [])
+                Amat.setNullSpace(interp_nullspace(inject, omat.getNullSpace()))
+                Amat.setTransposeNullSpace(interp_nullspace(inject, omat.getTransposeNullSpace()))
+                Amat.setNearNullSpace(interp_nullspace(inject, omat.getNearNullSpace()))
+
+            if len(bcs) > 0:
+                self.bc_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
+            else:
+                self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
             self._ctx_ref = self.new_snes_ctx(pc, J_fdm, bcs_fdm, mat_type,
                                               fcp=fcp, options_prefix=options_prefix)
 
-        if len(bcs) > 0:
-            self.bc_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
-        else:
-            self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
-
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, appctx)
+        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, appctx, pmat_type, diagonal_scale)
         self._assemble_P()
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
@@ -136,18 +166,174 @@ def interp_nullspace(I, nsp):
 
         # We set a DM and an appropriate SNESContext on the constructed PC so one
         # can do e.g. multigrid or patch solves.
-        fdm_dm = V_fdm.dm
-        self._dm = fdm_dm
-
-        fdmpc.setDM(fdm_dm)
+        self._dm = V_fdm.dm
+        fdmpc.setDM(self._dm)
         fdmpc.setOptionsPrefix(options_prefix)
         fdmpc.setOperators(A=Amat, P=Pmat)
-        fdmpc.setUseAmat(True)
+        fdmpc.setUseAmat(use_amat)
         self.pc = fdmpc
-
-        with dmhooks.add_hooks(fdm_dm, self, appctx=self._ctx_ref, save=False):
+        if hasattr(self, "_ctx_ref"):
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref, save=False):
+                fdmpc.setFromOptions()
+        else:
             fdmpc.setFromOptions()
 
+    @PETSc.Log.EventDecorator("FDMPrealloc")
+    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type, diagonal_scale):
+        """
+        Assemble the sparse preconditioner with cell-wise constant coefficients.
+
+        :arg V: the :class:`firedrake.FunctionSpace` of the form arguments
+        :arg J: the Jacobian bilinear form
+        :arg bcs: an iterable of boundary conditions on V
+        :arg appctx: the application context
+
+        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
+        """
+        ifacet, = numpy.nonzero([is_restricted(Vsub.finat_element)[1] for Vsub in V])
+        if len(ifacet) == 0:
+            Vfacet = None
+            Vbig = V
+            _, fdofs = split_dofs(V.finat_element)
+        elif len(ifacet) == 1:
+            Vfacet = V[ifacet[0]]
+            ebig, = set(unrestrict_element(Vsub.ufl_element()) for Vsub in V)
+            Vbig = firedrake.FunctionSpace(V.mesh(), ebig)
+            if len(V) > 1:
+                dims = [Vsub.finat_element.space_dimension() for Vsub in V]
+                assert sum(dims) == Vbig.finat_element.space_dimension()
+            fdofs = restricted_dofs(Vfacet.finat_element, Vbig.finat_element)
+        else:
+            raise ValueError("Expecting at most one FunctionSpace restricted onto facets.")
+
+        value_size = Vbig.value_size
+        if value_size != 1:
+            fdofs = numpy.add.outer(value_size * fdofs, numpy.arange(value_size, dtype=fdofs.dtype))
+        dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
+        idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
+        self.ises = tuple(PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs))
+        self.submats = [None for _ in range(7)]
+
+        self.reference_tensor_on_diag = dict()
+        self.get_static_condensation = dict()
+        if Vfacet:
+            # If we are in a facet space, we build the Schur complement on its diagonal block
+            self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
+            self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1], self.submats)
+
+        elif len(fdofs) and V.finat_element.formdegree == 0:
+            # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
+            i1 = PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF)
+            self.get_static_condensation[V] = lambda Ae: condense_element_pattern(Ae, self.ises[0], i1, self.submats)
+
+        # dict of cell to global mappings for each function space
+        self.cell_to_global = dict()
+        self.lgmaps = dict()
+
+        @PETSc.Log.EventDecorator("FDMGetIndices")
+        def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
+            result = cell_to_local(cell_index, result=result)
+            return lgmap.apply(result, result=result)
+
+        bc_rows = dict()
+        for Vsub in V:
+            lgmap = Vsub.local_to_global_map([bc.reconstruct(V=Vsub, g=0) for bc in bcs])
+            bsize = Vsub.dof_dset.layout_vec.getBlockSize()
+            cell_to_local, nel = glonum_fun(Vsub.cell_node_map(), bsize=bsize)
+            self.cell_to_global[Vsub] = partial(cell_to_global, lgmap, cell_to_local)
+            self.lgmaps[Vsub] = lgmap
+
+            own = Vsub.dof_dset.layout_vec.getLocalSize()
+            bdofs = numpy.nonzero(lgmap.indices[:own] < 0)[0].astype(PETSc.IntType)
+            bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
+
+        # get coefficients on a given cell
+        coefficients, assembly_callables = self.assemble_coef(J, form_compiler_parameters)
+        coeffs = [coefficients.get(k) for k in ("beta", "alpha")]
+        cmaps = [glonum_fun(ck.cell_node_map())[0] for ck in coeffs]
+
+        @PETSc.Log.EventDecorator("FDMGetCoeffs")
+        def get_coeffs(e, result=None):
+            vals = []
+            for k, (coeff, cmap) in enumerate(zip(coeffs, cmaps)):
+                get_coeffs.indices[k] = cmap(e, result=get_coeffs.indices[k])
+                vals.append(coeff.dat.data_ro[get_coeffs.indices[k]])
+            return numpy.concatenate(vals, out=result)
+        get_coeffs.indices = [None for _ in range(len(coeffs))]
+        self.get_coeffs = get_coeffs
+
+        self.nel = nel
+        self.work_mats = dict()
+
+        Pmats = dict()
+        addv = PETSc.InsertMode.ADD_VALUES
+        symmetric = pmat_type.endswith("sbaij")
+
+        # Store only off-diagonal blocks with more columns than rows to save memory
+        Vsort = sorted(V, key=lambda Vsub: Vsub.dim())
+        for Vrow, Vcol in product(Vsort, Vsort):
+            if symmetric and (Vcol, Vrow) in Pmats:
+                P = PETSc.Mat().createTranspose(Pmats[Vcol, Vrow])
+            else:
+                on_diag = Vrow == Vcol
+                triu = on_diag and symmetric
+                ptype = pmat_type if on_diag else PETSc.Mat.Type.AIJ
+                sizes = tuple(Vsub.dof_dset.layout_vec.getSizes() for Vsub in (Vrow, Vcol))
+                # bsizes = tuple(Vsub.dof_dset.layout_vec.getBlockSize() for Vsub in (Vrow, Vcol))
+
+                preallocator = PETSc.Mat().create(comm=self.comm)
+                preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
+                preallocator.setSizes(sizes)
+                preallocator.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, False)
+                preallocator.setUp()
+                self.set_values(preallocator, Vrow, Vcol, addv, triu=triu)
+                preallocator.assemble()
+                d_nnz, o_nnz = get_preallocation(preallocator, sizes[0][0])
+                preallocator.destroy()
+                if on_diag:
+                    numpy.maximum(d_nnz, 1, out=d_nnz)
+
+                P = PETSc.Mat().create(comm=self.comm)
+                P.setType(ptype)
+                P.setSizes(sizes)
+                # P.setBlockSizes(*bsizes)
+                P.setPreallocationNNZ((d_nnz, o_nnz))
+                P.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+                if ptype.endswith("sbaij"):
+                    P.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
+                P.setUp()
+            Pmats[Vrow, Vcol] = P
+
+        if len(V) == 1:
+            Pmat = Pmats[V, V]
+        else:
+            Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=V.comm)
+
+        self.diag = None
+
+        @PETSc.Log.EventDecorator("FDMAssemble")
+        def assemble_P():
+            for _assemble in assembly_callables:
+                _assemble()
+            for Vrow, Vcol in product(Vsort, Vsort):
+                P = Pmats[Vrow, Vcol]
+                if P.getType().endswith("aij"):
+                    P.zeroEntries()
+                    if Vrow == Vcol and len(bc_rows[Vrow]) > 0:
+                        rows = bc_rows[Vrow][:, None]
+                        vals = numpy.ones(rows.shape, dtype=PETSc.RealType)
+                        P.setValuesRCV(rows, rows, vals, addv)
+                    self.set_values(P, Vrow, Vcol, addv)
+            Pmat.assemble()
+            if diagonal_scale:
+                diag = Pmat.getDiagonal(result=self.diag)
+                diag.sqrtabs()
+                diag.reciprocal()
+                Pmat.diagonalScale(L=diag, R=diag)
+                self.diag = diag
+
+        return Pmat, assemble_P
+
     @PETSc.Log.EventDecorator("FDMUpdate")
     def update(self, pc):
         if hasattr(self, "A"):
@@ -155,26 +341,24 @@ def update(self, pc):
         self._assemble_P()
 
     def apply(self, pc, x, y):
-        dm = self._dm
-        with dmhooks.add_hooks(dm, self, appctx=self._ctx_ref):
-            if hasattr(self, "fdm_interp"):
-                self.fdm_interp.multTranspose(x, self.work_vec_x)
+        if hasattr(self, "_ctx_ref"):
+            self.fdm_interp.multTranspose(x, self.work_vec_x)
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.apply(self.work_vec_x, self.work_vec_y)
-                self.fdm_interp.mult(self.work_vec_y, y)
-                y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
-            else:
-                self.pc.apply(x, y)
+            self.fdm_interp.mult(self.work_vec_y, y)
+            y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
+        else:
+            self.pc.apply(x, y)
 
     def applyTranspose(self, pc, x, y):
-        dm = self._dm
-        with dmhooks.add_hooks(dm, self, appctx=self._ctx_ref):
-            if hasattr(self, "fdm_interp"):
-                self.fdm_interp.multTranspose(x, self.work_vec_y)
+        if hasattr(self, "_ctx_ref"):
+            self.fdm_interp.multTranspose(x, self.work_vec_y)
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.applyTranspose(self.work_vec_y, self.work_vec_x)
-                self.fdm_interp.mult(self.work_vec_x, y)
-                y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
-            else:
-                self.pc.applyTranspose(x, y)
+            self.fdm_interp.mult(self.work_vec_x, y)
+            y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
+        else:
+            self.pc.applyTranspose(x, y)
 
     def view(self, pc, viewer=None):
         super(FDMPC, self).view(pc, viewer)
@@ -182,27 +366,770 @@ def view(self, pc, viewer=None):
             viewer.printfASCII("PC to apply inverse\n")
             self.pc.view(viewer)
 
-    def assemble_fdm_op(self, V, J, bcs, appctx):
+    def destroy(self, pc):
+        objs = []
+        if hasattr(self, "pc"):
+            objs.append(self.pc.getOperators()[-1])
+            objs.append(self.pc)
+        if hasattr(self, "submats"):
+            objs.extend(self.submats)
+        if hasattr(self, "work_mats"):
+            objs.extend(list(self.work_mats.values()))
+        if hasattr(self, "ises"):
+            objs.extend(self.ises)
+        for obj in objs:
+            if hasattr(obj, "destroy"):
+                obj.destroy()
+
+    @PETSc.Log.EventDecorator("FDMSetValues")
+    def set_values(self, A, Vrow, Vcol, addv, triu=False):
+
+        def RtAP(R, A, P, result=None):
+            RtAP.buff = A.matMult(P, result=RtAP.buff)
+            return R.transposeMatMult(RtAP.buff, result=result)
+        RtAP.buff = None
+
+        set_values_csr = self.load_set_values(triu=triu)
+        get_rindices = self.cell_to_global[Vrow]
+        if Vrow == Vcol:
+            get_cindices = lambda e, result=None: result
+            update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, rindices, addv)
+            rtensor = self.reference_tensor_on_diag.get(Vrow, None) or self.assemble_reference_tensor(Vrow)
+            assemble_element_mat = lambda De, result=None: De.PtAP(rtensor, result=result)
+            condense_element_mat = self.get_static_condensation.get(Vrow, None)
+        else:
+            get_cindices = self.cell_to_global[Vcol]
+            update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, cindices, addv)
+            rtensor = self.assemble_reference_tensor(Vrow)
+            ctensor = self.assemble_reference_tensor(Vcol)
+            assemble_element_mat = lambda De, result=None: RtAP(rtensor, De, ctensor, result=result)
+            condense_element_mat = None
+
+        do_sort = True
+        if condense_element_mat is None:
+            condense_element_mat = lambda x: x
+            do_sort = False
+
+        common_key = "coefs"
+        rindices = None
+        cindices = None
+        if A.getType() != PETSc.Mat.Type.PREALLOCATOR:
+            Ae = self.work_mats[Vrow, Vcol]
+            De = self.work_mats[common_key]
+            data = self.work_csr[2]
+            insert = PETSc.InsertMode.INSERT
+            work_vec = De.getDiagonal()
+            if len(data.shape) == 3:
+                @PETSc.Log.EventDecorator("FDMUpdateDiag")
+                def update_De(data):
+                    De.setValuesCSR(*self.work_csr, addv=insert)
+                    De.assemble()
+                    return De
+            else:
+                @PETSc.Log.EventDecorator("FDMUpdateDiag")
+                def update_De(data):
+                    work_vec.setArray(data)
+                    De.setDiagonal(work_vec, addv=insert)
+                    return De
+
+            for e in range(self.nel):
+                rindices = get_rindices(e, result=rindices)
+                cindices = get_cindices(e, result=cindices)
+                data = self.get_coeffs(e, result=data)
+                Ae = assemble_element_mat(update_De(data), result=Ae)
+                update_A(condense_element_mat(Ae), rindices, cindices)
+
+            work_vec.destroy()
+
+        elif self.nel:
+            if common_key not in self.work_mats:
+                data = self.get_coeffs(0)
+                data.fill(1.0E0)
+                shape = data.shape + (1,)*(3-len(data.shape))
+                nrows = shape[0] * shape[1]
+                ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
+                aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
+                if shape[2] > 1:
+                    ai *= shape[2]
+                    data = numpy.tile(numpy.eye(shape[2]), shape[:1] + (1,)*(len(shape)-1))
+
+                self.work_csr = (ai, aj, data)
+                De = PETSc.Mat().createAIJ((nrows, nrows), csr=self.work_csr, comm=PETSc.COMM_SELF)
+                self.work_mats[common_key] = De
+
+            De = self.work_mats[common_key]
+            Ae = assemble_element_mat(De, result=None)
+            self.work_mats[Vrow, Vcol] = Ae
+            if do_sort:
+                sort_interior_dofs(self.ises[0], Ae)
+            Se = condense_element_mat(Ae)
+
+            for e in range(self.nel):
+                rindices = get_rindices(e, result=rindices)
+                cindices = get_cindices(e, result=cindices)
+                update_A(Se, rindices, cindices)
+        else:
+            self.work_csr = (None, None, None)
+            self.work_mats[common_key] = None
+            self.work_mats[Vrow, Vcol] = None
+        if RtAP.buff:
+            RtAP.buff.destroy()
+
+    @PETSc.Log.EventDecorator("FDMCoefficients")
+    def assemble_coef(self, J, form_compiler_parameters):
         """
-        Assemble the sparse preconditioner with cell-wise constant coefficients.
+        Obtain coefficients as the diagonal of a weighted mass matrix in V^k x V^{k+1}
+        """
+        from ufl.algorithms.ad import expand_derivatives
+        from ufl.algorithms.expand_indices import expand_indices
+        from firedrake.formmanipulation import ExtractSubBlock
+        from firedrake.assemble import assemble
 
-        :arg V: the :class:`~.FunctionSpace` of the form arguments
-        :arg J: the Jacobian bilinear form
-        :arg bcs: an iterable of boundary conditions on V
-        :arg appctx: the application context
+        index = len(J.arguments()[-1].function_space())-1
+        if index:
+            splitter = ExtractSubBlock()
+            J = splitter.split(J, argument_indices=(index, index))
 
-        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
-        """
-        from pyop2.sparsity import get_preallocation
+        mesh = J.ufl_domain()
+        ndim = mesh.topological_dimension()
+        args_J = J.arguments()
+        e = args_J[0].ufl_element()
+        if isinstance(e, (ufl.VectorElement, ufl.TensorElement)):
+            e = e._sub_element
+        e = unrestrict_element(e)
+        sobolev = e.sobolev_space()
+
+        map_grad = None
+        if sobolev == ufl.H1:
+            map_grad = lambda p: p
+        elif sobolev in [ufl.HCurl, ufl.HDiv]:
+            u = ufl.Coefficient(ufl.FunctionSpace(mesh, e))
+            du = ufl.variable(ufl.grad(u))
+            dku = ufl.div(u) if sobolev == ufl.HDiv else ufl.curl(u)
+            eps = expand_derivatives(ufl.diff(ufl.replace(expand_derivatives(dku), {ufl.grad(u): du}), du))
+            if sobolev == ufl.HDiv:
+                map_grad = lambda p: ufl.outer(p, eps/ndim)
+            elif len(eps.ufl_shape) == 3:
+                map_grad = lambda p: ufl.dot(p, eps/2)
+            else:
+                map_grad = lambda p: p*(eps/2)
+
+        V = args_J[0].function_space()
+        formdegree = V.finat_element.formdegree
+        degree = e.degree()
+        try:
+            degree = max(degree)
+        except TypeError:
+            pass
+        qdeg = degree
+        if formdegree == ndim:
+            qfam = "DG" if ndim == 1 else "DQ"
+            qdeg = 0
+        elif formdegree == 0:
+            qfam = "DG" if ndim == 1 else "RTCE" if ndim == 2 else "NCE"
+        elif formdegree == 1 and ndim == 3:
+            qfam = "NCF"
+        else:
+            qfam = "DQ L2"
+            qdeg = degree - 1
+
+        qvariant = "fdm_quadrature"
+        elements = [e.reconstruct(variant=qvariant),
+                    ufl.FiniteElement(qfam, cell=mesh.ufl_cell(), degree=qdeg, variant=qvariant)]
+        elements = list(map(ufl.BrokenElement, elements))
+        if V.shape:
+            elements = [ufl.TensorElement(ele, shape=V.shape) for ele in elements]
+
+        Z = firedrake.FunctionSpace(mesh, ufl.MixedElement(elements))
+        args = (firedrake.TestFunctions(Z), firedrake.TrialFunctions(Z))
+        repargs = {t: v[0] for t, v in zip(args_J, args)}
+        repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else dict()
+        Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
+        mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
+
+        key = (mixed_form.signature(), mesh)
+        block_diagonal = True
+
+        if key not in self._coefficient_cache and False:
+            M = assemble(mixed_form, mat_type="matfree",
+                         form_compiler_parameters=form_compiler_parameters)
+
+            coefs = []
+            mats = []
+            for iset in Z.dof_dset.field_ises:
+                Msub = M.petscmat.createSubMatrix(iset, iset)
+                coefs.append(Msub.getPythonContext()._diagonal)
+                mats.append(Msub)
+
+            def scale_coefficients():
+                for Msub, coef in zip(mats, coefs):
+                    ksp = PETSc.KSP().create(comm=V.comm)
+                    ksp.setOperators(A=Msub, P=Msub)
+                    ksp.setType(PETSc.KSP.Type.CG)
+                    ksp.setNormType(PETSc.KSP.NormType.NATURAL)
+                    ksp.pc.setType(PETSc.PC.Type.JACOBI)
+                    ksp.setTolerances(rtol=1E-3, atol=0.0E0, max_it=8)
+                    ksp.setComputeEigenvalues(True)
+                    ksp.setUp()
+
+                    x = Msub.createVecRight()
+                    b = Msub.createVecLeft()
+                    x.set(0)
+                    b.setRandom()
+                    ksp.solve(b, x)
+                    ew = numpy.real(ksp.computeEigenvalues())
+                    ksp.destroy()
+                    x.destroy()
+                    b.destroy()
+                    dscale = (max(ew) + min(ew))/2
+                    dscale = sum(ew) / len(ew)
+                    scale = dscale if dscale == dscale else 1
+                    with coef.dat.vec as diag:
+                        diag.scale(scale)
+
+            coefficients = {"beta": coefs[0], "alpha": coefs[1]}
+            assembly_callables = [scale_coefficients]
+            self._coefficient_cache[key] = (coefficients, assembly_callables)
+            return self._coefficient_cache[key]
+
+        if key not in self._coefficient_cache:
+            if not block_diagonal or not V.shape:
+                tensor = firedrake.Function(Z)
+                coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
+                assembly_callables = [partial(assemble, mixed_form, tensor=tensor, diagonal=True,
+                                              form_compiler_parameters=form_compiler_parameters)]
+            else:
+                M = assemble(mixed_form, mat_type="matfree",
+                             form_compiler_parameters=form_compiler_parameters)
+                coefficients = dict()
+                assembly_callables = []
+                for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
+                    sub = M.petscmat.createSubMatrix(iset, iset)
+                    ctx = sub.getPythonContext()
+                    coefficients[name] = ctx._block_diagonal
+                    assembly_callables.append(ctx._assemble_block_diagonal)
+
+            self._coefficient_cache[key] = (coefficients, assembly_callables)
+        return self._coefficient_cache[key]
+
+    @PETSc.Log.EventDecorator("FDMRefTensor")
+    def assemble_reference_tensor(self, V):
+        ndim = V.mesh().topological_dimension()
+        value_size = V.value_size
+        formdegree = V.finat_element.formdegree
+        degree = V.finat_element.degree
+        try:
+            degree = max(degree)
+        except TypeError:
+            pass
+        if formdegree == ndim:
+            degree = degree + 1
+        is_interior, is_facet = is_restricted(V.finat_element)
+        key = (degree, ndim, formdegree, V.value_size, is_interior, is_facet)
+        cache = self._reference_tensor_cache
+        if key not in cache:
+            full_key = (degree, ndim, formdegree, V.value_size, 0, 0)
+
+            if is_facet and full_key in cache:
+                result = cache[full_key]
+                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
+                cache[key] = result.createSubMatrix(noperm, self.ises[1])
+                noperm.destroy()
+                return cache[key]
+
+            elements = sorted(get_base_elements(V.finat_element), key=lambda e: e.formdegree)
+            ref_el = elements[0].get_reference_element()
+            eq = FIAT.FDMQuadrature(ref_el, degree)
+            e0 = elements[0] if elements[0].formdegree == 0 else FIAT.FDMLagrange(ref_el, degree)
+            e1 = elements[-1] if elements[-1].formdegree == 1 else FIAT.FDMDiscontinuousLagrange(ref_el, degree-1)
+            if is_interior:
+                e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
+            if hasattr(eq.dual, "rule"):
+                rule = eq.dual.rule
+            else:
+                rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
+
+            pts = rule.get_points()
+            wts = rule.get_weights()
+
+            phiq = eq.tabulate(0, pts)
+            phi1 = e1.tabulate(0, pts)
+            phi0 = e0.tabulate(1, pts)
+
+            moments = lambda v, u: numpy.dot(numpy.multiply(v, wts), u.T)
+            A00 = moments(phiq[(0, )], phi0[(0, )])
+            A11 = moments(phi1[(0, )], phi1[(0, )])
+            A10 = moments(phi1[(0, )], phi0[(1, )])
+            A10 = numpy.linalg.solve(A11, A10)
+            A11 = numpy.eye(A11.shape[0])
+
+            Ihat = mass_matrix(ndim, formdegree, A00, A11)
+            Dhat = diff_matrix(ndim, formdegree, A00, A11, A10)
+            result = block_mat([[Ihat], [Dhat]])
+            Ihat.destroy()
+            Dhat.destroy()
+
+            if value_size != 1:
+                eye = petsc_sparse(numpy.eye(value_size))
+                temp = result
+                result = temp.kron(eye)
+                temp.destroy()
+                eye.destroy()
+
+            if is_facet:
+                cache[full_key] = result
+                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
+                result = result.createSubMatrix(noperm, self.ises[1])
+                noperm.destroy()
+
+            cache[key] = result
+        return cache[key]
+
+
+def factor_interior_mat(A00):
+    # Assume that interior DOF list i0 is ordered such that A00 is block diagonal
+    # with blocks of increasing dimension
+    indptr, indices, data = A00.getValuesCSR()
+    degree = numpy.diff(indptr)
+
+    # TODO handle non-symmetric case with LU, requires scipy
+    invchol = lambda X: numpy.linalg.inv(numpy.linalg.cholesky(X))
+    nblocks = numpy.count_nonzero(degree == 1)
+    zlice = slice(0, nblocks)
+    numpy.sqrt(data[zlice], out=data[zlice])
+    numpy.reciprocal(data[zlice], out=data[zlice])
+    PETSc.Log.logFlops(2*nblocks)
+    for k in range(2, degree[-1]+1):
+        nblocks = numpy.count_nonzero(degree == k)
+        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
+        data[zlice] = invchol(data[zlice].reshape((-1, k, k))).reshape((-1,))
+        flops = ((k+1)**3 + 5*(k+1)-12)//3 + k**3
+        PETSc.Log.logFlops(flops*nblocks)
+
+    A00.setValuesCSR(indptr, indices, data)
+    A00.assemble()
+
+
+@PETSc.Log.EventDecorator("FDMCondense")
+def condense_element_mat(A, i0, i1, submats):
+    isrows = [i0, i0, i1, i1]
+    iscols = [i0, i1, i0, i1]
+    submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
+    A00, A01, A10, A11 = submats[:4]
+    factor_interior_mat(A00)
+    submats[4] = A00.matMult(A01, result=submats[4])
+    submats[5] = A10.matTransposeMult(A00, result=submats[5])
+    submats[6] = submats[5].matMult(submats[4], result=submats[6])
+    submats[6].aypx(-1.0, A11)
+    return submats[6]
+
+
+@PETSc.Log.EventDecorator("FDMCondense")
+def condense_element_pattern(A, i0, i1, submats):
+    isrows = [i0, i0, i1]
+    iscols = [i0, i1, i0]
+    submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
+    A00, A01, A10 = submats[:3]
+    submats[4] = A10.matTransposeMult(A00, result=submats[4])
+    submats[5] = A00.matMult(A01, result=submats[5])
+    submats[6] = submats[4].matMult(submats[5], result=submats[6])
+    submats[6].aypx(0.0, A)
+    return submats[6]
+
+
+@PETSc.Log.EventDecorator("LoadCode")
+def load_c_code(code, name, **kwargs):
+    from pyop2.compilation import load
+    from pyop2.utils import get_petsc_dir
+    cppargs = ["-I%s/include" % d for d in get_petsc_dir()]
+    ldargs = (["-L%s/lib" % d for d in get_petsc_dir()]
+              + ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()]
+              + ["-lpetsc", "-lm"])
+    funptr = load(code, "c", name,
+                  cppargs=cppargs, ldargs=ldargs,
+                  **kwargs)
+
+    def get_pointer(obj):
+        if isinstance(obj, (PETSc.Mat, PETSc.Vec)):
+            return obj.handle
+        elif isinstance(obj, numpy.ndarray):
+            return obj.ctypes.data
+        return obj
+
+    @PETSc.Log.EventDecorator(name)
+    def wrapper(*args):
+        return funptr(*list(map(get_pointer, args)))
+    return wrapper
+
+
+def load_assemble_csr(comm, triu=False):
+    if triu:
+        name = "setSubMatCSR_SBAIJ"
+        select_cols = "icol < irow ? -1: icol"
+    else:
+        name = "setSubMatCSR_AIJ"
+        select_cols = "icol"
+    code = f"""
+#include <petsc.h>
+
+PetscErrorCode {name}(Mat A,
+                      Mat B,
+                      PetscInt *rindices,
+                      PetscInt *cindices,
+                      InsertMode addv)
+{{
+    PetscInt ncols, irow, icol;
+    PetscInt *cols, *indices;
+    PetscScalar *vals;
+
+    PetscInt m, n;
+    PetscErrorCode ierr;
+    PetscFunctionBeginUser;
+    MatGetSize(B, &m, NULL);
+
+    n = 0;
+    for (PetscInt i = 0; i < m; i++) {{
+        ierr = MatGetRow(B, i, &ncols, NULL, NULL);CHKERRQ(ierr);
+        n = ncols > n ? ncols : n;
+        ierr = MatRestoreRow(B, i, &ncols, NULL, NULL);CHKERRQ(ierr);
+    }}
+    PetscMalloc1(n, &indices);
+    for (PetscInt i = 0; i < m; i++) {{
+        ierr = MatGetRow(B, i, &ncols, &cols, &vals);CHKERRQ(ierr);
+        irow = rindices[i];
+        for (PetscInt j = 0; j < ncols; j++) {{
+            icol = cindices[cols[j]];
+            indices[j] = {select_cols};
+        }}
+        ierr = MatSetValues(A, 1, &irow, ncols, indices, vals, addv);CHKERRQ(ierr);
+        ierr = MatRestoreRow(B, i, &ncols, &cols, &vals);CHKERRQ(ierr);
+    }}
+    PetscFree(indices);
+    PetscFunctionReturn(0);
+}}
+"""
+    argtypes = [ctypes.c_voidp, ctypes.c_voidp,
+                ctypes.c_voidp, ctypes.c_voidp, ctypes.c_int]
+    return load_c_code(code, name, comm=comm, argtypes=argtypes,
+                       restype=ctypes.c_int)
+
+
+def petsc_sparse(A_numpy, rtol=1E-10):
+    Amax = max(A_numpy.min(), A_numpy.max(), key=abs)
+    atol = rtol*Amax
+    nnz = numpy.count_nonzero(abs(A_numpy) > atol, axis=1).astype(PETSc.IntType)
+    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=PETSc.COMM_SELF)
+    for row, Arow in enumerate(A_numpy):
+        cols = numpy.argwhere(abs(Arow) > atol).astype(PETSc.IntType).flat
+        A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
+    A.assemble()
+    return A
+
+
+def block_mat(A_blocks):
+    if len(A_blocks) == 1:
+        if len(A_blocks[0]) == 1:
+            return A_blocks[0][0]
+
+    nrows = sum([Arow[0].size[0] for Arow in A_blocks])
+    ncols = sum([Aij.size[1] for Aij in A_blocks[0]])
+    nnz = numpy.concatenate([sum([numpy.diff(Aij.getValuesCSR()[0]) for Aij in Arow]) for Arow in A_blocks])
+    A = PETSc.Mat().createAIJ((nrows, ncols), nnz=(nnz, 0), comm=PETSc.COMM_SELF)
+    imode = PETSc.InsertMode.INSERT
+    insert_block = FDMPC.load_set_values()
+    rsizes = [sum([Ai[0].size[0] for Ai in A_blocks[:k]]) for k in range(len(A_blocks)+1)]
+    csizes = [sum([Aij.size[1] for Aij in A_blocks[0][:k]]) for k in range(len(A_blocks[0])+1)]
+    rows = [numpy.arange(*rsizes[i:i+2], dtype=PETSc.IntType) for i in range(len(A_blocks))]
+    cols = [numpy.arange(*csizes[j:j+2], dtype=PETSc.IntType) for j in range(len(A_blocks[0]))]
+    for Ai, irows in zip(A_blocks, rows):
+        for Aij, jcols in zip(Ai, cols):
+            insert_block(A, Aij, irows, jcols, imode)
+
+    A.assemble()
+    return A
+
+
+def is_restricted(finat_element):
+    is_interior = True
+    is_facet = True
+    tdim = finat_element.cell.get_spatial_dimension()
+    entity_dofs = finat_element.entity_dofs()
+    for edim in sorted(entity_dofs):
+        v = sum(list(entity_dofs[edim].values()), [])
+        if len(v):
+            try:
+                edim = sum(edim)
+            except TypeError:
+                pass
+            if edim == tdim:
+                is_facet = False
+            else:
+                is_interior = False
+    return is_interior, is_facet
+
+
+def sort_interior_dofs(idofs, A):
+    Aii = A.createSubMatrix(idofs, idofs)
+    indptr, indices, _ = Aii.getValuesCSR()
+    n = idofs.getSize()
+    visit = numpy.zeros((n, ), dtype=bool)
+    perm = []
+    degree = 0
+    while not visit.all():
+        degree += 1
+        for i in range(n):
+            if not visit[i]:
+                neigh = indices[slice(*indptr[i:i+2])]
+                if len(neigh) == degree:
+                    visit[neigh] = True
+                    perm.extend(neigh)
+
+    idofs.setIndices(idofs.getIndices()[perm])
+
+
+def kron3(A, B, C, scale=None):
+    temp = B.kron(C)
+    if scale is not None:
+        temp.scale(scale)
+    result = A.kron(temp)
+    temp.destroy()
+    return result
+
+
+def mass_matrix(ndim, formdegree, B00, B11):
+    B00 = petsc_sparse(B00)
+    B11 = petsc_sparse(B11)
+    if ndim == 1:
+        B_blocks = [B11 if formdegree else B00]
+    elif ndim == 2:
+        if formdegree == 0:
+            B_blocks = [B00.kron(B00)]
+        elif formdegree == 1:
+            B_blocks = [B00.kron(B11), B11.kron(B00)]
+        else:
+            B_blocks = [B11.kron(B11)]
+    elif ndim == 3:
+        if formdegree == 0:
+            B_blocks = [kron3(B00, B00, B00)]
+        elif formdegree == 1:
+            B_blocks = [kron3(B00, B00, B11), kron3(B00, B11, B00), kron3(B11, B00, B00)]
+        elif formdegree == 2:
+            B_blocks = [kron3(B00, B11, B11), kron3(B11, B00, B11), kron3(B11, B11, B00)]
+        else:
+            B_blocks = [kron3(B11, B11, B11)]
+
+    B00.destroy()
+    B11.destroy()
+    if len(B_blocks) == 1:
+        result = B_blocks[0]
+    else:
+        nrows = sum(Bk.size[0] for Bk in B_blocks)
+        ncols = sum(Bk.size[1] for Bk in B_blocks)
+        csr_block = [Bk.getValuesCSR() for Bk in B_blocks]
+        ishift = numpy.cumsum([0] + [csr[0][-1] for csr in csr_block])
+        jshift = numpy.cumsum([0] + [Bk.size[1] for Bk in B_blocks])
+        indptr = numpy.concatenate([csr[0][bool(shift):]+shift for csr, shift in zip(csr_block, ishift[:-1])])
+        indices = numpy.concatenate([csr[1]+shift for csr, shift in zip(csr_block, jshift[:-1])])
+        data = numpy.concatenate([csr[2] for csr in csr_block])
+        result = PETSc.Mat().createAIJ((nrows, ncols), csr=(indptr, indices, data), comm=PETSc.COMM_SELF)
+        for B in B_blocks:
+            B.destroy()
+    return result
+
+
+def diff_matrix(ndim, formdegree, A00, A11, A10):
+    if formdegree == ndim:
+        ncols = A10.shape[0]**ndim
+        A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=PETSc.COMM_SELF)
+        A_zero.assemble()
+        return A_zero
+
+    A00 = petsc_sparse(A00)
+    A11 = petsc_sparse(A11)
+    A10 = petsc_sparse(A10)
+    if ndim == 1:
+        return A10
+    elif ndim == 2:
+        if formdegree == 0:
+            A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
+        elif formdegree == 1:
+            A_blocks = [[A10.kron(A11), A11.kron(A10)]]
+            A_blocks[-1][-1].scale(-1)
+    elif ndim == 3:
+        if formdegree == 0:
+            A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
+        elif formdegree == 1:
+            size = tuple(A11.getSize()[k] * A10.getSize()[k] * A00.getSize()[k] for k in range(2))
+            A_zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=PETSc.COMM_SELF)
+            A_zero.assemble()
+            A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), A_zero],
+                        [kron3(A10, A00, A11, scale=-1), A_zero, kron3(A11, A00, A10)],
+                        [A_zero, kron3(A10, A11, A00), kron3(A11, A10, A00, scale=-1)]]
+        elif formdegree == 2:
+            A_blocks = [[kron3(A10, A11, A11, scale=-1), kron3(A11, A10, A11), kron3(A11, A11, A10)]]
+
+    A00.destroy()
+    A11.destroy()
+    A10.destroy()
+    result = block_mat(A_blocks)
+    for A_row in A_blocks:
+        for A in A_row:
+            A.destroy()
+    return result
+
+
+def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
+    from tsfc.finatinterface import create_element
+    from firedrake.preconditioners.pmg import fiat_reference_prolongator
+
+    ef = Vf.finat_element
+    ec = Vc.finat_element
+    if ef.formdegree - ec.formdegree != 1:
+        raise ValueError("Expecting Vf = d(Vc)")
+
+    elements = list(set(get_base_elements(ec) + get_base_elements(ef)))
+    elements = sorted(elements, key=lambda e: e.formdegree)
+    e0, e1 = elements[::len(elements)-1]
+
+    degree = e0.degree()
+    A11 = numpy.eye(degree, dtype=PETSc.RealType)
+    A00 = numpy.eye(degree+1, dtype=PETSc.RealType)
+    A10 = fiat_reference_prolongator(e1, e0, derivative=True)
+
+    ndim = Vc.mesh().topological_dimension()
+    Dhat = diff_matrix(ndim, ec.formdegree, A00, A11, A10)
+
+    scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
+    fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
+    cdofs = restricted_dofs(ec, create_element(unrestrict_element(scalar_element(Vc.ufl_element()))))
+    fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
+    cises = PETSc.IS().createGeneral(cdofs, comm=PETSc.COMM_SELF)
+    temp = Dhat
+    Dhat = temp.createSubMatrix(fises, cises)
+    fises.destroy()
+    cises.destroy()
+    temp.destroy()
+    if Vf.value_size > 1:
+        temp = Dhat
+        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType))
+        Dhat = temp.kron(eye)
+        temp.destroy()
+        eye.destroy()
+
+    rmap = Vf.local_to_global_map(fbcs)
+    cmap = Vc.local_to_global_map(cbcs)
+    rlocal, nel = glonum_fun(Vf.cell_node_map(), bsize=Vf.value_size)
+    clocal, nel = glonum_fun(Vc.cell_node_map(), bsize=Vc.value_size)
+
+    def cell_to_global(lgmap, cell_to_local, e, result=None):
+        result = cell_to_local(e, result=result)
+        return lgmap.apply(result, result=result)
+
+    imode = PETSc.InsertMode.INSERT
+    update_Dmat = FDMPC.load_set_values()
+
+    sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
+    block_size = Vf.dof_dset.layout_vec.getBlockSize()
+    preallocator = PETSc.Mat().create(comm=Vf.comm)
+    preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
+    preallocator.setSizes(sizes)
+    preallocator.setUp()
+
+    rindices = None
+    cindices = None
+    for e in range(nel):
+        rindices = cell_to_global(rmap, rlocal, e, result=rindices)
+        cindices = cell_to_global(cmap, clocal, e, result=cindices)
+        update_Dmat(preallocator, Dhat, rindices, cindices, imode)
+
+    preallocator.assemble()
+    nnz = get_preallocation(preallocator, sizes[0][0])
+    preallocator.destroy()
+    Dmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=Vf.comm)
+    Dmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+
+    for e in range(nel):
+        rindices = cell_to_global(rmap, rlocal, e, result=rindices)
+        cindices = cell_to_global(cmap, clocal, e, result=cindices)
+        update_Dmat(Dmat, Dhat, rindices, cindices, imode)
+
+    Dmat.assemble()
+    Dhat.destroy()
+    return Dmat
+
+
+def unrestrict_element(ele):
+    if isinstance(ele, ufl.VectorElement):
+        return type(ele)(unrestrict_element(ele._sub_element), dim=ele.num_sub_elements())
+    elif isinstance(ele, ufl.TensorElement):
+        return type(ele)(unrestrict_element(ele._sub_element), shape=ele._shape, symmetry=ele.symmetry())
+    elif isinstance(ele, ufl.EnrichedElement):
+        return type(ele)(*list(dict.fromkeys(unrestrict_element(e) for e in ele._elements)))
+    elif isinstance(ele, ufl.TensorProductElement):
+        return type(ele)(*(unrestrict_element(e) for e in ele.sub_elements()), cell=ele.cell())
+    elif isinstance(ele, ufl.MixedElement):
+        return type(ele)(*(unrestrict_element(e) for e in ele.sub_elements()))
+    elif isinstance(ele, ufl.WithMapping):
+        return type(ele)(unrestrict_element(ele.wrapee), ele.mapping())
+    elif isinstance(ele, ufl.RestrictedElement):
+        return unrestrict_element(ele._element)
+    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
+        return type(ele)(unrestrict_element(ele._element))
+    else:
+        return ele
+
+
+def get_base_elements(e):
+    if isinstance(e, finat.EnrichedElement):
+        return sum(list(map(get_base_elements, e.elements)), [])
+    elif isinstance(e, finat.TensorProductElement):
+        return sum(list(map(get_base_elements, e.factors)), [])
+    elif isinstance(e, finat.cube.FlattenedDimensions):
+        return get_base_elements(e.product)
+    elif isinstance(e, (finat.HCurlElement, finat.HDivElement)):
+        return get_base_elements(e.wrappee)
+    elif isinstance(e, finat.finiteelementbase.FiniteElementBase):
+        return get_base_elements(e.fiat_equivalent)
+    elif isinstance(e, FIAT.RestrictedElement):
+        return get_base_elements(e._element)
+    return [e]
+
+
+class PoissonFDMPC(FDMPC):
+    """
+    A preconditioner for tensor-product elements that changes the shape
+    functions so that the H^1 Riesz map is sparse in the interior of a
+    Cartesian cell, and assembles a global sparse matrix on which other
+    preconditioners, such as `ASMStarPC`, can be applied.
+
+    Here we assume that the volume integrals in the Jacobian can be expressed as:
+
+    inner(grad(v), alpha(grad(u)))*dx + inner(v, beta(u))*dx
+
+    where alpha and beta are linear functions (tensor contractions).
+    The sparse matrix is obtained by approximating alpha and beta by cell-wise
+    constants and discarding the coefficients in alpha that couple together
+    mixed derivatives and mixed components.
+
+    For spaces that are not H^1-conforming, this preconditioner will use
+    the symmetric interior-penalty DG method. The penalty coefficient can be
+    provided in the application context, keyed on ``"eta"``.
+    """
+
+    _variant = "fdm_ipdg"
+
+    def assemble_reference_tensor(self, V):
         from firedrake.preconditioners.pmg import get_line_elements
         try:
-            line_elements = get_line_elements(V)
+            line_elements, shifts = get_line_elements(V)
         except ValueError:
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
+        line_elements, = line_elements
+        self.axes_shifts, = shifts
+
         degree = max(e.degree() for e in line_elements)
-        eta = float(appctx.get("eta", (degree+1)**2))
-        quad_degree = 2*degree+1
+        eta = float(self.appctx.get("eta", degree*(degree+1)))
         element = V.finat_element
         is_dg = element.entity_dofs() == element.entity_closure_dofs()
 
@@ -213,83 +1140,59 @@ def assemble_fdm_op(self, V, J, bcs, appctx):
             Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(fdm_setup_ipdg(e, eta)))
             if not (e.formdegree or is_dg):
                 Dfdm[0] = None
+        return Afdm, Dfdm, bdof
 
-        # coefficients w.r.t. the reference values
-        coefficients, self.assembly_callables = self.assemble_coef(J, quad_degree)
-        # set arbitrary non-zero coefficients for preallocation
-        for coef in coefficients.values():
-            with coef.dat.vec as cvec:
-                cvec.set(1.0E0)
-
-        bcflags = get_weak_bc_flags(J)
-
-        # preallocate by calling the assembly routine on a PREALLOCATOR Mat
-        sizes = (V.dof_dset.layout_vec.getSizes(),)*2
-        block_size = V.dof_dset.layout_vec.getBlockSize()
-        prealloc = PETSc.Mat().create(comm=self.comm)
-        prealloc.setType(PETSc.Mat.Type.PREALLOCATOR)
-        prealloc.setSizes(sizes)
-        prealloc.setUp()
-        self.assemble_kron(prealloc, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
-        nnz = get_preallocation(prealloc, block_size * V.dof_dset.set.size)
-        Pmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=self.comm)
-        Pmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
-        assemble_P = partial(self.assemble_kron, Pmat, V, bcs, eta,
-                             coefficients, Afdm, Dfdm, bdof, bcflags)
-        prealloc.destroy()
-        return Pmat, assemble_P
-
-    def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags):
+    @PETSc.Log.EventDecorator("FDMSetValues")
+    def set_values(self, A, Vrow, Vcol, addv, triu=False):
         """
         Assemble the stiffness matrix in the FDM basis using Kronecker products of interval matrices
 
         :arg A: the :class:`PETSc.Mat` to assemble
-        :arg V: the :class:`~.FunctionSpace` of the form arguments
-        :arg bcs: an iterable of :class:`~.DirichletBC` s
-        :arg eta: a ``float`` penalty parameter for the symmetric interior penalty method
-        :arg coefficients: a ``dict`` mapping strings to :class:`firedrake.function.Function` s with the form coefficients
-        :arg Afdm: the list with sparse interval matrices
-        :arg Dfdm: the list with normal derivatives matrices
-        :arg bcflags: the :class:`numpy.ndarray` with BC facet flags returned by ``get_weak_bc_flags``
+        :arg Vrow: the :class:`firedrake.FunctionSpace` test space
+        :arg Vcol: the :class:`firedrake.FunctionSpace` trial space
         """
-        from firedrake.preconditioners.pmg import get_axes_shift
-        Gq = coefficients.get("Gq")
-        Bq = coefficients.get("Bq")
-        Gq_facet = coefficients.get("Gq_facet")
-        PT_facet = coefficients.get("PT_facet")
-
-        imode = PETSc.InsertMode.ADD_VALUES
-        lgmap = V.local_to_global_map(bcs)
-
+        set_values_csr = self.load_set_values(triu=triu)
+        update_A = lambda A, Ae, rindices: set_values_csr(A, Ae, rindices, rindices, addv)
+        condense_element_mat = self.get_static_condensation.get(Vrow, lambda x: x)
+        condense_element_mat = lambda x: x
+
+        get_rindices = self.cell_to_global[Vrow]
+        rtensor = self.reference_tensor_on_diag.get(Vrow, None) or self.assemble_reference_tensor(Vrow)
+        self.reference_tensor_on_diag[Vrow] = rtensor
+        Afdm, Dfdm, bdof = rtensor
+
+        Gq = self.coefficients.get("alpha")
+        Bq = self.coefficients.get("beta")
+        bcflags = self.coefficients.get("bcflags")
+        Gq_facet = self.coefficients.get("Gq_facet")
+        PT_facet = self.coefficients.get("PT_facet")
+
+        V = Vrow
         bsize = V.value_size
         ncomp = V.ufl_element().reference_value_size()
         sdim = (V.finat_element.space_dimension() * bsize) // ncomp  # dimension of a single component
         ndim = V.ufl_domain().topological_dimension()
-        shift = get_axes_shift(V.finat_element) % ndim
+        shift = self.axes_shifts * bsize
 
-        index_cell, nel = glonum_fun(V.cell_node_map())
-        index_coef, _ = glonum_fun(Gq.cell_node_map())
+        index_coef, _ = glonum_fun((Gq or Bq).cell_node_map())
+        index_bc, _ = glonum_fun(bcflags.cell_node_map())
         flag2id = numpy.kron(numpy.eye(ndim, ndim, dtype=PETSc.IntType), [[1], [2]])
 
         # pshape is the shape of the DOFs in the tensor product
         pshape = tuple(Ak[0].size[0] for Ak in Afdm)
-        if shift:
-            assert ncomp == ndim
-            pshape = [tuple(numpy.roll(pshape, -shift*k)) for k in range(ncomp)]
+        static_condensation = False
+        if sdim != numpy.prod(pshape):
+            static_condensation = True
 
-        if A.getType() != PETSc.Mat.Type.PREALLOCATOR:
-            A.zeroEntries()
-            for assemble_coef in self.assembly_callables:
-                assemble_coef()
-
-        # insert the identity in the Dirichlet rows and columns
-        for row in V.dof_dset.lgmap.indices[lgmap.indices < 0]:
-            A.setValue(row, row, 1.0E0, imode)
+        if set(shift) != {0}:
+            assert ncomp == ndim
+            pshape = [tuple(numpy.roll(pshape, -shift[k])) for k in range(ncomp)]
 
         # assemble zero-th order term separately, including off-diagonals (mixed components)
         # I cannot do this for hdiv elements as off-diagonals are not sparse, this is because
         # the FDM eigenbases for GLL(N) and GLL(N-1) are not orthogonal to each other
-        use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2
+        rindices = None
+        use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2 or static_condensation
         if not use_diag_Bq:
             bshape = Bq.ufl_shape
             # Be = Bhat kron ... kron Bhat
@@ -299,67 +1202,86 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
 
             aptr = numpy.arange(0, (bshape[0]+1)*bshape[1], bshape[1], dtype=PETSc.IntType)
             aidx = numpy.tile(numpy.arange(bshape[1], dtype=PETSc.IntType), bshape[0])
-            for e in range(nel):
+            for e in range(self.nel):
                 # Ae = Be kron Bq[e]
                 adata = numpy.sum(Bq.dat.data_ro[index_coef(e)], axis=0)
                 Ae = PETSc.Mat().createAIJWithArrays(bshape, (aptr, aidx, adata), comm=PETSc.COMM_SELF)
                 Ae = Be.kron(Ae)
-
-                ie = index_cell(e)
-                ie = numpy.repeat(ie*bsize, bsize) + numpy.tile(numpy.arange(bsize, dtype=ie.dtype), len(ie))
-                rows = lgmap.apply(ie)
-                set_submat_csr(A, Ae, rows, imode)
+                rindices = get_rindices(e, result=rindices)
+                update_A(A, Ae, rindices)
                 Ae.destroy()
             Be.destroy()
             Bq = None
 
         # assemble the second order term and the zero-th order term if any,
-        # discarding mixed derivatives and mixed components
-        for e in range(nel):
-            ie = numpy.reshape(index_cell(e), (ncomp//bsize, -1))
+        # discarding mixed derivatives and mixed componentsget_weak_bc_flags(J)
+        mue = numpy.zeros((ncomp, ndim), dtype=PETSc.RealType)
+        bqe = numpy.zeros((ncomp,), dtype=PETSc.RealType)
+
+        for e in range(self.nel):
             je = index_coef(e)
-            bce = bcflags[e]
+            bce = bcflags.dat.data_ro_with_halos[index_bc(e)] > 1E-8
+
+            rindices = get_rindices(e, result=rindices)
+            rows = numpy.reshape(rindices, (-1, bsize))
+            rows = numpy.transpose(rows)
+            rows = numpy.reshape(rows, (ncomp, -1))
 
             # get second order coefficient on this cell
-            mue = numpy.atleast_1d(numpy.sum(Gq.dat.data_ro[je], axis=0))
+            if Gq is not None:
+                mue.flat[:] = numpy.sum(Gq.dat.data_ro[je], axis=0)
+            # get zero-th order coefficient on this cell
             if Bq is not None:
-                # get zero-th order coefficient on this cell
-                bqe = numpy.atleast_1d(numpy.sum(Bq.dat.data_ro[je], axis=0))
+                bqe.flat[:] = numpy.sum(Bq.dat.data_ro[je], axis=0)
 
             for k in range(ncomp):
                 # permutation of axes with respect to the first vector component
-                axes = numpy.roll(numpy.arange(ndim), -shift*k)
+                axes = numpy.roll(numpy.arange(ndim), -shift[k])
                 # for each component: compute the stiffness matrix Ae
-                muk = mue[k] if len(mue.shape) == 2 else mue
                 bck = bce[:, k] if len(bce.shape) == 2 else bce
                 fbc = numpy.dot(bck, flag2id)
 
-                # Ae = mue[k][0] Ahat + bqe[k] Bhat
-                Be = Afdm[axes[0]][0].copy()
-                Ae = Afdm[axes[0]][1+fbc[0]].copy()
-                Ae.scale(muk[0])
-                if Bq is not None:
-                    Ae.axpy(bqe[k], Be)
-
-                if ndim > 1:
-                    # Ae = Ae kron Bhat + mue[k][1] Bhat kron Ahat
-                    Ae = Ae.kron(Afdm[axes[1]][0])
-                    Ae.axpy(muk[1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
-                    if ndim > 2:
-                        # Ae = Ae kron Bhat + mue[k][2] Bhat kron Bhat kron Ahat
-                        Be = Be.kron(Afdm[axes[1]][0])
-                        Ae = Ae.kron(Afdm[axes[2]][0])
-                        Ae.axpy(muk[2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
-
-                rows = lgmap.apply(ie[0]*bsize+k if bsize == ncomp else ie[k])
-                set_submat_csr(A, Ae, rows, imode)
+                if Gq is not None:
+                    # Ae = mue[k][0] Ahat + bqe[k] Bhat
+                    Be = Afdm[axes[0]][0].copy()
+                    Ae = Afdm[axes[0]][1+fbc[0]].copy()
+                    Ae.scale(mue[k][0])
+                    if Bq is not None:
+                        Ae.axpy(bqe[k], Be)
+
+                    if ndim > 1:
+                        # Ae = Ae kron Bhat + mue[k][1] Bhat kron Ahat
+                        Ae = Ae.kron(Afdm[axes[1]][0])
+                        if Gq is not None:
+                            Ae.axpy(mue[k][1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
+
+                        if ndim > 2:
+                            # Ae = Ae kron Bhat + mue[k][2] Bhat kron Bhat kron Ahat
+                            Be = Be.kron(Afdm[axes[1]][0])
+                            Ae = Ae.kron(Afdm[axes[2]][0])
+                            if Gq is not None:
+                                Ae.axpy(mue[k][2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
+                    Be.destroy()
+
+                elif Bq is not None:
+                    Ae = Afdm[axes[0]][0]
+                    for m in range(1, ndim):
+                        Ae = Ae.kron(Afdm[axes[m]][0])
+                    Ae.scale(bqe[k])
+
+                Ae = condense_element_mat(Ae)
+                update_A(A, Ae, rows[k].astype(PETSc.IntType))
                 Ae.destroy()
-                Be.destroy()
 
         # assemble SIPG interior facet terms if the normal derivatives have been set up
         if any(Dk is not None for Dk in Dfdm):
+            if static_condensation:
+                raise NotImplementedError("Static condensation for SIPG not implemented")
             if ndim < V.ufl_domain().geometric_dimension():
                 raise NotImplementedError("SIPG on immersed meshes is not implemented")
+            eta = float(self.appctx.get("eta"))
+
+            lgmap = self.lgmaps[V]
             index_facet, local_facet_data, nfacets = get_interior_facet_maps(V)
             index_coef, _, _ = get_interior_facet_maps(Gq_facet or Gq)
             rows = numpy.zeros((2, sdim), dtype=PETSc.IntType)
@@ -382,7 +1304,7 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                     Gfacet = numpy.sum(Gq.dat.data_ro_with_halos[je], axis=1)
 
                 for k in range(ncomp):
-                    axes = numpy.roll(numpy.arange(ndim), -shift*k)
+                    axes = numpy.roll(numpy.arange(ndim), -shift[k])
                     Dfacet = Dfdm[axes[0]]
                     if Dfacet is None:
                         continue
@@ -439,31 +1361,30 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                         rows[0] = pull_axis(icell[0][k0], pshape[k0], idir[0])
                         rows[1] = pull_axis(icell[1][k1], pshape[k1], idir[1])
 
-                    set_submat_csr(A, Ae, rows, imode)
+                    update_A(A, Ae, rows)
                     Ae.destroy()
-        A.assemble()
 
-    def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
-        """
-        Return the coefficients of the Jacobian form arguments and their gradient with respect to the reference coordinates.
-
-        :arg J: the Jacobian bilinear form
-        :arg quad_deg: the quadrature degree used for the coefficients
-        :arg discard_mixed: discard entries in second order coefficient with mixed derivatives and mixed components
-        :arg cell_average: to return the coefficients as DG_0 Functions
-
-        :returns: a 2-tuple of
-            coefficients: a dictionary mapping strings to :class:`firedrake.function.Function` s with the coefficients of the form,
-            assembly_callables: a list of assembly callables for each coefficient of the form
-        """
+    @PETSc.Log.EventDecorator("FDMCoefficients")
+    def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_average=True):
         from ufl import inner, diff
         from ufl.algorithms.ad import expand_derivatives
+
         coefficients = {}
         assembly_callables = []
 
         mesh = J.ufl_domain()
         tdim = mesh.topological_dimension()
         Finv = ufl.JacobianInverse(mesh)
+
+        args_J = J.arguments()
+        V = args_J[-1].function_space()
+        degree = V.ufl_element().degree()
+        try:
+            degree = max(degree)
+        except TypeError:
+            pass
+        quad_deg = 2*degree+1
+        quad_deg = (form_compiler_parameters or {}).get("degree", quad_deg)
         dx = firedrake.dx(degree=quad_deg)
 
         if cell_average:
@@ -474,7 +1395,6 @@ def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
             degree = quad_deg
 
         # extract coefficients directly from the bilinear form
-        args_J = J.arguments()
         integrals_J = J.integrals_by_type("cell")
         mapping = args_J[0].ufl_element().mapping().lower()
         Piola = get_piola_tensor(mapping, mesh)
@@ -517,11 +1437,12 @@ def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
             Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape, symmetry=True)
 
         # assemble second order coefficient
-        Q = firedrake.FunctionSpace(mesh, Qe)
-        q = firedrake.TestFunction(Q)
-        Gq = firedrake.Function(Q)
-        coefficients["Gq"] = Gq
-        assembly_callables.append(partial(firedrake.assemble, inner(G, q)*dx, Gq))
+        if not isinstance(alpha, ufl.constantvalue.Zero):
+            Q = firedrake.FunctionSpace(mesh, Qe)
+            q = firedrake.TestFunction(Q)
+            Gq = firedrake.Function(Q)
+            coefficients["alpha"] = Gq
+            assembly_callables.append(partial(firedrake.assemble, inner(G, q)*dx, Gq))
 
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
@@ -535,7 +1456,7 @@ def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
             Q = firedrake.FunctionSpace(mesh, Qe)
             q = firedrake.TestFunction(Q)
             Bq = firedrake.Function(Q)
-            coefficients["Bq"] = Bq
+            coefficients["beta"] = Bq
             assembly_callables.append(partial(firedrake.assemble, inner(beta, q)*dx, Bq))
 
         if Piola:
@@ -565,6 +1486,45 @@ def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
             PT_facet = firedrake.Function(Q)
             coefficients["PT_facet"] = PT_facet
             assembly_callables.append(partial(firedrake.assemble, ((inner(q('+'), PT('+')) + inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
+
+        # make DGT functions with BC flags
+        rvs = V.ufl_element().reference_value_shape()
+        cell = mesh.ufl_cell()
+        family = "CG" if cell.topological_dimension() == 1 else "DGT"
+        degree = 1 if cell.topological_dimension() == 1 else 0
+        Qe = ufl.FiniteElement(family, cell=cell, degree=degree)
+        if rvs:
+            Qe = ufl.TensorElement(Qe, shape=rvs)
+        Q = firedrake.FunctionSpace(mesh, Qe)
+        q = firedrake.TestFunction(Q)
+        bcflags = firedrake.Function(Q)
+
+        ref_args = [ufl.variable(t) for t in args_J]
+        replace_args = {t: s for t, s in zip(args_J, ref_args)}
+
+        forms = []
+        md = {"quadrature_degree": 0}
+        for it in J.integrals():
+            itype = it.integral_type()
+            if itype.startswith("exterior_facet"):
+                beta = ufl.diff(ufl.diff(ufl.replace(it.integrand(), replace_args), ref_args[0]), ref_args[1])
+                beta = expand_derivatives(beta)
+                if rvs:
+                    beta = ufl.diag_vector(beta)
+                ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
+                forms.append(ufl.inner(q, beta)*ds_ext)
+
+        if len(forms):
+            form = sum(forms)
+            if len(form.arguments()) == 1:
+                assembly_callables.append(partial(firedrake.assemble, form, bcflags))
+                coefficients["bcflags"] = bcflags
+
+        # set arbitrary non-zero coefficients for preallocation
+        for coef in coefficients.values():
+            with coef.dat.vec as cvec:
+                cvec.set(1.0E0)
+        self.coefficients = coefficients
         return coefficients, assembly_callables
 
 
@@ -643,12 +1603,13 @@ def fdm_setup_ipdg(fdm_element, eta):
         Dfdm: the tabulation of the normal derivatives of the Dirichlet eigenfunctions.
         bdof: the indices of PointEvaluation dofs.
     """
-    from FIAT.quadrature import GaussLegendreQuadratureLineRule
-    from FIAT.functional import PointEvaluation
     ref_el = fdm_element.get_reference_element()
     degree = fdm_element.degree()
-    rule = GaussLegendreQuadratureLineRule(ref_el, degree+1)
-    bdof = [k for k, f in enumerate(fdm_element.dual_basis()) if isinstance(f, PointEvaluation)]
+    if hasattr(fdm_element.dual, "rule"):
+        rule = fdm_element.dual.rule
+    else:
+        rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
+    bdof = [k for k, f in enumerate(fdm_element.dual_basis()) if isinstance(f, FIAT.functional.PointEvaluation)]
 
     phi = fdm_element.tabulate(1, rule.get_points())
     Jhat = phi[(0, )]
@@ -680,7 +1641,7 @@ def get_interior_facet_maps(V):
     """
     Extrude V.interior_facet_node_map and V.ufl_domain().interior_facets.local_facet_dat
 
-    :arg V: a :class:`~.FunctionSpace`
+    :arg V: a :class:`FunctionSpace`
 
     :returns: the 3-tuple of
         facet_to_nodes_fun: maps interior facets to the nodes of the two cells sharing it,
@@ -747,35 +1708,75 @@ def get_interior_facet_maps(V):
     return facet_to_nodes_fun, local_facet_data_fun, nfacets
 
 
-@lru_cache(maxsize=10)
-def glonum_fun(node_map):
+@lru_cache(maxsize=20)
+def glonum_fun(node_map, bsize=1):
     """
-    Return a function that maps each topological entity to its nodes and the total number of entities.
+    Return a the local numbering given an non-extruded local map and the total number of entities.
 
-    :arg node_map: a :class:`pyop2.Map` mapping entities to their nodes, including ghost entities.
+    :arg node_map: a :class:`pyop2.Map` mapping entities to their local dofs, including ghost entities.
 
-    :returns: a 2-tuple with the map and the number of cells owned by this process
+    :returns: a 2-tuple with the map and the number of entities owned by this process
     """
     nelv = node_map.values.shape[0]
     if node_map.offset is None:
-        return lambda e: node_map.values_with_halo[e], nelv
+        nel = nelv
+
+        def glonum(e, result=None):
+            if result is None:
+                result = numpy.copy(node_map.values_with_halo[e])
+            else:
+                numpy.copyto(result, node_map.values_with_halo[e])
+            return result
+
     else:
         layers = node_map.iterset.layers_array
         if layers.shape[0] == 1:
             nelz = layers[0, 1]-layers[0, 0]-1
             nel = nelz*nelv
-            return lambda e: node_map.values_with_halo[e//nelz] + (e % nelz)*node_map.offset, nel
+
+            def _glonum(node_map, nelz, e, result=None):
+                if result is None:
+                    result = numpy.copy(node_map.values_with_halo[e // nelz])
+                else:
+                    numpy.copyto(result, node_map.values_with_halo[e // nelz])
+                result += (e % nelz)*node_map.offset
+                return result
+            glonum = partial(_glonum, node_map, nelz)
+
         else:
             nelz = layers[:, 1]-layers[:, 0]-1
             nel = sum(nelz[:nelv])
             to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
             to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-            return lambda e: node_map.values_with_halo[to_base[e]] + to_layer[e]*node_map.offset, nel
+
+            def _glonum(node_map, to_base, to_layer, e, result=None):
+                if result is None:
+                    result = numpy.copy(node_map.values_with_halo[to_base[e]])
+                else:
+                    numpy.copyto(result, node_map.values_with_halo[to_base[e]])
+                result += to_layer[e]*node_map.offset
+                return result
+            glonum = partial(_glonum, node_map, to_base, to_layer)
+
+    if bsize == 1:
+        return glonum, nel
+
+    ibase = numpy.arange(bsize, dtype=node_map.values.dtype)
+
+    def vector_glonum(bsize, ibase, e, result=None):
+        index = None
+        if result is not None:
+            index = result[:, 0]
+        index = glonum(e, result=index)
+        index *= bsize
+        return numpy.add.outer(index, ibase, out=result)
+
+    return partial(vector_glonum, bsize, ibase), nel
 
 
 def glonum(node_map):
     """
-    Return an array with the nodes of each topological entity of a certain kind.
+    Return an array with the node map.
 
     :arg node_map: a :class:`pyop2.Map` mapping entities to their nodes, including ghost entities.
 
@@ -794,43 +1795,19 @@ def glonum(node_map):
         return numpy.repeat(node_map.values_with_halo, nelz, axis=0) + numpy.kron(to_layer.reshape((-1, 1)), node_map.offset)
 
 
-def get_weak_bc_flags(J):
-    """
-    Return flags indicating whether the zero-th order coefficient on each facet of every cell is non-zero
-    """
-    from ufl.algorithms.ad import expand_derivatives
-    mesh = J.ufl_domain()
-    args_J = J.arguments()
-    V = args_J[0].function_space()
-    rvs = V.ufl_element().reference_value_shape()
-    cell = mesh.ufl_cell()
-    family = "CG" if cell.topological_dimension() == 1 else "DGT"
-    degree = 1 if cell.topological_dimension() == 1 else 0
-    Qe = ufl.FiniteElement(family, cell=cell, degree=degree)
-    if rvs:
-        Qe = ufl.TensorElement(Qe, shape=rvs)
-    Q = firedrake.FunctionSpace(mesh, Qe)
-    q = firedrake.TestFunction(Q)
-
-    ref_args = [ufl.variable(t) for t in args_J]
-    replace_args = {t: s for t, s in zip(args_J, ref_args)}
-
-    forms = []
-    md = {"quadrature_degree": 0}
-    for it in J.integrals():
-        itype = it.integral_type()
-        if itype.startswith("exterior_facet"):
-            beta = ufl.diff(ufl.diff(ufl.replace(it.integrand(), replace_args), ref_args[0]), ref_args[1])
-            beta = expand_derivatives(beta)
-            if rvs:
-                beta = ufl.diag_vector(beta)
-            ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
-            forms.append(ufl.inner(q, beta)*ds_ext)
-
-    tol = 1E-8
-    if len(forms):
-        bq = firedrake.assemble(sum(forms))
-        fbc = bq.dat.data_with_halos[glonum(Q.cell_node_map())]
-        return (abs(fbc) > tol).astype(PETSc.IntType)
-    else:
-        return numpy.zeros(glonum(Q.cell_node_map()).shape, dtype=PETSc.IntType)
+def spy(A, comm=None):
+    import matplotlib.pyplot as plt
+    import scipy.sparse as sp
+    if comm is None:
+        comm = A.comm
+    nnz = A.getInfo()["nz_used"]
+    if A.getType().endswith("sbaij"):
+        A.setOption(PETSc.Mat.Option.GETROW_UPPERTRIANGULAR, True)
+    csr = tuple(reversed(A.getValuesCSR()))
+    if comm.rank == 0:
+        csr[0].fill(1)
+        scipy_mat = sp.csr_matrix(csr, shape=A.getSize())
+        fig, axes = plt.subplots(nrows=1, ncols=1)
+        axes.spy(scipy_mat, marker=".", markersize=2)
+        plt.title("nnz(A) = %d" % nnz)
+        plt.show()
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 34e6469396..825ee81c36 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -3,19 +3,19 @@
 
 
 fdmstar = {
-    "mat_type": "matfree",
-    "ksp_type": "cg",
-    "ksp_atol": 0.0E0,
-    "ksp_rtol": 1.0E-8,
-    "ksp_norm_type": "unpreconditioned",
-    "ksp_monitor_true_residual": None,
-    "ksp_converged_reason": None,
-    "pc_type": "python",
-    "pc_python_type": "firedrake.P1PC",
-    "pmg_coarse_mat_type": "aij",
-    "pmg_mg_coarse": {
-        "ksp_type": "preonly",
-        "pc_type": "cholesky",
+    'mat_type': 'matfree',
+    'ksp_type': 'cg',
+    'ksp_atol': 0.0E0,
+    'ksp_rtol': 1.0E-8,
+    'ksp_norm_type': 'unpreconditioned',
+    'ksp_monitor_true_residual': None,
+    'ksp_converged_reason': None,
+    'pc_type': 'python',
+    'pc_python_type': 'firedrake.P1PC',
+    'pmg_mg_coarse': {
+        'mat_type': 'aij',
+        'ksp_type': 'preonly',
+        'pc_type': 'cholesky',
     },
     "pmg_mg_levels": {
         "ksp_type": "chebyshev",
@@ -40,10 +40,10 @@
 
 
 @pytest.fixture(params=[2, 3],
-                ids=["Rectangle", "Box"])
+                ids=['Rectangle', 'Box'])
 def mesh(request):
     nx = 4
-    distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
+    distribution = {'overlap_type': (DistributedMeshOverlapType.VERTEX, 1)}
     m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
     if request.param == 3:
         m = ExtrudedMesh(m, nx)
@@ -62,7 +62,7 @@ def expected(mesh):
         return [8, 8, 8]
 
 
-@pytest.fixture(params=[None, "fdm"], ids=["spectral", "fdm"])
+@pytest.fixture(params=[None, 'fdm'], ids=['spectral', 'fdm'])
 def variant(request):
     return request.param
 
@@ -71,7 +71,7 @@ def variant(request):
 def test_p_independence(mesh, expected, variant):
     nits = []
     for p in range(3, 6):
-        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=p, variant=variant)
+        e = FiniteElement('Lagrange', cell=mesh.ufl_cell(), degree=p, variant=variant)
         V = FunctionSpace(mesh, e)
         u = TrialFunction(V)
         v = TestFunction(V)
@@ -86,9 +86,9 @@ def test_p_independence(mesh, expected, variant):
         a = inner(grad(v), grad(u))*dx
         L = inner(v, B)*dx
 
-        subs = ("on_boundary",)
+        subs = ('on_boundary',)
         if mesh.cell_set._extruded:
-            subs += ("top", "bottom")
+            subs += ('top', 'bottom')
         bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
 
         uh = Function(V)
@@ -104,7 +104,7 @@ def test_p_independence(mesh, expected, variant):
 def test_variable_coefficient(mesh):
     ndim = mesh.geometric_dimension()
     k = 4
-    V = FunctionSpace(mesh, "Lagrange", k)
+    V = FunctionSpace(mesh, 'Lagrange', k)
     u = TrialFunction(V)
     v = TestFunction(V)
     x = SpatialCoordinate(mesh)
@@ -119,9 +119,9 @@ def test_variable_coefficient(mesh):
     a = (inner(grad(v), dot(alpha, grad(u))) + inner(v, beta*u))*dx(degree=3*k+2)
     L = inner(v, Constant(1))*dx
 
-    subs = ("on_boundary",)
+    subs = ('on_boundary',)
     if mesh.cell_set._extruded:
-        subs += ("top", "bottom")
+        subs += ('top', 'bottom')
     bcs = [DirichletBC(V, zero(V.ufl_element().value_shape()), sub) for sub in subs]
 
     uh = Function(V)
@@ -131,27 +131,27 @@ def test_variable_coefficient(mesh):
     assert solver.snes.ksp.getIterationNumber() <= 14
 
 
-@pytest.fixture(params=["cg", "dg", "rt"],
-                ids=["cg", "dg", "rt"])
+@pytest.fixture(params=['cg', 'dg', 'rt'],
+                ids=['cg', 'dg', 'rt'])
 def fs(request, mesh):
     degree = 3
     ndim = mesh.topological_dimension()
     cell = mesh.ufl_cell()
     element = request.param
-    variant = None
-    if element == "rt":
-        family = "RTCF" if ndim == 2 else "NCF"
+    variant = 'fdm_ipdg'
+    if element == 'rt':
+        family = 'RTCF' if ndim == 2 else 'NCF'
         return FunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant))
     else:
         if ndim == 1:
-            family = "DG" if element == "dg" else "CG"
+            family = 'DG' if element == 'dg' else 'CG'
         else:
-            family = "DQ" if element == "dg" else "Q"
+            family = 'DQ' if element == 'dg' else 'Q'
         return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-ndim)
 
 
 @pytest.mark.skipcomplex
-def test_direct_solver(fs):
+def test_ipdg_direct_solver(fs):
     mesh = fs.mesh()
     x = SpatialCoordinate(mesh)
     ndim = mesh.geometric_dimension()
@@ -187,31 +187,31 @@ def test_direct_solver(fs):
     if ndim > 1:
         subs += (3,)
     if extruded:
-        subs += ("top",)
+        subs += ('top',)
 
     bcs = [DirichletBC(fs, u_exact, sub) for sub in subs]
 
     dirichlet_ids = subs
-    if "on_boundary" in dirichlet_ids:
+    if 'on_boundary' in dirichlet_ids:
         neumann_ids = []
     else:
         make_tuple = lambda s: s if type(s) == tuple else (s,)
         neumann_ids = list(set(mesh.exterior_facets.unique_markers) - set(sum([make_tuple(s) for s in subs if type(s) != str], ())))
     if extruded:
-        if "top" not in dirichlet_ids:
-            neumann_ids.append("top")
-        if "bottom" not in dirichlet_ids:
-            neumann_ids.append("bottom")
+        if 'top' not in dirichlet_ids:
+            neumann_ids.append('top')
+        if 'bottom' not in dirichlet_ids:
+            neumann_ids.append('bottom')
 
     dxq = dx(degree=quad_degree, domain=mesh)
     if extruded:
         dS_int = dS_v(degree=quad_degree) + dS_h(degree=quad_degree)
-        ds_ext = {"on_boundary": ds_v(degree=quad_degree), "bottom": ds_b(degree=quad_degree), "top": ds_t(degree=quad_degree)}
+        ds_ext = {'on_boundary': ds_v(degree=quad_degree), 'bottom': ds_b(degree=quad_degree), 'top': ds_t(degree=quad_degree)}
         ds_Dir = [ds_ext.get(s) or ds_v(s, degree=quad_degree) for s in dirichlet_ids]
         ds_Neu = [ds_ext.get(s) or ds_v(s, degree=quad_degree) for s in neumann_ids]
     else:
         dS_int = dS(degree=quad_degree)
-        ds_ext = {"on_boundary": ds(degree=quad_degree)}
+        ds_ext = {'on_boundary': ds(degree=quad_degree)}
         ds_Dir = [ds_ext.get(s) or ds(s, degree=quad_degree) for s in dirichlet_ids]
         ds_Neu = [ds_ext.get(s) or ds(s, degree=quad_degree) for s in neumann_ids]
 
@@ -238,20 +238,65 @@ def test_direct_solver(fs):
 
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={
-        "mat_type": "matfree",
-        "ksp_type": "cg",
-        "ksp_atol": 0.0E0,
-        "ksp_rtol": 1.0E-8,
-        "ksp_max_it": 3,
-        "ksp_monitor": None,
-        "ksp_norm_type": "unpreconditioned",
-        "pc_type": "python",
-        "pc_python_type": "firedrake.FDMPC",
-        "fdm_pc_type": "cholesky",
-        "fdm_pc_factor_mat_solver_type": "mumps",
-        "fdm_pc_factor_mat_ordering_type": "nd",
-    }, appctx={"eta": eta, })
+        'mat_type': 'matfree',
+        'ksp_type': 'cg',
+        'ksp_atol': 0.0E0,
+        'ksp_rtol': 1.0E-8,
+        'ksp_max_it': 3,
+        'ksp_monitor': None,
+        'ksp_norm_type': 'unpreconditioned',
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.PoissonFDMPC',
+        'fdm_pc_type': 'cholesky',
+        'fdm_pc_factor_mat_solver_type': 'mumps',
+        'fdm_pc_factor_mat_ordering_type': 'nd',
+    }, appctx={'eta': eta, })
     solver.solve()
 
     assert solver.snes.ksp.getIterationNumber() == 1
-    assert norm(u_exact-uh, "H1") < 1.0E-8
+    assert norm(u_exact-uh, 'H1') < 1.0E-8
+
+
+@pytest.mark.skipcomplex
+def test_static_condensation(mesh):
+    degree = 3
+    quad_degree = 2*degree+1
+    cell = mesh.ufl_cell()
+    e = FiniteElement('Lagrange', cell=cell, degree=degree, variant='fdm')
+    Z = FunctionSpace(mesh, MixedElement(*[RestrictedElement(e, d) for d in ("interior", "facet")]))
+    z = Function(Z)
+    u = sum(split(z))
+
+    f = Constant(1)
+    U = ((1/2)*inner(grad(u), grad(u)) - inner(u, f))*dx(degree=quad_degree)
+    F = derivative(U, z, TestFunction(Z))
+    a = derivative(F, z, TrialFunction(Z))
+
+    subs = ['on_boundary']
+    if mesh.cell_set._extruded:
+        subs += ['top', 'bottom']
+    bcs = [DirichletBC(Z.sub(1), zero(), sub) for sub in subs]
+
+    problem = LinearVariationalProblem(a, -F, z, bcs=bcs)
+    solver = LinearVariationalSolver(problem, solver_parameters={
+        'mat_type': 'matfree',
+        'ksp_monitor': None,
+        'ksp_type': 'preonly',
+        'ksp_norm_type': 'unpreconditioned',
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.SCPC',
+        'pc_sc_eliminate_fields': '0',
+        'condensed_field': {
+            'mat_type': 'matfree',
+            'ksp_monitor': None,
+            'ksp_type': 'preonly',
+            'ksp_norm_type': 'unpreconditioned',
+            'pc_type': 'python',
+            'pc_python_type': 'firedrake.FDMPC',
+            'fdm_pc_type': 'lu',
+            'fdm_pc_mat_factor_solver_type': 'mumps'
+        }
+    })
+    solver.solve()
+    residual = solver.snes.ksp.buildResidual()
+    assert residual.norm() < 1E-14

From 283fbc5f94ef33940c0c8962e60fba6d7aa8022e Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 7 Mar 2023 12:07:25 +0000
Subject: [PATCH 02/75] DROP BEFORE MERGE

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 04f377d84f..d1709a8e16 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -48,7 +48,7 @@ jobs:
       - name: Build Firedrake
         run: |
           cd ..
-          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name build --tinyasm --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager || (cat firedrake-install.log && /bin/false)
+          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name build --tinyasm --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager --package-branch tsfc pbrubeck/fdm-discontinuous || (cat firedrake-install.log && /bin/false)
       - name: Install test dependencies
         run: |
           . ../build/bin/activate

From aa38d80c8fa4140e5bc2642599f0d3d32f7ffdb1 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 7 Mar 2023 12:17:03 +0000
Subject: [PATCH 03/75] replace apostrophe with quotes on test

---
 tests/regression/test_fdm.py | 142 +++++++++++++++++------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 825ee81c36..6934bf0a53 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -3,19 +3,19 @@
 
 
 fdmstar = {
-    'mat_type': 'matfree',
-    'ksp_type': 'cg',
-    'ksp_atol': 0.0E0,
-    'ksp_rtol': 1.0E-8,
-    'ksp_norm_type': 'unpreconditioned',
-    'ksp_monitor_true_residual': None,
-    'ksp_converged_reason': None,
-    'pc_type': 'python',
-    'pc_python_type': 'firedrake.P1PC',
-    'pmg_mg_coarse': {
-        'mat_type': 'aij',
-        'ksp_type': 'preonly',
-        'pc_type': 'cholesky',
+    "mat_type": "matfree",
+    "ksp_type": "cg",
+    "ksp_atol": 0.0E0,
+    "ksp_rtol": 1.0E-8,
+    "ksp_norm_type": "unpreconditioned",
+    "ksp_monitor_true_residual": None,
+    "ksp_converged_reason": None,
+    "pc_type": "python",
+    "pc_python_type": "firedrake.P1PC",
+    "pmg_mg_coarse": {
+        "mat_type": "aij",
+        "ksp_type": "preonly",
+        "pc_type": "cholesky",
     },
     "pmg_mg_levels": {
         "ksp_type": "chebyshev",
@@ -40,10 +40,10 @@
 
 
 @pytest.fixture(params=[2, 3],
-                ids=['Rectangle', 'Box'])
+                ids=["Rectangle", "Box"])
 def mesh(request):
     nx = 4
-    distribution = {'overlap_type': (DistributedMeshOverlapType.VERTEX, 1)}
+    distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
     m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
     if request.param == 3:
         m = ExtrudedMesh(m, nx)
@@ -62,7 +62,7 @@ def expected(mesh):
         return [8, 8, 8]
 
 
-@pytest.fixture(params=[None, 'fdm'], ids=['spectral', 'fdm'])
+@pytest.fixture(params=[None, "fdm"], ids=["spectral", "fdm"])
 def variant(request):
     return request.param
 
@@ -71,7 +71,7 @@ def variant(request):
 def test_p_independence(mesh, expected, variant):
     nits = []
     for p in range(3, 6):
-        e = FiniteElement('Lagrange', cell=mesh.ufl_cell(), degree=p, variant=variant)
+        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=p, variant=variant)
         V = FunctionSpace(mesh, e)
         u = TrialFunction(V)
         v = TestFunction(V)
@@ -86,9 +86,9 @@ def test_p_independence(mesh, expected, variant):
         a = inner(grad(v), grad(u))*dx
         L = inner(v, B)*dx
 
-        subs = ('on_boundary',)
+        subs = ("on_boundary",)
         if mesh.cell_set._extruded:
-            subs += ('top', 'bottom')
+            subs += ("top", "bottom")
         bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
 
         uh = Function(V)
@@ -104,7 +104,7 @@ def test_p_independence(mesh, expected, variant):
 def test_variable_coefficient(mesh):
     ndim = mesh.geometric_dimension()
     k = 4
-    V = FunctionSpace(mesh, 'Lagrange', k)
+    V = FunctionSpace(mesh, "Lagrange", k)
     u = TrialFunction(V)
     v = TestFunction(V)
     x = SpatialCoordinate(mesh)
@@ -119,9 +119,9 @@ def test_variable_coefficient(mesh):
     a = (inner(grad(v), dot(alpha, grad(u))) + inner(v, beta*u))*dx(degree=3*k+2)
     L = inner(v, Constant(1))*dx
 
-    subs = ('on_boundary',)
+    subs = ("on_boundary",)
     if mesh.cell_set._extruded:
-        subs += ('top', 'bottom')
+        subs += ("top", "bottom")
     bcs = [DirichletBC(V, zero(V.ufl_element().value_shape()), sub) for sub in subs]
 
     uh = Function(V)
@@ -131,22 +131,22 @@ def test_variable_coefficient(mesh):
     assert solver.snes.ksp.getIterationNumber() <= 14
 
 
-@pytest.fixture(params=['cg', 'dg', 'rt'],
-                ids=['cg', 'dg', 'rt'])
+@pytest.fixture(params=["cg", "dg", "rt"],
+                ids=["cg", "dg", "rt"])
 def fs(request, mesh):
     degree = 3
     ndim = mesh.topological_dimension()
     cell = mesh.ufl_cell()
     element = request.param
-    variant = 'fdm_ipdg'
-    if element == 'rt':
-        family = 'RTCF' if ndim == 2 else 'NCF'
+    variant = "fdm_ipdg"
+    if element == "rt":
+        family = "RTCF" if ndim == 2 else "NCF"
         return FunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant))
     else:
         if ndim == 1:
-            family = 'DG' if element == 'dg' else 'CG'
+            family = "DG" if element == "dg" else "CG"
         else:
-            family = 'DQ' if element == 'dg' else 'Q'
+            family = "DQ" if element == "dg" else "Q"
         return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-ndim)
 
 
@@ -187,31 +187,31 @@ def test_ipdg_direct_solver(fs):
     if ndim > 1:
         subs += (3,)
     if extruded:
-        subs += ('top',)
+        subs += ("top",)
 
     bcs = [DirichletBC(fs, u_exact, sub) for sub in subs]
 
     dirichlet_ids = subs
-    if 'on_boundary' in dirichlet_ids:
+    if "on_boundary" in dirichlet_ids:
         neumann_ids = []
     else:
         make_tuple = lambda s: s if type(s) == tuple else (s,)
         neumann_ids = list(set(mesh.exterior_facets.unique_markers) - set(sum([make_tuple(s) for s in subs if type(s) != str], ())))
     if extruded:
-        if 'top' not in dirichlet_ids:
-            neumann_ids.append('top')
-        if 'bottom' not in dirichlet_ids:
-            neumann_ids.append('bottom')
+        if "top" not in dirichlet_ids:
+            neumann_ids.append("top")
+        if "bottom" not in dirichlet_ids:
+            neumann_ids.append("bottom")
 
     dxq = dx(degree=quad_degree, domain=mesh)
     if extruded:
         dS_int = dS_v(degree=quad_degree) + dS_h(degree=quad_degree)
-        ds_ext = {'on_boundary': ds_v(degree=quad_degree), 'bottom': ds_b(degree=quad_degree), 'top': ds_t(degree=quad_degree)}
+        ds_ext = {"on_boundary": ds_v(degree=quad_degree), "bottom": ds_b(degree=quad_degree), "top": ds_t(degree=quad_degree)}
         ds_Dir = [ds_ext.get(s) or ds_v(s, degree=quad_degree) for s in dirichlet_ids]
         ds_Neu = [ds_ext.get(s) or ds_v(s, degree=quad_degree) for s in neumann_ids]
     else:
         dS_int = dS(degree=quad_degree)
-        ds_ext = {'on_boundary': ds(degree=quad_degree)}
+        ds_ext = {"on_boundary": ds(degree=quad_degree)}
         ds_Dir = [ds_ext.get(s) or ds(s, degree=quad_degree) for s in dirichlet_ids]
         ds_Neu = [ds_ext.get(s) or ds(s, degree=quad_degree) for s in neumann_ids]
 
@@ -221,7 +221,7 @@ def test_ipdg_direct_solver(fs):
     h = CellVolume(mesh)/FacetArea(mesh)
     penalty = eta/h
 
-    outer_jump = lambda w, n: outer(w('+'), n('+')) + outer(w('-'), n('-'))
+    outer_jump = lambda w, n: outer(w("+"), n("+")) + outer(w("-"), n("-"))
     num_flux = lambda w: alpha(avg(penalty/2) * outer_jump(w, n))
     num_flux_b = lambda w: alpha((penalty/2) * outer(w, n))
 
@@ -238,23 +238,23 @@ def test_ipdg_direct_solver(fs):
 
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={
-        'mat_type': 'matfree',
-        'ksp_type': 'cg',
-        'ksp_atol': 0.0E0,
-        'ksp_rtol': 1.0E-8,
-        'ksp_max_it': 3,
-        'ksp_monitor': None,
-        'ksp_norm_type': 'unpreconditioned',
-        'pc_type': 'python',
-        'pc_python_type': 'firedrake.PoissonFDMPC',
-        'fdm_pc_type': 'cholesky',
-        'fdm_pc_factor_mat_solver_type': 'mumps',
-        'fdm_pc_factor_mat_ordering_type': 'nd',
-    }, appctx={'eta': eta, })
+        "mat_type": "matfree",
+        "ksp_type": "cg",
+        "ksp_atol": 0.0E0,
+        "ksp_rtol": 1.0E-8,
+        "ksp_max_it": 3,
+        "ksp_monitor": None,
+        "ksp_norm_type": "unpreconditioned",
+        "pc_type": "python",
+        "pc_python_type": "firedrake.PoissonFDMPC",
+        "fdm_pc_type": "cholesky",
+        "fdm_pc_factor_mat_solver_type": "mumps",
+        "fdm_pc_factor_mat_ordering_type": "nd",
+    }, appctx={"eta": eta, })
     solver.solve()
 
     assert solver.snes.ksp.getIterationNumber() == 1
-    assert norm(u_exact-uh, 'H1') < 1.0E-8
+    assert norm(u_exact-uh, "H1") < 1.0E-8
 
 
 @pytest.mark.skipcomplex
@@ -262,7 +262,7 @@ def test_static_condensation(mesh):
     degree = 3
     quad_degree = 2*degree+1
     cell = mesh.ufl_cell()
-    e = FiniteElement('Lagrange', cell=cell, degree=degree, variant='fdm')
+    e = FiniteElement("Lagrange", cell=cell, degree=degree, variant="fdm")
     Z = FunctionSpace(mesh, MixedElement(*[RestrictedElement(e, d) for d in ("interior", "facet")]))
     z = Function(Z)
     u = sum(split(z))
@@ -272,29 +272,29 @@ def test_static_condensation(mesh):
     F = derivative(U, z, TestFunction(Z))
     a = derivative(F, z, TrialFunction(Z))
 
-    subs = ['on_boundary']
+    subs = ["on_boundary"]
     if mesh.cell_set._extruded:
-        subs += ['top', 'bottom']
+        subs += ["top", "bottom"]
     bcs = [DirichletBC(Z.sub(1), zero(), sub) for sub in subs]
 
     problem = LinearVariationalProblem(a, -F, z, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={
-        'mat_type': 'matfree',
-        'ksp_monitor': None,
-        'ksp_type': 'preonly',
-        'ksp_norm_type': 'unpreconditioned',
-        'pc_type': 'python',
-        'pc_python_type': 'firedrake.SCPC',
-        'pc_sc_eliminate_fields': '0',
-        'condensed_field': {
-            'mat_type': 'matfree',
-            'ksp_monitor': None,
-            'ksp_type': 'preonly',
-            'ksp_norm_type': 'unpreconditioned',
-            'pc_type': 'python',
-            'pc_python_type': 'firedrake.FDMPC',
-            'fdm_pc_type': 'lu',
-            'fdm_pc_mat_factor_solver_type': 'mumps'
+        "mat_type": "matfree",
+        "ksp_monitor": None,
+        "ksp_type": "preonly",
+        "ksp_norm_type": "unpreconditioned",
+        "pc_type": "python",
+        "pc_python_type": "firedrake.SCPC",
+        "pc_sc_eliminate_fields": "0",
+        "condensed_field": {
+            "mat_type": "matfree",
+            "ksp_monitor": None,
+            "ksp_type": "preonly",
+            "ksp_norm_type": "unpreconditioned",
+            "pc_type": "python",
+            "pc_python_type": "firedrake.FDMPC",
+            "fdm_pc_type": "lu",
+            "fdm_pc_mat_factor_solver_type": "mumps"
         }
     })
     solver.solve()

From 8554c497414b9dc9f345bee1788fe48eed898ab8 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 7 Mar 2023 16:40:46 +0000
Subject: [PATCH 04/75] fix tests, support more general TP elements in BLAS
 prolongation kernels

---
 firedrake/preconditioners/fdm.py |  17 +-
 firedrake/preconditioners/pmg.py | 922 +++++++++++++++++++++++--------
 tests/regression/test_fdm.py     |   1 +
 3 files changed, 701 insertions(+), 239 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 4542178d7c..88be79b597 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -75,7 +75,6 @@ def initialize(self, pc):
 
         use_amat = options.getBool("pc_use_amat", True)
         pmat_type = options.getString("mat_type", PETSc.Mat.Type.AIJ)
-        diagonal_scale = options.getBool("diagonal_scale", False)
 
         appctx = self.get_appctx(pc)
         fcp = appctx.get("form_compiler_parameters")
@@ -152,7 +151,7 @@ def interp_nullspace(I, nsp):
                                               fcp=fcp, options_prefix=options_prefix)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, appctx, pmat_type, diagonal_scale)
+        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, appctx, pmat_type)
         self._assemble_P()
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
@@ -179,7 +178,7 @@ def interp_nullspace(I, nsp):
             fdmpc.setFromOptions()
 
     @PETSc.Log.EventDecorator("FDMPrealloc")
-    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type, diagonal_scale):
+    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type):
         """
         Assemble the sparse preconditioner with cell-wise constant coefficients.
 
@@ -309,8 +308,6 @@ def get_coeffs(e, result=None):
         else:
             Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=V.comm)
 
-        self.diag = None
-
         @PETSc.Log.EventDecorator("FDMAssemble")
         def assemble_P():
             for _assemble in assembly_callables:
@@ -325,12 +322,6 @@ def assemble_P():
                         P.setValuesRCV(rows, rows, vals, addv)
                     self.set_values(P, Vrow, Vcol, addv)
             Pmat.assemble()
-            if diagonal_scale:
-                diag = Pmat.getDiagonal(result=self.diag)
-                diag.sqrtabs()
-                diag.reciprocal()
-                Pmat.diagonalScale(L=diag, R=diag)
-                self.diag = diag
 
         return Pmat, assemble_P
 
@@ -341,7 +332,7 @@ def update(self, pc):
         self._assemble_P()
 
     def apply(self, pc, x, y):
-        if hasattr(self, "_ctx_ref"):
+        if hasattr(self, "fdm_interp"):
             self.fdm_interp.multTranspose(x, self.work_vec_x)
             with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.apply(self.work_vec_x, self.work_vec_y)
@@ -351,7 +342,7 @@ def apply(self, pc, x, y):
             self.pc.apply(x, y)
 
     def applyTranspose(self, pc, x, y):
-        if hasattr(self, "_ctx_ref"):
+        if hasattr(self, "fdm_interp"):
             self.fdm_interp.multTranspose(x, self.work_vec_y)
             with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.applyTranspose(self.work_vec_y, self.work_vec_x)
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 88cc0f1b10..81791ebb77 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -7,7 +7,7 @@
                                get_function_space, set_function_space)
 from firedrake.solving_utils import _SNESContext
 from firedrake.tsfc_interface import extract_numbered_coefficients
-from firedrake.utils import ScalarType_c, IntType_c
+from firedrake.utils import ScalarType_c, IntType_c, cached_property
 from firedrake.petsc import PETSc
 import firedrake
 import ufl
@@ -30,9 +30,9 @@ class PMGBase(PCSNESBase):
     or any other solver in firedrake may be applied to the coarse problem.
 
     Other PETSc options inspected by this class are:
-    - 'pmg_coarse_degree': polynomial degree of the coarse level
-    - 'pmg_coarse_mat_type': can be either 'aij' or 'matfree'
-    - 'pmg_coarse_form_compiler_mode': can be 'spectral' (default), 'vanilla', 'coffee', or 'tensor'
+    - 'pmg_mg_coarse_degree': polynomial degree of the coarse level
+    - 'pmg_mg_coarse_mat_type': can be either 'aij' or 'matfree'
+    - 'pmg_mg_coarse_form_compiler_mode': can be 'spectral' (default), 'vanilla', 'coffee', or 'tensor'
     - 'pmg_mg_levels_transfer_mat_type': can be either 'aij' or 'matfree'
 
     The p-coarsening is implemented in the `coarsen_element` routine.
@@ -80,6 +80,10 @@ def initialize(self, pc):
 
         odm = pc.getDM()
         ctx = get_appctx(odm)
+        if ctx is None:
+            raise ValueError("No context found.")
+        if not isinstance(ctx, _SNESContext):
+            raise ValueError("Don't know how to get form from %r", ctx)
 
         test, trial = ctx.J.arguments()
         if test.function_space() != trial.function_space():
@@ -94,10 +98,10 @@ def initialize(self, pc):
         # Get the coarse degree from PETSc options
         fcp = ctx._problem.form_compiler_parameters
         mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
-        self.coarse_degree = opts.getInt("coarse_degree", default=1)
-        self.coarse_mat_type = opts.getString("coarse_mat_type", default=ctx.mat_type)
-        self.coarse_pmat_type = opts.getString("coarse_pmat_type", default=self.coarse_mat_type)
-        self.coarse_form_compiler_mode = opts.getString("coarse_form_compiler_mode", default=mode)
+        self.coarse_degree = opts.getInt("mg_coarse_degree", default=1)
+        self.coarse_mat_type = opts.getString("mg_coarse_mat_type", default=ctx.mat_type)
+        self.coarse_pmat_type = opts.getString("mg_coarse_pmat_type", default=self.coarse_mat_type)
+        self.coarse_form_compiler_mode = opts.getString("mg_coarse_form_compiler_mode", default=mode)
 
         # Construct a list with the elements we'll be using
         V = test.function_space()
@@ -176,12 +180,15 @@ def coarsen(self, fdm, comm):
         fu = fproblem.u
         cu = firedrake.Function(cV)
 
+        is_linear = fu not in fctx.J.coefficients()
+
         fdeg = PMGBase.max_degree(fV.ufl_element())
         cdeg = PMGBase.max_degree(cV.ufl_element())
 
-        fine_to_coarse_map = {fu: cu,
-                              test: test.reconstruct(function_space=cV),
+        fine_to_coarse_map = {test: test.reconstruct(function_space=cV),
                               trial: trial.reconstruct(function_space=cV)}
+        if not is_linear:
+            fine_to_coarse_map[fu] = cu
 
         def _coarsen_form(a):
             if isinstance(a, ufl.Form):
@@ -195,6 +202,7 @@ def _coarsen_form(a):
         cJp = _coarsen_form(fctx.Jp)
         fcp = self.coarsen_quadrature(fproblem.form_compiler_parameters, fdeg, cdeg)
         cbcs = self.coarsen_bcs(fproblem.bcs, cV)
+        cF = self.coarsen_residual(cF, cJ, cu)
 
         # Coarsen the appctx: the user might want to provide solution-dependant expressions and forms
         cappctx = dict(fctx.appctx)
@@ -227,7 +235,7 @@ def _coarsen_form(a):
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
                                                          form_compiler_parameters=fcp,
-                                                         is_linear=fproblem.is_linear)
+                                                         is_linear=is_linear)
 
         cctx = type(fctx)(cproblem, mat_type, pmat_type,
                           appctx=cappctx,
@@ -250,13 +258,16 @@ def _coarsen_form(a):
         cdm.setCreateInterpolation(self.create_interpolation)
         cdm.setCreateInjection(self.create_injection)
 
-        # injection of the initial state
-        def inject_state(mat):
-            with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
-                mat.multTranspose(xf, xc)
+        interp_petscmat, _ = cdm.createInterpolation(fdm)
+        inject_petscmat = cdm.createInjection(fdm)
+
+        if not is_linear:
+            # injection of the initial state
+            def inject_state():
+                with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
+                    inject_petscmat.mult(xf, xc)
 
-        injection = self.create_injection(cdm, fdm)
-        add_hook(parent, setup=partial(inject_state, injection), call_setup=True)
+            add_hook(parent, setup=inject_state, call_setup=True)
 
         # coarsen the nullspace basis
         def coarsen_nullspace(coarse_V, mat, fine_nullspace):
@@ -287,14 +298,13 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
             else:
                 return fine_nullspace
 
-        I, _ = self.create_interpolation(cdm, fdm)
         ises = cV._ises
-        cctx._nullspace = coarsen_nullspace(cV, I, fctx._nullspace)
+        cctx._nullspace = coarsen_nullspace(cV, inject_petscmat, fctx._nullspace)
         cctx.set_nullspace(cctx._nullspace, ises, transpose=False, near=False)
-        cctx._nullspace_T = coarsen_nullspace(cV, I, fctx._nullspace_T)
-        cctx.set_nullspace(cctx._nullspace_T, ises, transpose=True, near=False)
-        cctx._near_nullspace = coarsen_nullspace(cV, injection, fctx._near_nullspace)
+        cctx._near_nullspace = coarsen_nullspace(cV, inject_petscmat, fctx._near_nullspace)
         cctx.set_nullspace(cctx._near_nullspace, ises, transpose=False, near=True)
+        cctx._nullspace_T = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace_T)
+        cctx.set_nullspace(cctx._nullspace_T, ises, transpose=True, near=False)
         return cdm
 
     def coarsen_quadrature(self, metadata, fdeg, cdeg):
@@ -315,25 +325,19 @@ def coarsen_bcs(self, fbcs, cV):
             for index in bc._indices:
                 cV_ = cV_.sub(index)
             cbc_value = self.coarsen_bc_value(bc, cV_)
-            if type(bc) == firedrake.DirichletBC:
-                cbcs.append(firedrake.DirichletBC(cV_, cbc_value,
-                                                  bc.sub_domain))
+            if isinstance(bc, firedrake.DirichletBC):
+                cbcs.append(bc.reconstruct(V=cV, g=cbc_value))
             else:
                 raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
         return cbcs
 
     @staticmethod
     @lru_cache(maxsize=20)
-    def create_transfer(cctx, fctx, mat_type, cbcs, fbcs, inject):
+    def create_transfer(cctx, fctx, mat_type, cbcs, fbcs):
         cbcs = cctx._problem.bcs if cbcs else []
         fbcs = fctx._problem.bcs if fbcs else []
-        if inject:
-            cV = cctx._problem.u
-            fV = fctx._problem.u
-        else:
-            cV = cctx.J.arguments()[0].function_space()
-            fV = fctx.J.arguments()[0].function_space()
-
+        cV = cctx.J.arguments()[0].function_space()
+        fV = fctx.J.arguments()[0].function_space()
         if mat_type == "matfree":
             return prolongation_matrix_matfree(fV, cV, fbcs, cbcs)
         elif mat_type == "aij":
@@ -344,13 +348,12 @@ def create_transfer(cctx, fctx, mat_type, cbcs, fbcs, inject):
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        return self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False, False), None
+        return self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False), None
 
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        I = self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False, True)
-        return PETSc.Mat().createTranspose(I)
+        return self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
 
     @staticmethod
     def max_degree(ele):
@@ -392,7 +395,7 @@ def reconstruct_degree(ele, degree):
         if isinstance(ele, ufl.VectorElement):
             return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), dim=ele.num_sub_elements())
         elif isinstance(ele, ufl.TensorElement):
-            return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), shape=ele.value_shape(), symmetry=ele.symmetry())
+            return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), shape=ele._shape, symmetry=ele.symmetry())
         elif isinstance(ele, ufl.EnrichedElement):
             shift = degree-PMGBase.max_degree(ele)
             return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele._elements))
@@ -404,8 +407,10 @@ def reconstruct_degree(ele, degree):
             return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele.sub_elements()))
         elif isinstance(ele, ufl.WithMapping):
             return type(ele)(PMGBase.reconstruct_degree(ele.wrapee, degree), ele.mapping())
-        elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement, ufl.RestrictedElement)):
+        elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
             return type(ele)(PMGBase.reconstruct_degree(ele._element, degree))
+        elif isinstance(ele, ufl.RestrictedElement):
+            return type(ele)(PMGBase.reconstruct_degree(ele._element, degree), restriction_domain=ele._restriction_domain)
         else:
             return ele.reconstruct(degree=degree)
 
@@ -431,8 +436,7 @@ def configure_pmg(self, pc, pdm):
         # for the user, if they haven't already; I don't know any
         # other way to get PETSc to know this at the right time.
         opts = PETSc.Options(pc.getOptionsPrefix() + "pmg_")
-        if "mg_coarse_pc_mg_levels" not in opts:
-            opts["mg_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
+        opts["mg_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
 
         return ppc
 
@@ -443,7 +447,10 @@ def applyTranspose(self, pc, x, y):
         return self.ppc.applyTranspose(x, y)
 
     def coarsen_bc_value(self, bc, cV):
-        return firedrake.zero(cV.shape)
+        return 0
+
+    def coarsen_residual(self, Fc, Jc, uc):
+        return ufl.action(Jc, uc)
 
 
 class PMGSNES(SNESBase, PMGBase):
@@ -475,10 +482,8 @@ def configure_pmg(self, snes, pdm):
         # for the user, if they haven't already; I don't know any
         # other way to get PETSc to know this at the right time.
         opts = PETSc.Options(snes.getOptionsPrefix() + "pfas_")
-        if "fas_coarse_pc_mg_levels" not in opts:
-            opts["fas_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
-        if "fas_coarse_snes_fas_levels" not in opts:
-            opts["fas_coarse_snes_fas_levels"] = odm.getRefineLevel() + 1
+        opts["fas_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
+        opts["fas_coarse_snes_fas_levels"] = odm.getRefineLevel() + 1
 
         return psnes
 
@@ -499,6 +504,89 @@ def coarsen_bc_value(self, bc, cV):
         coarse.interpolate(bc._original_arg)
         return coarse
 
+    def coarsen_residual(self, Fc, Jc, uc):
+        return Fc
+
+
+def load_c_code(code, name, argtypes, comm):
+    from pyop2.compilation import load
+    from pyop2.utils import get_petsc_dir
+    cppargs = ["-I%s/include" % d for d in get_petsc_dir()]
+    ldargs = (["-L%s/lib" % d for d in get_petsc_dir()]
+              + ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()]
+              + ["-lpetsc", "-lm"])
+    return load(code, "c", name, argtypes=argtypes,
+                cppargs=cppargs, ldargs=ldargs,
+                comm=comm)
+
+
+def reference_moments(*args, **kwargs):
+    import ctypes
+    from tsfc import compile_form
+    quad_degree = 1+sum([PMGBase.max_degree(t.ufl_element()) for t in args])
+    form = ufl.inner(*args)*ufl.dx(degree=quad_degree)
+    kernel, = compile_form(form, parameters=dict(mode="spectral"),
+                           log=PETSc.Log.isActive(), **kwargs)
+    op2kernel = op2.Kernel(kernel.ast, kernel.name,
+                           requires_zeroed_output_arguments=True,
+                           flop_count=kernel.flop_count,
+                           events=(kernel.event,))
+    code = op2kernel.code.gencode().replace("static inline void", "void")
+    coords = None
+    mesh = form.ufl_domain()
+    if len(kernel.arguments) > 3-len(form.arguments()):
+        mesh_element = mesh.coordinates.function_space().finat_element
+        nodes = mesh_element.fiat_equivalent.dual.get_nodes()
+        points = [list(node.get_point_dict().keys())[0] for node in nodes]
+        coords = numpy.array(points, dtype=PETSc.ScalarType)
+
+    argtypes = [ctypes.c_voidp]*len(kernel.arguments)
+    funptr = load_c_code(code, op2kernel.code.name, argtypes, mesh.comm)
+
+    def _wrapper(*args):
+        args[0].fill(0.0E0)
+        _args = list(args)
+        if coords is not None:
+            _args.insert(1, coords)
+        return funptr(*[a.ctypes.data for a in _args])
+
+    return _wrapper
+
+
+@lru_cache(maxsize=10)
+def matfree_reference_prolongator(Vf, Vc):
+    dimf = Vf.value_size * Vf.finat_element.space_dimension()
+    dimc = Vc.value_size * Vc.finat_element.space_dimension()
+    build_Afc = reference_moments(ufl.TestFunction(Vf), ufl.TrialFunction(Vc))
+    apply_Aff = reference_moments(ufl.TestFunction(Vf), ufl.Coefficient(Vf))
+    diag_Aff = reference_moments(ufl.TestFunction(Vf), ufl.TrialFunction(Vf), diagonal=True)
+    Ax = numpy.empty((dimf,), dtype=PETSc.ScalarType)
+    Dx = numpy.empty((dimf,), dtype=PETSc.ScalarType)
+    diagonal = numpy.empty((dimf,), dtype=PETSc.ScalarType)
+    result = numpy.empty((dimf, dimc), dtype=PETSc.ScalarType)
+
+    def _afun(x):
+        nonlocal Ax, Dx, diagonal
+        numpy.multiply(x, diagonal, out=Dx)
+        apply_Aff(Ax, Dx)
+        numpy.multiply(Ax, diagonal, out=Ax)
+        return Ax
+
+    if Vf.comm.rank == 0:
+        from scipy.sparse.linalg import cg, LinearOperator
+        build_Afc(result)
+        diag_Aff(diagonal)
+        numpy.sqrt(diagonal, out=diagonal)
+        numpy.reciprocal(diagonal, out=diagonal)
+        A = LinearOperator((dimf, dimf), _afun, dtype=result.dtype)
+        for k in range(dimc):
+            numpy.multiply(result[:, k], diagonal, out=result[:, k])
+            result[:, k], _ = cg(A, result[:, k], tol=1E-12)
+            numpy.multiply(result[:, k], diagonal, out=result[:, k])
+
+    result = Vf.comm.bcast(result, root=0)
+    return result
+
 
 def prolongation_transfer_kernel_action(Vf, expr):
     from tsfc import compile_expression_dual_evaluation
@@ -515,12 +603,12 @@ def prolongation_transfer_kernel_action(Vf, expr):
                       events=(kernel.event,)), coefficients
 
 
-@lru_cache(maxsize=10)
 def expand_element(ele):
     """
     Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
     """
-    if ele.cell() == ufl.quadrilateral:
+
+    if ele.cell().cellname().startswith("quadrilateral"):
         quadrilateral_tpc = ufl.TensorProductCell(ufl.interval, ufl.interval)
         return expand_element(ele.reconstruct(cell=quadrilateral_tpc))
     elif ele.cell() == ufl.hexahedron:
@@ -528,12 +616,14 @@ def expand_element(ele):
         return expand_element(ele.reconstruct(cell=hexahedron_tpc))
     elif isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
         return expand_element(ele._sub_element)
-    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement, ufl.RestrictedElement)):
+    elif isinstance(ele, ufl.MixedElement):
+        return type(ele)(*[expand_element(e) for e in ele.sub_elements()])
+    elif isinstance(ele, ufl.RestrictedElement):
+        return type(ele)(expand_element(ele._element), restriction_domain=ele._restriction_domain)
+    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
         return expand_element(ele._element)
     elif isinstance(ele, ufl.WithMapping):
         return expand_element(ele.wrapee)
-    elif isinstance(ele, ufl.MixedElement):
-        return ufl.MixedElement(*[expand_element(e) for e in ele.sub_elements()])
     elif isinstance(ele, ufl.EnrichedElement):
         terms = []
         for e in ele._elements:
@@ -542,6 +632,7 @@ def expand_element(ele):
                 terms.extend(ee._elements)
             else:
                 terms.append(ee)
+        cell, = set([t.cell() for t in terms])
         return ufl.EnrichedElement(*terms)
     elif isinstance(ele, ufl.TensorProductElement):
         factors = [expand_element(e) for e in ele.sub_elements()]
@@ -552,55 +643,188 @@ def expand_element(ele):
                 f_factors = f.sub_elements() if isinstance(f, ufl.TensorProductElement) else (f,)
                 new_terms.extend([t_factors + f_factors for t_factors in terms])
             terms = new_terms
+
         if len(terms) == 1:
             return ufl.TensorProductElement(*terms[0])
         else:
-            return ufl.EnrichedElement(*[ufl.TensorProductElement(*k) for k in terms])
+            terms = [ufl.TensorProductElement(*k) for k in terms]
+            cell, = set([t.cell() for t in terms])
+            return ufl.EnrichedElement(*terms)
     else:
         return ele
 
 
+def evaluate_dual(dual, element, key=None):
+    keys = set(tuple(phi.get_point_dict().keys()) for phi in dual)
+    pts = list(set(sum(keys, ())))
+    if key is None:
+        key = (0, ) * len(pts[0])
+    tab = element.tabulate(sum(key), pts)[key]
+    result = numpy.empty((len(dual), element.space_dimension()), dtype=tab.dtype)
+    zero = [(0.0, ())]
+    for k, phi in enumerate(dual):
+        wts = phi.get_point_dict()
+        wts = numpy.array([wts.get(pt, zero)[0][0] for pt in pts])
+        result[k] = tab.dot(wts).T
+    return result
+
+
+def compare_element(e1, e2):
+    if e1 is e2:
+        return True
+    if e1.space_dimension() != e2.space_dimension():
+        return False
+    B = evaluate_dual(e1.dual_basis(), e2)
+    numpy.fill_diagonal(B, numpy.diagonal(B)-1.0)
+    return numpy.allclose(B, 0.0, rtol=1E-14, atol=1E-14)
+
+
+def compare_dual(b1, b2):
+    p1 = b1.get_point_dict()
+    p2 = b2.get_point_dict()
+    if len(p1) != len(p2):
+        return False
+
+    k1 = numpy.array(list(p1.keys()))
+    k2 = numpy.array(list(p2.keys()))
+    if not numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16):
+        return False
+
+    k1 = numpy.array([p1[k][0][0] for k in p1])
+    k2 = numpy.array([p2[k][0][0] for k in p2])
+    if not numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16):
+        return False
+    return True
+
+
+def compare_dual_basis(l1, l2):
+    if len(l1) != len(l2):
+        return False
+    for b1, b2 in zip(l1, l2):
+        if not compare_dual(b1, b2):
+            return False
+    return True
+
+
+@lru_cache(maxsize=10)
+@PETSc.Log.EventDecorator("GetLineElements")
 def get_line_elements(V):
     from FIAT.reference_element import LINE
     from tsfc.finatinterface import create_element
     ele = V.ufl_element()
     if isinstance(ele, ufl.MixedElement) and not isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
         raise ValueError("MixedElements are not decomposed into tensor products")
-    rvs = ele.reference_value_size()
-    ele = expand_element(ele)
-    if isinstance(ele, ufl.EnrichedElement):
-        ele = ele._elements[0]
 
+    ele = expand_element(ele)
     finat_ele = create_element(ele)
-    if rvs*finat_ele.space_dimension() != V.value_size*V.finat_element.space_dimension():
-        raise ValueError("Failed to decompose %s into a single tensor product" % V.ufl_element())
-    factors = finat_ele.factors if hasattr(finat_ele, "factors") else (finat_ele,)
+    if finat_ele.space_dimension() != V.finat_element.space_dimension():
+        raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
+
+    def cyclic_perm(a):
+        return [a[i:] + a[:i] for i in range(len(a))]
+
+    permutations = []
     line_elements = []
-    for e in reversed(factors):
-        fiat_ele = e.fiat_equivalent
-        if fiat_ele.get_reference_element().shape != LINE:
-            raise ValueError("Expecting %s to be on the interval" % fiat_ele)
-        line_elements.append(fiat_ele)
-    return line_elements
+    axes_shifts = []
+
+    terms = finat_ele.elements if hasattr(finat_ele, "elements") else [finat_ele]
+    for term in terms:
+        factors = term.factors if hasattr(term, "factors") else (term,)
+        expansion = tuple(e.fiat_equivalent for e in reversed(factors))
+        if not all([e.get_reference_element().shape == LINE for e in expansion]):
+            raise ValueError("Failed to decompose %s into line elements" % V.ufl_element())
+
+        shift = -1
+        for k, perm in enumerate(permutations):
+            is_perm = all([e1.space_dimension() == e2.space_dimension()
+                           for e1, e2 in zip(perm, expansion)])
+            for e1, e2 in zip(perm, expansion):
+                if is_perm:
+                    is_perm = compare_element(e1, e2)
+
+            if is_perm:
+                shift = len(expansion) - k
+                axes_shifts[-1] = axes_shifts[-1] + (shift, )
+                break
+
+        if shift == -1:
+            line_elements.append(expansion)
+            axes_shifts.append((0, ))
+            permutations = cyclic_perm(expansion)
+
+    return line_elements, axes_shifts
 
 
 @lru_cache(maxsize=10)
-def get_line_interpolator(felem, celem):
-    from FIAT import functional, make_quadrature
+def fiat_reference_prolongator(felem, celem, derivative=False):
+    from FIAT.reference_element import flatten_reference_cube
+
+    ref_el = flatten_reference_cube(felem.get_reference_element())
+    tdim = ref_el.get_spatial_dimension()
+    if derivative and tdim > 1:
+        raise NotImplementedError("Derivative prolongator is only available on the interval")
+    ckey = (felem.formdegree,) if derivative else (0,)*tdim
+    fkey = (celem.formdegree,) if derivative else (0,)*tdim
+
     fdual = felem.dual_basis()
     cdual = celem.dual_basis()
-    if len(fdual) == len(cdual):
-        if all(f.get_point_dict() == c.get_point_dict() for f, c in zip(fdual, cdual)):
-            return numpy.array([])
+    if fkey == ckey and compare_dual_basis(fdual, cdual):
+        return numpy.array([])
+    return evaluate_dual(fdual, celem, ckey)
+
 
-    if all(isinstance(phi, functional.PointEvaluation) for phi in fdual):
-        pts = [list(phi.get_point_dict().keys())[0] for phi in fdual]
-        return celem.tabulate(0, pts)[(0,)]
+@lru_cache(maxsize=10)
+def finat_reference_prolongator(felem, celem):
+    from finat.quadrature import make_quadrature
+    from gem.interpreter import evaluate
+
+    ref_el = felem.cell
+    ndim = ref_el.get_spatial_dimension()
+    degree = felem.degree
+    try:
+        degree = max(degree)
+    except TypeError:
+        pass
+    quad_degree = 2*degree+1
+
+    def _tabulate(e, ps, entity=None):
+        results = evaluate(e.basis_evaluation(0, ps, entity).values())
+        return results[0].arr.reshape((len(ps.points), -1))
+
+    is_facet_element = True
+    entity_dofs = felem.entity_dofs()
+    for key in entity_dofs:
+        v = sum(list(entity_dofs[key].values()), [])
+        if len(v):
+            edim = sum(key) if type(key) == tuple else key
+            if edim == ndim:
+                is_facet_element = False
+
+    if is_facet_element and degree > 5:
+        entities = []
+        quadratures = []
+        for key in ref_el.sub_entities:
+            edim = sum(key) if type(key) == tuple else key
+            if edim == ndim-1:
+                sub_entities = ref_el.sub_entities[key]
+                entities.extend([(key, f) for f in sub_entities])
+                quadratures.extend([make_quadrature(ref_el.construct_subelement(key), quad_degree)]*len(sub_entities))
+
+        wts = numpy.concatenate([evaluate([q.weight_expression])[0].arr.reshape((-1,)) for q in quadratures])
+        cphi = numpy.concatenate([_tabulate(celem, q.point_set, entity=e) for q, e in zip(quadratures, entities)]).T
+        fphi = numpy.concatenate([_tabulate(felem, q.point_set, entity=e) for q, e in zip(quadratures, entities)]).T
     else:
-        pts = make_quadrature(felem.get_reference_element(),
-                              felem.space_dimension()).get_points()
-        return numpy.dot(celem.tabulate(0, pts)[(0,)],
-                         numpy.linalg.inv(felem.tabulate(0, pts)[(0,)]))
+        quadrature = make_quadrature(ref_el, quad_degree)
+        wts = evaluate([quadrature.weight_expression])[0].arr.reshape((-1,))
+        cphi = _tabulate(celem, quadrature.point_set).T
+        fphi = _tabulate(felem, quadrature.point_set).T
+
+    numpy.sqrt(wts, out=wts)
+    numpy.multiply(fphi, wts, out=fphi)
+    numpy.multiply(cphi, wts, out=cphi)
+    cphi = cphi.reshape((celem.space_dimension(), -1))
+    fphi = fphi.reshape((felem.space_dimension(), -1))
+    return numpy.linalg.solve(fphi.dot(fphi.T), fphi.dot(cphi.T))
 
 
 # Common kernel to compute y = kron(A3, kron(A2, A1)) * x
@@ -609,7 +833,7 @@ def get_line_interpolator(felem, celem):
 #include <petscsys.h>
 #include <petscblaslapack.h>
 
-static inline void kronmxv(PetscBLASInt tflag,
+static inline void kronmxv_inplace(PetscBLASInt tflag,
     PetscBLASInt mx, PetscBLASInt my, PetscBLASInt mz,
     PetscBLASInt nx, PetscBLASInt ny, PetscBLASInt nz, PetscBLASInt nel,
     PetscScalar *A1, PetscScalar *A2, PetscScalar *A3,
@@ -675,42 +899,206 @@ def get_line_interpolator(felem, celem):
 *y = ptr[ires];
 return;
 }
-"""
 
+static inline void kronmxv(PetscBLASInt tflag,
+    PetscBLASInt mx, PetscBLASInt my, PetscBLASInt mz,
+    PetscBLASInt nx, PetscBLASInt ny, PetscBLASInt nz, PetscBLASInt nel,
+    PetscScalar *A1, PetscScalar *A2, PetscScalar *A3,
+    PetscScalar *x, PetscScalar *y, PetscScalar *xwork, PetscScalar *ywork){
+
+    PetscScalar *ptr[2] = {xwork, ywork};
+
+    if(ptr[0] != x)
+        for(PetscBLASInt j=0; j<nx*ny*nz*nel; j++)
+            ptr[0][j] = x[j];
+
+    kronmxv_inplace(tflag, mx, my, mz, nx, ny, nz, nel, A1, A2, A3, &ptr[0], &ptr[1]);
 
-def make_kron_code(Vf, Vc, t_in, t_out, mat_name):
-    nscal = Vf.ufl_element().reference_value_size()
-    felems = get_line_elements(Vf)
-    celems = get_line_elements(Vc)
-    if len(felems) != len(celems):
-        raise ValueError("Fine and coarse elements do not have the same number of factors")
-    if len(felems) > 3:
-        raise ValueError("More than three factors are not supported")
+    if(ptr[1] != y)
+        for(PetscBLASInt j=0; j<mx*my*mz*nel; j++)
+            y[j] = ptr[1][j];
+    return;
+}
+
+static inline void permute_axis(PetscBLASInt axis,
+    PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
+    PetscScalar *x, PetscScalar *y){
+
+    PetscBLASInt p = 0;
+    PetscBLASInt s0, s1, s2, s3;
+    if (axis == 0){
+        s0 = 1; s1 = s0*n0; s2 = s1*n1; s3 = s2*n2;
+    }else if(axis == 1){
+        s1 = 1; s2 = s1*n1; s0 = s2*n2; s3 = s0*n0;
+    }else if(axis == 2){
+        s2 = 1; s0 = s2*n2; s1 = s0*n0; s3 = s1*n1;
+    }
+    for(PetscBLASInt i3=0; i3<n3; i3++)
+        for(PetscBLASInt i2=0; i2<n2; i2++)
+            for(PetscBLASInt i1=0; i1<n1; i1++)
+                for(PetscBLASInt i0=0; i0<n0; i0++)
+                    y[s0*i0 + s1*i1 + s2*i2 + s3*i3] = x[p++];
+    return;
+}
 
-    # Declare array shapes to be used as literals inside the kernels
-    fshape = [e.space_dimension() for e in felems]
-    cshape = [e.space_dimension() for e in celems]
-    shapes = [(nscal,) + tuple(fshape), (nscal,) + tuple(cshape)]
+static inline void ipermute_axis(PetscBLASInt axis,
+    PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
+    PetscScalar *x, PetscScalar *y){
+
+    PetscBLASInt p = 0;
+    PetscBLASInt s0, s1, s2, s3;
+    if (axis == 0){
+        s0 = 1; s1 = s0*n0; s2 = s1*n1; s3 = s2*n2;
+    }else if(axis == 1){
+        s1 = 1; s2 = s1*n1; s0 = s2*n2; s3 = s0*n0;
+    }else if(axis == 2){
+        s2 = 1; s0 = s2*n2; s1 = s0*n0; s3 = s1*n1;
+    }
+
+    for(PetscBLASInt i3=0; i3<n3; i3++)
+        for(PetscBLASInt i2=0; i2<n2; i2++)
+            for(PetscBLASInt i1=0; i1<n1; i1++)
+                for(PetscBLASInt i0=0; i0<n0; i0++)
+                    x[p++] += y[s0*i0 + s1*i1 + s2*i2 + s3*i3];
+    return;
+}
+"""
+
+
+@PETSc.Log.EventDecorator("MakeKronCode")
+def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
+    operator_decl = []
+    prolong_code = []
+    restrict_code = []
+    felems, fshifts = get_line_elements(Vf)
+    celems, cshifts = get_line_elements(Vc)
+    if len(felems) > 3 or len(celems) > 3:
+        raise ValueError("The expansion is too complicated")
+
+    shifts = fshifts
+    in_place = False
+    if len(felems) == len(celems):
+        in_place = all([(len(fs)*Vf.value_size == len(cs)*Vc.value_size) for fs, cs in zip(fshifts, cshifts)])
+        psize = Vf.value_size
+
+    if not in_place:
+        if len(celems) == 1:
+            psize = Vc.value_size
+            pelem = celems[0]
+            perm_name = "perm_%s" % t_in
+            celems = celems*len(felems)
+        elif len(felems) == 1:
+            shifts = cshifts
+            psize = Vf.value_size
+            pelem = felems[0]
+            perm_name = "perm_%s" % t_out
+            felems = felems*len(celems)
+        else:
+            raise ValueError("Cannot assign fine to coarse DOFs")
+
+        for k in range(len(shifts)):
+            if Vc.value_size*len(shifts[k]) < Vf.value_size:
+                shifts[k] = shifts[k]*(Vf.value_size//Vc.value_size)
+
+        perm = sum(shifts, tuple())
+        perm_data = ", ".join(map(str, perm))
+        operator_decl.append(f"""
+            PetscBLASInt {perm_name}[{len(perm)}] = {{ {perm_data} }};
+        """)
+
+        pshape = [e.space_dimension() for e in pelem]
+        pargs = ", ".join(map(str, pshape+[1]*(3-len(pshape))))
+        pstride = psize * numpy.prod(pshape)
+        if shifts == fshifts:
+            prolong_code.append(f"""
+            for({IntType_c} j=1; j<{len(perm)}; j++)
+                permute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} j=1; j<{len(perm)}; j++)
+                ipermute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+
+    fskip = 0
+    cskip = 0
+    Jlen = 0
+    Jmats = []
+    fshapes = []
+    cshapes = []
+    has_code = False
+    for felem, celem, shift in zip(felems, celems, shifts):
+        if len(felem) != len(celem):
+            raise ValueError("Fine and coarse elements do not have the same number of factors")
+        if len(felem) > 3:
+            raise ValueError("More than three factors are not supported")
+
+        # Declare array shapes to be used as literals inside the kernels
+        nscal = psize*len(shift)
+        fshape = [e.space_dimension() for e in felem]
+        cshape = [e.space_dimension() for e in celem]
+        fshapes.append((nscal,) + tuple(fshape))
+        cshapes.append((nscal,) + tuple(cshape))
+
+        J = [fiat_reference_prolongator(fe, ce).T for fe, ce in zip(felem, celem)]
+        if any([Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J]):
+            prolong_code.append(f"""
+            for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} i=0; i<{nscal*numpy.prod(cshape)}; i++) {t_in}[i+{cskip}] = 0.0E0;
+            """)
+        else:
+            Jsize = numpy.cumsum([Jlen]+[Jk.size for Jk in J])
+            Jptrs = ["%s+%d" % (mat_name, Jsize[k]) if J[k].size else "NULL" for k in range(len(J))]
+            Jmats.extend(J)
+            Jlen = Jsize[-1]
+
+            # The Kronecker product routines assume 3D shapes, so in 1D and 2D we pass NULL instead of J
+            Jargs = ", ".join(Jptrs+["NULL"]*(3-len(Jptrs)))
+            fargs = ", ".join(map(str, fshape+[1]*(3-len(fshape))))
+            cargs = ", ".join(map(str, cshape+[1]*(3-len(cshape))))
+            if in_place:
+                prolong_code.append(f"""
+            kronmxv_inplace(0, {fargs}, {cargs}, {nscal}, {Jargs}, &{t_in}, &{t_out});
+                """)
+                restrict_code.append(f"""
+            kronmxv_inplace(1, {cargs}, {fargs}, {nscal}, {Jargs}, &{t_out}, &{t_in});
+                """)
+            elif shifts == fshifts:
+                if has_code and psize > 1:
+                    raise ValueError("Single tensor product to many tensor products not implemented for vectors")
+                # Single tensor product to many
+                prolong_code.append(f"""
+            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, {t_in}+{cskip}, {t_out}+{fskip}, {scratch}, {t_out}+{fskip});
+                """)
+                restrict_code.append(f"""
+            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip}, {scratch});
+                """)
+            else:
+                # Many tensor products to single tensor product
+                if has_code:
+                    raise ValueError("Many tensor products to single tensor product not implemented")
+                fskip = 0
+                prolong_code.append(f"""
+            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, {t_in}+{cskip}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip});
+                """)
+                restrict_code.append(f"""
+            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip}, {t_in}+{cskip});
+                """)
+            has_code = True
+        fskip += nscal*numpy.prod(fshape)
+        cskip += nscal*numpy.prod(cshape)
 
     # Pass the 1D interpolators as a hexadecimal string
-    J = [get_line_interpolator(fe, ce) for fe, ce in zip(felems, celems)]
-    Jdata = ", ".join(map(float.hex, chain(*[Jk.flat for Jk in J])))
-    Jsize = numpy.cumsum([0]+[Jk.size for Jk in J])
-    Jptrs = ["%s+%d" % (mat_name, Jsize[k]) if J[k].size else "NULL" for k in range(len(J))]
-
-    # The Kronecker product routines assume 3D shapes, so in 1D and 2D we pass NULL instead of J
-    Jargs = ", ".join(Jptrs+["NULL"]*(3-len(Jptrs)))
-    fargs = ", ".join(map(str, fshape+[1]*(3-len(fshape))))
-    cargs = ", ".join(map(str, cshape+[1]*(3-len(cshape))))
-    operator_decl = f"""
-            PetscScalar {mat_name}[{Jsize[-1]}] = {{ {Jdata} }};
-    """
-    prolong_code = f"""
-            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, &{t_in}, &{t_out});
-    """
-    restrict_code = f"""
-            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, &{t_out}, &{t_in});
-    """
+    Jdata = ", ".join(map(float.hex, chain(*[Jk.flat for Jk in Jmats])))
+    operator_decl.append(f"""
+            PetscScalar {mat_name}[{Jlen}] = {{ {Jdata} }};
+    """)
+
+    operator_decl = "".join(operator_decl)
+    prolong_code = "".join(prolong_code)
+    restrict_code = "".join(reversed(restrict_code))
+    shapes = [tuple(map(max, zip(*fshapes))), tuple(map(max, zip(*cshapes)))]
     return operator_decl, prolong_code, restrict_code, shapes
 
 
@@ -754,6 +1142,8 @@ def cache_generate_code(kernel, comm):
 
 
 def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
+    if fmapping == cmapping:
+        return None
     domain = Q.ufl_domain()
     A = get_piola_tensor(cmapping, domain, inverse=False)
     B = get_piola_tensor(fmapping, domain, inverse=True)
@@ -795,28 +1185,20 @@ def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
     return coef_decl, prolong_code, restrict_code, mapping_code, coefficients
 
 
-def get_axes_shift(ele):
-    """Return the form degree of a FInAT element after discarding modifiers"""
-    if hasattr(ele, "element"):
-        return get_axes_shift(ele.element)
-    else:
-        return ele.formdegree
-
-
 def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
-    shift = get_axes_shift(V.finat_element)
-    tdim = V.mesh().topological_dimension()
-    if shift % tdim:
+    _, shifts = get_line_elements(V)
+    shift = shifts[0]
+    if shift != (0,):
         ndof = numpy.prod(vshape)
         permutation = numpy.reshape(numpy.arange(ndof), pshape)
-        axes = numpy.arange(tdim)
+        axes = numpy.arange(len(shift))
         for k in range(permutation.shape[0]):
-            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, -shift*k)), pshape[1:])
+            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, -shift[k])), pshape[1:])
         nflip = 0
         mapping = V.ufl_element().mapping().lower()
         if mapping == "contravariant piola":
             # flip the sign of the first component
-            nflip = ndof//tdim
+            nflip = ndof//len(shift)
         elif mapping == "covariant piola":
             # flip the order of reference components
             permutation = numpy.flip(permutation, axis=0)
@@ -850,30 +1232,38 @@ def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
     return decl, prolong, restrict
 
 
+@PETSc.Log.EventDecorator("GetPermutedMap")
 def get_permuted_map(V):
     """
     Return a PermutedMap with the same tensor product shape for
     every component of H(div) or H(curl) tensor product elements
     """
-    shift = get_axes_shift(V.finat_element)
-    if shift % V.mesh().topological_dimension() == 0:
+    expansion, shifts = get_line_elements(V)
+    if {(0, )} == set(shifts):
         return V.cell_node_map()
 
-    elements = get_line_elements(V)
-    axes = numpy.arange(len(elements))
-    pshape = [-1] + [e.space_dimension() for e in elements]
-    permutation = numpy.reshape(numpy.arange(V.finat_element.space_dimension()), pshape)
-    for k in range(permutation.shape[0]):
-        permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, shift*k)), pshape[1:])
+    istart = 0
+    perm = []
+    for factors, shift in zip(expansion, shifts):
+        axes = numpy.arange(len(factors))
+        pshape = [len(shift)] + [e.space_dimension() for e in factors]
+        iend = istart + numpy.prod(pshape)
+        permutation = numpy.reshape(numpy.arange(istart, iend), pshape)
+        for k in range(permutation.shape[0]):
+            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, shift[k])), pshape[1:])
+        perm.extend(permutation.flat)
+        istart = iend
 
-    permutation = numpy.reshape(permutation, (-1,))
-    return PermutedMap(V.cell_node_map(), permutation)
+    return PermutedMap(V.cell_node_map(), perm)
 
 
 class StandaloneInterpolationMatrix(object):
     """
     Interpolation matrix for a single standalone space.
     """
+
+    _cache_work = {}
+
     def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
         self.Vf_bcs = Vf_bcs
         self.Vc_bcs = Vc_bcs
@@ -881,29 +1271,47 @@ def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
             self.uf = Vf
             Vf = Vf.function_space()
         else:
-            self.uf = firedrake.Function(Vf)
+            self.uf = self._cache_work.get(Vf, firedrake.Function(Vf))
+            self._cache_work[Vf] = self.uf
         if isinstance(Vc, firedrake.Function):
             self.uc = Vc
             Vc = Vc.function_space()
         else:
-            self.uc = firedrake.Function(Vc)
-
-        self.weight = self.multiplicity(Vf)
-        with self.weight.dat.vec as w:
+            self.uc = self._cache_work.get(Vc, firedrake.Function(Vc))
+            self._cache_work[Vc] = self.uc
+        self.Vf = Vf
+        self.Vc = Vc
+
+    @cached_property
+    def _weight(self):
+        weight = firedrake.Function(self.Vf)
+        size = self.Vf.finat_element.space_dimension() * self.Vf.value_size
+        kernel_code = f"""
+        void weight(PetscScalar *restrict w){{
+            for(PetscInt i=0; i<{size}; i++) w[i] += 1.0;
+            return;
+        }}
+        """
+        kernel = op2.Kernel(kernel_code, "weight", requires_zeroed_output_arguments=True)
+        op2.par_loop(kernel, weight.cell_set, weight.dat(op2.INC, weight.cell_node_map()))
+        with weight.dat.vec as w:
             w.reciprocal()
+        return weight
 
+    @cached_property
+    def _kernels(self):
         try:
-            uf_map = get_permuted_map(Vf)
-            uc_map = get_permuted_map(Vc)
-            prolong_kernel, restrict_kernel, coefficients = self.make_blas_kernels(Vf, Vc)
+            uf_map = get_permuted_map(self.Vf)
+            uc_map = get_permuted_map(self.Vc)
+            prolong_kernel, restrict_kernel, coefficients = self.make_blas_kernels(self.Vf, self.Vc)
             prolong_args = [prolong_kernel, self.uf.cell_set,
                             self.uf.dat(op2.INC, uf_map),
                             self.uc.dat(op2.READ, uc_map),
-                            self.weight.dat(op2.READ, uf_map)]
+                            self._weight.dat(op2.READ, uf_map)]
         except ValueError:
-            uf_map = Vf.cell_node_map()
-            uc_map = Vc.cell_node_map()
-            prolong_kernel, restrict_kernel, coefficients = self.make_kernels(Vf, Vc)
+            uf_map = self.Vf.cell_node_map()
+            uc_map = self.Vc.cell_node_map()
+            prolong_kernel, restrict_kernel, coefficients = self.make_kernels(self.Vf, self.Vc)
             prolong_args = [prolong_kernel, self.uf.cell_set,
                             self.uf.dat(op2.WRITE, uf_map),
                             self.uc.dat(op2.READ, uc_map)]
@@ -911,10 +1319,38 @@ def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
         restrict_args = [restrict_kernel, self.uf.cell_set,
                          self.uc.dat(op2.INC, uc_map),
                          self.uf.dat(op2.READ, uf_map),
-                         self.weight.dat(op2.READ, uf_map)]
+                         self._weight.dat(op2.READ, uf_map)]
         coefficient_args = [c.dat(op2.READ, c.cell_node_map()) for c in coefficients]
-        self._prolong = partial(op2.par_loop, *prolong_args, *coefficient_args)
-        self._restrict = partial(op2.par_loop, *restrict_args, *coefficient_args)
+        prolong = partial(op2.par_loop, *prolong_args, *coefficient_args)
+        restrict = partial(op2.par_loop, *restrict_args, *coefficient_args)
+        return prolong, restrict
+
+    def view(self, mat, viewer=None):
+        if viewer is None:
+            return
+        typ = viewer.getType()
+        if typ != PETSc.Viewer.Type.ASCII:
+            return
+        viewer.printfASCII("Firedrake matrix-free prolongator %s\n" %
+                           type(self).__name__)
+
+    def getInfo(self, mat, info=None):
+        from mpi4py import MPI
+        memory = self.uf.dat.nbytes + self.uc.dat.nbytes
+        if self._weight is not None:
+            memory += self._weight.dat.nbytes
+        if info is None:
+            info = PETSc.Mat.InfoType.GLOBAL_SUM
+        if info == PETSc.Mat.InfoType.LOCAL:
+            return {"memory": memory}
+        elif info == PETSc.Mat.InfoType.GLOBAL_SUM:
+            gmem = mat.comm.tompi4py().allreduce(memory, op=MPI.SUM)
+            return {"memory": gmem}
+        elif info == PETSc.Mat.InfoType.GLOBAL_MAX:
+            gmem = mat.comm.tompi4py().allreduce(memory, op=MPI.MAX)
+            return {"memory": gmem}
+        else:
+            raise ValueError("Unknown info type %s" % info)
 
     @staticmethod
     def make_blas_kernels(Vf, Vc):
@@ -922,8 +1358,8 @@ def make_blas_kernels(Vf, Vc):
         Interpolation and restriction kernels between CG / DG
         tensor product spaces on quads and hexes.
 
-        Works by tabulating the coarse 1D Lagrange basis
-        functions as the (fdegree+1)-by-(cdegree+1) matrix Jhat,
+        Works by tabulating the coarse 1D basis functions
+        as the (fdegree+1)-by-(cdegree+1) matrix Jhat,
         and using the fact that the 2D / 3D tabulation is the
         tensor product J = kron(Jhat, kron(Jhat, Jhat))
         """
@@ -936,43 +1372,47 @@ def make_blas_kernels(Vf, Vc):
         coefficients = []
         mapping_code = ""
         coef_decl = ""
+
         if fmapping == cmapping:
             # interpolate on each direction via Kroncker product
-            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vf, Vc, "t0", "t1", "J0")
+            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vf, Vc, "t0", "t1", "J0", "t2")
         else:
             decl = [""]*4
             prolong = [""]*5
             restrict = [""]*5
             # get embedding element for Vf with identity mapping and collocated vector component DOFs
             try:
-                Q = Vf if fmapping == "identity" else firedrake.FunctionSpace(Vf.ufl_domain(),
-                                                                              felem.reconstruct(mapping="identity"))
-                mapping_output = make_mapping_code(Q, fmapping, cmapping, "t0", "t1")
+                qelem = felem
+                if qelem.mapping() != "identity":
+                    qelem = qelem.reconstruct(mapping="identity")
+                Qf = Vf if qelem == felem else firedrake.FunctionSpace(Vf.ufl_domain(), qelem)
+                mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
                 in_place_mapping = True
             except Exception:
-                Qe = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
+                qelem = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
                 if felem.value_shape():
-                    Qe = ufl.TensorElement(Qe, shape=felem.value_shape(), symmetry=felem.symmetry())
-                Q = firedrake.FunctionSpace(Vf.ufl_domain(), Qe)
-                mapping_output = make_mapping_code(Q, fmapping, cmapping, "t0", "t1")
-
-            qshape = (Q.value_size, Q.finat_element.space_dimension())
-            # interpolate to embedding fine space, permute to FInAT ordering, and apply the mapping
-            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Q, Vc, "t0", "t1", "J0")
-            decl[1], restrict[1], prolong[1] = make_permutation_code(Vc, qshape, shapes[0], "t0", "t1", "perm0")
-            coef_decl, prolong[2], restrict[2], mapping_code, coefficients = mapping_output
-
-            if not in_place_mapping:
-                # permute to Kronecker-friendly ordering and interpolate to fine space
-                decl[2], prolong[3], restrict[3] = make_permutation_code(Vf, qshape, shapes[0], "t1", "t0", "perm1")
-                decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Vf, Q, "t0", "t1", "J1")
-                shapes.extend(_shapes)
+                    qelem = ufl.TensorElement(qelem, shape=felem._shape, symmetry=felem.symmetry())
+                Qf = firedrake.FunctionSpace(Vf.ufl_domain(), qelem)
+                mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
+
+            qshape = (Qf.value_size, Qf.finat_element.space_dimension())
+            # interpolate to embedding fine space
+            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Qf, Vc, "t0", "t1", "J0", "t2")
+
+            if mapping_output is not None:
+                # permute to FInAT ordering, and apply the mapping
+                decl[1], restrict[1], prolong[1] = make_permutation_code(Vc, qshape, shapes[0], "t0", "t1", "perm0")
+                coef_decl, prolong[2], restrict[2], mapping_code, coefficients = mapping_output
+                if not in_place_mapping:
+                    # permute to Kronecker-friendly ordering and interpolate to fine space
+                    decl[2], prolong[3], restrict[3] = make_permutation_code(Vf, qshape, shapes[0], "t1", "t0", "perm1")
+                    decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Vf, Qf, "t0", "t1", "J1", "t2")
+                    shapes.extend(_shapes)
 
             operator_decl = "".join(decl)
             prolong_code = "".join(prolong)
             restrict_code = "".join(reversed(restrict))
 
-        lwork = numpy.prod([max(*dims) for dims in zip(*shapes)])
         # FInAT elements order the component DOFs related to the same node contiguously.
         # We transpose before and after the multiplication times J to have each component
         # stored contiguously as a scalar field, thus reducing the number of dgemm calls.
@@ -982,6 +1422,10 @@ def make_blas_kernels(Vf, Vc):
 
         fshape = (Vf.value_size, Vf.finat_element.space_dimension())
         cshape = (Vc.value_size, Vc.finat_element.space_dimension())
+
+        lwork = numpy.prod([max(*dims) for dims in zip(*shapes)])
+        lwork = max(lwork, max(numpy.prod(fshape), numpy.prod(cshape)))
+
         if cshape[0] == 1:
             coarse_read = f"""for({IntType_c} i=0; i<{numpy.prod(cshape)}; i++) t0[i] = x[i];"""
             coarse_write = f"""for({IntType_c} i=0; i<{numpy.prod(cshape)}; i++) x[i] += t0[i];"""
@@ -1017,9 +1461,10 @@ def make_blas_kernels(Vf, Vc):
 
         void prolongation(PetscScalar *restrict y, const PetscScalar *restrict x,
                           const PetscScalar *restrict w{coef_decl}){{
-            PetscScalar work[2][{lwork}];
+            PetscScalar work[3][{lwork}] = {{0.0E0}};
             PetscScalar *t0 = work[0];
             PetscScalar *t1 = work[1];
+            PetscScalar *t2 = work[2];
             {operator_decl}
             {coarse_read}
             {prolong_code}
@@ -1029,9 +1474,10 @@ def make_blas_kernels(Vf, Vc):
 
         void restriction(PetscScalar *restrict x, const PetscScalar *restrict y,
                          const PetscScalar *restrict w{coef_decl}){{
-            PetscScalar work[2][{lwork}];
+            PetscScalar work[3][{lwork}] = {{0.0E0}};
             PetscScalar *t0 = work[0];
             PetscScalar *t1 = work[1];
+            PetscScalar *t2 = work[2];
             {operator_decl}
             {fine_read}
             {restrict_code}
@@ -1052,47 +1498,67 @@ def make_kernels(self, Vf, Vc):
 
         This is temporary while we wait for dual evaluation in FInAT.
         """
-        prolong_kernel, _ = prolongation_transfer_kernel_action(Vf, self.uc)
-        matrix_kernel, coefficients = prolongation_transfer_kernel_action(Vf, firedrake.TestFunction(Vc))
-        # The way we transpose the prolongation kernel is suboptimal.
-        # A local matrix is generated each time the kernel is executed.
-        element_kernel = loopy.generate_code_v2(matrix_kernel.code).device_code()
-        element_kernel = element_kernel.replace("void expression_kernel", "static void expression_kernel")
-        dimc = Vc.finat_element.space_dimension() * Vc.value_size
-        dimf = Vf.finat_element.space_dimension() * Vf.value_size
-
-        coef_args = "".join([", c%d" % i for i in range(len(coefficients))])
-        coef_decl = "".join([", const %s *restrict c%d" % (ScalarType_c, i) for i in range(len(coefficients))])
-        restrict_code = f"""
-        {element_kernel}
-
-        void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w{coef_decl})
-        {{
-            {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}};
-            expression_kernel(Afc{coef_args});
-            for ({IntType_c} i = 0; i < {dimf}; i++)
-               for ({IntType_c} j = 0; j < {dimc}; j++)
-                   Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i];
-        }}
-        """
-        restrict_kernel = op2.Kernel(restrict_code, "restriction", requires_zeroed_output_arguments=True)
-        return prolong_kernel, restrict_kernel, coefficients
+        try:
+            prolong_kernel, _ = prolongation_transfer_kernel_action(Vf, self.uc)
+            matrix_kernel, coefficients = prolongation_transfer_kernel_action(Vf, firedrake.TestFunction(Vc))
+            # The way we transpose the prolongation kernel is suboptimal.
+            # A local matrix is generated each time the kernel is executed.
+            element_kernel = loopy.generate_code_v2(matrix_kernel.code).device_code()
+            element_kernel = element_kernel.replace("void expression_kernel", "static void expression_kernel")
+            coef_args = "".join([", c%d" % i for i in range(len(coefficients))])
+            coef_decl = "".join([", const %s *restrict c%d" % (ScalarType_c, i) for i in range(len(coefficients))])
+            dimc = Vc.finat_element.space_dimension() * Vc.value_size
+            dimf = Vf.finat_element.space_dimension() * Vf.value_size
+            restrict_code = f"""
+            {element_kernel}
+
+            void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w{coef_decl})
+            {{
+                {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}};
+                expression_kernel(Afc{coef_args});
+                for ({IntType_c} i = 0; i < {dimf}; i++)
+                   for ({IntType_c} j = 0; j < {dimc}; j++)
+                       Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i];
+            }}
+            """
+            restrict_kernel = op2.Kernel(restrict_code, "restriction", requires_zeroed_output_arguments=True)
+        except NotImplementedError:
+            if Vc.ufl_element().mapping() != Vf.ufl_element().mapping():
+                raise NotImplementedError("Prolongation not supported from %s to %s" % (Vc.ufl_element(), Vf.ufl_element()))
+            if Vf.finat_element.space_dimension() < 400:
+                Jmat = finat_reference_prolongator(Vf.finat_element, Vc.finat_element)
+            else:
+                Jmat = matfree_reference_prolongator(Vf, Vc)
+            dimf, dimc = Jmat.shape
+            vsize = (Vc.value_size*Vc.finat_element.space_dimension())//dimc
+            Jdata = ", ".join(map(float.hex, Jmat.flat))
+            kernel_code = f"""
+            void prolongation({ScalarType_c} *restrict uf, const {ScalarType_c} *restrict uc)
+            {{
+                {ScalarType_c} Afc[{dimf}*{dimc}] = {{ {Jdata} }};
+                for ({IntType_c} i = 0; i < {vsize}*{dimf}; i++)
+                   uf[i] = 0.0E0;
+
+                for ({IntType_c} i = 0; i < {dimf}; i++)
+                    for ({IntType_c} j = 0; j < {dimc}; j++)
+                        for ({IntType_c} k = 0; k < {vsize}; k++)
+                            uf[i*{vsize}+k] += Afc[i*{dimc} + j] * uc[j*{vsize}+k];
+            }}
+
+            void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w)
+            {{
+                {ScalarType_c} Afc[{dimf}*{dimc}] = {{ {Jdata} }};
+                for ({IntType_c} i = 0; i < {dimf}; i++)
+                    for ({IntType_c} j = 0; j < {dimc}; j++)
+                        for ({IntType_c} k = 0; k < {vsize}; k++)
+                            Rc[j*{vsize}+k] += Afc[i*{dimc} + j] * Rf[i*{vsize}+k] * w[i*{vsize}+k];
+            }}
+            """
+            prolong_kernel = op2.Kernel(kernel_code, "prolongation", requires_zeroed_output_arguments=True)
+            restrict_kernel = op2.Kernel(kernel_code, "restriction", requires_zeroed_output_arguments=True)
+            coefficients = []
 
-    @staticmethod
-    def multiplicity(V):
-        # Lawrence's magic code for calculating dof multiplicities
-        shapes = (V.finat_element.space_dimension(),
-                  numpy.prod(V.shape))
-        domain = "{[i,j]: 0 <= i < %d and 0 <= j < %d}" % shapes
-        instructions = """
-        for i, j
-            w[i,j] = w[i,j] + 1
-        end
-        """
-        weight = firedrake.Function(V)
-        firedrake.par_loop((domain, instructions), firedrake.dx,
-                           {"w": (weight, op2.INC)}, is_loopy_kernel=True)
-        return weight
+        return prolong_kernel, restrict_kernel, coefficients
 
     def multTranspose(self, mat, rf, rc):
         """
@@ -1105,7 +1571,7 @@ def multTranspose(self, mat, rf, rc):
 
         with self.uc.dat.vec_wo as uc:
             uc.set(0.0E0)
-        self._restrict()
+        self._kernels[1]()
 
         for bc in self.Vc_bcs:
             bc.zero(self.uc)
@@ -1123,7 +1589,7 @@ def mult(self, mat, xc, xf, inc=False):
 
         with self.uf.dat.vec_wo as uf:
             uf.set(0.0E0)
-        self._prolong()
+        self._kernels[0]()
 
         for bc in self.Vf_bcs:
             bc.zero(self.uf)
@@ -1146,25 +1612,29 @@ class MixedInterpolationMatrix(StandaloneInterpolationMatrix):
     """
     Interpolation matrix for a mixed finite element space.
     """
-    def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
-        self.Vf_bcs = Vf_bcs
-        self.Vc_bcs = Vc_bcs
-        self.uf = Vf if isinstance(Vf, firedrake.Function) else firedrake.Function(Vf)
-        self.uc = Vc if isinstance(Vc, firedrake.Function) else firedrake.Function(Vc)
+    @cached_property
+    def _weight(self):
+        return None
 
-        self.standalones = []
+    @cached_property
+    def _standalones(self):
+        standalones = []
         for (i, (uf_sub, uc_sub)) in enumerate(zip(self.uf.subfunctions, self.uc.subfunctions)):
-            Vf_sub_bcs = [bc for bc in Vf_bcs if bc.function_space().index == i]
-            Vc_sub_bcs = [bc for bc in Vc_bcs if bc.function_space().index == i]
+            Vf_sub_bcs = [bc for bc in self.Vf_bcs if bc.function_space().index == i]
+            Vc_sub_bcs = [bc for bc in self.Vc_bcs if bc.function_space().index == i]
             standalone = StandaloneInterpolationMatrix(uf_sub, uc_sub, Vf_sub_bcs, Vc_sub_bcs)
-            self.standalones.append(standalone)
+            standalones.append(standalone)
+        return standalones
 
-        self._prolong = lambda: [standalone._prolong() for standalone in self.standalones]
-        self._restrict = lambda: [standalone._restrict() for standalone in self.standalones]
+    @cached_property
+    def _kernels(self):
+        prolong = lambda: [standalone._kernels[0]() for standalone in self._standalones]
+        restrict = lambda: [standalone._kernels[1]() for standalone in self._standalones]
+        return prolong, restrict
 
     def getNestSubMatrix(self, i, j):
         if i == j:
-            s = self.standalones[i]
+            s = self._standalones[i]
             sizes = (s.uf.dof_dset.layout_vec.getSizes(), s.uc.dof_dset.layout_vec.getSizes())
             M_shll = PETSc.Mat().createPython(sizes, s, comm=s.uf._comm)
             M_shll.setUp()
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 6934bf0a53..cefb464680 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -12,6 +12,7 @@
     "ksp_converged_reason": None,
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
+    "pmg_coarse_mat_type": "aij",
     "pmg_mg_coarse": {
         "mat_type": "aij",
         "ksp_type": "preonly",

From d0a7b01d197398ded05dc9fabdc05c538e180e1f Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 8 Mar 2023 10:50:19 +0000
Subject: [PATCH 05/75] pfas tests now passing, but need to create the
 transpose of injection

---
 firedrake/preconditioners/pmg.py    | 46 ++++++++++++++++-------------
 tests/multigrid/test_p_multigrid.py | 23 ++++++++-------
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 81791ebb77..bc533370c0 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -48,6 +48,7 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
+    _is_linear = False
 
     def coarsen_element(self, ele):
         """
@@ -95,13 +96,19 @@ def initialize(self, pc):
         pdm = PETSc.DMShell().create(comm=pc.comm)
         pdm.setOptionsPrefix(options_prefix)
 
+        self.ppc = self.configure_pmg(pc, pdm)
+        self.ppc.setFromOptions()
+
+        print(self.ppc.getOptionsPrefix())
+        copts = PETSc.Options(self.ppc.getOptionsPrefix()+self.ppc.getType()+"_coarse_")
+
         # Get the coarse degree from PETSc options
         fcp = ctx._problem.form_compiler_parameters
         mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
-        self.coarse_degree = opts.getInt("mg_coarse_degree", default=1)
-        self.coarse_mat_type = opts.getString("mg_coarse_mat_type", default=ctx.mat_type)
-        self.coarse_pmat_type = opts.getString("mg_coarse_pmat_type", default=self.coarse_mat_type)
-        self.coarse_form_compiler_mode = opts.getString("mg_coarse_form_compiler_mode", default=mode)
+        self.coarse_degree = copts.getInt("degree", default=1)
+        self.coarse_mat_type = copts.getString("mat_type", default=ctx.mat_type)
+        self.coarse_pmat_type = copts.getString("pmat_type", default=self.coarse_mat_type)
+        self.coarse_form_compiler_mode = copts.getString("form_compiler_mode", default=mode)
 
         # Construct a list with the elements we'll be using
         V = test.function_space()
@@ -125,8 +132,8 @@ def initialize(self, pc):
         pdm.setCreateInterpolation(self.create_interpolation)
         # We need this for p-FAS
         pdm.setCreateInjection(self.create_injection)
-        pdm.setSNESJacobian(_SNESContext.form_jacobian)
         pdm.setSNESFunction(_SNESContext.form_function)
+        pdm.setSNESJacobian(_SNESContext.form_jacobian)
         pdm.setKSPComputeOperators(_SNESContext.compute_operators)
 
         set_function_space(pdm, get_function_space(odm))
@@ -135,11 +142,9 @@ def initialize(self, pc):
         assert parent is not None
         add_hook(parent, setup=partial(push_parent, pdm, parent), teardown=partial(pop_parent, pdm, parent), call_setup=True)
         add_hook(parent, setup=partial(push_appctx, pdm, ctx), teardown=partial(pop_appctx, pdm, ctx), call_setup=True)
-
-        self.ppc = self.configure_pmg(pc, pdm)
-        self.ppc.setFromOptions()
         self.ppc.setUp()
 
+
     def update(self, pc):
         pass
 
@@ -147,7 +152,8 @@ def view(self, pc, viewer=None):
         if viewer is None:
             viewer = PETSc.Viewer.STDOUT
         viewer.printfASCII("p-multigrid PC\n")
-        self.ppc.view(viewer)
+        if hasattr(self, "ppc"):
+            self.ppc.view(viewer)
 
     def destroy(self, pc):
         if hasattr(self, "ppc"):
@@ -180,14 +186,12 @@ def coarsen(self, fdm, comm):
         fu = fproblem.u
         cu = firedrake.Function(cV)
 
-        is_linear = fu not in fctx.J.coefficients()
-
         fdeg = PMGBase.max_degree(fV.ufl_element())
         cdeg = PMGBase.max_degree(cV.ufl_element())
 
         fine_to_coarse_map = {test: test.reconstruct(function_space=cV),
                               trial: trial.reconstruct(function_space=cV)}
-        if not is_linear:
+        if not self._is_linear:
             fine_to_coarse_map[fu] = cu
 
         def _coarsen_form(a):
@@ -235,7 +239,7 @@ def _coarsen_form(a):
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
                                                          form_compiler_parameters=fcp,
-                                                         is_linear=is_linear)
+                                                         is_linear=self._is_linear)
 
         cctx = type(fctx)(cproblem, mat_type, pmat_type,
                           appctx=cappctx,
@@ -261,11 +265,11 @@ def _coarsen_form(a):
         interp_petscmat, _ = cdm.createInterpolation(fdm)
         inject_petscmat = cdm.createInjection(fdm)
 
-        if not is_linear:
+        if not self._is_linear:
             # injection of the initial state
             def inject_state():
                 with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
-                    inject_petscmat.mult(xf, xc)
+                    inject_petscmat.multTranspose(xf, xc)
 
             add_hook(parent, setup=inject_state, call_setup=True)
 
@@ -326,7 +330,7 @@ def coarsen_bcs(self, fbcs, cV):
                 cV_ = cV_.sub(index)
             cbc_value = self.coarsen_bc_value(bc, cV_)
             if isinstance(bc, firedrake.DirichletBC):
-                cbcs.append(bc.reconstruct(V=cV, g=cbc_value))
+                cbcs.append(bc.reconstruct(V=cV_, g=cbc_value))
             else:
                 raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
         return cbcs
@@ -353,7 +357,8 @@ def create_interpolation(self, dmc, dmf):
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        return self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
+        I = self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
+        return PETSc.Mat().createTranspose(I)
 
     @staticmethod
     def max_degree(ele):
@@ -417,6 +422,7 @@ def reconstruct_degree(ele, degree):
 
 class PMGPC(PCBase, PMGBase):
     _prefix = "pmg_"
+    _is_linear = True
 
     def configure_pmg(self, pc, pdm):
         odm = pc.getDM()
@@ -455,6 +461,7 @@ def coarsen_residual(self, Fc, Jc, uc):
 
 class PMGSNES(SNESBase, PMGBase):
     _prefix = "pfas_"
+    _is_linear = False
 
     def configure_pmg(self, snes, pdm):
         odm = snes.getDM()
@@ -470,7 +477,6 @@ def configure_pmg(self, snes, pdm):
         psnes.setFunction(fun, f.duplicate(), args=args, kargs=kargs)
 
         pdm.setGlobalVector(f.duplicate())
-        self.dummy = f.duplicate()
         psnes.setSolution(f.duplicate())
 
         # PETSc unfortunately requires us to make an ugly hack.
@@ -491,7 +497,7 @@ def step(self, snes, x, f, y):
         ctx = get_appctx(snes.dm)
         push_appctx(self.ppc.dm, ctx)
         x.copy(y)
-        self.ppc.solve(snes.vec_rhs or self.dummy, y)
+        self.ppc.solve(snes.vec_rhs or None, y)
         y.aypx(-1, x)
         snes.setConvergedReason(self.ppc.getConvergedReason())
         pop_appctx(self.ppc.dm)
@@ -1391,7 +1397,7 @@ def make_blas_kernels(Vf, Vc):
             except Exception:
                 qelem = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
                 if felem.value_shape():
-                    qelem = ufl.TensorElement(qelem, shape=felem._shape, symmetry=felem.symmetry())
+                    qelem = ufl.TensorElement(qelem, shape=felem.value_shape(), symmetry=felem.symmetry())
                 Qf = firedrake.FunctionSpace(Vf.ufl_domain(), qelem)
                 mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
 
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 17e4e33296..17f4ba790e 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -52,7 +52,7 @@ def test_prolongation_matrix_matfree():
                 if u != v:
                     v.assign(0)
                     P = prolongation_matrix_matfree(v, u).getPythonContext()
-                    P._prolong()
+                    P._kernels[0]()
                     assert norm(v-expr, "L2") < tol
 
 
@@ -240,7 +240,8 @@ def test_p_multigrid_mixed(mat_type):
              "ksp_max_it": 3,
              "pc_type": "jacobi"}
 
-    coarse = {"ksp_type": "richardson",
+    coarse = {"mat_type": "aij",
+              "ksp_type": "richardson",
               "ksp_max_it": 1,
               "ksp_norm_type": "unpreconditioned",
               "ksp_monitor": None,
@@ -255,12 +256,12 @@ def test_p_multigrid_mixed(mat_type):
           "ksp_monitor_true_residual": None,
           "pc_type": "python",
           "pc_python_type": "firedrake.PMGPC",
-          # "mat_type": mat_type,  # FIXME bug with mat-free jacobi on MixedFunctionSpace
+          "mat_type": mat_type,
           "pmg_pc_mg_type": "multiplicative",
           "pmg_mg_levels": relax,
           "pmg_mg_coarse": coarse}
 
-    basis = VectorSpaceBasis([assemble(TestFunction(Z.sub(1))*dx)])
+    basis = VectorSpaceBasis([interpolate(Constant(1), Z.sub(1))])
     basis.orthonormalize()
     nullspace = MixedVectorSpaceBasis(Z, [Z.sub(0), basis])
     problem = NonlinearVariationalProblem(F, z, bcs)
@@ -313,6 +314,7 @@ def test_p_fas_scalar():
     atol = rtol * Fnorm
 
     coarse = {
+        "mat_type": "aij",
         "ksp_type": "preonly",
         "ksp_norm_type": None,
         "pc_type": "cholesky"}
@@ -321,7 +323,6 @@ def test_p_fas_scalar():
         "ksp_type": "chebyshev",
         "ksp_monitor_true_residual": None,
         "ksp_norm_type": "unpreconditioned",
-        "ksp_max_it": 3,
         "pc_type": "jacobi"}
 
     pmg = {
@@ -340,7 +341,7 @@ def test_p_fas_scalar():
         "pmg_mg_coarse": coarse}
 
     pfas = {
-        "mat_type": "aij",
+        "mat_type": mat_type,
         "snes_monitor": None,
         "snes_converged_reason": None,
         "snes_atol": atol,
@@ -364,23 +365,23 @@ def test_p_fas_scalar():
 @pytest.mark.skipcomplex
 def test_p_fas_nonlinear_scalar():
     mat_type = "matfree"
-    N = 4
-    dxq = dx(degree=3*N+2)  # here we also test coarsening of quadrature degree
+    degree = 4
+    dxq = dx(degree=3*degree+2)  # here we also test coarsening of quadrature degree
 
     mesh = UnitSquareMesh(4, 4, quadrilateral=True)
-    V = FunctionSpace(mesh, "CG", N)
+    V = FunctionSpace(mesh, "CG", degree)
     u = Function(V)
     f = Constant(1)
     bcs = DirichletBC(V, 0, "on_boundary")
 
     # Regularized p-Laplacian
     p = 5
-    eps = 1
+    eps = Constant(1)
     y = eps + inner(grad(u), grad(u))
     E = (1/p)*(y**(p/2))*dxq - inner(f, u)*dxq
     F = derivative(E, u, TestFunction(V))
 
-    fcp = {"quadrature_degree": 3*N+2}
+    fcp = {"quadrature_degree": 3*degree+2}
     problem = NonlinearVariationalProblem(F, u, bcs, form_compiler_parameters=fcp)
 
     # Due to the convoluted nature of the nested iteration

From 5c5d861ce43acf4252c7f42beab769e16f09dde2 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 8 Mar 2023 15:31:43 +0000
Subject: [PATCH 06/75] fix tests

---
 firedrake/preconditioners/pmg.py    | 28 +++++++++++-----------------
 tests/multigrid/test_p_multigrid.py |  2 +-
 tests/regression/test_fdm.py        |  1 -
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index bc533370c0..c0378afd8f 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -48,7 +48,6 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
-    _is_linear = False
 
     def coarsen_element(self, ele):
         """
@@ -92,7 +91,6 @@ def initialize(self, pc):
 
         prefix = pc.getOptionsPrefix()
         options_prefix = prefix + self._prefix
-        opts = PETSc.Options(options_prefix)
         pdm = PETSc.DMShell().create(comm=pc.comm)
         pdm.setOptionsPrefix(options_prefix)
 
@@ -144,7 +142,6 @@ def initialize(self, pc):
         add_hook(parent, setup=partial(push_appctx, pdm, ctx), teardown=partial(pop_appctx, pdm, ctx), call_setup=True)
         self.ppc.setUp()
 
-
     def update(self, pc):
         pass
 
@@ -189,10 +186,9 @@ def coarsen(self, fdm, comm):
         fdeg = PMGBase.max_degree(fV.ufl_element())
         cdeg = PMGBase.max_degree(cV.ufl_element())
 
-        fine_to_coarse_map = {test: test.reconstruct(function_space=cV),
+        fine_to_coarse_map = {fu: cu,
+                              test: test.reconstruct(function_space=cV),
                               trial: trial.reconstruct(function_space=cV)}
-        if not self._is_linear:
-            fine_to_coarse_map[fu] = cu
 
         def _coarsen_form(a):
             if isinstance(a, ufl.Form):
@@ -238,8 +234,7 @@ def _coarsen_form(a):
 
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
-                                                         form_compiler_parameters=fcp,
-                                                         is_linear=self._is_linear)
+                                                         form_compiler_parameters=fcp)
 
         cctx = type(fctx)(cproblem, mat_type, pmat_type,
                           appctx=cappctx,
@@ -265,11 +260,11 @@ def _coarsen_form(a):
         interp_petscmat, _ = cdm.createInterpolation(fdm)
         inject_petscmat = cdm.createInjection(fdm)
 
-        if not self._is_linear:
+        if cu in cJ.coefficients():
             # injection of the initial state
             def inject_state():
                 with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
-                    inject_petscmat.multTranspose(xf, xc)
+                    inject_petscmat.mult(xf, xc)
 
             add_hook(parent, setup=inject_state, call_setup=True)
 
@@ -303,9 +298,9 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
                 return fine_nullspace
 
         ises = cV._ises
-        cctx._nullspace = coarsen_nullspace(cV, inject_petscmat, fctx._nullspace)
+        cctx._nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace)
         cctx.set_nullspace(cctx._nullspace, ises, transpose=False, near=False)
-        cctx._near_nullspace = coarsen_nullspace(cV, inject_petscmat, fctx._near_nullspace)
+        cctx._near_nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._near_nullspace)
         cctx.set_nullspace(cctx._near_nullspace, ises, transpose=False, near=True)
         cctx._nullspace_T = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace_T)
         cctx.set_nullspace(cctx._nullspace_T, ises, transpose=True, near=False)
@@ -352,13 +347,14 @@ def create_transfer(cctx, fctx, mat_type, cbcs, fbcs):
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        return self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False), None
+        interpolate = self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False)
+        rscale = interpolate.createVecRight()  # only used as a workaround in the creation of coarse vecs
+        return interpolate, rscale
 
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        I = self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
-        return PETSc.Mat().createTranspose(I)
+        return self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
 
     @staticmethod
     def max_degree(ele):
@@ -422,7 +418,6 @@ def reconstruct_degree(ele, degree):
 
 class PMGPC(PCBase, PMGBase):
     _prefix = "pmg_"
-    _is_linear = True
 
     def configure_pmg(self, pc, pdm):
         odm = pc.getDM()
@@ -461,7 +456,6 @@ def coarsen_residual(self, Fc, Jc, uc):
 
 class PMGSNES(SNESBase, PMGBase):
     _prefix = "pfas_"
-    _is_linear = False
 
     def configure_pmg(self, snes, pdm):
         odm = snes.getDM()
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 17f4ba790e..89d7c94efb 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -261,7 +261,7 @@ def test_p_multigrid_mixed(mat_type):
           "pmg_mg_levels": relax,
           "pmg_mg_coarse": coarse}
 
-    basis = VectorSpaceBasis([interpolate(Constant(1), Z.sub(1))])
+    basis = VectorSpaceBasis([assemble(TestFunction(Z.sub(1))*dx)])
     basis.orthonormalize()
     nullspace = MixedVectorSpaceBasis(Z, [Z.sub(0), basis])
     problem = NonlinearVariationalProblem(F, z, bcs)
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index cefb464680..6934bf0a53 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -12,7 +12,6 @@
     "ksp_converged_reason": None,
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
-    "pmg_coarse_mat_type": "aij",
     "pmg_mg_coarse": {
         "mat_type": "aij",
         "ksp_type": "preonly",

From d9a743ca098d8a72905178294b66727d3f0437da Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 8 Mar 2023 16:20:30 +0000
Subject: [PATCH 07/75] fix docs

---
 firedrake/preconditioners/fdm.py | 12 ++++++++----
 firedrake/preconditioners/pmg.py |  5 +++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 88be79b597..1cd769ae27 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -182,10 +182,12 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type
         """
         Assemble the sparse preconditioner with cell-wise constant coefficients.
 
-        :arg V: the :class:`firedrake.FunctionSpace` of the form arguments
+        :arg V: the :class:`.FunctionSpace` of the form arguments
         :arg J: the Jacobian bilinear form
         :arg bcs: an iterable of boundary conditions on V
+        :arg form_compiler_parameters: parameters to assemble diagonal factors
         :arg appctx: the application context
+        :pmat_type: the preconditioner `PETSc.Mat.Type`
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
         """
@@ -1139,8 +1141,10 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         Assemble the stiffness matrix in the FDM basis using Kronecker products of interval matrices
 
         :arg A: the :class:`PETSc.Mat` to assemble
-        :arg Vrow: the :class:`firedrake.FunctionSpace` test space
-        :arg Vcol: the :class:`firedrake.FunctionSpace` trial space
+        :arg Vrow: the :class:`.FunctionSpace` test space
+        :arg Vcol: the :class:`.FunctionSpace` trial space
+        :arg addv: a `PETSc.Mat.InsertMode`
+        :arg triu: are we assembling only the upper triangular part?
         """
         set_values_csr = self.load_set_values(triu=triu)
         update_A = lambda A, Ae, rindices: set_values_csr(A, Ae, rindices, rindices, addv)
@@ -1632,7 +1636,7 @@ def get_interior_facet_maps(V):
     """
     Extrude V.interior_facet_node_map and V.ufl_domain().interior_facets.local_facet_dat
 
-    :arg V: a :class:`FunctionSpace`
+    :arg V: a :class:`.FunctionSpace`
 
     :returns: the 3-tuple of
         facet_to_nodes_fun: maps interior facets to the nodes of the two cells sharing it,
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index c0378afd8f..74120b81a2 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -234,7 +234,8 @@ def _coarsen_form(a):
 
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
-                                                         form_compiler_parameters=fcp)
+                                                         form_compiler_parameters=fcp,
+                                                         is_linear=fproblem.is_linear)
 
         cctx = type(fctx)(cproblem, mat_type, pmat_type,
                           appctx=cappctx,
@@ -471,7 +472,7 @@ def configure_pmg(self, snes, pdm):
         psnes.setFunction(fun, f.duplicate(), args=args, kargs=kargs)
 
         pdm.setGlobalVector(f.duplicate())
-        psnes.setSolution(f.duplicate())
+        psnes.setSolution(snes.getSolution())
 
         # PETSc unfortunately requires us to make an ugly hack.
         # We would like to use GMG for the coarse solve, at least

From 49a34f0cd805183c480f54ba037377b5434c0f7b Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 8 Mar 2023 17:21:33 +0000
Subject: [PATCH 08/75] add FDM tests for H(curl) and H(div)

---
 tests/regression/test_fdm.py | 52 +++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 6934bf0a53..aa4946c24b 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -70,15 +70,14 @@ def variant(request):
 @pytest.mark.skipcomplex
 def test_p_independence(mesh, expected, variant):
     nits = []
-    for p in range(3, 6):
-        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=p, variant=variant)
+    for degree, nits in zip(range(3, 6), expected):
+        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=degree, variant=variant)
         V = FunctionSpace(mesh, e)
         u = TrialFunction(V)
         v = TestFunction(V)
 
-        ndim = mesh.geometric_dimension()
         x = SpatialCoordinate(mesh)
-        x -= Constant([0.5]*ndim)
+        x -= Constant([0.5]*len(x))
         u_exact = dot(x, x)
         f_exact = grad(u_exact)
         B = -div(f_exact)
@@ -95,9 +94,50 @@ def test_p_independence(mesh, expected, variant):
         problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
         solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
         solver.solve()
-        nits.append(solver.snes.ksp.getIterationNumber())
+        assert solver.snes.ksp.getIterationNumber() <= nits
     assert norm(u_exact-uh, "H1") < 2.0E-7
-    assert nits <= expected
+
+
+def solve_riesz_map(V, d):
+    beta = Constant(1E-8)
+    subs = [(1, 3)]
+
+    x = SpatialCoordinate(V.mesh())
+    x -= Constant([0.5]*len(x))
+    expr = x * exp(-10*dot(x, x))
+    if V.mesh().extruded:
+        subs += ["top"]
+
+    u_exact = Function(V)
+    u_exact.project(expr, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})
+    bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
+
+    uh = Function(V)
+    test = TestFunction(V)
+    trial = TrialFunction(V)
+    a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
+    problem = LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
+    solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
+    solver.solve()
+    return solver.snes.ksp.getIterationNumber()
+
+
+@pytest.mark.skipcomplex
+def test_hcurl(mesh, expected):
+    family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
+    for degree, nits in zip(range(3, 6), expected):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
+        V = FunctionSpace(mesh, element)
+        assert solve_riesz_map(V, curl) <= nits
+
+
+@pytest.mark.skipcomplex
+def test_hdiv(mesh, expected):
+    family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
+    for degree, nits in zip(range(3, 6), expected):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
+        V = FunctionSpace(mesh, element)
+        assert solve_riesz_map(V, div) <= nits
 
 
 @pytest.mark.skipcomplex

From 73976775b4d2888c434b2b38e38b7f24fb4dc319 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 9 Mar 2023 09:53:42 +0000
Subject: [PATCH 09/75] fix serendipity p1pc test

---
 tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py | 2 +-
 tests/regression/test_fdm.py                                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py b/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
index f82acdf62e..b9eb6e830a 100644
--- a/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
+++ b/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
@@ -10,12 +10,12 @@ def run_poisson():
                    "ksp_monitor": None,
                    "pc_type": "python",
                    "pc_python_type": "firedrake.P1PC",
-                   "pmg_coarse_degree": coarse_deg,
                    "pmg_mg_levels": {
                        "ksp_type": "chebyshev",
                        "ksp_max_it": 2,
                        "pc_type": "jacobi"},
                    "pmg_mg_coarse": {
+                       "degree": coarse_deg,
                        "ksp_type": "preonly",
                        "pc_type": "lu",
                        "pc_factor_mat_solver_type": "mumps"
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index aa4946c24b..125881797d 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -101,12 +101,12 @@ def test_p_independence(mesh, expected, variant):
 def solve_riesz_map(V, d):
     beta = Constant(1E-8)
     subs = [(1, 3)]
+    if V.mesh().extruded:
+        subs += ["top"]
 
     x = SpatialCoordinate(V.mesh())
     x -= Constant([0.5]*len(x))
     expr = x * exp(-10*dot(x, x))
-    if V.mesh().extruded:
-        subs += ["top"]
 
     u_exact = Function(V)
     u_exact.project(expr, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})

From da2c6b50c33620c1e6162a7e4afaee68f9e8fda4 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 9 Mar 2023 10:04:54 +0000
Subject: [PATCH 10/75] remove print statement

---
 firedrake/preconditioners/pmg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 74120b81a2..0e00b2008f 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -97,7 +97,6 @@ def initialize(self, pc):
         self.ppc = self.configure_pmg(pc, pdm)
         self.ppc.setFromOptions()
 
-        print(self.ppc.getOptionsPrefix())
         copts = PETSc.Options(self.ppc.getOptionsPrefix()+self.ppc.getType()+"_coarse_")
 
         # Get the coarse degree from PETSc options

From 8b2a415bd91f4d04880c88fd819113c02578f4c8 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 9 Mar 2023 15:31:03 +0000
Subject: [PATCH 11/75] fix typo in ValueError

---
 firedrake/preconditioners/gtmg.py  | 6 +++---
 firedrake/preconditioners/patch.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/firedrake/preconditioners/gtmg.py b/firedrake/preconditioners/gtmg.py
index 35f1f1e570..d11cc548c1 100644
--- a/firedrake/preconditioners/gtmg.py
+++ b/firedrake/preconditioners/gtmg.py
@@ -29,7 +29,7 @@ def initialize(self, pc):
         if ctx is None:
             raise ValueError("No context found.")
         if not isinstance(ctx, _SNESContext):
-            raise ValueError("Don't know how to get form from %r", ctx)
+            raise ValueError("Don't know how to get form from %r" % ctx)
 
         prefix = pc.getOptionsPrefix()
         options_prefix = prefix + self._prefix
@@ -41,7 +41,7 @@ def initialize(self, pc):
             if ictx is None:
                 raise ValueError("No context found on matrix")
             if not isinstance(ictx, ImplicitMatrixContext):
-                raise ValueError("Don't know how to get form from %r", ictx)
+                raise ValueError("Don't know how to get form from %r" % ictx)
 
             fine_operator = ictx.a
             fine_bcs = ictx.row_bcs
@@ -70,7 +70,7 @@ def initialize(self, pc):
             fine_petscmat.setTransposeNullSpace(fine_transpose_nullspace)
 
         # Handle the coarse operator
-        coarse_options_prefix = options_prefix + "mg_coarse"
+        coarse_options_prefix = options_prefix + "mg_coarse_"
         coarse_mat_type = opts.getString(coarse_options_prefix + "mat_type",
                                          parameters["default_matrix_type"])
 
diff --git a/firedrake/preconditioners/patch.py b/firedrake/preconditioners/patch.py
index 2a20f2e2f8..8c1919aecd 100644
--- a/firedrake/preconditioners/patch.py
+++ b/firedrake/preconditioners/patch.py
@@ -747,14 +747,14 @@ def initialize(self, obj):
         if ctx is None:
             raise ValueError("No context found on form")
         if not isinstance(ctx, _SNESContext):
-            raise ValueError("Don't know how to get form from %r", ctx)
+            raise ValueError("Don't know how to get form from %r" % ctx)
 
         if P.getType() == "python":
             ictx = P.getPythonContext()
             if ictx is None:
                 raise ValueError("No context found on matrix")
             if not isinstance(ictx, ImplicitMatrixContext):
-                raise ValueError("Don't know how to get form from %r", ictx)
+                raise ValueError("Don't know how to get form from %r" % ictx)
             J = ictx.a
             bcs = ictx.row_bcs
             if bcs != ictx.col_bcs:

From 15034350791ebd9c4bcd13b92a4bbbc1e6a50180 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 9 Mar 2023 17:12:31 +0000
Subject: [PATCH 12/75] address review comments

---
 firedrake/preconditioners/fdm.py |  52 ++------------
 firedrake/preconditioners/pmg.py |  22 ++++--
 tests/regression/test_fdm.py     | 116 ++++++++++++-------------------
 3 files changed, 68 insertions(+), 122 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 1cd769ae27..9c02a5c1f0 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -387,9 +387,9 @@ def RtAP(R, A, P, result=None):
         if Vrow == Vcol:
             get_cindices = lambda e, result=None: result
             update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, rindices, addv)
-            rtensor = self.reference_tensor_on_diag.get(Vrow, None) or self.assemble_reference_tensor(Vrow)
+            rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
             assemble_element_mat = lambda De, result=None: De.PtAP(rtensor, result=result)
-            condense_element_mat = self.get_static_condensation.get(Vrow, None)
+            condense_element_mat = self.get_static_condensation.get(Vrow)
         else:
             get_cindices = self.cell_to_global[Vcol]
             update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, cindices, addv)
@@ -543,48 +543,6 @@ def assemble_coef(self, J, form_compiler_parameters):
         key = (mixed_form.signature(), mesh)
         block_diagonal = True
 
-        if key not in self._coefficient_cache and False:
-            M = assemble(mixed_form, mat_type="matfree",
-                         form_compiler_parameters=form_compiler_parameters)
-
-            coefs = []
-            mats = []
-            for iset in Z.dof_dset.field_ises:
-                Msub = M.petscmat.createSubMatrix(iset, iset)
-                coefs.append(Msub.getPythonContext()._diagonal)
-                mats.append(Msub)
-
-            def scale_coefficients():
-                for Msub, coef in zip(mats, coefs):
-                    ksp = PETSc.KSP().create(comm=V.comm)
-                    ksp.setOperators(A=Msub, P=Msub)
-                    ksp.setType(PETSc.KSP.Type.CG)
-                    ksp.setNormType(PETSc.KSP.NormType.NATURAL)
-                    ksp.pc.setType(PETSc.PC.Type.JACOBI)
-                    ksp.setTolerances(rtol=1E-3, atol=0.0E0, max_it=8)
-                    ksp.setComputeEigenvalues(True)
-                    ksp.setUp()
-
-                    x = Msub.createVecRight()
-                    b = Msub.createVecLeft()
-                    x.set(0)
-                    b.setRandom()
-                    ksp.solve(b, x)
-                    ew = numpy.real(ksp.computeEigenvalues())
-                    ksp.destroy()
-                    x.destroy()
-                    b.destroy()
-                    dscale = (max(ew) + min(ew))/2
-                    dscale = sum(ew) / len(ew)
-                    scale = dscale if dscale == dscale else 1
-                    with coef.dat.vec as diag:
-                        diag.scale(scale)
-
-            coefficients = {"beta": coefs[0], "alpha": coefs[1]}
-            assembly_callables = [scale_coefficients]
-            self._coefficient_cache[key] = (coefficients, assembly_callables)
-            return self._coefficient_cache[key]
-
         if key not in self._coefficient_cache:
             if not block_diagonal or not V.shape:
                 tensor = firedrake.Function(Z)
@@ -621,8 +579,7 @@ def assemble_reference_tensor(self, V):
         key = (degree, ndim, formdegree, V.value_size, is_interior, is_facet)
         cache = self._reference_tensor_cache
         if key not in cache:
-            full_key = (degree, ndim, formdegree, V.value_size, 0, 0)
-
+            full_key = (degree, ndim, formdegree, V.value_size, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
@@ -1148,11 +1105,10 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         """
         set_values_csr = self.load_set_values(triu=triu)
         update_A = lambda A, Ae, rindices: set_values_csr(A, Ae, rindices, rindices, addv)
-        condense_element_mat = self.get_static_condensation.get(Vrow, lambda x: x)
         condense_element_mat = lambda x: x
 
         get_rindices = self.cell_to_global[Vrow]
-        rtensor = self.reference_tensor_on_diag.get(Vrow, None) or self.assemble_reference_tensor(Vrow)
+        rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
         self.reference_tensor_on_diag[Vrow] = rtensor
         Afdm, Dfdm, bdof = rtensor
 
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 0e00b2008f..56341599a6 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -83,7 +83,7 @@ def initialize(self, pc):
         if ctx is None:
             raise ValueError("No context found.")
         if not isinstance(ctx, _SNESContext):
-            raise ValueError("Don't know how to get form from %r", ctx)
+            raise ValueError("Don't know how to get form from %r" % ctx)
 
         test, trial = ctx.J.arguments()
         if test.function_space() != trial.function_space():
@@ -521,6 +521,14 @@ def load_c_code(code, name, argtypes, comm):
 
 
 def reference_moments(*args, **kwargs):
+    """
+    Return a python function that computes the L2 inner product of the
+    arguments in the reference cell.
+
+    :arg test: the test `ufl.Argument`
+    :arg trial: the trial `ufl.Argument` or `ufl.Coefficient`
+    :kwarg diagonal: are we assembling the diagonal of the bilinear form?
+    """
     import ctypes
     from tsfc import compile_form
     quad_degree = 1+sum([PMGBase.max_degree(t.ufl_element()) for t in args])
@@ -555,6 +563,9 @@ def _wrapper(*args):
 
 @lru_cache(maxsize=10)
 def matfree_reference_prolongator(Vf, Vc):
+    """
+    Return the prolongation from Vc to Vf on the reference element.
+    """
     dimf = Vf.value_size * Vf.finat_element.space_dimension()
     dimc = Vc.value_size * Vc.finat_element.space_dimension()
     build_Afc = reference_moments(ufl.TestFunction(Vf), ufl.TrialFunction(Vc))
@@ -607,8 +618,8 @@ def expand_element(ele):
     """
     Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
     """
-
     if ele.cell().cellname().startswith("quadrilateral"):
+        # Handle immersed quadrilaterals
         quadrilateral_tpc = ufl.TensorProductCell(ufl.interval, ufl.interval)
         return expand_element(ele.reconstruct(cell=quadrilateral_tpc))
     elif ele.cell() == ufl.hexahedron:
@@ -967,6 +978,9 @@ def _tabulate(e, ps, entity=None):
 
 @PETSc.Log.EventDecorator("MakeKronCode")
 def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
+    """
+    Return interpolation and restriction sub-kernels between enriched tensor product elements
+    """
     operator_decl = []
     prolong_code = []
     restrict_code = []
@@ -1271,13 +1285,13 @@ def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
             self.uf = Vf
             Vf = Vf.function_space()
         else:
-            self.uf = self._cache_work.get(Vf, firedrake.Function(Vf))
+            self.uf = self._cache_work.get(Vf) or firedrake.Function(Vf)
             self._cache_work[Vf] = self.uf
         if isinstance(Vc, firedrake.Function):
             self.uc = Vc
             Vc = Vc.function_space()
         else:
-            self.uc = self._cache_work.get(Vc, firedrake.Function(Vc))
+            self.uc = self._cache_work.get(Vc) or firedrake.Function(Vc)
             self._cache_work[Vc] = self.uc
         self.Vf = Vf
         self.Vc = Vc
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 125881797d..4699b129b5 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -7,9 +7,8 @@
     "ksp_type": "cg",
     "ksp_atol": 0.0E0,
     "ksp_rtol": 1.0E-8,
-    "ksp_norm_type": "unpreconditioned",
-    "ksp_monitor_true_residual": None,
-    "ksp_converged_reason": None,
+    "ksp_norm_type": "natural",
+    "ksp_monitor": None,
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
     "pmg_mg_coarse": {
@@ -39,6 +38,34 @@
 }
 
 
+def solve_riesz_map(V, d):
+    beta = Constant(1E-8)
+    subs = [(1, 3)]
+    if V.mesh().cell_set._extruded:
+        subs += ["top"]
+
+    x = SpatialCoordinate(V.mesh())
+    x -= Constant([0.5]*len(x))
+    if V.ufl_element().value_shape() == ():
+        u_exact = exp(-10*dot(x, x))
+        u_bc = u_exact
+    else:
+        u_exact = x * exp(-10*dot(x, x))
+        u_bc = Function(V)
+        u_bc.project(u_exact, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})
+
+    bcs = [DirichletBC(V, u_bc, sub) for sub in subs]
+
+    uh = Function(V)
+    test = TestFunction(V)
+    trial = TrialFunction(V)
+    a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
+    problem = LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
+    solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
+    solver.solve()
+    return solver.snes.ksp.getIterationNumber()
+
+
 @pytest.fixture(params=[2, 3],
                 ids=["Rectangle", "Box"])
 def mesh(request):
@@ -54,90 +81,39 @@ def mesh(request):
     return m
 
 
-@pytest.fixture
-def expected(mesh):
-    if mesh.topological_dimension() == 2:
-        return [5, 5, 5]
-    elif mesh.topological_dimension() == 3:
-        return [8, 8, 8]
-
-
 @pytest.fixture(params=[None, "fdm"], ids=["spectral", "fdm"])
 def variant(request):
     return request.param
 
 
 @pytest.mark.skipcomplex
-def test_p_independence(mesh, expected, variant):
-    nits = []
-    for degree, nits in zip(range(3, 6), expected):
-        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=degree, variant=variant)
-        V = FunctionSpace(mesh, e)
-        u = TrialFunction(V)
-        v = TestFunction(V)
-
-        x = SpatialCoordinate(mesh)
-        x -= Constant([0.5]*len(x))
-        u_exact = dot(x, x)
-        f_exact = grad(u_exact)
-        B = -div(f_exact)
-
-        a = inner(grad(v), grad(u))*dx
-        L = inner(v, B)*dx
-
-        subs = ("on_boundary",)
-        if mesh.cell_set._extruded:
-            subs += ("top", "bottom")
-        bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
-
-        uh = Function(V)
-        problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
-        solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
-        solver.solve()
-        assert solver.snes.ksp.getIterationNumber() <= nits
-    assert norm(u_exact-uh, "H1") < 2.0E-7
-
-
-def solve_riesz_map(V, d):
-    beta = Constant(1E-8)
-    subs = [(1, 3)]
-    if V.mesh().extruded:
-        subs += ["top"]
-
-    x = SpatialCoordinate(V.mesh())
-    x -= Constant([0.5]*len(x))
-    expr = x * exp(-10*dot(x, x))
-
-    u_exact = Function(V)
-    u_exact.project(expr, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})
-    bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
-
-    uh = Function(V)
-    test = TestFunction(V)
-    trial = TrialFunction(V)
-    a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
-    problem = LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
-    solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
-    solver.solve()
-    return solver.snes.ksp.getIterationNumber()
+def test_p_independence_hgrad(mesh, variant):
+    family = "Lagrange"
+    expected = 9 if mesh.topological_dimension() == 3 else 5
+    for degree in range(3, 6):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant=variant)
+        V = FunctionSpace(mesh, element)
+        assert solve_riesz_map(V, grad) <= expected
 
 
 @pytest.mark.skipcomplex
-def test_hcurl(mesh, expected):
+def test_p_independence_hcurl(mesh):
     family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
-    for degree, nits in zip(range(3, 6), expected):
+    expected = 6 if mesh.topological_dimension() == 3 else 3
+    for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
-        assert solve_riesz_map(V, curl) <= nits
+        assert solve_riesz_map(V, curl) <= expected
 
 
 @pytest.mark.skipcomplex
-def test_hdiv(mesh, expected):
+def test_p_independence_hdiv(mesh):
     family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
-    for degree, nits in zip(range(3, 6), expected):
+    expected = 2
+    for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
-        assert solve_riesz_map(V, div) <= nits
+        assert solve_riesz_map(V, div) <= expected
 
 
 @pytest.mark.skipcomplex
@@ -148,7 +124,7 @@ def test_variable_coefficient(mesh):
     u = TrialFunction(V)
     v = TestFunction(V)
     x = SpatialCoordinate(mesh)
-    x -= Constant([0.5]*ndim)
+    x -= Constant([0.5]*len(x))
 
     # variable coefficients
     alphas = [0.1+10*dot(x, x)]*ndim

From 93088e6d38c526977819bb571b5d6b981acc7958 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 9 Mar 2023 18:19:39 +0000
Subject: [PATCH 13/75] add facetsplit tests

---
 tests/regression/test_fdm.py | 121 +++++++++++++++++------------------
 1 file changed, 57 insertions(+), 64 deletions(-)

diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 4699b129b5..63129c455b 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -1,29 +1,31 @@
 import pytest
 from firedrake import *
 
-
-fdmstar = {
+ksp = {
     "mat_type": "matfree",
     "ksp_type": "cg",
     "ksp_atol": 0.0E0,
     "ksp_rtol": 1.0E-8,
     "ksp_norm_type": "natural",
     "ksp_monitor": None,
+}
+
+coarse = {
+    "mat_type": "aij",
+    "ksp_type": "preonly",
+    "pc_type": "cholesky",
+}
+
+fdmstar = {
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
-    "pmg_mg_coarse": {
-        "mat_type": "aij",
-        "ksp_type": "preonly",
-        "pc_type": "cholesky",
-    },
+    "pmg_mg_coarse": coarse,
     "pmg_mg_levels": {
         "ksp_type": "chebyshev",
         "ksp_norm_type": "none",
         "esteig_ksp_type": "cg",
         "esteig_ksp_norm_type": "natural",
         "ksp_chebyshev_esteig": "0.75,0.25,0.0,1.0",
-        "ksp_chebyshev_esteig_noisy": True,
-        "ksp_chebyshev_esteig_steps": 8,
         "pc_type": "python",
         "pc_python_type": "firedrake.FDMPC",
         "fdm": {
@@ -31,12 +33,44 @@
             "pc_python_type": "firedrake.ASMExtrudedStarPC",
             "pc_star_mat_ordering_type": "nd",
             "pc_star_sub_sub_pc_type": "cholesky",
-            "pc_star_sub_sub_pc_factor_mat_solver_type": "petsc",
-            "pc_star_sub_sub_pc_factor_mat_ordering_type": "natural",
         }
     }
 }
 
+facetstar = {
+    "pc_type": "python",
+    "pc_python_type": "firedrake.FacetSplitPC",
+    "facet_pc_type": "python",
+    "facet_pc_python_type": "firedrake.FDMPC",
+    "facet_fdm_pc_use_amat": False,
+    "facet_fdm_pc_type": "fieldsplit",
+    "facet_fdm_pc_fieldsplit_type": "symmetric_multiplicative",
+    "facet_fdm_fieldsplit_0": {
+        "ksp_type": "preonly",
+        "pc_type": "icc",
+    },
+    "facet_fdm_fieldsplit_1": {
+        "ksp_type": "preonly",
+        "pc_type": "python",
+        "pc_python_type": "firedrake.P1PC",
+        "pmg_mg_coarse": coarse,
+        "pmg_mg_levels": {
+            "ksp_type": "chebyshev",
+            "ksp_norm_type": "none",
+            "esteig_ksp_type": "cg",
+            "esteig_ksp_norm_type": "natural",
+            "ksp_chebyshev_esteig": "0.75,0.25,0.0,1.0",
+            "pc_type": "python",
+            "pc_python_type": "firedrake.ASMExtrudedStarPC",
+            "pc_star_mat_ordering_type": "nd",
+            "pc_star_sub_sub_pc_type": "cholesky",
+        }
+    }
+}
+
+fdmstar.update(ksp)
+facetstar.update(ksp)
+
 
 def solve_riesz_map(V, d):
     beta = Constant(1E-8)
@@ -61,9 +95,13 @@ def solve_riesz_map(V, d):
     trial = TrialFunction(V)
     a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
     problem = LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
-    solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
-    solver.solve()
-    return solver.snes.ksp.getIterationNumber()
+    its = []
+    for sparams in [fdmstar, facetstar]:
+        uh.assign(0)
+        solver = LinearVariationalSolver(problem, solver_parameters=sparams)
+        solver.solve()
+        its.append(solver.snes.ksp.getIterationNumber())
+    return its
 
 
 @pytest.fixture(params=[2, 3],
@@ -87,11 +125,11 @@ def variant(request):
 
 
 @pytest.mark.skipcomplex
-def test_p_independence_hgrad(mesh, variant):
+def test_p_independence_hgrad(mesh):
     family = "Lagrange"
-    expected = 9 if mesh.topological_dimension() == 3 else 5
+    expected = [9, 9] if mesh.topological_dimension() == 3 else [5, 5]
     for degree in range(3, 6):
-        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant=variant)
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
         assert solve_riesz_map(V, grad) <= expected
 
@@ -99,7 +137,7 @@ def test_p_independence_hgrad(mesh, variant):
 @pytest.mark.skipcomplex
 def test_p_independence_hcurl(mesh):
     family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
-    expected = 6 if mesh.topological_dimension() == 3 else 3
+    expected = [6, 6] if mesh.topological_dimension() == 3 else [3, 3]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
@@ -109,7 +147,7 @@ def test_p_independence_hcurl(mesh):
 @pytest.mark.skipcomplex
 def test_p_independence_hdiv(mesh):
     family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
-    expected = 2
+    expected = [2, 2]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
@@ -271,48 +309,3 @@ def test_ipdg_direct_solver(fs):
 
     assert solver.snes.ksp.getIterationNumber() == 1
     assert norm(u_exact-uh, "H1") < 1.0E-8
-
-
-@pytest.mark.skipcomplex
-def test_static_condensation(mesh):
-    degree = 3
-    quad_degree = 2*degree+1
-    cell = mesh.ufl_cell()
-    e = FiniteElement("Lagrange", cell=cell, degree=degree, variant="fdm")
-    Z = FunctionSpace(mesh, MixedElement(*[RestrictedElement(e, d) for d in ("interior", "facet")]))
-    z = Function(Z)
-    u = sum(split(z))
-
-    f = Constant(1)
-    U = ((1/2)*inner(grad(u), grad(u)) - inner(u, f))*dx(degree=quad_degree)
-    F = derivative(U, z, TestFunction(Z))
-    a = derivative(F, z, TrialFunction(Z))
-
-    subs = ["on_boundary"]
-    if mesh.cell_set._extruded:
-        subs += ["top", "bottom"]
-    bcs = [DirichletBC(Z.sub(1), zero(), sub) for sub in subs]
-
-    problem = LinearVariationalProblem(a, -F, z, bcs=bcs)
-    solver = LinearVariationalSolver(problem, solver_parameters={
-        "mat_type": "matfree",
-        "ksp_monitor": None,
-        "ksp_type": "preonly",
-        "ksp_norm_type": "unpreconditioned",
-        "pc_type": "python",
-        "pc_python_type": "firedrake.SCPC",
-        "pc_sc_eliminate_fields": "0",
-        "condensed_field": {
-            "mat_type": "matfree",
-            "ksp_monitor": None,
-            "ksp_type": "preonly",
-            "ksp_norm_type": "unpreconditioned",
-            "pc_type": "python",
-            "pc_python_type": "firedrake.FDMPC",
-            "fdm_pc_type": "lu",
-            "fdm_pc_mat_factor_solver_type": "mumps"
-        }
-    })
-    solver.solve()
-    residual = solver.snes.ksp.buildResidual()
-    assert residual.norm() < 1E-14

From 4f95d7d781523f3efefd6e850d5d75985a0ceca1 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Fri, 10 Mar 2023 17:51:45 +0000
Subject: [PATCH 14/75] test statically-condensed star-relaxation

---
 firedrake/preconditioners/fdm.py    | 113 +++++++++++-----------
 firedrake/preconditioners/pmg.py    |  63 +++++++-----
 tests/multigrid/test_p_multigrid.py | 144 ++++++++++++++++++----------
 tests/regression/test_fdm.py        |  34 +++----
 4 files changed, 207 insertions(+), 147 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 9c02a5c1f0..8918bc56e3 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -36,11 +36,11 @@ class FDMPC(PCBase):
 
     Here we assume that the volume integrals in the Jacobian can be expressed as:
 
-    inner(d(v), alpha(d(u)))*dx + inner(v, beta(u))*dx
+    inner(d(v), alpha * d(u))*dx + inner(v, beta * u)*dx
 
-    where alpha and beta are linear functions (tensor contractions).
-    The sparse matrix is obtained by approximating (v, alpha u) and (v, beta u) as
-    diagonal mass matrices
+    where alpha and beta are possibly tensor-valued.  The sparse matrix is
+    obtained by approximating (v, alpha * u) and (v, beta * u) as diagonal mass
+    matrices.
     """
 
     _prefix = "fdm_"
@@ -107,7 +107,12 @@ def initialize(self, pc):
             # Matrix-free assembly of the transformed Jacobian
             V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
             J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
-            bcs_fdm = tuple(bc.reconstruct(V=V_fdm, g=0) for bc in bcs)
+            bcs_fdm = []
+            for bc in bcs:
+                W = V_fdm
+                for index in bc._indices:
+                    W = W.sub(index)
+                bcs_fdm.append(bc.reconstruct(V=W, g=0))
 
             self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
             self.work_vec_x = Amat.createVecLeft()
@@ -280,7 +285,6 @@ def get_coeffs(e, result=None):
                 triu = on_diag and symmetric
                 ptype = pmat_type if on_diag else PETSc.Mat.Type.AIJ
                 sizes = tuple(Vsub.dof_dset.layout_vec.getSizes() for Vsub in (Vrow, Vcol))
-                # bsizes = tuple(Vsub.dof_dset.layout_vec.getBlockSize() for Vsub in (Vrow, Vcol))
 
                 preallocator = PETSc.Mat().create(comm=self.comm)
                 preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
@@ -297,7 +301,6 @@ def get_coeffs(e, result=None):
                 P = PETSc.Mat().create(comm=self.comm)
                 P.setType(ptype)
                 P.setSizes(sizes)
-                # P.setBlockSizes(*bsizes)
                 P.setPreallocationNNZ((d_nnz, o_nnz))
                 P.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
                 if ptype.endswith("sbaij"):
@@ -483,10 +486,10 @@ def assemble_coef(self, J, form_compiler_parameters):
             splitter = ExtractSubBlock()
             J = splitter.split(J, argument_indices=(index, index))
 
-        mesh = J.ufl_domain()
-        ndim = mesh.topological_dimension()
         args_J = J.arguments()
         e = args_J[0].ufl_element()
+        mesh = args_J[0].function_space().mesh()
+        tdim = mesh.topological_dimension()
         if isinstance(e, (ufl.VectorElement, ufl.TensorElement)):
             e = e._sub_element
         e = unrestrict_element(e)
@@ -501,7 +504,7 @@ def assemble_coef(self, J, form_compiler_parameters):
             dku = ufl.div(u) if sobolev == ufl.HDiv else ufl.curl(u)
             eps = expand_derivatives(ufl.diff(ufl.replace(expand_derivatives(dku), {ufl.grad(u): du}), du))
             if sobolev == ufl.HDiv:
-                map_grad = lambda p: ufl.outer(p, eps/ndim)
+                map_grad = lambda p: ufl.outer(p, eps/tdim)
             elif len(eps.ufl_shape) == 3:
                 map_grad = lambda p: ufl.dot(p, eps/2)
             else:
@@ -515,12 +518,12 @@ def assemble_coef(self, J, form_compiler_parameters):
         except TypeError:
             pass
         qdeg = degree
-        if formdegree == ndim:
-            qfam = "DG" if ndim == 1 else "DQ"
+        if formdegree == tdim:
+            qfam = "DG" if tdim == 1 else "DQ"
             qdeg = 0
         elif formdegree == 0:
-            qfam = "DG" if ndim == 1 else "RTCE" if ndim == 2 else "NCE"
-        elif formdegree == 1 and ndim == 3:
+            qfam = "DG" if tdim == 1 else "RTCE" if tdim == 2 else "NCE"
+        elif formdegree == 1 and tdim == 3:
             qfam = "NCF"
         else:
             qfam = "DQ L2"
@@ -565,7 +568,7 @@ def assemble_coef(self, J, form_compiler_parameters):
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V):
-        ndim = V.mesh().topological_dimension()
+        tdim = V.mesh().topological_dimension()
         value_size = V.value_size
         formdegree = V.finat_element.formdegree
         degree = V.finat_element.degree
@@ -573,13 +576,13 @@ def assemble_reference_tensor(self, V):
             degree = max(degree)
         except TypeError:
             pass
-        if formdegree == ndim:
+        if formdegree == tdim:
             degree = degree + 1
         is_interior, is_facet = is_restricted(V.finat_element)
-        key = (degree, ndim, formdegree, V.value_size, is_interior, is_facet)
+        key = (degree, tdim, formdegree, V.value_size, is_interior, is_facet)
         cache = self._reference_tensor_cache
         if key not in cache:
-            full_key = (degree, ndim, formdegree, V.value_size, False, False)
+            full_key = (degree, tdim, formdegree, V.value_size, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
@@ -613,8 +616,8 @@ def assemble_reference_tensor(self, V):
             A10 = numpy.linalg.solve(A11, A10)
             A11 = numpy.eye(A11.shape[0])
 
-            Ihat = mass_matrix(ndim, formdegree, A00, A11)
-            Dhat = diff_matrix(ndim, formdegree, A00, A11, A10)
+            Ihat = mass_matrix(tdim, formdegree, A00, A11)
+            Dhat = diff_matrix(tdim, formdegree, A00, A11, A10)
             result = block_mat([[Ihat], [Dhat]])
             Ihat.destroy()
             Dhat.destroy()
@@ -846,19 +849,19 @@ def kron3(A, B, C, scale=None):
     return result
 
 
-def mass_matrix(ndim, formdegree, B00, B11):
+def mass_matrix(tdim, formdegree, B00, B11):
     B00 = petsc_sparse(B00)
     B11 = petsc_sparse(B11)
-    if ndim == 1:
+    if tdim == 1:
         B_blocks = [B11 if formdegree else B00]
-    elif ndim == 2:
+    elif tdim == 2:
         if formdegree == 0:
             B_blocks = [B00.kron(B00)]
         elif formdegree == 1:
             B_blocks = [B00.kron(B11), B11.kron(B00)]
         else:
             B_blocks = [B11.kron(B11)]
-    elif ndim == 3:
+    elif tdim == 3:
         if formdegree == 0:
             B_blocks = [kron3(B00, B00, B00)]
         elif formdegree == 1:
@@ -887,9 +890,9 @@ def mass_matrix(ndim, formdegree, B00, B11):
     return result
 
 
-def diff_matrix(ndim, formdegree, A00, A11, A10):
-    if formdegree == ndim:
-        ncols = A10.shape[0]**ndim
+def diff_matrix(tdim, formdegree, A00, A11, A10):
+    if formdegree == tdim:
+        ncols = A10.shape[0]**tdim
         A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=PETSc.COMM_SELF)
         A_zero.assemble()
         return A_zero
@@ -897,15 +900,15 @@ def diff_matrix(ndim, formdegree, A00, A11, A10):
     A00 = petsc_sparse(A00)
     A11 = petsc_sparse(A11)
     A10 = petsc_sparse(A10)
-    if ndim == 1:
+    if tdim == 1:
         return A10
-    elif ndim == 2:
+    elif tdim == 2:
         if formdegree == 0:
             A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
         elif formdegree == 1:
             A_blocks = [[A10.kron(A11), A11.kron(A10)]]
             A_blocks[-1][-1].scale(-1)
-    elif ndim == 3:
+    elif tdim == 3:
         if formdegree == 0:
             A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
         elif formdegree == 1:
@@ -946,8 +949,8 @@ def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
     A00 = numpy.eye(degree+1, dtype=PETSc.RealType)
     A10 = fiat_reference_prolongator(e1, e0, derivative=True)
 
-    ndim = Vc.mesh().topological_dimension()
-    Dhat = diff_matrix(ndim, ec.formdegree, A00, A11, A10)
+    tdim = Vc.mesh().topological_dimension()
+    Dhat = diff_matrix(tdim, ec.formdegree, A00, A11, A10)
 
     scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
     fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
@@ -1056,7 +1059,7 @@ class PoissonFDMPC(FDMPC):
 
     inner(grad(v), alpha(grad(u)))*dx + inner(v, beta(u))*dx
 
-    where alpha and beta are linear functions (tensor contractions).
+    where alpha and beta are possibly tensor-valued.
     The sparse matrix is obtained by approximating alpha and beta by cell-wise
     constants and discarding the coefficients in alpha that couple together
     mixed derivatives and mixed components.
@@ -1122,12 +1125,12 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         bsize = V.value_size
         ncomp = V.ufl_element().reference_value_size()
         sdim = (V.finat_element.space_dimension() * bsize) // ncomp  # dimension of a single component
-        ndim = V.ufl_domain().topological_dimension()
+        tdim = V.mesh().topological_dimension()
         shift = self.axes_shifts * bsize
 
         index_coef, _ = glonum_fun((Gq or Bq).cell_node_map())
         index_bc, _ = glonum_fun(bcflags.cell_node_map())
-        flag2id = numpy.kron(numpy.eye(ndim, ndim, dtype=PETSc.IntType), [[1], [2]])
+        flag2id = numpy.kron(numpy.eye(tdim, tdim, dtype=PETSc.IntType), [[1], [2]])
 
         # pshape is the shape of the DOFs in the tensor product
         pshape = tuple(Ak[0].size[0] for Ak in Afdm)
@@ -1136,7 +1139,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             static_condensation = True
 
         if set(shift) != {0}:
-            assert ncomp == ndim
+            assert ncomp == tdim
             pshape = [tuple(numpy.roll(pshape, -shift[k])) for k in range(ncomp)]
 
         # assemble zero-th order term separately, including off-diagonals (mixed components)
@@ -1148,7 +1151,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             bshape = Bq.ufl_shape
             # Be = Bhat kron ... kron Bhat
             Be = Afdm[0][0].copy()
-            for k in range(1, ndim):
+            for k in range(1, tdim):
                 Be = Be.kron(Afdm[k][0])
 
             aptr = numpy.arange(0, (bshape[0]+1)*bshape[1], bshape[1], dtype=PETSc.IntType)
@@ -1166,7 +1169,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
         # assemble the second order term and the zero-th order term if any,
         # discarding mixed derivatives and mixed componentsget_weak_bc_flags(J)
-        mue = numpy.zeros((ncomp, ndim), dtype=PETSc.RealType)
+        mue = numpy.zeros((ncomp, tdim), dtype=PETSc.RealType)
         bqe = numpy.zeros((ncomp,), dtype=PETSc.RealType)
 
         for e in range(self.nel):
@@ -1187,7 +1190,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
             for k in range(ncomp):
                 # permutation of axes with respect to the first vector component
-                axes = numpy.roll(numpy.arange(ndim), -shift[k])
+                axes = numpy.roll(numpy.arange(tdim), -shift[k])
                 # for each component: compute the stiffness matrix Ae
                 bck = bce[:, k] if len(bce.shape) == 2 else bce
                 fbc = numpy.dot(bck, flag2id)
@@ -1200,13 +1203,13 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                     if Bq is not None:
                         Ae.axpy(bqe[k], Be)
 
-                    if ndim > 1:
+                    if tdim > 1:
                         # Ae = Ae kron Bhat + mue[k][1] Bhat kron Ahat
                         Ae = Ae.kron(Afdm[axes[1]][0])
                         if Gq is not None:
                             Ae.axpy(mue[k][1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
 
-                        if ndim > 2:
+                        if tdim > 2:
                             # Ae = Ae kron Bhat + mue[k][2] Bhat kron Bhat kron Ahat
                             Be = Be.kron(Afdm[axes[1]][0])
                             Ae = Ae.kron(Afdm[axes[2]][0])
@@ -1216,7 +1219,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
                 elif Bq is not None:
                     Ae = Afdm[axes[0]][0]
-                    for m in range(1, ndim):
+                    for m in range(1, tdim):
                         Ae = Ae.kron(Afdm[axes[m]][0])
                     Ae.scale(bqe[k])
 
@@ -1228,7 +1231,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         if any(Dk is not None for Dk in Dfdm):
             if static_condensation:
                 raise NotImplementedError("Static condensation for SIPG not implemented")
-            if ndim < V.ufl_domain().geometric_dimension():
+            if tdim < V.mesh().geometric_dimension():
                 raise NotImplementedError("SIPG on immersed meshes is not implemented")
             eta = float(self.appctx.get("eta"))
 
@@ -1246,8 +1249,8 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
                 if PT_facet:
                     icell = numpy.reshape(lgmap.apply(ie), (2, ncomp, -1))
-                    iord0 = numpy.insert(numpy.delete(numpy.arange(ndim), idir[0]), 0, idir[0])
-                    iord1 = numpy.insert(numpy.delete(numpy.arange(ndim), idir[1]), 0, idir[1])
+                    iord0 = numpy.insert(numpy.delete(numpy.arange(tdim), idir[0]), 0, idir[0])
+                    iord1 = numpy.insert(numpy.delete(numpy.arange(tdim), idir[1]), 0, idir[1])
                     je = je[[0, 1], lfd]
                     Pfacet = PT_facet.dat.data_ro_with_halos[je]
                     Gfacet = Gq_facet.dat.data_ro_with_halos[je]
@@ -1255,14 +1258,14 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                     Gfacet = numpy.sum(Gq.dat.data_ro_with_halos[je], axis=1)
 
                 for k in range(ncomp):
-                    axes = numpy.roll(numpy.arange(ndim), -shift[k])
+                    axes = numpy.roll(numpy.arange(tdim), -shift[k])
                     Dfacet = Dfdm[axes[0]]
                     if Dfacet is None:
                         continue
 
                     if PT_facet:
-                        k0 = iord0[k] if shift != 1 else ndim-1-iord0[-k-1]
-                        k1 = iord1[k] if shift != 1 else ndim-1-iord1[-k-1]
+                        k0 = iord0[k] if shift != 1 else tdim-1-iord0[-k-1]
+                        k1 = iord1[k] if shift != 1 else tdim-1-iord1[-k-1]
                         Piola = Pfacet[[0, 1], [k0, k1]]
                         mu = Gfacet[[0, 1], idir]
                     else:
@@ -1297,10 +1300,10 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                             Adense[ii, j0:j1] -= smu[j] * Dfacet[:, jface % 2]
 
                     Ae = numpy_to_petsc(Adense, dense_indices, diag=False)
-                    if ndim > 1:
+                    if tdim > 1:
                         # assume that the mesh is oriented
                         Ae = Ae.kron(Afdm[axes[1]][0])
-                        if ndim > 2:
+                        if tdim > 2:
                             Ae = Ae.kron(Afdm[axes[2]][0])
 
                     if bsize == ncomp:
@@ -1323,12 +1326,12 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
         coefficients = {}
         assembly_callables = []
 
-        mesh = J.ufl_domain()
+        args_J = J.arguments()
+        V = args_J[-1].function_space()
+        mesh = V.mesh()
         tdim = mesh.topological_dimension()
         Finv = ufl.JacobianInverse(mesh)
 
-        args_J = J.arguments()
-        V = args_J[-1].function_space()
         degree = V.ufl_element().degree()
         try:
             degree = max(degree)
@@ -1590,7 +1593,7 @@ def fdm_setup_ipdg(fdm_element, eta):
 @lru_cache(maxsize=10)
 def get_interior_facet_maps(V):
     """
-    Extrude V.interior_facet_node_map and V.ufl_domain().interior_facets.local_facet_dat
+    Extrude V.interior_facet_node_map and V.mesh().interior_facets.local_facet_dat
 
     :arg V: a :class:`.FunctionSpace`
 
@@ -1599,7 +1602,7 @@ def get_interior_facet_maps(V):
         local_facet_data_fun: maps interior facets to the local facet numbering in the two cells sharing it,
         nfacets: the total number of interior facets owned by this process
     """
-    mesh = V.ufl_domain()
+    mesh = V.mesh()
     intfacets = mesh.interior_facets
     facet_to_cells = intfacets.facet_cell_map.values
     local_facet_data = intfacets.local_facet_dat.data_ro
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 56341599a6..6511be38b5 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -606,7 +606,7 @@ def prolongation_transfer_kernel_action(Vf, expr):
     kernel = compile_expression_dual_evaluation(expr, to_element, Vf.ufl_element(), log=PETSc.Log.isActive())
     coefficients = extract_numbered_coefficients(expr, kernel.coefficient_numbers)
     if kernel.needs_external_coords:
-        coefficients = [Vf.ufl_domain().coordinates] + coefficients
+        coefficients = [Vf.mesh().coordinates] + coefficients
 
     return op2.Kernel(kernel.ast, kernel.name,
                       requires_zeroed_output_arguments=True,
@@ -790,7 +790,7 @@ def finat_reference_prolongator(felem, celem):
     from gem.interpreter import evaluate
 
     ref_el = felem.cell
-    ndim = ref_el.get_spatial_dimension()
+    tdim = ref_el.get_spatial_dimension()
     degree = felem.degree
     try:
         degree = max(degree)
@@ -804,11 +804,12 @@ def _tabulate(e, ps, entity=None):
 
     is_facet_element = True
     entity_dofs = felem.entity_dofs()
-    for key in entity_dofs:
-        v = sum(list(entity_dofs[key].values()), [])
+    for edim in sorted(entity_dofs):
+        v = sum(list(entity_dofs[edim].values()), [])
         if len(v):
-            edim = sum(key) if type(key) == tuple else key
-            if edim == ndim:
+            if type(edim) == tuple:
+                edim = sum(edim)
+            if edim == tdim:
                 is_facet_element = False
 
     if is_facet_element and degree > 5:
@@ -816,7 +817,7 @@ def _tabulate(e, ps, entity=None):
         quadratures = []
         for key in ref_el.sub_entities:
             edim = sum(key) if type(key) == tuple else key
-            if edim == ndim-1:
+            if edim == tdim-1:
                 sub_entities = ref_el.sub_entities[key]
                 entities.extend([(key, f) for f in sub_entities])
                 quadratures.extend([make_quadrature(ref_el.construct_subelement(key), quad_degree)]*len(sub_entities))
@@ -1113,6 +1114,11 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
     prolong_code = "".join(prolong_code)
     restrict_code = "".join(reversed(restrict_code))
     shapes = [tuple(map(max, zip(*fshapes))), tuple(map(max, zip(*cshapes)))]
+
+    if fskip > numpy.prod(shapes[0]):
+        shapes[0] = (fskip, 1, 1, 1)
+    if cskip > numpy.prod(shapes[1]):
+        shapes[1] = (cskip, 1, 1, 1)
     return operator_decl, prolong_code, restrict_code, shapes
 
 
@@ -1158,9 +1164,8 @@ def cache_generate_code(kernel, comm):
 def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
     if fmapping == cmapping:
         return None
-    domain = Q.ufl_domain()
-    A = get_piola_tensor(cmapping, domain, inverse=False)
-    B = get_piola_tensor(fmapping, domain, inverse=True)
+    A = get_piola_tensor(cmapping, Q.mesh(), inverse=False)
+    B = get_piola_tensor(fmapping, Q.mesh(), inverse=True)
     tensor = A
     if B:
         tensor = ufl.dot(B, tensor) if tensor else B
@@ -1285,14 +1290,16 @@ def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
             self.uf = Vf
             Vf = Vf.function_space()
         else:
-            self.uf = self._cache_work.get(Vf) or firedrake.Function(Vf)
-            self._cache_work[Vf] = self.uf
+            if Vf not in self._cache_work:
+                self._cache_work[Vf] = firedrake.Function(Vf)
+            self.uf = self._cache_work[Vf]
         if isinstance(Vc, firedrake.Function):
             self.uc = Vc
             Vc = Vc.function_space()
         else:
-            self.uc = self._cache_work.get(Vc) or firedrake.Function(Vc)
-            self._cache_work[Vc] = self.uc
+            if Vc not in self._cache_work:
+                self._cache_work[Vc] = firedrake.Function(Vc)
+            self.uc = self._cache_work[Vc]
         self.Vf = Vf
         self.Vc = Vc
 
@@ -1339,6 +1346,16 @@ def _kernels(self):
         restrict = partial(op2.par_loop, *restrict_args, *coefficient_args)
         return prolong, restrict
 
+    def _prolong(self):
+        with self.uf.dat.vec_wo as uf:
+            uf.set(0.0E0)
+        self._kernels[0]()
+
+    def _restrict(self):
+        with self.uc.dat.vec_wo as uc:
+            uc.set(0.0E0)
+        self._kernels[1]()
+
     def view(self, mat, viewer=None):
         if viewer is None:
             return
@@ -1399,14 +1416,14 @@ def make_blas_kernels(Vf, Vc):
                 qelem = felem
                 if qelem.mapping() != "identity":
                     qelem = qelem.reconstruct(mapping="identity")
-                Qf = Vf if qelem == felem else firedrake.FunctionSpace(Vf.ufl_domain(), qelem)
+                Qf = Vf if qelem == felem else firedrake.FunctionSpace(Vf.mesh(), qelem)
                 mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
                 in_place_mapping = True
             except Exception:
                 qelem = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
                 if felem.value_shape():
                     qelem = ufl.TensorElement(qelem, shape=felem.value_shape(), symmetry=felem.symmetry())
-                Qf = firedrake.FunctionSpace(Vf.ufl_domain(), qelem)
+                Qf = firedrake.FunctionSpace(Vf.mesh(), qelem)
                 mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
 
             qshape = (Qf.value_size, Qf.finat_element.space_dimension())
@@ -1583,9 +1600,7 @@ def multTranspose(self, mat, rf, rc):
         for bc in self.Vf_bcs:
             bc.zero(self.uf)
 
-        with self.uc.dat.vec_wo as uc:
-            uc.set(0.0E0)
-        self._kernels[1]()
+        self._restrict()
 
         for bc in self.Vc_bcs:
             bc.zero(self.uc)
@@ -1601,9 +1616,7 @@ def mult(self, mat, xc, xf, inc=False):
         for bc in self.Vc_bcs:
             bc.zero(self.uc)
 
-        with self.uf.dat.vec_wo as uf:
-            uf.set(0.0E0)
-        self._kernels[0]()
+        self._prolong()
 
         for bc in self.Vf_bcs:
             bc.zero(self.uf)
@@ -1642,8 +1655,8 @@ def _standalones(self):
 
     @cached_property
     def _kernels(self):
-        prolong = lambda: [standalone._kernels[0]() for standalone in self._standalones]
-        restrict = lambda: [standalone._kernels[1]() for standalone in self._standalones]
+        prolong = lambda: [s._prolong() for s in self._standalones]
+        restrict = lambda: [s._restrict() for s in self._standalones]
         return prolong, restrict
 
     def getNestSubMatrix(self, i, j):
@@ -1667,7 +1680,7 @@ def prolongation_matrix_aij(Pk, P1, Pk_bcs=[], P1_bcs=[]):
                       (Pk.cell_node_map(),
                        P1.cell_node_map()))
     mat = op2.Mat(sp, PETSc.ScalarType)
-    mesh = Pk.ufl_domain()
+    mesh = Pk.mesh()
 
     fele = Pk.ufl_element()
     if isinstance(fele, ufl.MixedElement) and not isinstance(fele, (ufl.VectorElement, ufl.TensorElement)):
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 89d7c94efb..ff6d51be54 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -2,58 +2,102 @@
 from firedrake import *
 
 
-def test_reconstruct_degree():
-    meshes = [UnitSquareMesh(1, 1, quadrilateral=True)]
-    meshes.append(ExtrudedMesh(meshes[0], layers=1))
-    for mesh in meshes:
-        ndim = mesh.topological_dimension()
-        elist = []
-        for degree in [7, 2, 31]:
-            V = VectorFunctionSpace(mesh, "Q", degree)
-            Q = FunctionSpace(mesh, "DQ", degree-2)
-            Z = MixedFunctionSpace([V, Q])
-            e = Z.ufl_element()
-            elist.append(e)
-            assert e == PMGPC.reconstruct_degree(elist[0], degree)
-
-        elist = []
-        for degree in [7, 2, 31]:
-            V = FunctionSpace(mesh, "NCF" if ndim == 3 else "RTCF", degree)
-            Q = FunctionSpace(mesh, "DQ", degree-1)
-            Z = MixedFunctionSpace([V, Q])
-            e = Z.ufl_element()
-            elist.append(e)
-            assert e == PMGPC.reconstruct_degree(elist[0], degree)
-
-
-def test_prolongation_matrix_matfree():
+@pytest.fixture(params=[2, 3],
+                ids=["Rectangle", "Box"])
+def tp_mesh(request):
+    nx = 4
+    distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
+    m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
+    if request.param == 3:
+        m = ExtrudedMesh(m, nx)
+
+    x = SpatialCoordinate(m)
+    xnew = as_vector([acos(1-2*xj)/pi for xj in x])
+    m.coordinates.interpolate(xnew)
+    return m
+
+
+@pytest.fixture(params=[0, 1, 2],
+                ids=["H1", "HCurl", "HDiv"])
+def tp_family(tp_mesh, request):
+    tdim = tp_mesh.topological_dimension()
+    if tdim == 3:
+        families = ["Q", "NCE", "NCF"]
+    else:
+        families = ["Q", "RTCE", "RTCF"]
+    return families[request.param]
+
+
+@pytest.fixture(params=[None, "fdm", "hierarchical"], ids=["spectral", "fdm", "hierarchical"])
+def variant(request):
+    return request.param
+
+
+def test_reconstruct_degree(tp_mesh):
+    tdim = tp_mesh.topological_dimension()
+    elist = []
+    for degree in [7, 2, 31]:
+        V = VectorFunctionSpace(tp_mesh, "Q", degree)
+        Q = FunctionSpace(tp_mesh, "DQ", degree-2)
+        Z = MixedFunctionSpace([V, Q])
+        e = Z.ufl_element()
+        elist.append(e)
+        assert e == PMGPC.reconstruct_degree(elist[0], degree)
+
+    elist = []
+    for degree in [7, 2, 31]:
+        V = FunctionSpace(tp_mesh, "NCF" if tdim == 3 else "RTCF", degree)
+        Q = FunctionSpace(tp_mesh, "DQ", degree-1)
+        Z = MixedFunctionSpace([V, Q])
+        e = Z.ufl_element()
+        elist.append(e)
+        assert e == PMGPC.reconstruct_degree(elist[0], degree)
+
+
+def test_prolong_de_rham(tp_mesh):
+    from firedrake.preconditioners.pmg import prolongation_matrix_matfree
+
+    tdim = tp_mesh.topological_dimension()
+    b = Constant(list(range(tdim)))
+    mat = diag(Constant([tdim+1]*tdim)) + Constant([[-1]*tdim]*tdim)
+    expr = dot(mat, SpatialCoordinate(tp_mesh)) + b
+
+    cell = tp_mesh.ufl_cell()
+    elems = [VectorElement(FiniteElement("Q", cell=cell, degree=2)),
+             FiniteElement("NCE" if tdim == 3 else "RTCE", cell=cell, degree=2),
+             FiniteElement("NCF" if tdim == 3 else "RTCF", cell=cell, degree=2)]
+    fs = [FunctionSpace(tp_mesh, e) for e in elems]
+    us = [Function(V) for V in fs]
+    us[0].interpolate(expr)
+    for u in us:
+        for v in us:
+            if u != v:
+                P = prolongation_matrix_matfree(v, u).getPythonContext()
+                P._prolong()
+                assert norm(v-expr, "L2") < 1E-14
+
+
+def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
-    tol = 1E-14
-    meshes = [UnitSquareMesh(3, 2, quadrilateral=True)]
-    meshes.append(ExtrudedMesh(meshes[0], layers=2))
-    for mesh in meshes:
-        ndim = mesh.topological_dimension()
-        b = Constant(list(range(ndim)))
-        mat = diag(Constant([ndim+1]*ndim)) + Constant([[-1]*ndim]*ndim)
-        expr = dot(mat, SpatialCoordinate(mesh)) + b
-
-        variant = None
-        cell = mesh.ufl_cell()
-        elems = []
-        elems.append(VectorElement(FiniteElement("Q", cell=cell, degree=3, variant=variant)))
-        elems.append(FiniteElement("NCF" if ndim == 3 else "RTCF", cell=cell, degree=2, variant=variant))
-        elems.append(FiniteElement("NCE" if ndim == 3 else "RTCE", cell=cell, degree=2, variant=variant))
-        fs = [FunctionSpace(mesh, e) for e in elems]
-        us = [Function(V) for V in fs]
-        us[0].interpolate(expr)
-        for u in us:
-            for v in us:
-                if u != v:
-                    v.assign(0)
-                    P = prolongation_matrix_matfree(v, u).getPythonContext()
-                    P._kernels[0]()
-                    assert norm(v-expr, "L2") < tol
+    degree = 3
+    cell = tp_mesh.ufl_cell()
+    element = FiniteElement(tp_family, cell=cell, degree=degree, variant=variant)
+    Vi = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="interior"))
+    Vf = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="facet"))
+    Vc = FunctionSpace(tp_mesh, tp_family, degree=1)
+
+    ui = Function(Vi)
+    uf = Function(Vf)
+    uc = Function(Vc)
+    uc.dat.data[0::2] = 0.0
+    uc.dat.data[1::2] = 1.0
+
+    for v in [ui, uf]:
+        P = prolongation_matrix_matfree(v, uc).getPythonContext()
+        P._prolong()
+
+    assert norm(ui + uf - uc, "L2")  < 2E-14
 
 
 @pytest.fixture(params=["triangles", "quadrilaterals"], scope="module")
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 63129c455b..e4217b5129 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -73,7 +73,7 @@
 
 
 def solve_riesz_map(V, d):
-    beta = Constant(1E-8)
+    beta = Constant(1E-4)
     subs = [(1, 3)]
     if V.mesh().cell_set._extruded:
         subs += ["top"]
@@ -137,7 +137,7 @@ def test_p_independence_hgrad(mesh):
 @pytest.mark.skipcomplex
 def test_p_independence_hcurl(mesh):
     family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
-    expected = [6, 6] if mesh.topological_dimension() == 3 else [3, 3]
+    expected = [8, 7] if mesh.topological_dimension() == 3 else [4, 4]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
@@ -147,7 +147,7 @@ def test_p_independence_hcurl(mesh):
 @pytest.mark.skipcomplex
 def test_p_independence_hdiv(mesh):
     family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
-    expected = [2, 2]
+    expected = [3, 3]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
@@ -156,7 +156,7 @@ def test_p_independence_hdiv(mesh):
 
 @pytest.mark.skipcomplex
 def test_variable_coefficient(mesh):
-    ndim = mesh.geometric_dimension()
+    gdim = mesh.geometric_dimension()
     k = 4
     V = FunctionSpace(mesh, "Lagrange", k)
     u = TrialFunction(V)
@@ -165,10 +165,10 @@ def test_variable_coefficient(mesh):
     x -= Constant([0.5]*len(x))
 
     # variable coefficients
-    alphas = [0.1+10*dot(x, x)]*ndim
+    alphas = [0.1+10*dot(x, x)]*gdim
     alphas[0] = 1+10*exp(-dot(x, x))
     alpha = diag(as_vector(alphas))
-    beta = ((10*cos(3*pi*x[0]) + 20*sin(2*pi*x[1]))*cos(pi*x[ndim-1]))**2
+    beta = ((10*cos(3*pi*x[0]) + 20*sin(2*pi*x[1]))*cos(pi*x[gdim-1]))**2
 
     a = (inner(grad(v), dot(alpha, grad(u))) + inner(v, beta*u))*dx(degree=3*k+2)
     L = inner(v, Constant(1))*dx
@@ -189,44 +189,44 @@ def test_variable_coefficient(mesh):
                 ids=["cg", "dg", "rt"])
 def fs(request, mesh):
     degree = 3
-    ndim = mesh.topological_dimension()
+    tdim = mesh.topological_dimension()
     cell = mesh.ufl_cell()
     element = request.param
     variant = "fdm_ipdg"
     if element == "rt":
-        family = "RTCF" if ndim == 2 else "NCF"
+        family = "RTCF" if tdim == 2 else "NCF"
         return FunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant))
     else:
-        if ndim == 1:
+        if tdim == 1:
             family = "DG" if element == "dg" else "CG"
         else:
             family = "DQ" if element == "dg" else "Q"
-        return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-ndim)
+        return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-tdim)
 
 
 @pytest.mark.skipcomplex
 def test_ipdg_direct_solver(fs):
     mesh = fs.mesh()
     x = SpatialCoordinate(mesh)
-    ndim = mesh.geometric_dimension()
+    gdim = mesh.geometric_dimension()
     ncomp = fs.ufl_element().value_size()
     u_exact = dot(x, x)
     if ncomp:
         u_exact = as_vector([u_exact + Constant(k) for k in range(ncomp)])
 
-    N = fs.ufl_element().degree()
+    degree = fs.ufl_element().degree()
     try:
-        N, = set(N)
+        degree, = set(degree)
     except TypeError:
         pass
 
-    quad_degree = 2*(N+1)-1
+    quad_degree = 2*(degree+1)-1
     uh = Function(fs)
     u = TrialFunction(fs)
     v = TestFunction(fs)
 
     # problem coefficients
-    A1 = diag(Constant(range(1, ndim+1)))
+    A1 = diag(Constant(range(1, gdim+1)))
     A2 = diag(Constant(range(1, ncomp+1)))
     alpha = lambda grad_u: dot(dot(A2, grad_u), A1)
     beta = diag(Constant(range(2, ncomp+2)))
@@ -238,7 +238,7 @@ def test_ipdg_direct_solver(fs):
 
     extruded = mesh.cell_set._extruded
     subs = (1,)
-    if ndim > 1:
+    if gdim > 1:
         subs += (3,)
     if extruded:
         subs += ("top",)
@@ -271,7 +271,7 @@ def test_ipdg_direct_solver(fs):
 
     ds_Dir = sum(ds_Dir, ds(tuple()))
     ds_Neu = sum(ds_Neu, ds(tuple()))
-    eta = Constant((N+1)**2)
+    eta = Constant((degree+1)**2)
     h = CellVolume(mesh)/FacetArea(mesh)
     penalty = eta/h
 

From 9a548e51bb2d3b9c3d31ca1e8dc40411b42b049c Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 13 Mar 2023 18:22:05 +0000
Subject: [PATCH 15/75] attempt to write restricted NCE prolongator

---
 firedrake/preconditioners/fdm.py    |   6 +-
 firedrake/preconditioners/pmg.py    | 422 +++++++++-------------------
 tests/multigrid/test_p_multigrid.py |  50 ++--
 tests/regression/test_fdm.py        |  55 ++--
 4 files changed, 196 insertions(+), 337 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 8918bc56e3..a4dd248622 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1072,9 +1072,9 @@ class PoissonFDMPC(FDMPC):
     _variant = "fdm_ipdg"
 
     def assemble_reference_tensor(self, V):
-        from firedrake.preconditioners.pmg import get_line_elements
+        from firedrake.preconditioners.pmg import get_permutation_to_line_elements
         try:
-            line_elements, shifts = get_line_elements(V)
+            _, line_elements, shifts = get_permutation_to_line_elements(V)
         except ValueError:
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
@@ -1602,6 +1602,8 @@ def get_interior_facet_maps(V):
         local_facet_data_fun: maps interior facets to the local facet numbering in the two cells sharing it,
         nfacets: the total number of interior facets owned by this process
     """
+    if isinstance(V, firedrake.Function):
+        V = V.function_space()
     mesh = V.mesh()
     intfacets = mesh.interior_facets
     facet_to_cells = intfacets.facet_cell_map.values
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 6511be38b5..61ed8e06d9 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -72,13 +72,13 @@ def coarsen_form(self, form, fine_to_coarse_map):
         """
         return ufl.replace(form, fine_to_coarse_map)
 
-    def initialize(self, pc):
+    def initialize(self, obj):
         # Make a new DM.
         # Hook up a (new) coarsen routine on that DM.
         # Make a new PC, of type MG.
         # Assign the DM to that PC.
 
-        odm = pc.getDM()
+        odm = obj.getDM()
         ctx = get_appctx(odm)
         if ctx is None:
             raise ValueError("No context found.")
@@ -89,15 +89,15 @@ def initialize(self, pc):
         if test.function_space() != trial.function_space():
             raise NotImplementedError("test and trial spaces must be the same")
 
-        prefix = pc.getOptionsPrefix()
+        prefix = obj.getOptionsPrefix()
         options_prefix = prefix + self._prefix
-        pdm = PETSc.DMShell().create(comm=pc.comm)
+        pdm = PETSc.DMShell().create(comm=obj.comm)
         pdm.setOptionsPrefix(options_prefix)
 
-        self.ppc = self.configure_pmg(pc, pdm)
-        self.ppc.setFromOptions()
+        ppc = self.configure_pmg(obj, pdm)
+        is_snes = isinstance(obj, PETSc.SNES)
 
-        copts = PETSc.Options(self.ppc.getOptionsPrefix()+self.ppc.getType()+"_coarse_")
+        copts = PETSc.Options(ppc.getOptionsPrefix()+ppc.getType()+"_coarse_")
 
         # Get the coarse degree from PETSc options
         fcp = ctx._problem.form_compiler_parameters
@@ -126,12 +126,10 @@ def initialize(self, pc):
         # Now overwrite some routines on the DM
         pdm.setRefine(None)
         pdm.setCoarsen(self.coarsen)
-        pdm.setCreateInterpolation(self.create_interpolation)
-        # We need this for p-FAS
-        pdm.setCreateInjection(self.create_injection)
-        pdm.setSNESFunction(_SNESContext.form_function)
-        pdm.setSNESJacobian(_SNESContext.form_jacobian)
-        pdm.setKSPComputeOperators(_SNESContext.compute_operators)
+        if is_snes:
+            pdm.setSNESFunction(_SNESContext.form_function)
+            pdm.setSNESJacobian(_SNESContext.form_jacobian)
+            pdm.setKSPComputeOperators(_SNESContext.compute_operators)
 
         set_function_space(pdm, get_function_space(odm))
 
@@ -139,19 +137,23 @@ def initialize(self, pc):
         assert parent is not None
         add_hook(parent, setup=partial(push_parent, pdm, parent), teardown=partial(pop_parent, pdm, parent), call_setup=True)
         add_hook(parent, setup=partial(push_appctx, pdm, ctx), teardown=partial(pop_appctx, pdm, ctx), call_setup=True)
-        self.ppc.setUp()
 
-    def update(self, pc):
-        pass
+        ppc.incrementTabLevel(1, parent=obj)
+        ppc.setFromOptions()
+        ppc.setUp()
+        self.ppc = ppc
+
+    def update(self, obj):
+        self.ppc.setUp()
 
-    def view(self, pc, viewer=None):
+    def view(self, obj, viewer=None):
         if viewer is None:
             viewer = PETSc.Viewer.STDOUT
         viewer.printfASCII("p-multigrid PC\n")
         if hasattr(self, "ppc"):
-            self.ppc.view(viewer)
+            self.ppc.view(viewer=viewer)
 
-    def destroy(self, pc):
+    def destroy(self, obj):
         if hasattr(self, "ppc"):
             self.ppc.destroy()
 
@@ -385,11 +387,11 @@ def reconstruct_degree(ele, degree):
 
         By default, reconstructed EnrichedElements, TensorProductElements,
         and MixedElements will have the degree of the sub-elements shifted
-        by the same amount so that the maximum degree is N.
-        This is useful to coarsen spaces like NCF(N) x DQ(N-1).
+        by the same amount so that the maximum degree is `degree`.
+        This is useful to coarsen spaces like NCF(k) x DQ(k-1).
 
         :arg ele: a :class:`ufl.FiniteElement` to reconstruct,
-        :arg N: an integer degree.
+        :arg degree: an integer degree.
 
         :returns: the reconstructed element
         """
@@ -398,14 +400,14 @@ def reconstruct_degree(ele, degree):
         elif isinstance(ele, ufl.TensorElement):
             return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), shape=ele._shape, symmetry=ele.symmetry())
         elif isinstance(ele, ufl.EnrichedElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele._elements))
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele._elements))
         elif isinstance(ele, ufl.TensorProductElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele.sub_elements()), cell=ele.cell())
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele.sub_elements()), cell=ele.cell())
         elif isinstance(ele, ufl.MixedElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele.sub_elements()))
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele.sub_elements()))
         elif isinstance(ele, ufl.WithMapping):
             return type(ele)(PMGBase.reconstruct_degree(ele.wrapee, degree), ele.mapping())
         elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
@@ -426,7 +428,6 @@ def configure_pmg(self, pc, pdm):
         ppc.setType("mg")
         ppc.setOperators(*pc.getOperators())
         ppc.setDM(pdm)
-        ppc.incrementTabLevel(1, parent=pc)
 
         # PETSc unfortunately requires us to make an ugly hack.
         # We would like to use GMG for the coarse solve, at least
@@ -463,7 +464,8 @@ def configure_pmg(self, snes, pdm):
         psnes.setOptionsPrefix(snes.getOptionsPrefix() + "pfas_")
         psnes.setType("fas")
         psnes.setDM(pdm)
-        psnes.incrementTabLevel(1, parent=snes)
+        psnes.setTolerances(max_it=1)
+        psnes.setConvergenceTest("skip")
 
         (f, residual) = snes.getFunction()
         assert residual is not None
@@ -508,97 +510,6 @@ def coarsen_residual(self, Fc, Jc, uc):
         return Fc
 
 
-def load_c_code(code, name, argtypes, comm):
-    from pyop2.compilation import load
-    from pyop2.utils import get_petsc_dir
-    cppargs = ["-I%s/include" % d for d in get_petsc_dir()]
-    ldargs = (["-L%s/lib" % d for d in get_petsc_dir()]
-              + ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()]
-              + ["-lpetsc", "-lm"])
-    return load(code, "c", name, argtypes=argtypes,
-                cppargs=cppargs, ldargs=ldargs,
-                comm=comm)
-
-
-def reference_moments(*args, **kwargs):
-    """
-    Return a python function that computes the L2 inner product of the
-    arguments in the reference cell.
-
-    :arg test: the test `ufl.Argument`
-    :arg trial: the trial `ufl.Argument` or `ufl.Coefficient`
-    :kwarg diagonal: are we assembling the diagonal of the bilinear form?
-    """
-    import ctypes
-    from tsfc import compile_form
-    quad_degree = 1+sum([PMGBase.max_degree(t.ufl_element()) for t in args])
-    form = ufl.inner(*args)*ufl.dx(degree=quad_degree)
-    kernel, = compile_form(form, parameters=dict(mode="spectral"),
-                           log=PETSc.Log.isActive(), **kwargs)
-    op2kernel = op2.Kernel(kernel.ast, kernel.name,
-                           requires_zeroed_output_arguments=True,
-                           flop_count=kernel.flop_count,
-                           events=(kernel.event,))
-    code = op2kernel.code.gencode().replace("static inline void", "void")
-    coords = None
-    mesh = form.ufl_domain()
-    if len(kernel.arguments) > 3-len(form.arguments()):
-        mesh_element = mesh.coordinates.function_space().finat_element
-        nodes = mesh_element.fiat_equivalent.dual.get_nodes()
-        points = [list(node.get_point_dict().keys())[0] for node in nodes]
-        coords = numpy.array(points, dtype=PETSc.ScalarType)
-
-    argtypes = [ctypes.c_voidp]*len(kernel.arguments)
-    funptr = load_c_code(code, op2kernel.code.name, argtypes, mesh.comm)
-
-    def _wrapper(*args):
-        args[0].fill(0.0E0)
-        _args = list(args)
-        if coords is not None:
-            _args.insert(1, coords)
-        return funptr(*[a.ctypes.data for a in _args])
-
-    return _wrapper
-
-
-@lru_cache(maxsize=10)
-def matfree_reference_prolongator(Vf, Vc):
-    """
-    Return the prolongation from Vc to Vf on the reference element.
-    """
-    dimf = Vf.value_size * Vf.finat_element.space_dimension()
-    dimc = Vc.value_size * Vc.finat_element.space_dimension()
-    build_Afc = reference_moments(ufl.TestFunction(Vf), ufl.TrialFunction(Vc))
-    apply_Aff = reference_moments(ufl.TestFunction(Vf), ufl.Coefficient(Vf))
-    diag_Aff = reference_moments(ufl.TestFunction(Vf), ufl.TrialFunction(Vf), diagonal=True)
-    Ax = numpy.empty((dimf,), dtype=PETSc.ScalarType)
-    Dx = numpy.empty((dimf,), dtype=PETSc.ScalarType)
-    diagonal = numpy.empty((dimf,), dtype=PETSc.ScalarType)
-    result = numpy.empty((dimf, dimc), dtype=PETSc.ScalarType)
-
-    def _afun(x):
-        nonlocal Ax, Dx, diagonal
-        numpy.multiply(x, diagonal, out=Dx)
-        apply_Aff(Ax, Dx)
-        numpy.multiply(Ax, diagonal, out=Ax)
-        return Ax
-
-    if Vf.comm.rank == 0:
-        from scipy.sparse.linalg import cg, LinearOperator
-        build_Afc(result)
-        diag_Aff(diagonal)
-        numpy.sqrt(diagonal, out=diagonal)
-        numpy.reciprocal(diagonal, out=diagonal)
-        A = LinearOperator((dimf, dimf), _afun, dtype=result.dtype)
-        for k in range(dimc):
-            numpy.multiply(result[:, k], diagonal, out=result[:, k])
-            result[:, k], _ = cg(A, result[:, k], tol=1E-12)
-            numpy.multiply(result[:, k], diagonal, out=result[:, k])
-
-    result = Vf.comm.bcast(result, root=0)
-    return result
-
-
 def prolongation_transfer_kernel_action(Vf, expr):
     from tsfc import compile_expression_dual_evaluation
     from tsfc.finatinterface import create_element
@@ -719,7 +630,7 @@ def compare_dual_basis(l1, l2):
 
 @lru_cache(maxsize=10)
 @PETSc.Log.EventDecorator("GetLineElements")
-def get_line_elements(V):
+def get_permutation_to_line_elements(V):
     from FIAT.reference_element import LINE
     from tsfc.finatinterface import create_element
     ele = V.ufl_element()
@@ -731,10 +642,6 @@ def get_line_elements(V):
     if finat_ele.space_dimension() != V.finat_element.space_dimension():
         raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
 
-    def cyclic_perm(a):
-        return [a[i:] + a[:i] for i in range(len(a))]
-
-    permutations = []
     line_elements = []
     axes_shifts = []
 
@@ -744,26 +651,48 @@ def cyclic_perm(a):
         expansion = tuple(e.fiat_equivalent for e in reversed(factors))
         if not all([e.get_reference_element().shape == LINE for e in expansion]):
             raise ValueError("Failed to decompose %s into line elements" % V.ufl_element())
-
-        shift = -1
-        for k, perm in enumerate(permutations):
-            is_perm = all([e1.space_dimension() == e2.space_dimension()
-                           for e1, e2 in zip(perm, expansion)])
-            for e1, e2 in zip(perm, expansion):
-                if is_perm:
-                    is_perm = compare_element(e1, e2)
-
-            if is_perm:
-                shift = len(expansion) - k
-                axes_shifts[-1] = axes_shifts[-1] + (shift, )
-                break
-
-        if shift == -1:
-            line_elements.append(expansion)
-            axes_shifts.append((0, ))
-            permutations = cyclic_perm(expansion)
-
-    return line_elements, axes_shifts
+        line_elements.append(expansion)
+
+    tp_shape = [tuple(e.space_dimension() for e in expansion) for expansion in line_elements]
+    sizes = list(map(numpy.prod, tp_shape))
+    dof_ranges = numpy.cumsum([0] + sizes)
+
+    dof_perm = []
+    shifts = []
+
+    grouped = [False for e in line_elements]
+    nterms = len(line_elements)
+    unique_line_elements = []
+    while not all(grouped):
+        istart = grouped.index(False)
+        expansion = line_elements[istart]
+        unique_line_elements.append(expansion)
+        axes_shifts = tuple()
+
+        tdim = len(expansion)
+        permutations = [expansion[k:] + expansion[:k] for k in range(tdim)]
+        for i in range(istart, nterms):
+            ecur = line_elements[i]
+            if not grouped[i]:
+                for shift, perm in enumerate(permutations):
+                    is_perm = all([e1.space_dimension() == e2.space_dimension()
+                                   for e1, e2 in zip(perm, ecur)])
+                    for e1, e2 in zip(perm, ecur):
+                        if is_perm:
+                            is_perm = compare_element(e1, e2)
+
+                    if is_perm:
+                        axes_shifts += ((tdim - shift) % tdim,)
+                        axes = numpy.arange(tdim)
+                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(tp_shape[istart])
+                        dofs = numpy.transpose(dofs, axes=numpy.roll(axes, -shift))
+                        dof_perm.extend(dofs.flat)
+                        grouped[i] = True
+                        break
+
+        shifts.append(axes_shifts)
+
+    return dof_perm, unique_line_elements, shifts
 
 
 @lru_cache(maxsize=10)
@@ -784,61 +713,6 @@ def fiat_reference_prolongator(felem, celem, derivative=False):
     return evaluate_dual(fdual, celem, ckey)
 
 
-@lru_cache(maxsize=10)
-def finat_reference_prolongator(felem, celem):
-    from finat.quadrature import make_quadrature
-    from gem.interpreter import evaluate
-
-    ref_el = felem.cell
-    tdim = ref_el.get_spatial_dimension()
-    degree = felem.degree
-    try:
-        degree = max(degree)
-    except TypeError:
-        pass
-    quad_degree = 2*degree+1
-
-    def _tabulate(e, ps, entity=None):
-        results = evaluate(e.basis_evaluation(0, ps, entity).values())
-        return results[0].arr.reshape((len(ps.points), -1))
-
-    is_facet_element = True
-    entity_dofs = felem.entity_dofs()
-    for edim in sorted(entity_dofs):
-        v = sum(list(entity_dofs[edim].values()), [])
-        if len(v):
-            if type(edim) == tuple:
-                edim = sum(edim)
-            if edim == tdim:
-                is_facet_element = False
-
-    if is_facet_element and degree > 5:
-        entities = []
-        quadratures = []
-        for key in ref_el.sub_entities:
-            edim = sum(key) if type(key) == tuple else key
-            if edim == tdim-1:
-                sub_entities = ref_el.sub_entities[key]
-                entities.extend([(key, f) for f in sub_entities])
-                quadratures.extend([make_quadrature(ref_el.construct_subelement(key), quad_degree)]*len(sub_entities))
-
-        wts = numpy.concatenate([evaluate([q.weight_expression])[0].arr.reshape((-1,)) for q in quadratures])
-        cphi = numpy.concatenate([_tabulate(celem, q.point_set, entity=e) for q, e in zip(quadratures, entities)]).T
-        fphi = numpy.concatenate([_tabulate(felem, q.point_set, entity=e) for q, e in zip(quadratures, entities)]).T
-    else:
-        quadrature = make_quadrature(ref_el, quad_degree)
-        wts = evaluate([quadrature.weight_expression])[0].arr.reshape((-1,))
-        cphi = _tabulate(celem, quadrature.point_set).T
-        fphi = _tabulate(felem, quadrature.point_set).T
-
-    numpy.sqrt(wts, out=wts)
-    numpy.multiply(fphi, wts, out=fphi)
-    numpy.multiply(cphi, wts, out=cphi)
-    cphi = cphi.reshape((celem.space_dimension(), -1))
-    fphi = fphi.reshape((felem.space_dimension(), -1))
-    return numpy.linalg.solve(fphi.dot(fphi.T), fphi.dot(cphi.T))
-
-
 # Common kernel to compute y = kron(A3, kron(A2, A1)) * x
 # Vector and tensor field generalization from Deville, Fischer, and Mund section 8.3.1.
 kronmxv_code = """
@@ -980,15 +854,13 @@ def _tabulate(e, ps, entity=None):
 @PETSc.Log.EventDecorator("MakeKronCode")
 def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
     """
-    Return interpolation and restriction sub-kernels between enriched tensor product elements
+    Return interpolation and restriction kernels between enriched tensor product elements
     """
     operator_decl = []
     prolong_code = []
     restrict_code = []
-    felems, fshifts = get_line_elements(Vf)
-    celems, cshifts = get_line_elements(Vc)
-    if len(felems) > 3 or len(celems) > 3:
-        raise ValueError("The expansion is too complicated")
+    _, felems, fshifts = get_permutation_to_line_elements(Vf)
+    _, celems, cshifts = get_permutation_to_line_elements(Vc)
 
     shifts = fshifts
     in_place = False
@@ -1002,6 +874,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
             pelem = celems[0]
             perm_name = "perm_%s" % t_in
             celems = celems*len(felems)
+
         elif len(felems) == 1:
             shifts = cshifts
             psize = Vf.value_size
@@ -1011,20 +884,34 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         else:
             raise ValueError("Cannot assign fine to coarse DOFs")
 
-        for k in range(len(shifts)):
-            if Vc.value_size*len(shifts[k]) < Vf.value_size:
-                shifts[k] = shifts[k]*(Vf.value_size//Vc.value_size)
-
-        perm = sum(shifts, tuple())
-        perm_data = ", ".join(map(str, perm))
-        operator_decl.append(f"""
-            PetscBLASInt {perm_name}[{len(perm)}] = {{ {perm_data} }};
-        """)
-
         pshape = [e.space_dimension() for e in pelem]
         pargs = ", ".join(map(str, pshape+[1]*(3-len(pshape))))
         pstride = psize * numpy.prod(pshape)
-        if shifts == fshifts:
+
+        if set(cshifts) == set(fshifts):
+            psize *= len(cshifts[0])
+            pstride *= len(cshifts[0])
+            prolong_code.append(f"""
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                permute_axis(0, {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                ipermute_axis(0, {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+            psize = 1
+
+        elif pelem == celems[0]:
+            for k in range(len(shifts)):
+                if Vc.value_size*len(shifts[k]) < Vf.value_size:
+                    shifts[k] = shifts[k]*(Vf.value_size//Vc.value_size)
+
+            perm = sum(shifts, tuple())
+            perm_data = ", ".join(map(str, perm))
+            operator_decl.append(f"""
+                PetscBLASInt {perm_name}[{len(perm)}] = {{ {perm_data} }};
+            """)
+
             prolong_code.append(f"""
             for({IntType_c} j=1; j<{len(perm)}; j++)
                 permute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
@@ -1205,7 +1092,7 @@ def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
 
 
 def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
-    _, shifts = get_line_elements(V)
+    _, _, shifts = get_permutation_to_line_elements(V)
     shift = shifts[0]
     if shift != (0,):
         ndof = numpy.prod(vshape)
@@ -1257,22 +1144,10 @@ def get_permuted_map(V):
     Return a PermutedMap with the same tensor product shape for
     every component of H(div) or H(curl) tensor product elements
     """
-    expansion, shifts = get_line_elements(V)
+
+    perm, _, shifts = get_permutation_to_line_elements(V)
     if {(0, )} == set(shifts):
         return V.cell_node_map()
-
-    istart = 0
-    perm = []
-    for factors, shift in zip(expansion, shifts):
-        axes = numpy.arange(len(factors))
-        pshape = [len(shift)] + [e.space_dimension() for e in factors]
-        iend = istart + numpy.prod(pshape)
-        permutation = numpy.reshape(numpy.arange(istart, iend), pshape)
-        for k in range(permutation.shape[0]):
-            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, shift[k])), pshape[1:])
-        perm.extend(permutation.flat)
-        istart = iend
-
     return PermutedMap(V.cell_node_map(), perm)
 
 
@@ -1529,66 +1404,29 @@ def make_kernels(self, Vf, Vc):
 
         This is temporary while we wait for dual evaluation in FInAT.
         """
-        try:
-            prolong_kernel, _ = prolongation_transfer_kernel_action(Vf, self.uc)
-            matrix_kernel, coefficients = prolongation_transfer_kernel_action(Vf, firedrake.TestFunction(Vc))
-            # The way we transpose the prolongation kernel is suboptimal.
-            # A local matrix is generated each time the kernel is executed.
-            element_kernel = loopy.generate_code_v2(matrix_kernel.code).device_code()
-            element_kernel = element_kernel.replace("void expression_kernel", "static void expression_kernel")
-            coef_args = "".join([", c%d" % i for i in range(len(coefficients))])
-            coef_decl = "".join([", const %s *restrict c%d" % (ScalarType_c, i) for i in range(len(coefficients))])
-            dimc = Vc.finat_element.space_dimension() * Vc.value_size
-            dimf = Vf.finat_element.space_dimension() * Vf.value_size
-            restrict_code = f"""
-            {element_kernel}
-
-            void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w{coef_decl})
-            {{
-                {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}};
-                expression_kernel(Afc{coef_args});
-                for ({IntType_c} i = 0; i < {dimf}; i++)
-                   for ({IntType_c} j = 0; j < {dimc}; j++)
-                       Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i];
-            }}
-            """
-            restrict_kernel = op2.Kernel(restrict_code, "restriction", requires_zeroed_output_arguments=True)
-        except NotImplementedError:
-            if Vc.ufl_element().mapping() != Vf.ufl_element().mapping():
-                raise NotImplementedError("Prolongation not supported from %s to %s" % (Vc.ufl_element(), Vf.ufl_element()))
-            if Vf.finat_element.space_dimension() < 400:
-                Jmat = finat_reference_prolongator(Vf.finat_element, Vc.finat_element)
-            else:
-                Jmat = matfree_reference_prolongator(Vf, Vc)
-            dimf, dimc = Jmat.shape
-            vsize = (Vc.value_size*Vc.finat_element.space_dimension())//dimc
-            Jdata = ", ".join(map(float.hex, Jmat.flat))
-            kernel_code = f"""
-            void prolongation({ScalarType_c} *restrict uf, const {ScalarType_c} *restrict uc)
-            {{
-                {ScalarType_c} Afc[{dimf}*{dimc}] = {{ {Jdata} }};
-                for ({IntType_c} i = 0; i < {vsize}*{dimf}; i++)
-                   uf[i] = 0.0E0;
-
-                for ({IntType_c} i = 0; i < {dimf}; i++)
-                    for ({IntType_c} j = 0; j < {dimc}; j++)
-                        for ({IntType_c} k = 0; k < {vsize}; k++)
-                            uf[i*{vsize}+k] += Afc[i*{dimc} + j] * uc[j*{vsize}+k];
-            }}
-
-            void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w)
-            {{
-                {ScalarType_c} Afc[{dimf}*{dimc}] = {{ {Jdata} }};
-                for ({IntType_c} i = 0; i < {dimf}; i++)
-                    for ({IntType_c} j = 0; j < {dimc}; j++)
-                        for ({IntType_c} k = 0; k < {vsize}; k++)
-                            Rc[j*{vsize}+k] += Afc[i*{dimc} + j] * Rf[i*{vsize}+k] * w[i*{vsize}+k];
-            }}
-            """
-            prolong_kernel = op2.Kernel(kernel_code, "prolongation", requires_zeroed_output_arguments=True)
-            restrict_kernel = op2.Kernel(kernel_code, "restriction", requires_zeroed_output_arguments=True)
-            coefficients = []
-
+        prolong_kernel, _ = prolongation_transfer_kernel_action(Vf, self.uc)
+        matrix_kernel, coefficients = prolongation_transfer_kernel_action(Vf, firedrake.TestFunction(Vc))
+        # The way we transpose the prolongation kernel is suboptimal.
+        # A local matrix is generated each time the kernel is executed.
+        element_kernel = loopy.generate_code_v2(matrix_kernel.code).device_code()
+        element_kernel = element_kernel.replace("void expression_kernel", "static void expression_kernel")
+        coef_args = "".join([", c%d" % i for i in range(len(coefficients))])
+        coef_decl = "".join([", const %s *restrict c%d" % (ScalarType_c, i) for i in range(len(coefficients))])
+        dimc = Vc.finat_element.space_dimension() * Vc.value_size
+        dimf = Vf.finat_element.space_dimension() * Vf.value_size
+        restrict_code = f"""
+        {element_kernel}
+
+        void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w{coef_decl})
+        {{
+            {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}};
+            expression_kernel(Afc{coef_args});
+            for ({IntType_c} i = 0; i < {dimf}; i++)
+               for ({IntType_c} j = 0; j < {dimc}; j++)
+                   Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i];
+        }}
+        """
+        restrict_kernel = op2.Kernel(restrict_code, "restriction", requires_zeroed_output_arguments=True)
         return prolong_kernel, restrict_kernel, coefficients
 
     def multTranspose(self, mat, rf, rc):
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index ff6d51be54..fb6d882617 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -5,7 +5,7 @@
 @pytest.fixture(params=[2, 3],
                 ids=["Rectangle", "Box"])
 def tp_mesh(request):
-    nx = 4
+    nx = 1
     distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
     m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
     if request.param == 3:
@@ -28,26 +28,32 @@ def tp_family(tp_mesh, request):
     return families[request.param]
 
 
-@pytest.fixture(params=[None, "fdm", "hierarchical"], ids=["spectral", "fdm", "hierarchical"])
+@pytest.fixture(params=[None, "hierarchical", "fdm"], ids=["spectral", "hierarchical", "fdm"])
 def variant(request):
     return request.param
 
 
-def test_reconstruct_degree(tp_mesh):
-    tdim = tp_mesh.topological_dimension()
-    elist = []
-    for degree in [7, 2, 31]:
-        V = VectorFunctionSpace(tp_mesh, "Q", degree)
-        Q = FunctionSpace(tp_mesh, "DQ", degree-2)
-        Z = MixedFunctionSpace([V, Q])
-        e = Z.ufl_element()
-        elist.append(e)
-        assert e == PMGPC.reconstruct_degree(elist[0], degree)
+@pytest.fixture(params=[0, 1],
+                ids=["CG-DG", "HDiv-DG"])
+def mixed_family(tp_mesh, request):
+    if request.param == 0:
+        Vfamily = "Q"
+    else:
+        tdim = tp_mesh.topological_dimension()
+        Vfamily = "NCF" if tdim == 3 else "RTCF"
+    Qfamily = "DQ"
+    return Vfamily, Qfamily
+
 
+def test_reconstruct_degree(tp_mesh, mixed_family):
     elist = []
+    Vfamily, Qfamily = mixed_family
     for degree in [7, 2, 31]:
-        V = FunctionSpace(tp_mesh, "NCF" if tdim == 3 else "RTCF", degree)
-        Q = FunctionSpace(tp_mesh, "DQ", degree-1)
+        if Vfamily in ["NCF", "RTCF"]:
+            V = FunctionSpace(tp_mesh, Vfamily, degree)
+        else:
+            V = VectorFunctionSpace(tp_mesh, Vfamily, degree)
+        Q = FunctionSpace(tp_mesh, Qfamily, degree-2)
         Z = MixedFunctionSpace([V, Q])
         e = Z.ufl_element()
         elist.append(e)
@@ -80,7 +86,7 @@ def test_prolong_de_rham(tp_mesh):
 def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
-    degree = 3
+    degree = 2
     cell = tp_mesh.ufl_cell()
     element = FiniteElement(tp_family, cell=cell, degree=degree, variant=variant)
     Vi = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="interior"))
@@ -90,14 +96,14 @@ def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     ui = Function(Vi)
     uf = Function(Vf)
     uc = Function(Vc)
-    uc.dat.data[0::2] = 0.0
+    uc.dat.data[0::2] = 2.0
     uc.dat.data[1::2] = 1.0
 
     for v in [ui, uf]:
         P = prolongation_matrix_matfree(v, uc).getPythonContext()
         P._prolong()
 
-    assert norm(ui + uf - uc, "L2")  < 2E-14
+    assert norm(ui + uf - uc, "L2") < 2E-14
 
 
 @pytest.fixture(params=["triangles", "quadrilaterals"], scope="module")
@@ -436,7 +442,7 @@ def test_p_fas_nonlinear_scalar():
 
     rtol = 1E-8
     atol = rtol * Fnorm
-
+    rtol = 0.0
     newton = {
         "mat_type": "aij",
         "snes_monitor": None,
@@ -444,7 +450,7 @@ def test_p_fas_nonlinear_scalar():
         "snes_type": "newtonls",
         "snes_max_it": 20,
         "snes_atol": atol,
-        "snes_rtol": 1E-50}
+        "snes_rtol": rtol}
 
     coarse = {
         "ksp_type": "preonly",
@@ -459,7 +465,7 @@ def test_p_fas_nonlinear_scalar():
 
     pmg = {
         "ksp_atol": atol*1E-1,
-        "ksp_rtol": 1E-50,
+        "ksp_rtol": rtol,
         "ksp_type": "cg",
         "ksp_converged_reason": None,
         "ksp_monitor_true_residual": None,
@@ -479,7 +485,7 @@ def test_p_fas_nonlinear_scalar():
         "snes_monitor": None,
         "snes_converged_reason": None,
         "snes_atol": atol,
-        "snes_rtol": 1E-50,
+        "snes_rtol": rtol,
         "snes_type": "python",
         "snes_python_type": "firedrake.PMGSNES",
         "pfas_snes_fas_type": "kaskade",
@@ -503,7 +509,7 @@ def check_coarsen_quadrature(solver):
             Nq, = Nq
             Nl = p.u.ufl_element().degree()
             try:
-                Nl, = set(Nl)
+                Nl = max(Nl)
             except TypeError:
                 pass
             assert Nq == 3*Nl+2
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index e4217b5129..dc76294909 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -21,11 +21,12 @@
     "pc_python_type": "firedrake.P1PC",
     "pmg_mg_coarse": coarse,
     "pmg_mg_levels": {
+        "ksp_max_it": 1,
         "ksp_type": "chebyshev",
         "ksp_norm_type": "none",
         "esteig_ksp_type": "cg",
         "esteig_ksp_norm_type": "natural",
-        "ksp_chebyshev_esteig": "0.75,0.25,0.0,1.0",
+        "ksp_chebyshev_esteig": "0.5,0.5,0.0,1.0",
         "pc_type": "python",
         "pc_python_type": "firedrake.FDMPC",
         "fdm": {
@@ -55,11 +56,12 @@
         "pc_python_type": "firedrake.P1PC",
         "pmg_mg_coarse": coarse,
         "pmg_mg_levels": {
+            "ksp_max_it": 1,
             "ksp_type": "chebyshev",
             "ksp_norm_type": "none",
             "esteig_ksp_type": "cg",
             "esteig_ksp_norm_type": "natural",
-            "ksp_chebyshev_esteig": "0.75,0.25,0.0,1.0",
+            "ksp_chebyshev_esteig": "0.5,0.5,0.0,1.0",
             "pc_type": "python",
             "pc_python_type": "firedrake.ASMExtrudedStarPC",
             "pc_star_mat_ordering_type": "nd",
@@ -72,7 +74,7 @@
 facetstar.update(ksp)
 
 
-def solve_riesz_map(V, d):
+def build_riesz_map(V, d):
     beta = Constant(1E-4)
     subs = [(1, 3)]
     if V.mesh().cell_set._extruded:
@@ -84,7 +86,8 @@ def solve_riesz_map(V, d):
         u_exact = exp(-10*dot(x, x))
         u_bc = u_exact
     else:
-        u_exact = x * exp(-10*dot(x, x))
+        A = Constant([[-1.]*len(x)]*len(x)) + diag(Constant([len(x)]*len(x)))
+        u_exact = dot(A, x) * exp(-10*dot(x, x))
         u_bc = Function(V)
         u_bc.project(u_exact, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})
 
@@ -94,14 +97,14 @@ def solve_riesz_map(V, d):
     test = TestFunction(V)
     trial = TrialFunction(V)
     a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
-    problem = LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
-    its = []
-    for sparams in [fdmstar, facetstar]:
-        uh.assign(0)
-        solver = LinearVariationalSolver(problem, solver_parameters=sparams)
-        solver.solve()
-        its.append(solver.snes.ksp.getIterationNumber())
-    return its
+    return LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
+
+
+def solve_riesz_map(problem, solver_parameters):
+    problem.u.assign(0)
+    solver = LinearVariationalSolver(problem, solver_parameters=solver_parameters)
+    solver.solve()
+    return solver.snes.ksp.getIterationNumber()
 
 
 @pytest.fixture(params=[2, 3],
@@ -125,33 +128,42 @@ def variant(request):
 
 
 @pytest.mark.skipcomplex
-def test_p_independence_hgrad(mesh):
+def test_p_independence_hgrad(mesh, variant):
     family = "Lagrange"
-    expected = [9, 9] if mesh.topological_dimension() == 3 else [5, 5]
+    expected = [16, 12] if mesh.topological_dimension() == 3 else [9, 7]
+    solvers = [fdmstar] if variant is None else [fdmstar, facetstar]
     for degree in range(3, 6):
-        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant=variant)
         V = FunctionSpace(mesh, element)
-        assert solve_riesz_map(V, grad) <= expected
+        problem = build_riesz_map(V, grad)
+        for sp, max_it in zip(solvers, expected[:len(solvers)]):
+            assert solve_riesz_map(problem, sp) <= max_it
 
 
 @pytest.mark.skipcomplex
 def test_p_independence_hcurl(mesh):
     family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
-    expected = [8, 7] if mesh.topological_dimension() == 3 else [4, 4]
+    expected = [13, 10] if mesh.topological_dimension() == 3 else [6, 6]
+    solvers = [fdmstar, facetstar]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
-        assert solve_riesz_map(V, curl) <= expected
+        problem = build_riesz_map(V, curl)
+        for sp, max_it in zip(solvers, expected[:len(solvers)]):
+            assert solve_riesz_map(problem, sp) <= max_it
 
 
 @pytest.mark.skipcomplex
 def test_p_independence_hdiv(mesh):
     family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
-    expected = [3, 3]
+    expected = [6, 6]
+    solvers = [fdmstar, facetstar]
     for degree in range(3, 6):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
-        assert solve_riesz_map(V, div) <= expected
+        problem = build_riesz_map(V, div)
+        for sp, max_it in zip(solvers, expected[:len(solvers)]):
+            assert solve_riesz_map(problem, sp) <= max_it
 
 
 @pytest.mark.skipcomplex
@@ -182,7 +194,8 @@ def test_variable_coefficient(mesh):
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
     solver.solve()
-    assert solver.snes.ksp.getIterationNumber() <= 14
+    expected = 23 if gdim == 3 else 14
+    assert solver.snes.ksp.getIterationNumber() <= expected
 
 
 @pytest.fixture(params=["cg", "dg", "rt"],

From 8fab0081bd805a2192161cb9acf64154e698a674 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 14 Mar 2023 22:32:50 +0000
Subject: [PATCH 16/75] expand FInAT elements

---
 firedrake/preconditioners/pmg.py    | 115 +++++++++++-----------------
 tests/multigrid/test_p_multigrid.py |  28 ++++++-
 2 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 61ed8e06d9..b8638d1095 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -10,6 +10,7 @@
 from firedrake.utils import ScalarType_c, IntType_c, cached_property
 from firedrake.petsc import PETSc
 import firedrake
+import finat
 import ufl
 import loopy
 import numpy
@@ -97,7 +98,7 @@ def initialize(self, obj):
         ppc = self.configure_pmg(obj, pdm)
         is_snes = isinstance(obj, PETSc.SNES)
 
-        copts = PETSc.Options(ppc.getOptionsPrefix()+ppc.getType()+"_coarse_")
+        copts = PETSc.Options(ppc.getOptionsPrefix() + ppc.getType() + "_coarse_")
 
         # Get the coarse degree from PETSc options
         fcp = ctx._problem.form_compiler_parameters
@@ -311,7 +312,7 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
     def coarsen_quadrature(self, metadata, fdeg, cdeg):
         if isinstance(metadata, dict):
             # Coarsen the quadrature degree in a dictionary
-            # such that the ratio of quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1) is preserved
+            # preserving the ratio of quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1)
             qdeg = metadata.get("quadrature_degree", None)
             if qdeg is not None:
                 cmd = dict(metadata)
@@ -529,49 +530,26 @@ def expand_element(ele):
     """
     Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
     """
-    if ele.cell().cellname().startswith("quadrilateral"):
-        # Handle immersed quadrilaterals
-        quadrilateral_tpc = ufl.TensorProductCell(ufl.interval, ufl.interval)
-        return expand_element(ele.reconstruct(cell=quadrilateral_tpc))
-    elif ele.cell() == ufl.hexahedron:
-        hexahedron_tpc = ufl.TensorProductCell(ufl.quadrilateral, ufl.interval)
-        return expand_element(ele.reconstruct(cell=hexahedron_tpc))
-    elif isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
-        return expand_element(ele._sub_element)
-    elif isinstance(ele, ufl.MixedElement):
-        return type(ele)(*[expand_element(e) for e in ele.sub_elements()])
-    elif isinstance(ele, ufl.RestrictedElement):
-        return type(ele)(expand_element(ele._element), restriction_domain=ele._restriction_domain)
-    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
-        return expand_element(ele._element)
-    elif isinstance(ele, ufl.WithMapping):
-        return expand_element(ele.wrapee)
-    elif isinstance(ele, ufl.EnrichedElement):
-        terms = []
-        for e in ele._elements:
-            ee = expand_element(e)
-            if isinstance(ee, ufl.EnrichedElement):
-                terms.extend(ee._elements)
-            else:
-                terms.append(ee)
-        cell, = set([t.cell() for t in terms])
-        return ufl.EnrichedElement(*terms)
-    elif isinstance(ele, ufl.TensorProductElement):
-        factors = [expand_element(e) for e in ele.sub_elements()]
+    if isinstance(ele, finat.FlattenedDimensions):
+        return expand_element(ele.product)
+    elif isinstance(ele, (finat.HDivElement, finat.HCurlElement)):
+        return expand_element(ele.wrappee)
+    elif isinstance(ele, finat.DiscontinuousElement):
+        return expand_element(ele.element)
+    elif isinstance(ele, finat.EnrichedElement):
+        terms = list(map(expand_element, ele.elements))
+        return finat.EnrichedElement(terms)
+    elif isinstance(ele, finat.TensorProductElement):
+        factors = list(map(expand_element, ele.factors))
         terms = [tuple()]
         for e in factors:
             new_terms = []
-            for f in e._elements if isinstance(e, ufl.EnrichedElement) else [e]:
-                f_factors = f.sub_elements() if isinstance(f, ufl.TensorProductElement) else (f,)
+            for f in e.elements if isinstance(e, finat.EnrichedElement) else [e]:
+                f_factors = tuple(f.factors) if isinstance(f, finat.TensorProductElement) else (f,)
                 new_terms.extend([t_factors + f_factors for t_factors in terms])
             terms = new_terms
-
-        if len(terms) == 1:
-            return ufl.TensorProductElement(*terms[0])
-        else:
-            terms = [ufl.TensorProductElement(*k) for k in terms]
-            cell, = set([t.cell() for t in terms])
-            return ufl.EnrichedElement(*terms)
+        terms = list(map(finat.TensorProductElement, terms))
+        return finat.EnrichedElement(terms)
     else:
         return ele
 
@@ -632,19 +610,15 @@ def compare_dual_basis(l1, l2):
 @PETSc.Log.EventDecorator("GetLineElements")
 def get_permutation_to_line_elements(V):
     from FIAT.reference_element import LINE
-    from tsfc.finatinterface import create_element
     ele = V.ufl_element()
     if isinstance(ele, ufl.MixedElement) and not isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
         raise ValueError("MixedElements are not decomposed into tensor products")
 
-    ele = expand_element(ele)
-    finat_ele = create_element(ele)
+    finat_ele = expand_element(V.finat_element)
     if finat_ele.space_dimension() != V.finat_element.space_dimension():
         raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
 
     line_elements = []
-    axes_shifts = []
-
     terms = finat_ele.elements if hasattr(finat_ele, "elements") else [finat_ele]
     for term in terms:
         factors = term.factors if hasattr(term, "factors") else (term,)
@@ -658,40 +632,43 @@ def get_permutation_to_line_elements(V):
     dof_ranges = numpy.cumsum([0] + sizes)
 
     dof_perm = []
+    unique_line_elements = []
     shifts = []
 
-    grouped = [False for e in line_elements]
-    nterms = len(line_elements)
-    unique_line_elements = []
-    while not all(grouped):
-        istart = grouped.index(False)
-        expansion = line_elements[istart]
-        unique_line_elements.append(expansion)
+    visit = [False for e in line_elements]
+    while False in visit:
+        base = line_elements[visit.index(False)]
+        tdim = len(base)
+        pshape = tuple(e.space_dimension() for e in base)
+        unique_line_elements.append(base)
+
         axes_shifts = tuple()
+        for shift in range(tdim):
+            if V.finat_element.formdegree != 2:
+                shift = (tdim - shift) % tdim
 
-        tdim = len(expansion)
-        permutations = [expansion[k:] + expansion[:k] for k in range(tdim)]
-        for i in range(istart, nterms):
-            ecur = line_elements[i]
-            if not grouped[i]:
-                for shift, perm in enumerate(permutations):
+            perm = base[shift:] + base[:shift]
+            for i, expansion in enumerate(line_elements):
+                if not visit[i]:
                     is_perm = all([e1.space_dimension() == e2.space_dimension()
-                                   for e1, e2 in zip(perm, ecur)])
-                    for e1, e2 in zip(perm, ecur):
+                                   for e1, e2 in zip(perm, expansion)])
+                    for e1, e2 in zip(perm, expansion):
                         if is_perm:
                             is_perm = compare_element(e1, e2)
 
                     if is_perm:
-                        axes_shifts += ((tdim - shift) % tdim,)
-                        axes = numpy.arange(tdim)
-                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(tp_shape[istart])
-                        dofs = numpy.transpose(dofs, axes=numpy.roll(axes, -shift))
-                        dof_perm.extend(dofs.flat)
-                        grouped[i] = True
+                        axes_shifts += ((tdim - shift) % tdim, )
+                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(tp_shape[i])
+                        dofs = numpy.transpose(dofs, axes=numpy.roll(numpy.arange(tdim), shift))
+                        assert dofs.shape == pshape
+                        dof_perm.append(dofs.flat)
+                        visit[i] = True
                         break
 
         shifts.append(axes_shifts)
 
+    dof_perm = numpy.concatenate(dof_perm)
+    dof_perm = numpy.argsort(dof_perm)
     return dof_perm, unique_line_elements, shifts
 
 
@@ -1138,16 +1115,16 @@ def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
     return decl, prolong, restrict
 
 
-@PETSc.Log.EventDecorator("GetPermutedMap")
 def get_permuted_map(V):
     """
     Return a PermutedMap with the same tensor product shape for
     every component of H(div) or H(curl) tensor product elements
     """
 
-    perm, _, shifts = get_permutation_to_line_elements(V)
-    if {(0, )} == set(shifts):
+    perm, _, _ = get_permutation_to_line_elements(V)
+    if all(perm[:-1] < perm[1:]):
         return V.cell_node_map()
+
     return PermutedMap(V.cell_node_map(), perm)
 
 
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index fb6d882617..37c56a37fa 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -96,13 +96,39 @@ def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     ui = Function(Vi)
     uf = Function(Vf)
     uc = Function(Vc)
-    uc.dat.data[0::2] = 2.0
+    uc.dat.data[0::2] = 0.0
     uc.dat.data[1::2] = 1.0
+    # import numpy
+    # from firedrake.preconditioners.pmg import get_permutation_to_line_elements
+
+    # cperm, _, _ = get_permutation_to_line_elements(Vc)
+    # cnum = Vc.cell_node_map().values[0]
+    # uc.dat.data[cnum[cperm]] = numpy.arange(1, 1+len(uc.dat.data))
+
+    # fperm, _, _ = get_permutation_to_line_elements(Vf)
+    # fnum = Vf.cell_node_map().values[0]
+
+    # print()
+    # # print("cperm", cperm)
+    # # print("fperm", fperm)
+    # print("inv(cperm)", numpy.argsort(cperm))
+    # print("inv(fperm)", numpy.argsort(fperm))
+
+    # expr = Constant([0]*3)
+    # for row in numpy.eye(3):
+    #     expr.assign(row)
+    #     uf.project(expr, solver_parameters={"mat_type": "matfree"})
+    #     print(numpy.nonzero(numpy.rint(uf.dat.data[fnum]).astype(int))[0])
+
+    # uf.project(uc, solver_parameters={"mat_type": "matfree"})
+    # print(numpy.rint(uf.dat.data[fnum[fperm]]).astype(int))
 
     for v in [ui, uf]:
         P = prolongation_matrix_matfree(v, uc).getPythonContext()
         P._prolong()
 
+    # print(numpy.rint(uf.dat.data[fnum[fperm]]).astype(int))
+
     assert norm(ui + uf - uc, "L2") < 2E-14
 
 

From fc88b9a18fed0d6c572b251904193852f9f40d16 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 15 Mar 2023 11:02:45 +0000
Subject: [PATCH 17/75] fix BLAS kernels for FacetElement(NCE)

---
 firedrake/preconditioners/fdm.py    |  2 +-
 firedrake/preconditioners/pmg.py    | 62 ++++++++++++++---------------
 tests/multigrid/test_p_multigrid.py | 31 ++-------------
 3 files changed, 33 insertions(+), 62 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index a4dd248622..1aeaa8b3a3 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1074,7 +1074,7 @@ class PoissonFDMPC(FDMPC):
     def assemble_reference_tensor(self, V):
         from firedrake.preconditioners.pmg import get_permutation_to_line_elements
         try:
-            _, line_elements, shifts = get_permutation_to_line_elements(V)
+            _, line_elements, shifts = get_permutation_to_line_elements(V.finat_element)
         except ValueError:
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index b8638d1095..6777b1ecd9 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -608,27 +608,24 @@ def compare_dual_basis(l1, l2):
 
 @lru_cache(maxsize=10)
 @PETSc.Log.EventDecorator("GetLineElements")
-def get_permutation_to_line_elements(V):
+def get_permutation_to_line_elements(finat_element):
     from FIAT.reference_element import LINE
-    ele = V.ufl_element()
-    if isinstance(ele, ufl.MixedElement) and not isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
-        raise ValueError("MixedElements are not decomposed into tensor products")
 
-    finat_ele = expand_element(V.finat_element)
-    if finat_ele.space_dimension() != V.finat_element.space_dimension():
-        raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
+    expansion = expand_element(finat_element)
+    if expansion.space_dimension() != finat_element.space_dimension():
+        raise ValueError("Failed to decompose %s into tensor products" % finat_element)
 
     line_elements = []
-    terms = finat_ele.elements if hasattr(finat_ele, "elements") else [finat_ele]
+    terms = expansion.elements if hasattr(expansion, "elements") else [expansion]
     for term in terms:
         factors = term.factors if hasattr(term, "factors") else (term,)
-        expansion = tuple(e.fiat_equivalent for e in reversed(factors))
-        if not all([e.get_reference_element().shape == LINE for e in expansion]):
-            raise ValueError("Failed to decompose %s into line elements" % V.ufl_element())
-        line_elements.append(expansion)
+        fiat_factors = tuple(e.fiat_equivalent for e in reversed(factors))
+        if not all([e.get_reference_element().shape == LINE for e in fiat_factors]):
+            raise ValueError("Failed to decompose %s into line elements" % fiat_factors)
+        line_elements.append(fiat_factors)
 
-    tp_shape = [tuple(e.space_dimension() for e in expansion) for expansion in line_elements]
-    sizes = list(map(numpy.prod, tp_shape))
+    shapes = [tuple(e.space_dimension() for e in factors) for factors in line_elements]
+    sizes = list(map(numpy.prod, shapes))
     dof_ranges = numpy.cumsum([0] + sizes)
 
     dof_perm = []
@@ -644,23 +641,23 @@ def get_permutation_to_line_elements(V):
 
         axes_shifts = tuple()
         for shift in range(tdim):
-            if V.finat_element.formdegree != 2:
+            if finat_element.formdegree != 2:
                 shift = (tdim - shift) % tdim
 
             perm = base[shift:] + base[:shift]
-            for i, expansion in enumerate(line_elements):
+            for i, term in enumerate(line_elements):
                 if not visit[i]:
                     is_perm = all([e1.space_dimension() == e2.space_dimension()
-                                   for e1, e2 in zip(perm, expansion)])
-                    for e1, e2 in zip(perm, expansion):
+                                   for e1, e2 in zip(perm, term)])
+                    for e1, e2 in zip(perm, term):
                         if is_perm:
                             is_perm = compare_element(e1, e2)
 
                     if is_perm:
                         axes_shifts += ((tdim - shift) % tdim, )
-                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(tp_shape[i])
-                        dofs = numpy.transpose(dofs, axes=numpy.roll(numpy.arange(tdim), shift))
-                        assert dofs.shape == pshape
+                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(pshape)
+                        dofs = numpy.transpose(dofs, axes=numpy.roll(numpy.arange(tdim), -shift))
+                        assert dofs.shape == shapes[i]
                         dof_perm.append(dofs.flat)
                         visit[i] = True
                         break
@@ -668,7 +665,6 @@ def get_permutation_to_line_elements(V):
         shifts.append(axes_shifts)
 
     dof_perm = numpy.concatenate(dof_perm)
-    dof_perm = numpy.argsort(dof_perm)
     return dof_perm, unique_line_elements, shifts
 
 
@@ -836,8 +832,8 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
     operator_decl = []
     prolong_code = []
     restrict_code = []
-    _, felems, fshifts = get_permutation_to_line_elements(Vf)
-    _, celems, cshifts = get_permutation_to_line_elements(Vc)
+    _, felems, fshifts = get_permutation_to_line_elements(Vf.finat_element)
+    _, celems, cshifts = get_permutation_to_line_elements(Vc.finat_element)
 
     shifts = fshifts
     in_place = False
@@ -866,17 +862,17 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         pstride = psize * numpy.prod(pshape)
 
         if set(cshifts) == set(fshifts):
-            psize *= len(cshifts[0])
-            pstride *= len(cshifts[0])
+            csize = Vc.value_size * Vc.finat_element.space_dimension()
             prolong_code.append(f"""
-            for({IntType_c} j=1; j<{len(fshifts)}; j++)
-                permute_axis(0, {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            for({IntType_c} i=1; i<{len(fshifts)}; i++)
+                for({IntType_c} j=0; j<{csize}; j++)
+                    {t_in}[i*{csize} + j] = {t_in}[j];
             """)
             restrict_code.append(f"""
-            for({IntType_c} j=1; j<{len(fshifts)}; j++)
-                ipermute_axis(0, {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            for({IntType_c} i=1; i<{len(fshifts)}; i++)
+                for({IntType_c} j=0; j<{csize}; j++)
+                    {t_in}[j] += {t_in}[i*{csize} + j];
             """)
-            psize = 1
 
         elif pelem == celems[0]:
             for k in range(len(shifts)):
@@ -1069,7 +1065,7 @@ def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
 
 
 def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
-    _, _, shifts = get_permutation_to_line_elements(V)
+    _, _, shifts = get_permutation_to_line_elements(V.finat_element)
     shift = shifts[0]
     if shift != (0,):
         ndof = numpy.prod(vshape)
@@ -1121,7 +1117,7 @@ def get_permuted_map(V):
     every component of H(div) or H(curl) tensor product elements
     """
 
-    perm, _, _ = get_permutation_to_line_elements(V)
+    perm, _, _ = get_permutation_to_line_elements(V.finat_element)
     if all(perm[:-1] < perm[1:]):
         return V.cell_node_map()
 
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 37c56a37fa..f81b707fda 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -28,7 +28,8 @@ def tp_family(tp_mesh, request):
     return families[request.param]
 
 
-@pytest.fixture(params=[None, "hierarchical", "fdm"], ids=["spectral", "hierarchical", "fdm"])
+@pytest.fixture(params=[None, "hierarchical", "fdm"],
+                ids=["spectral", "hierarchical", "fdm"])
 def variant(request):
     return request.param
 
@@ -86,7 +87,7 @@ def test_prolong_de_rham(tp_mesh):
 def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
-    degree = 2
+    degree = 3
     cell = tp_mesh.ufl_cell()
     element = FiniteElement(tp_family, cell=cell, degree=degree, variant=variant)
     Vi = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="interior"))
@@ -98,37 +99,11 @@ def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     uc = Function(Vc)
     uc.dat.data[0::2] = 0.0
     uc.dat.data[1::2] = 1.0
-    # import numpy
-    # from firedrake.preconditioners.pmg import get_permutation_to_line_elements
-
-    # cperm, _, _ = get_permutation_to_line_elements(Vc)
-    # cnum = Vc.cell_node_map().values[0]
-    # uc.dat.data[cnum[cperm]] = numpy.arange(1, 1+len(uc.dat.data))
-
-    # fperm, _, _ = get_permutation_to_line_elements(Vf)
-    # fnum = Vf.cell_node_map().values[0]
-
-    # print()
-    # # print("cperm", cperm)
-    # # print("fperm", fperm)
-    # print("inv(cperm)", numpy.argsort(cperm))
-    # print("inv(fperm)", numpy.argsort(fperm))
-
-    # expr = Constant([0]*3)
-    # for row in numpy.eye(3):
-    #     expr.assign(row)
-    #     uf.project(expr, solver_parameters={"mat_type": "matfree"})
-    #     print(numpy.nonzero(numpy.rint(uf.dat.data[fnum]).astype(int))[0])
-
-    # uf.project(uc, solver_parameters={"mat_type": "matfree"})
-    # print(numpy.rint(uf.dat.data[fnum[fperm]]).astype(int))
 
     for v in [ui, uf]:
         P = prolongation_matrix_matfree(v, uc).getPythonContext()
         P._prolong()
 
-    # print(numpy.rint(uf.dat.data[fnum[fperm]]).astype(int))
-
     assert norm(ui + uf - uc, "L2") < 2E-14
 
 

From 8df4c25f58b48df4dced2122fceb195c003ad1c2 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 15 Mar 2023 12:18:36 +0000
Subject: [PATCH 18/75] clean up

---
 firedrake/preconditioners/pmg.py    | 30 +++++++++++++----------------
 tests/multigrid/test_p_multigrid.py |  2 +-
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 6777b1ecd9..e626999b11 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -857,21 +857,17 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         else:
             raise ValueError("Cannot assign fine to coarse DOFs")
 
-        pshape = [e.space_dimension() for e in pelem]
-        pargs = ", ".join(map(str, pshape+[1]*(3-len(pshape))))
-        pstride = psize * numpy.prod(pshape)
-
         if set(cshifts) == set(fshifts):
             csize = Vc.value_size * Vc.finat_element.space_dimension()
             prolong_code.append(f"""
-            for({IntType_c} i=1; i<{len(fshifts)}; i++)
-                for({IntType_c} j=0; j<{csize}; j++)
-                    {t_in}[i*{csize} + j] = {t_in}[j];
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                for({IntType_c} i=0; i<{csize}; i++)
+                    {t_in}[j*{csize} + i] = {t_in}[i];
             """)
             restrict_code.append(f"""
-            for({IntType_c} i=1; i<{len(fshifts)}; i++)
-                for({IntType_c} j=0; j<{csize}; j++)
-                    {t_in}[j] += {t_in}[i*{csize} + j];
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                for({IntType_c} i=0; i<{csize}; i++)
+                    {t_in}[i] += {t_in}[j*{csize} + i];
             """)
 
         elif pelem == celems[0]:
@@ -879,12 +875,15 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
                 if Vc.value_size*len(shifts[k]) < Vf.value_size:
                     shifts[k] = shifts[k]*(Vf.value_size//Vc.value_size)
 
+            pshape = [e.space_dimension() for e in pelem]
+            pargs = ", ".join(map(str, pshape+[1]*(3-len(pshape))))
+            pstride = psize * numpy.prod(pshape)
+
             perm = sum(shifts, tuple())
             perm_data = ", ".join(map(str, perm))
             operator_decl.append(f"""
                 PetscBLASInt {perm_name}[{len(perm)}] = {{ {perm_data} }};
             """)
-
             prolong_code.append(f"""
             for({IntType_c} j=1; j<{len(perm)}; j++)
                 permute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
@@ -1116,12 +1115,10 @@ def get_permuted_map(V):
     Return a PermutedMap with the same tensor product shape for
     every component of H(div) or H(curl) tensor product elements
     """
-
-    perm, _, _ = get_permutation_to_line_elements(V.finat_element)
-    if all(perm[:-1] < perm[1:]):
+    indices, _, _ = get_permutation_to_line_elements(V.finat_element)
+    if all(indices[:-1] < indices[1:]):
         return V.cell_node_map()
-
-    return PermutedMap(V.cell_node_map(), perm)
+    return PermutedMap(V.cell_node_map(), indices)
 
 
 class StandaloneInterpolationMatrix(object):
@@ -1546,5 +1543,4 @@ def prolongation_matrix_matfree(Vf, Vc, Vf_bcs=[], Vc_bcs=[]):
     sizes = (Vf.dof_dset.layout_vec.getSizes(), Vc.dof_dset.layout_vec.getSizes())
     M_shll = PETSc.Mat().createPython(sizes, ctx, comm=Vf._comm)
     M_shll.setUp()
-
     return M_shll
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index f81b707fda..212a06bae1 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -87,7 +87,7 @@ def test_prolong_de_rham(tp_mesh):
 def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
-    degree = 3
+    degree = 5
     cell = tp_mesh.ufl_cell()
     element = FiniteElement(tp_family, cell=cell, degree=degree, variant=variant)
     Vi = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="interior"))

From f33456fff58d5dae7dfbbe0d6c0a692149c43058 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 15 Mar 2023 17:29:33 +0000
Subject: [PATCH 19/75] fix H(div) IPDG solver, use more elegant caching

---
 firedrake/preconditioners/fdm.py    |  67 ++++++------------
 firedrake/preconditioners/pmg.py    | 101 +++++++++++++++-------------
 tests/multigrid/test_p_multigrid.py |   2 +-
 3 files changed, 76 insertions(+), 94 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 1aeaa8b3a3..c315b9dbd5 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -53,12 +53,12 @@ class FDMPC(PCBase):
 
     @staticmethod
     def load_set_values(triu=False):
-        cache = FDMPC._c_code_cache
         key = triu
-        if key not in cache:
-            comm = PETSc.COMM_SELF
-            cache[key] = load_assemble_csr(comm, triu=triu)
-        return cache[key]
+        cache = FDMPC._c_code_cache
+        try:
+            return cache[key]
+        except KeyError:
+            return cache.setdefault(key, load_assemble_csr(PETSc.COMM_SELF, triu=triu))
 
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
@@ -545,8 +545,9 @@ def assemble_coef(self, J, form_compiler_parameters):
 
         key = (mixed_form.signature(), mesh)
         block_diagonal = True
-
-        if key not in self._coefficient_cache:
+        try:
+            return self._coefficient_cache[key]
+        except KeyError:
             if not block_diagonal or not V.shape:
                 tensor = firedrake.Function(Z)
                 coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
@@ -562,9 +563,7 @@ def assemble_coef(self, J, form_compiler_parameters):
                     ctx = sub.getPythonContext()
                     coefficients[name] = ctx._block_diagonal
                     assembly_callables.append(ctx._assemble_block_diagonal)
-
-            self._coefficient_cache[key] = (coefficients, assembly_callables)
-        return self._coefficient_cache[key]
+            return self._coefficient_cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V):
@@ -581,14 +580,16 @@ def assemble_reference_tensor(self, V):
         is_interior, is_facet = is_restricted(V.finat_element)
         key = (degree, tdim, formdegree, V.value_size, is_interior, is_facet)
         cache = self._reference_tensor_cache
-        if key not in cache:
+        try:
+            return cache[key]
+        except KeyError:
             full_key = (degree, tdim, formdegree, V.value_size, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
-                cache[key] = result.createSubMatrix(noperm, self.ises[1])
+                result = result.createSubMatrix(noperm, self.ises[1])
                 noperm.destroy()
-                return cache[key]
+                return cache.setdefault(key, result)
 
             elements = sorted(get_base_elements(V.finat_element), key=lambda e: e.formdegree)
             ref_el = elements[0].get_reference_element()
@@ -635,8 +636,7 @@ def assemble_reference_tensor(self, V):
                 result = result.createSubMatrix(noperm, self.ises[1])
                 noperm.destroy()
 
-            cache[key] = result
-        return cache[key]
+            return cache.setdefault(key, result)
 
 
 def factor_interior_mat(A00):
@@ -1091,7 +1091,8 @@ def assemble_reference_tensor(self, V):
         bdof = []  # indices of point evaluation dofs for each direction
         for e in line_elements:
             Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(fdm_setup_ipdg(e, eta)))
-            if not (e.formdegree or is_dg):
+            if not is_dg and e.degree() == degree:
+                # do not apply SIPG along continuous directions
                 Dfdm[0] = None
         return Afdm, Dfdm, bdof
 
@@ -1144,7 +1145,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
         # assemble zero-th order term separately, including off-diagonals (mixed components)
         # I cannot do this for hdiv elements as off-diagonals are not sparse, this is because
-        # the FDM eigenbases for GLL(N) and GLL(N-1) are not orthogonal to each other
+        # the FDM eigenbases for CG(k) and DG(k-1) are not orthogonal to each other
         rindices = None
         use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2 or static_condensation
         if not use_diag_Bq:
@@ -1500,15 +1501,6 @@ def pull_axis(x, pshape, idir):
     return numpy.reshape(numpy.moveaxis(numpy.reshape(x.copy(), pshape), idir, 0), x.shape)
 
 
-def set_submat_csr(A_global, A_local, global_indices, imode):
-    """insert values from A_local to A_global on the diagonal block with indices global_indices"""
-    indptr, indices, data = A_local.getValuesCSR()
-    for i, row in enumerate(global_indices.flat):
-        i0 = indptr[i]
-        i1 = indptr[i+1]
-        A_global.setValues(row, global_indices.flat[indices[i0:i1]], data[i0:i1], imode)
-
-
 def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
     """
     Create a SeqAIJ Mat from a dense matrix using the diagonal and a subset of rows and columns.
@@ -1555,7 +1547,7 @@ def fdm_setup_ipdg(fdm_element, eta):
         Bhat, and bcs(Ahat) for every combination of either natural or weak
         Dirichlet BCs on each endpoint.
         Dfdm: the tabulation of the normal derivatives of the Dirichlet eigenfunctions.
-        bdof: the indices of PointEvaluation dofs.
+        bdof: the indices of the vertex degrees of freedom.
     """
     ref_el = fdm_element.get_reference_element()
     degree = fdm_element.degree()
@@ -1563,7 +1555,8 @@ def fdm_setup_ipdg(fdm_element, eta):
         rule = fdm_element.dual.rule
     else:
         rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
-    bdof = [k for k, f in enumerate(fdm_element.dual_basis()) if isinstance(f, FIAT.functional.PointEvaluation)]
+    edof = fdm_element.entity_dofs()
+    bdof = edof[0][0] + edof[0][1]
 
     phi = fdm_element.tabulate(1, rule.get_points())
     Jhat = phi[(0, )]
@@ -1749,21 +1742,3 @@ def glonum(node_map):
             nelz = layers[:, 1]-layers[:, 0]-1
             to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
         return numpy.repeat(node_map.values_with_halo, nelz, axis=0) + numpy.kron(to_layer.reshape((-1, 1)), node_map.offset)
-
-
-def spy(A, comm=None):
-    import matplotlib.pyplot as plt
-    import scipy.sparse as sp
-    if comm is None:
-        comm = A.comm
-    nnz = A.getInfo()["nz_used"]
-    if A.getType().endswith("sbaij"):
-        A.setOption(PETSc.Mat.Option.GETROW_UPPERTRIANGULAR, True)
-    csr = tuple(reversed(A.getValuesCSR()))
-    if comm.rank == 0:
-        csr[0].fill(1)
-        scipy_mat = sp.csr_matrix(csr, shape=A.getSize())
-        fig, axes = plt.subplots(nrows=1, ncols=1)
-        axes.spy(scipy_mat, marker=".", markersize=2)
-        plt.title("nnz(A) = %d" % nnz)
-        plt.show()
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index e626999b11..bf3a462e0a 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -50,6 +50,8 @@ class PMGBase(PCSNESBase):
 
     _prefix = "pmg_"
 
+    _cache_transfer = {}
+
     def coarsen_element(self, ele):
         """
         Coarsen a given element to form the next problem down in the p-hierarchy.
@@ -333,19 +335,22 @@ def coarsen_bcs(self, fbcs, cV):
                 raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
         return cbcs
 
-    @staticmethod
-    @lru_cache(maxsize=20)
-    def create_transfer(cctx, fctx, mat_type, cbcs, fbcs):
-        cbcs = cctx._problem.bcs if cbcs else []
-        fbcs = fctx._problem.bcs if fbcs else []
+    def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
         cV = cctx.J.arguments()[0].function_space()
         fV = fctx.J.arguments()[0].function_space()
-        if mat_type == "matfree":
-            return prolongation_matrix_matfree(fV, cV, fbcs, cbcs)
-        elif mat_type == "aij":
-            return prolongation_matrix_aij(fV, cV, fbcs, cbcs)
-        else:
-            raise ValueError("Unknown matrix type")
+        cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
+        fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
+        key = (fV, cV, cbcs, fbcs, mat_type)
+        try:
+            return self._cache_transfer[key]
+        except KeyError:
+            if mat_type == "matfree":
+                construct_mat = prolongation_matrix_matfree
+            elif mat_type == "aij":
+                construct_mat = prolongation_matrix_aij
+            else:
+                raise ValueError("Unknown matrix type")
+            return self._cache_transfer.setdefault(key, construct_mat(fV, cV, fbcs, cbcs))
 
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
@@ -592,18 +597,13 @@ def compare_dual(b1, b2):
 
     k1 = numpy.array([p1[k][0][0] for k in p1])
     k2 = numpy.array([p2[k][0][0] for k in p2])
-    if not numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16):
-        return False
-    return True
+    return numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16)
 
 
 def compare_dual_basis(l1, l2):
     if len(l1) != len(l2):
         return False
-    for b1, b2 in zip(l1, l2):
-        if not compare_dual(b1, b2):
-            return False
-    return True
+    return all(compare_dual(b1, b2) for b1, b2 in zip(l1, l2))
 
 
 @lru_cache(maxsize=10)
@@ -615,14 +615,26 @@ def get_permutation_to_line_elements(finat_element):
     if expansion.space_dimension() != finat_element.space_dimension():
         raise ValueError("Failed to decompose %s into tensor products" % finat_element)
 
+    unique_factors = []
     line_elements = []
     terms = expansion.elements if hasattr(expansion, "elements") else [expansion]
     for term in terms:
         factors = term.factors if hasattr(term, "factors") else (term,)
-        fiat_factors = tuple(e.fiat_equivalent for e in reversed(factors))
-        if not all([e.get_reference_element().shape == LINE for e in fiat_factors]):
+        fiat_factors = [e.fiat_equivalent for e in reversed(factors)]
+        if any(e.get_reference_element().shape != LINE for e in fiat_factors):
             raise ValueError("Failed to decompose %s into line elements" % fiat_factors)
-        line_elements.append(fiat_factors)
+
+        # use the same FIAT element if it appears multiple times in the expansion
+        for i in range(len(fiat_factors)):
+            n = fiat_factors[i]
+            for f in unique_factors:
+                if compare_element(n, f):
+                    n = f
+                    break
+            if n is fiat_factors[i]:
+                unique_factors.append(n)
+            fiat_factors[i] = n
+        line_elements.append(tuple(fiat_factors))
 
     shapes = [tuple(e.space_dimension() for e in factors) for factors in line_elements]
     sizes = list(map(numpy.prod, shapes))
@@ -647,11 +659,10 @@ def get_permutation_to_line_elements(finat_element):
             perm = base[shift:] + base[:shift]
             for i, term in enumerate(line_elements):
                 if not visit[i]:
-                    is_perm = all([e1.space_dimension() == e2.space_dimension()
-                                   for e1, e2 in zip(perm, term)])
-                    for e1, e2 in zip(perm, term):
-                        if is_perm:
-                            is_perm = compare_element(e1, e2)
+                    is_perm = all(e1.space_dimension() == e2.space_dimension()
+                                  for e1, e2 in zip(perm, term))
+                    if is_perm:
+                        is_perm = all(compare_element(e1, e2) for e1, e2 in zip(perm, term))
 
                     if is_perm:
                         axes_shifts += ((tdim - shift) % tdim, )
@@ -660,7 +671,6 @@ def get_permutation_to_line_elements(finat_element):
                         assert dofs.shape == shapes[i]
                         dof_perm.append(dofs.flat)
                         visit[i] = True
-                        break
 
         shifts.append(axes_shifts)
 
@@ -838,7 +848,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
     shifts = fshifts
     in_place = False
     if len(felems) == len(celems):
-        in_place = all([(len(fs)*Vf.value_size == len(cs)*Vc.value_size) for fs, cs in zip(fshifts, cshifts)])
+        in_place = all((len(fs)*Vf.value_size == len(cs)*Vc.value_size) for fs, cs in zip(fshifts, cshifts))
         psize = Vf.value_size
 
     if not in_place:
@@ -914,7 +924,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         cshapes.append((nscal,) + tuple(cshape))
 
         J = [fiat_reference_prolongator(fe, ce).T for fe, ce in zip(felem, celem)]
-        if any([Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J]):
+        if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
             prolong_code.append(f"""
             for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
             """)
@@ -1116,7 +1126,7 @@ def get_permuted_map(V):
     every component of H(div) or H(curl) tensor product elements
     """
     indices, _, _ = get_permutation_to_line_elements(V.finat_element)
-    if all(indices[:-1] < indices[1:]):
+    if numpy.all(indices[:-1] < indices[1:]):
         return V.cell_node_map()
     return PermutedMap(V.cell_node_map(), indices)
 
@@ -1129,24 +1139,21 @@ class StandaloneInterpolationMatrix(object):
     _cache_work = {}
 
     def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
+        self.uf = self.work_function(Vf)
+        self.uc = self.work_function(Vc)
+        self.Vf = self.uf.function_space()
+        self.Vc = self.uc.function_space()
         self.Vf_bcs = Vf_bcs
         self.Vc_bcs = Vc_bcs
-        if isinstance(Vf, firedrake.Function):
-            self.uf = Vf
-            Vf = Vf.function_space()
-        else:
-            if Vf not in self._cache_work:
-                self._cache_work[Vf] = firedrake.Function(Vf)
-            self.uf = self._cache_work[Vf]
-        if isinstance(Vc, firedrake.Function):
-            self.uc = Vc
-            Vc = Vc.function_space()
+
+    def work_function(self, V):
+        if isinstance(V, firedrake.Function):
+            return V
         else:
-            if Vc not in self._cache_work:
-                self._cache_work[Vc] = firedrake.Function(Vc)
-            self.uc = self._cache_work[Vc]
-        self.Vf = Vf
-        self.Vc = Vc
+            try:
+                return self._cache_work[V]
+            except KeyError:
+                return self._cache_work.setdefault(V, firedrake.Function(V))
 
     @cached_property
     def _weight(self):
@@ -1187,8 +1194,8 @@ def _kernels(self):
                          self.uf.dat(op2.READ, uf_map),
                          self._weight.dat(op2.READ, uf_map)]
         coefficient_args = [c.dat(op2.READ, c.cell_node_map()) for c in coefficients]
-        prolong = partial(op2.par_loop, *prolong_args, *coefficient_args)
-        restrict = partial(op2.par_loop, *restrict_args, *coefficient_args)
+        prolong = op2.ParLoop(*prolong_args, *coefficient_args)
+        restrict = op2.ParLoop(*restrict_args, *coefficient_args)
         return prolong, restrict
 
     def _prolong(self):
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 212a06bae1..0b6c36f0b9 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -5,7 +5,7 @@
 @pytest.fixture(params=[2, 3],
                 ids=["Rectangle", "Box"])
 def tp_mesh(request):
-    nx = 1
+    nx = 4
     distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
     m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
     if request.param == 3:

From 0f2e80892b6b579bfd1e72c1818a72b8ce76c143 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 13:21:33 +0000
Subject: [PATCH 20/75] add comments

---
 firedrake/preconditioners/fdm.py    | 106 +++++++++++++++++-----------
 firedrake/preconditioners/pmg.py    |  45 ++++++------
 tests/multigrid/test_p_multigrid.py |  14 +++-
 tests/regression/test_fdm.py        |  16 +++--
 4 files changed, 105 insertions(+), 76 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index c315b9dbd5..35aed9fed4 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -104,7 +104,7 @@ def initialize(self, pc):
         if element == e_fdm:
             V_fdm, J_fdm, bcs_fdm = (V, J, bcs)
         else:
-            # Matrix-free assembly of the transformed Jacobian
+            # Reconstruct forms with variant element
             V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
             J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
             bcs_fdm = []
@@ -114,6 +114,7 @@ def initialize(self, pc):
                     W = W.sub(index)
                 bcs_fdm.append(bc.reconstruct(V=W, g=0))
 
+            # Construct interpolation from original to variant spaces
             self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
             self.work_vec_x = Amat.createVecLeft()
             self.work_vec_y = Amat.createVecRight()
@@ -156,7 +157,7 @@ def interp_nullspace(I, nsp):
                                               fcp=fcp, options_prefix=options_prefix)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, appctx, pmat_type)
+        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type)
         self._assemble_P()
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
@@ -183,15 +184,14 @@ def interp_nullspace(I, nsp):
             fdmpc.setFromOptions()
 
     @PETSc.Log.EventDecorator("FDMPrealloc")
-    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type):
+    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type):
         """
-        Assemble the sparse preconditioner with cell-wise constant coefficients.
+        Assemble the sparse preconditioner from diagonal mass matrices.
 
         :arg V: the :class:`.FunctionSpace` of the form arguments
         :arg J: the Jacobian bilinear form
         :arg bcs: an iterable of boundary conditions on V
         :arg form_compiler_parameters: parameters to assemble diagonal factors
-        :arg appctx: the application context
         :pmat_type: the preconditioner `PETSc.Mat.Type`
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
@@ -232,15 +232,15 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, appctx, pmat_type
             i1 = PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF)
             self.get_static_condensation[V] = lambda Ae: condense_element_pattern(Ae, self.ises[0], i1, self.submats)
 
-        # dict of cell to global mappings for each function space
-        self.cell_to_global = dict()
-        self.lgmaps = dict()
-
         @PETSc.Log.EventDecorator("FDMGetIndices")
         def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
+            # Be careful not to create new arrays
             result = cell_to_local(cell_index, result=result)
             return lgmap.apply(result, result=result)
 
+        # Create data strctures needed for assembly
+        self.cell_to_global = dict()
+        self.lgmaps = dict()
         bc_rows = dict()
         for Vsub in V:
             lgmap = Vsub.local_to_global_map([bc.reconstruct(V=Vsub, g=0) for bc in bcs])
@@ -253,13 +253,13 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             bdofs = numpy.nonzero(lgmap.indices[:own] < 0)[0].astype(PETSc.IntType)
             bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
 
-        # get coefficients on a given cell
         coefficients, assembly_callables = self.assemble_coef(J, form_compiler_parameters)
         coeffs = [coefficients.get(k) for k in ("beta", "alpha")]
         cmaps = [glonum_fun(ck.cell_node_map())[0] for ck in coeffs]
 
         @PETSc.Log.EventDecorator("FDMGetCoeffs")
         def get_coeffs(e, result=None):
+            # Get vector for betas and alphas on a cell
             vals = []
             for k, (coeff, cmap) in enumerate(zip(coeffs, cmaps)):
                 get_coeffs.indices[k] = cmap(e, result=get_coeffs.indices[k])
@@ -277,6 +277,7 @@ def get_coeffs(e, result=None):
 
         # Store only off-diagonal blocks with more columns than rows to save memory
         Vsort = sorted(V, key=lambda Vsub: Vsub.dim())
+        # Loop over all pairs of subspaces
         for Vrow, Vcol in product(Vsort, Vsort):
             if symmetric and (Vcol, Vrow) in Pmats:
                 P = PETSc.Mat().createTranspose(Pmats[Vcol, Vrow])
@@ -428,6 +429,7 @@ def update_De(data):
                     De.setDiagonal(work_vec, addv=insert)
                     return De
 
+            # Core assembly loop
             for e in range(self.nel):
                 rindices = get_rindices(e, result=rindices)
                 cindices = get_cindices(e, result=cindices)
@@ -438,6 +440,7 @@ def update_De(data):
             work_vec.destroy()
 
         elif self.nel:
+            # Preallocation of the sparsity pattern
             if common_key not in self.work_mats:
                 data = self.get_coeffs(0)
                 data.fill(1.0E0)
@@ -640,8 +643,11 @@ def assemble_reference_tensor(self, V):
 
 
 def factor_interior_mat(A00):
-    # Assume that interior DOF list i0 is ordered such that A00 is block diagonal
-    # with blocks of increasing dimension
+    """
+    Used in static condensation. Take in A00 on a cell, return its Cholesky
+    factorisation. Assumes that interior DOF have been reordered to make A00
+    block diagonal with blocks of increasing dimension.
+    """
     indptr, indices, data = A00.getValuesCSR()
     degree = numpy.diff(indptr)
 
@@ -665,6 +671,7 @@ def factor_interior_mat(A00):
 
 @PETSc.Log.EventDecorator("FDMCondense")
 def condense_element_mat(A, i0, i1, submats):
+    # Return the Schur complement associated to indices in i1, condensing i0 out
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
     submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
@@ -679,6 +686,7 @@ def condense_element_mat(A, i0, i1, submats):
 
 @PETSc.Log.EventDecorator("FDMCondense")
 def condense_element_pattern(A, i0, i1, submats):
+    # Add zeroes on the statically condensed pattern so that you can run ICC(0)
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
@@ -716,6 +724,8 @@ def wrapper(*args):
 
 
 def load_assemble_csr(comm, triu=False):
+    # Insert one sparse matrix into another sparse matrix.
+    # Done in C for efficiency, since it loops over rows.
     if triu:
         name = "setSubMatCSR_SBAIJ"
         select_cols = "icol < irow ? -1: icol"
@@ -767,11 +777,12 @@ def load_assemble_csr(comm, triu=False):
                        restype=ctypes.c_int)
 
 
-def petsc_sparse(A_numpy, rtol=1E-10):
+def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
+    # Convert dense numpy matrix into a sparse PETSc matrix
     Amax = max(A_numpy.min(), A_numpy.max(), key=abs)
     atol = rtol*Amax
     nnz = numpy.count_nonzero(abs(A_numpy) > atol, axis=1).astype(PETSc.IntType)
-    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=PETSc.COMM_SELF)
+    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
     for row, Arow in enumerate(A_numpy):
         cols = numpy.argwhere(abs(Arow) > atol).astype(PETSc.IntType).flat
         A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
@@ -780,29 +791,18 @@ def petsc_sparse(A_numpy, rtol=1E-10):
 
 
 def block_mat(A_blocks):
+    # Return a concrete Mat corresponding to a block matrix given as a list of lists
     if len(A_blocks) == 1:
         if len(A_blocks[0]) == 1:
             return A_blocks[0][0]
 
-    nrows = sum([Arow[0].size[0] for Arow in A_blocks])
-    ncols = sum([Aij.size[1] for Aij in A_blocks[0]])
-    nnz = numpy.concatenate([sum([numpy.diff(Aij.getValuesCSR()[0]) for Aij in Arow]) for Arow in A_blocks])
-    A = PETSc.Mat().createAIJ((nrows, ncols), nnz=(nnz, 0), comm=PETSc.COMM_SELF)
-    imode = PETSc.InsertMode.INSERT
-    insert_block = FDMPC.load_set_values()
-    rsizes = [sum([Ai[0].size[0] for Ai in A_blocks[:k]]) for k in range(len(A_blocks)+1)]
-    csizes = [sum([Aij.size[1] for Aij in A_blocks[0][:k]]) for k in range(len(A_blocks[0])+1)]
-    rows = [numpy.arange(*rsizes[i:i+2], dtype=PETSc.IntType) for i in range(len(A_blocks))]
-    cols = [numpy.arange(*csizes[j:j+2], dtype=PETSc.IntType) for j in range(len(A_blocks[0]))]
-    for Ai, irows in zip(A_blocks, rows):
-        for Aij, jcols in zip(Ai, cols):
-            insert_block(A, Aij, irows, jcols, imode)
-
-    A.assemble()
-    return A
+    nest = PETSc.Mat().createNest(A_blocks, comm=A_blocks[0][0].getComm())
+    # A nest Mat would not allow us to take matrix-matrix products
+    return nest.convert(mat_type=A_blocks[0][0].getType())
 
 
 def is_restricted(finat_element):
+    # Determine if an element is a restriction onto interior or facets
     is_interior = True
     is_facet = True
     tdim = finat_element.cell.get_spatial_dimension()
@@ -822,6 +822,7 @@ def is_restricted(finat_element):
 
 
 def sort_interior_dofs(idofs, A):
+    # Permute `idofs` to have A[idofs, idofs] with contiguous 1x1, 2x2, 3x3, ... blocks
     Aii = A.createSubMatrix(idofs, idofs)
     indptr, indices, _ = Aii.getValuesCSR()
     n = idofs.getSize()
@@ -836,8 +837,8 @@ def sort_interior_dofs(idofs, A):
                 if len(neigh) == degree:
                     visit[neigh] = True
                     perm.extend(neigh)
-
     idofs.setIndices(idofs.getIndices()[perm])
+    Aii.destroy()
 
 
 def kron3(A, B, C, scale=None):
@@ -849,9 +850,13 @@ def kron3(A, B, C, scale=None):
     return result
 
 
-def mass_matrix(tdim, formdegree, B00, B11):
-    B00 = petsc_sparse(B00)
-    B11 = petsc_sparse(B11)
+def mass_matrix(tdim, formdegree, B00, B11, comm=None):
+    # Construct mass matrix on reference cell from 1D mass matrices B00 and B11.
+    # It can be applied with either broken or conforming test and trial spaces.
+    if comm is None:
+        comm = PETSc.COMM_SELF
+    B00 = petsc_sparse(B00, comm=comm)
+    B11 = petsc_sparse(B11, comm=comm)
     if tdim == 1:
         B_blocks = [B11 if formdegree else B00]
     elif tdim == 2:
@@ -871,8 +876,6 @@ def mass_matrix(tdim, formdegree, B00, B11):
         else:
             B_blocks = [kron3(B11, B11, B11)]
 
-    B00.destroy()
-    B11.destroy()
     if len(B_blocks) == 1:
         result = B_blocks[0]
     else:
@@ -884,23 +887,35 @@ def mass_matrix(tdim, formdegree, B00, B11):
         indptr = numpy.concatenate([csr[0][bool(shift):]+shift for csr, shift in zip(csr_block, ishift[:-1])])
         indices = numpy.concatenate([csr[1]+shift for csr, shift in zip(csr_block, jshift[:-1])])
         data = numpy.concatenate([csr[2] for csr in csr_block])
-        result = PETSc.Mat().createAIJ((nrows, ncols), csr=(indptr, indices, data), comm=PETSc.COMM_SELF)
+        result = PETSc.Mat().createAIJ((nrows, ncols), csr=(indptr, indices, data), comm=comm)
         for B in B_blocks:
             B.destroy()
+    if not (B00 is result):
+        B00.destroy()
+    if not (B11 is result):
+        B11.destroy()
     return result
 
 
-def diff_matrix(tdim, formdegree, A00, A11, A10):
+def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
+    # Construct exterior derivative matrix on reference cell from 1D mass matrices A00 and A11,
+    # and exterior derivative moments A10.
+    # It can be applied with either broken or conforming test and trial spaces.
+    if comm is None:
+        comm = PETSc.COMM_SELF
     if formdegree == tdim:
         ncols = A10.shape[0]**tdim
-        A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=PETSc.COMM_SELF)
+        A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=comm)
         A_zero.assemble()
         return A_zero
 
-    A00 = petsc_sparse(A00)
-    A11 = petsc_sparse(A11)
-    A10 = petsc_sparse(A10)
+    A00 = petsc_sparse(A00, comm=comm)
+    A11 = petsc_sparse(A11, comm=comm)
+    A10 = petsc_sparse(A10, comm=comm)
     if tdim == 1:
+        A00.destroy()
+        A11.destroy()
+
         return A10
     elif tdim == 2:
         if formdegree == 0:
@@ -913,7 +928,7 @@ def diff_matrix(tdim, formdegree, A00, A11, A10):
             A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
         elif formdegree == 1:
             size = tuple(A11.getSize()[k] * A10.getSize()[k] * A00.getSize()[k] for k in range(2))
-            A_zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=PETSc.COMM_SELF)
+            A_zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=comm)
             A_zero.assemble()
             A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), A_zero],
                         [kron3(A10, A00, A11, scale=-1), A_zero, kron3(A11, A00, A10)],
@@ -932,6 +947,10 @@ def diff_matrix(tdim, formdegree, A00, A11, A10):
 
 
 def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
+    """
+    Magic. Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
+    Works for any basis. These are the same matrices one needs for HypreAMS and friends.
+    """
     from tsfc.finatinterface import create_element
     from firedrake.preconditioners.pmg import fiat_reference_prolongator
 
@@ -1012,6 +1031,7 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
 
 
 def unrestrict_element(ele):
+    # Get an element that might or might not be restricted and return the parent unrestricted element.
     if isinstance(ele, ufl.VectorElement):
         return type(ele)(unrestrict_element(ele._sub_element), dim=ele.num_sub_elements())
     elif isinstance(ele, ufl.TensorElement):
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index bf3a462e0a..6757864e7a 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -78,8 +78,8 @@ def coarsen_form(self, form, fine_to_coarse_map):
     def initialize(self, obj):
         # Make a new DM.
         # Hook up a (new) coarsen routine on that DM.
-        # Make a new PC, of type MG.
-        # Assign the DM to that PC.
+        # Make a new PC, of type MG (or SNES of type FAS).
+        # Assign the DM to that PC (or SNES).
 
         odm = obj.getDM()
         ctx = get_appctx(odm)
@@ -262,18 +262,17 @@ def _coarsen_form(a):
         cdm.setCreateInterpolation(self.create_interpolation)
         cdm.setCreateInjection(self.create_injection)
 
-        interp_petscmat, _ = cdm.createInterpolation(fdm)
-        inject_petscmat = cdm.createInjection(fdm)
-
         if cu in cJ.coefficients():
-            # injection of the initial state
+            # Only inject state if the coarse state is a dependency of the coarse Jacobian.
+            inject_petscmat = cdm.createInjection(fdm)
+
             def inject_state():
                 with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
                     inject_petscmat.mult(xf, xc)
 
             add_hook(parent, setup=inject_state, call_setup=True)
 
-        # coarsen the nullspace basis
+        # Coarsen the nullspace basis
         def coarsen_nullspace(coarse_V, mat, fine_nullspace):
             if isinstance(fine_nullspace, MixedVectorSpaceBasis):
                 if mat.type == 'python':
@@ -302,13 +301,16 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
             else:
                 return fine_nullspace
 
-        ises = cV._ises
+        if fctx._nullspace or fctx._near_nullspace or fctx._nullspace_T:
+            interp_petscmat, _ = cdm.createInterpolation(fdm)
+        else:
+            interp_petscmat = None
         cctx._nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace)
-        cctx.set_nullspace(cctx._nullspace, ises, transpose=False, near=False)
+        cctx.set_nullspace(cctx._nullspace, cV._ises, transpose=False, near=False)
         cctx._near_nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._near_nullspace)
-        cctx.set_nullspace(cctx._near_nullspace, ises, transpose=False, near=True)
+        cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
         cctx._nullspace_T = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace_T)
-        cctx.set_nullspace(cctx._nullspace_T, ises, transpose=True, near=False)
+        cctx.set_nullspace(cctx._nullspace_T, cV._ises, transpose=True, near=False)
         return cdm
 
     def coarsen_quadrature(self, metadata, fdeg, cdeg):
@@ -336,6 +338,7 @@ def coarsen_bcs(self, fbcs, cV):
         return cbcs
 
     def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
+        # Create a transfer or retrieve it from the class cache
         cV = cctx.J.arguments()[0].function_space()
         fV = fctx.J.arguments()[0].function_space()
         cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
@@ -532,9 +535,7 @@ def prolongation_transfer_kernel_action(Vf, expr):
 
 
 def expand_element(ele):
-    """
-    Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
-    """
+    # Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
     if isinstance(ele, finat.FlattenedDimensions):
         return expand_element(ele.product)
     elif isinstance(ele, (finat.HDivElement, finat.HCurlElement)):
@@ -560,6 +561,7 @@ def expand_element(ele):
 
 
 def evaluate_dual(dual, element, key=None):
+    # Evaluate the action of a set of dual functionals on the basis functions of an element.
     keys = set(tuple(phi.get_point_dict().keys()) for phi in dual)
     pts = list(set(sum(keys, ())))
     if key is None:
@@ -615,7 +617,7 @@ def get_permutation_to_line_elements(finat_element):
     if expansion.space_dimension() != finat_element.space_dimension():
         raise ValueError("Failed to decompose %s into tensor products" % finat_element)
 
-    unique_factors = []
+    unique_factors = set()
     line_elements = []
     terms = expansion.elements if hasattr(expansion, "elements") else [expansion]
     for term in terms:
@@ -632,7 +634,7 @@ def get_permutation_to_line_elements(finat_element):
                     n = f
                     break
             if n is fiat_factors[i]:
-                unique_factors.append(n)
+                unique_factors.add(n)
             fiat_factors[i] = n
         line_elements.append(tuple(fiat_factors))
 
@@ -680,15 +682,8 @@ def get_permutation_to_line_elements(finat_element):
 
 @lru_cache(maxsize=10)
 def fiat_reference_prolongator(felem, celem, derivative=False):
-    from FIAT.reference_element import flatten_reference_cube
-
-    ref_el = flatten_reference_cube(felem.get_reference_element())
-    tdim = ref_el.get_spatial_dimension()
-    if derivative and tdim > 1:
-        raise NotImplementedError("Derivative prolongator is only available on the interval")
-    ckey = (felem.formdegree,) if derivative else (0,)*tdim
-    fkey = (celem.formdegree,) if derivative else (0,)*tdim
-
+    ckey = (felem.formdegree,) if derivative else None
+    fkey = (celem.formdegree,) if derivative else None
     fdual = felem.dual_basis()
     cdual = celem.dual_basis()
     if fkey == ckey and compare_dual_basis(fdual, cdual):
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 0b6c36f0b9..a7414ebb4c 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -47,6 +47,10 @@ def mixed_family(tp_mesh, request):
 
 
 def test_reconstruct_degree(tp_mesh, mixed_family):
+    """ Construct a complicated mixed element and ensure we may recover it by
+        p-refining or p-coarsening an element of the same family with different
+        degree.
+    """
     elist = []
     Vfamily, Qfamily = mixed_family
     for degree in [7, 2, 31]:
@@ -57,11 +61,15 @@ def test_reconstruct_degree(tp_mesh, mixed_family):
         Q = FunctionSpace(tp_mesh, Qfamily, degree-2)
         Z = MixedFunctionSpace([V, Q])
         e = Z.ufl_element()
+
         elist.append(e)
         assert e == PMGPC.reconstruct_degree(elist[0], degree)
 
 
 def test_prolong_de_rham(tp_mesh):
+    """ Interpolate a linear vector function between [H1]^d, HCurl and HDiv spaces
+        where it can be exactly represented
+    """
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
     tdim = tp_mesh.topological_dimension()
@@ -85,6 +93,10 @@ def test_prolong_de_rham(tp_mesh):
 
 
 def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
+    """ Interpolate a low-order function to interior and facet high-order spaces
+        and ensure that the sum of the two high-order functions is equal to the
+        low-order function
+    """
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
     degree = 5
@@ -291,7 +303,7 @@ def test_p_multigrid_mixed(mat_type):
              "ksp_max_it": 3,
              "pc_type": "jacobi"}
 
-    coarse = {"mat_type": "aij",
+    coarse = {"mat_type": "aij",  # This circumvents the need for AssembledPC
               "ksp_type": "richardson",
               "ksp_max_it": 1,
               "ksp_norm_type": "unpreconditioned",
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index dc76294909..f2263a8f10 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -16,6 +16,7 @@
     "pc_type": "cholesky",
 }
 
+# FDM without static condensation
 fdmstar = {
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
@@ -38,6 +39,7 @@
     }
 }
 
+# FDM with static condensation
 facetstar = {
     "pc_type": "python",
     "pc_python_type": "firedrake.FacetSplitPC",
@@ -48,7 +50,7 @@
     "facet_fdm_pc_fieldsplit_type": "symmetric_multiplicative",
     "facet_fdm_fieldsplit_0": {
         "ksp_type": "preonly",
-        "pc_type": "icc",
+        "pc_type": "icc",  # this is exact for the sparse approximation used in FDM
     },
     "facet_fdm_fieldsplit_1": {
         "ksp_type": "preonly",
@@ -136,8 +138,8 @@ def test_p_independence_hgrad(mesh, variant):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant=variant)
         V = FunctionSpace(mesh, element)
         problem = build_riesz_map(V, grad)
-        for sp, max_it in zip(solvers, expected[:len(solvers)]):
-            assert solve_riesz_map(problem, sp) <= max_it
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
 
 @pytest.mark.skipcomplex
@@ -149,8 +151,8 @@ def test_p_independence_hcurl(mesh):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
         problem = build_riesz_map(V, curl)
-        for sp, max_it in zip(solvers, expected[:len(solvers)]):
-            assert solve_riesz_map(problem, sp) <= max_it
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
 
 @pytest.mark.skipcomplex
@@ -162,8 +164,8 @@ def test_p_independence_hdiv(mesh):
         element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
         V = FunctionSpace(mesh, element)
         problem = build_riesz_map(V, div)
-        for sp, max_it in zip(solvers, expected[:len(solvers)]):
-            assert solve_riesz_map(problem, sp) <= max_it
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
 
 @pytest.mark.skipcomplex

From d1ae080247a6298a2b163fd4ad7b57604ab97203 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 14:06:27 +0000
Subject: [PATCH 21/75] more comments

---
 firedrake/preconditioners/fdm.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 35aed9fed4..21c2ac91c0 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -53,6 +53,8 @@ class FDMPC(PCBase):
 
     @staticmethod
     def load_set_values(triu=False):
+        # Compile the C function to insert sparse element matrices and store in
+        # class cache
         key = triu
         cache = FDMPC._c_code_cache
         try:
@@ -104,7 +106,7 @@ def initialize(self, pc):
         if element == e_fdm:
             V_fdm, J_fdm, bcs_fdm = (V, J, bcs)
         else:
-            # Reconstruct forms with variant element
+            # Reconstruct Jacobian and bcs with variant element
             V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
             J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
             bcs_fdm = []
@@ -238,7 +240,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             result = cell_to_local(cell_index, result=result)
             return lgmap.apply(result, result=result)
 
-        # Create data strctures needed for assembly
+        # Create data structures needed for assembly
         self.cell_to_global = dict()
         self.lgmaps = dict()
         bc_rows = dict()
@@ -380,6 +382,16 @@ def destroy(self, pc):
 
     @PETSc.Log.EventDecorator("FDMSetValues")
     def set_values(self, A, Vrow, Vcol, addv, triu=False):
+        """
+        Assemble the stiffness matrix in the FDM basis using sparse reference
+        tensors and diagonal mass matrices.
+
+        :arg A: the :class:`PETSc.Mat` to assemble
+        :arg Vrow: the :class:`.FunctionSpace` test space
+        :arg Vcol: the :class:`.FunctionSpace` trial space
+        :arg addv: a `PETSc.Mat.InsertMode`
+        :arg triu: are we assembling only the upper triangular part?
+        """
 
         def RtAP(R, A, P, result=None):
             RtAP.buff = A.matMult(P, result=RtAP.buff)
@@ -858,7 +870,12 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
     B00 = petsc_sparse(B00, comm=comm)
     B11 = petsc_sparse(B11, comm=comm)
     if tdim == 1:
-        B_blocks = [B11 if formdegree else B00]
+        if formdegree == 0:
+            B11.destroy()
+            return B00
+        else:
+            B00.destroy()
+            return B11
     elif tdim == 2:
         if formdegree == 0:
             B_blocks = [B00.kron(B00)]
@@ -890,10 +907,8 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
         result = PETSc.Mat().createAIJ((nrows, ncols), csr=(indptr, indices, data), comm=comm)
         for B in B_blocks:
             B.destroy()
-    if not (B00 is result):
-        B00.destroy()
-    if not (B11 is result):
-        B11.destroy()
+    B00.destroy()
+    B11.destroy()
     return result
 
 

From 39283025028cf66e1cee151c9054efdb70588365 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 14:46:48 +0000
Subject: [PATCH 22/75] glonum_fun -> extrude_node_map, do not coarsen residual
 of linear p-MG

---
 firedrake/preconditioners/fdm.py | 66 ++++++++++++--------------------
 firedrake/preconditioners/pmg.py | 17 ++++----
 2 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 21c2ac91c0..6ceade799e 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -53,8 +53,12 @@ class FDMPC(PCBase):
 
     @staticmethod
     def load_set_values(triu=False):
-        # Compile the C function to insert sparse element matrices and store in
-        # class cache
+        """
+        Compile C code to insert sparse element matrices and store in class cache
+        :arg triu: are we inserting onto the upper triangular part of the matrix?
+
+        :returns: a python wrapper for the matrix insertion function
+        """
         key = triu
         cache = FDMPC._c_code_cache
         try:
@@ -247,7 +251,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
         for Vsub in V:
             lgmap = Vsub.local_to_global_map([bc.reconstruct(V=Vsub, g=0) for bc in bcs])
             bsize = Vsub.dof_dset.layout_vec.getBlockSize()
-            cell_to_local, nel = glonum_fun(Vsub.cell_node_map(), bsize=bsize)
+            cell_to_local, nel = extrude_node_map(Vsub.cell_node_map(), bsize=bsize)
             self.cell_to_global[Vsub] = partial(cell_to_global, lgmap, cell_to_local)
             self.lgmaps[Vsub] = lgmap
 
@@ -257,7 +261,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
 
         coefficients, assembly_callables = self.assemble_coef(J, form_compiler_parameters)
         coeffs = [coefficients.get(k) for k in ("beta", "alpha")]
-        cmaps = [glonum_fun(ck.cell_node_map())[0] for ck in coeffs]
+        cmaps = [extrude_node_map(ck.cell_node_map())[0] for ck in coeffs]
 
         @PETSc.Log.EventDecorator("FDMGetCoeffs")
         def get_coeffs(e, result=None):
@@ -1005,8 +1009,8 @@ def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
 
     rmap = Vf.local_to_global_map(fbcs)
     cmap = Vc.local_to_global_map(cbcs)
-    rlocal, nel = glonum_fun(Vf.cell_node_map(), bsize=Vf.value_size)
-    clocal, nel = glonum_fun(Vc.cell_node_map(), bsize=Vc.value_size)
+    rlocal, nel = extrude_node_map(Vf.cell_node_map(), bsize=Vf.value_size)
+    clocal, nel = extrude_node_map(Vc.cell_node_map(), bsize=Vc.value_size)
 
     def cell_to_global(lgmap, cell_to_local, e, result=None):
         result = cell_to_local(e, result=result)
@@ -1164,8 +1168,8 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         tdim = V.mesh().topological_dimension()
         shift = self.axes_shifts * bsize
 
-        index_coef, _ = glonum_fun((Gq or Bq).cell_node_map())
-        index_bc, _ = glonum_fun(bcflags.cell_node_map())
+        index_coef, _ = extrude_node_map((Gq or Bq).cell_node_map())
+        index_bc, _ = extrude_node_map(bcflags.cell_node_map())
         flag2id = numpy.kron(numpy.eye(tdim, tdim, dtype=PETSc.IntType), [[1], [2]])
 
         # pshape is the shape of the DOFs in the tensor product
@@ -1693,19 +1697,20 @@ def get_interior_facet_maps(V):
 
 
 @lru_cache(maxsize=20)
-def glonum_fun(node_map, bsize=1):
+def extrude_node_map(node_map, bsize=1):
     """
-    Return a the local numbering given an non-extruded local map and the total number of entities.
+    Construct a (possibly vector-valued) cell to node map from an un-extruded scalar map.
 
     :arg node_map: a :class:`pyop2.Map` mapping entities to their local dofs, including ghost entities.
+    :arg bsize: the block size
 
-    :returns: a 2-tuple with the map and the number of entities owned by this process
+    :returns: a 2-tuple with the map as function and the number of cells owned by this process
     """
     nelv = node_map.values.shape[0]
     if node_map.offset is None:
         nel = nelv
 
-        def glonum(e, result=None):
+        def scalar_map(e, result=None):
             if result is None:
                 result = numpy.copy(node_map.values_with_halo[e])
             else:
@@ -1718,14 +1723,14 @@ def glonum(e, result=None):
             nelz = layers[0, 1]-layers[0, 0]-1
             nel = nelz*nelv
 
-            def _glonum(node_map, nelz, e, result=None):
+            def _scalar_map(node_map, nelz, e, result=None):
                 if result is None:
                     result = numpy.copy(node_map.values_with_halo[e // nelz])
                 else:
                     numpy.copyto(result, node_map.values_with_halo[e // nelz])
                 result += (e % nelz)*node_map.offset
                 return result
-            glonum = partial(_glonum, node_map, nelz)
+            scalar_map = partial(_scalar_map, node_map, nelz)
 
         else:
             nelz = layers[:, 1]-layers[:, 0]-1
@@ -1733,47 +1738,26 @@ def _glonum(node_map, nelz, e, result=None):
             to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
             to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
 
-            def _glonum(node_map, to_base, to_layer, e, result=None):
+            def _scalar_map(node_map, to_base, to_layer, e, result=None):
                 if result is None:
                     result = numpy.copy(node_map.values_with_halo[to_base[e]])
                 else:
                     numpy.copyto(result, node_map.values_with_halo[to_base[e]])
                 result += to_layer[e]*node_map.offset
                 return result
-            glonum = partial(_glonum, node_map, to_base, to_layer)
+            scalar_map = partial(_scalar_map, node_map, to_base, to_layer)
 
     if bsize == 1:
-        return glonum, nel
+        return scalar_map, nel
 
     ibase = numpy.arange(bsize, dtype=node_map.values.dtype)
 
-    def vector_glonum(bsize, ibase, e, result=None):
+    def vector_map(bsize, ibase, e, result=None):
         index = None
         if result is not None:
             index = result[:, 0]
-        index = glonum(e, result=index)
+        index = scalar_map(e, result=index)
         index *= bsize
         return numpy.add.outer(index, ibase, out=result)
 
-    return partial(vector_glonum, bsize, ibase), nel
-
-
-def glonum(node_map):
-    """
-    Return an array with the node map.
-
-    :arg node_map: a :class:`pyop2.Map` mapping entities to their nodes, including ghost entities.
-
-    :returns: a :class:`numpy.ndarray` whose rows are the nodes for each cell
-    """
-    if (node_map.offset is None) or (node_map.values_with_halo.size == 0):
-        return node_map.values_with_halo
-    else:
-        layers = node_map.iterset.layers_array
-        if layers.shape[0] == 1:
-            nelz = layers[0, 1]-layers[0, 0]-1
-            to_layer = numpy.tile(numpy.arange(nelz, dtype=node_map.offset.dtype), len(node_map.values_with_halo))
-        else:
-            nelz = layers[:, 1]-layers[:, 0]-1
-            to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-        return numpy.repeat(node_map.values_with_halo, nelz, axis=0) + numpy.kron(to_layer.reshape((-1, 1)), node_map.offset)
+    return partial(vector_map, bsize, ibase), nel
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 6757864e7a..029fffad6d 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -98,7 +98,7 @@ def initialize(self, obj):
         pdm.setOptionsPrefix(options_prefix)
 
         ppc = self.configure_pmg(obj, pdm)
-        is_snes = isinstance(obj, PETSc.SNES)
+        self.is_snes = isinstance(obj, PETSc.SNES)
 
         copts = PETSc.Options(ppc.getOptionsPrefix() + ppc.getType() + "_coarse_")
 
@@ -129,7 +129,7 @@ def initialize(self, obj):
         # Now overwrite some routines on the DM
         pdm.setRefine(None)
         pdm.setCoarsen(self.coarsen)
-        if is_snes:
+        if self.is_snes:
             pdm.setSNESFunction(_SNESContext.form_function)
             pdm.setSNESJacobian(_SNESContext.form_jacobian)
             pdm.setKSPComputeOperators(_SNESContext.compute_operators)
@@ -201,12 +201,15 @@ def _coarsen_form(a):
                              for f in a.integrals()])
             return a
 
-        cF = _coarsen_form(fctx.F)
         cJ = _coarsen_form(fctx.J)
         cJp = _coarsen_form(fctx.Jp)
+        # This fixes a subtle bug where you are applying PMGPC on a mixed
+        # problem with geometric multigrid only on one block and an non-Lagrange element
+        # on the other block (gmg breaks for non-Lagrange elements)
+        cF = _coarsen_form(fctx.F) if self.is_snes else ufl.action(cJ, cu)
+
         fcp = self.coarsen_quadrature(fproblem.form_compiler_parameters, fdeg, cdeg)
         cbcs = self.coarsen_bcs(fproblem.bcs, cV)
-        cF = self.coarsen_residual(cF, cJ, cu)
 
         # Coarsen the appctx: the user might want to provide solution-dependant expressions and forms
         cappctx = dict(fctx.appctx)
@@ -460,9 +463,6 @@ def applyTranspose(self, pc, x, y):
     def coarsen_bc_value(self, bc, cV):
         return 0
 
-    def coarsen_residual(self, Fc, Jc, uc):
-        return ufl.action(Jc, uc)
-
 
 class PMGSNES(SNESBase, PMGBase):
     _prefix = "pfas_"
@@ -515,9 +515,6 @@ def coarsen_bc_value(self, bc, cV):
         coarse.interpolate(bc._original_arg)
         return coarse
 
-    def coarsen_residual(self, Fc, Jc, uc):
-        return Fc
-
 
 def prolongation_transfer_kernel_action(Vf, expr):
     from tsfc import compile_expression_dual_evaluation

From 213a55b41d6d1443d103db4cfea493e925f18a77 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 15:08:46 +0000
Subject: [PATCH 23/75] update citations

---
 firedrake/preconditioners/fdm.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 6ceade799e..9345bb674c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -13,17 +13,29 @@
 import FIAT
 import finat
 
-Citations().add("Brubeck2021", """
-@misc{Brubeck2021,
+Citations().add("Brubeck2022a", """
+@article{Brubeck2022a,
   title={A scalable and robust vertex-star relaxation for high-order {FEM}},
   author={Brubeck, Pablo D. and Farrell, Patrick E.},
+  journal = {SIAM J. Sci. Comput.},
+  volume = {44},
+  number = {5},
+  pages = {A2991-A3017},
+  year = {2022},
+  doi = {10.1137/21M1444187}
+""")
+
+Citations().add("Brubeck2022b", """
+@misc{Brubeck2022b,
+  title={{Multigrid solvers for the de Rham complex with optimal complexity in polynomial degree}},
+  author={Brubeck, Pablo D. and Farrell, Patrick E.},
   archiveprefix = {arXiv},
-  eprint = {2107.14758},
+  eprint = {2211.14284},
   primaryclass = {math.NA},
-  year={2021}
-}
+  year={2022}
 """)
 
+
 __all__ = ("FDMPC", "PoissonFDMPC")
 
 
@@ -44,8 +56,8 @@ class FDMPC(PCBase):
     """
 
     _prefix = "fdm_"
-
     _variant = "fdm"
+    _citation = "Brubeck2022b"
 
     _reference_tensor_cache = {}
     _coefficient_cache = {}
@@ -71,8 +83,8 @@ def initialize(self, pc):
         from firedrake.assemble import allocate_matrix, assemble
         from firedrake.preconditioners.pmg import prolongation_matrix_matfree
         from firedrake.preconditioners.patch import bcdofs
-        Citations().register("Brubeck2021")
 
+        Citations().register(self._citation)
         self.comm = pc.comm
         Amat, Pmat = pc.getOperators()
         prefix = pc.getOptionsPrefix()
@@ -396,7 +408,6 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         :arg addv: a `PETSc.Mat.InsertMode`
         :arg triu: are we assembling only the upper triangular part?
         """
-
         def RtAP(R, A, P, result=None):
             RtAP.buff = A.matMult(P, result=RtAP.buff)
             return R.transposeMatMult(RtAP.buff, result=result)
@@ -1109,6 +1120,7 @@ class PoissonFDMPC(FDMPC):
     """
 
     _variant = "fdm_ipdg"
+    _citation = "Brubeck2022a"
 
     def assemble_reference_tensor(self, V):
         from firedrake.preconditioners.pmg import get_permutation_to_line_elements

From b6dff72ba09c79e036e0ebd1c0ef903b2a53c75b Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 15:19:23 +0000
Subject: [PATCH 24/75] add some comments

---
 firedrake/preconditioners/fdm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 9345bb674c..5616cf547c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -418,7 +418,9 @@ def RtAP(R, A, P, result=None):
         if Vrow == Vcol:
             get_cindices = lambda e, result=None: result
             update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, rindices, addv)
+            # moments of orthogonalized basis against basis tabulation and derivative tabulation
             rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
+            # element matrix obtained via Equation (3.9) of Brubeck2022b
             assemble_element_mat = lambda De, result=None: De.PtAP(rtensor, result=result)
             condense_element_mat = self.get_static_condensation.get(Vrow)
         else:
@@ -505,6 +507,8 @@ def update_De(data):
     def assemble_coef(self, J, form_compiler_parameters):
         """
         Obtain coefficients as the diagonal of a weighted mass matrix in V^k x V^{k+1}
+
+        See Section 3.2 of Brubeck2022b.
         """
         from ufl.algorithms.ad import expand_derivatives
         from ufl.algorithms.expand_indices import expand_indices

From 415881ce76b0f3693ce2d1dfe21d79a18cdb056c Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 17:02:42 +0000
Subject: [PATCH 25/75] comments explaining reference tensor and coefficients

---
 firedrake/preconditioners/fdm.py | 34 +++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 5616cf547c..330ea4c33e 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -506,15 +506,26 @@ def update_De(data):
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coef(self, J, form_compiler_parameters):
         """
-        Obtain coefficients as the diagonal of a weighted mass matrix in V^k x V^{k+1}
-
+        Obtain coefficients for the auxiliary operator as the diagonal of a
+        weighted mass matrix in broken(V^k) * broken(V^{k+1}).
         See Section 3.2 of Brubeck2022b.
+
+        :arg J: the Jacobian bilinear :class:`ufl.Form`,
+        :form_compiler_parameters: a `dict` with tsfc parameters.
+
+        :return: a 2-tuple with a `dict` with the zero-th order and second
+        order coefficients keyed on ``"beta"`` and ``"alpha"``, and a list of
+        assembly callables.
         """
         from ufl.algorithms.ad import expand_derivatives
         from ufl.algorithms.expand_indices import expand_indices
         from firedrake.formmanipulation import ExtractSubBlock
         from firedrake.assemble import assemble
 
+        # Basic idea: take the original bilinear form and
+        # replace the exterior derivatives with arguments in broken(V^{k+1}).
+        # Then, replace the original arguments with arguments in broken(V^k).
+        # Where the broken spaces have L2-orthogonal FDM basis functions.
         index = len(J.arguments()[-1].function_space())-1
         if index:
             splitter = ExtractSubBlock()
@@ -529,6 +540,7 @@ def assemble_coef(self, J, form_compiler_parameters):
         e = unrestrict_element(e)
         sobolev = e.sobolev_space()
 
+        # Replacement rule for the exterior derivative = grad(arg) * eps
         map_grad = None
         if sobolev == ufl.H1:
             map_grad = lambda p: p
@@ -544,6 +556,7 @@ def assemble_coef(self, J, form_compiler_parameters):
             else:
                 map_grad = lambda p: p*(eps/2)
 
+        # Construct Z = broken(V^k) * broken(V^{k+1})
         V = args_J[0].function_space()
         formdegree = V.finat_element.formdegree
         degree = e.degree()
@@ -569,14 +582,16 @@ def assemble_coef(self, J, form_compiler_parameters):
         elements = list(map(ufl.BrokenElement, elements))
         if V.shape:
             elements = [ufl.TensorElement(ele, shape=V.shape) for ele in elements]
-
         Z = firedrake.FunctionSpace(mesh, ufl.MixedElement(elements))
+
+        # Transform the exterior derivative and the original arguments of J to arguments in Z
         args = (firedrake.TestFunctions(Z), firedrake.TrialFunctions(Z))
         repargs = {t: v[0] for t, v in zip(args_J, args)}
         repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else dict()
         Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
         mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
 
+        # Return coefficients and assembly callables, and cache them class
         key = (mixed_form.signature(), mesh)
         block_diagonal = True
         try:
@@ -601,6 +616,15 @@ def assemble_coef(self, J, form_compiler_parameters):
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V):
+        """
+        Return the reference tensor used in the diagonal factorization of the
+        sparse cell matrices.  See Section 3.2 of Brubeck2022b.
+
+        :arg V: a :class:`.FunctionSpace`
+
+        :return: a :class:`PETSc.Mat` with the moments of orthogonalized bases
+        against the basis and its exterior derivative.
+        """
         tdim = V.mesh().topological_dimension()
         value_size = V.value_size
         formdegree = V.finat_element.formdegree
@@ -612,12 +636,12 @@ def assemble_reference_tensor(self, V):
         if formdegree == tdim:
             degree = degree + 1
         is_interior, is_facet = is_restricted(V.finat_element)
-        key = (degree, tdim, formdegree, V.value_size, is_interior, is_facet)
+        key = (degree, tdim, formdegree, value_size, is_interior, is_facet)
         cache = self._reference_tensor_cache
         try:
             return cache[key]
         except KeyError:
-            full_key = (degree, tdim, formdegree, V.value_size, False, False)
+            full_key = (degree, tdim, formdegree, value_size, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)

From 81764895aa23fa04cbf78db21114c11aac7b7321 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 17:53:33 +0000
Subject: [PATCH 26/75] deterministic sort keys of point_dicts

---
 firedrake/preconditioners/pmg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 029fffad6d..55e0264014 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -559,8 +559,8 @@ def expand_element(ele):
 
 def evaluate_dual(dual, element, key=None):
     # Evaluate the action of a set of dual functionals on the basis functions of an element.
-    keys = set(tuple(phi.get_point_dict().keys()) for phi in dual)
-    pts = list(set(sum(keys, ())))
+    keys = list(dict.fromkeys(tuple(phi.get_point_dict().keys()) for phi in dual))
+    pts = list(dict.fromkeys(sum(keys, ())))
     if key is None:
         key = (0, ) * len(pts[0])
     tab = element.tabulate(sum(key), pts)[key]

From 42ed1bfc7f92a857bd3945f97c25c85fdca7f803 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 17:55:28 +0000
Subject: [PATCH 27/75] construct block diagonal mass matrix from a nest Mat

---
 firedrake/preconditioners/fdm.py | 42 +++++++++++++++-----------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 330ea4c33e..9d13962782 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -877,7 +877,8 @@ def is_restricted(finat_element):
 
 
 def sort_interior_dofs(idofs, A):
-    # Permute `idofs` to have A[idofs, idofs] with contiguous 1x1, 2x2, 3x3, ... blocks
+    # Permute `idofs` to have A[idofs, idofs] with square blocks of
+    # increasing dimension along its diagonal.
     Aii = A.createSubMatrix(idofs, idofs)
     indptr, indices, _ = Aii.getValuesCSR()
     n = idofs.getSize()
@@ -907,7 +908,7 @@ def kron3(A, B, C, scale=None):
 
 def mass_matrix(tdim, formdegree, B00, B11, comm=None):
     # Construct mass matrix on reference cell from 1D mass matrices B00 and B11.
-    # It can be applied with either broken or conforming test and trial spaces.
+    # The 1D matrices may come with different test and trial spaces.
     if comm is None:
         comm = PETSc.COMM_SELF
     B00 = petsc_sparse(B00, comm=comm)
@@ -921,34 +922,31 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
             return B11
     elif tdim == 2:
         if formdegree == 0:
-            B_blocks = [B00.kron(B00)]
+            B_diag = [B00.kron(B00)]
         elif formdegree == 1:
-            B_blocks = [B00.kron(B11), B11.kron(B00)]
+            B_diag = [B00.kron(B11), B11.kron(B00)]
         else:
-            B_blocks = [B11.kron(B11)]
+            B_diag = [B11.kron(B11)]
     elif tdim == 3:
         if formdegree == 0:
-            B_blocks = [kron3(B00, B00, B00)]
+            B_diag = [kron3(B00, B00, B00)]
         elif formdegree == 1:
-            B_blocks = [kron3(B00, B00, B11), kron3(B00, B11, B00), kron3(B11, B00, B00)]
+            B_diag = [kron3(B00, B00, B11), kron3(B00, B11, B00), kron3(B11, B00, B00)]
         elif formdegree == 2:
-            B_blocks = [kron3(B00, B11, B11), kron3(B11, B00, B11), kron3(B11, B11, B00)]
+            B_diag = [kron3(B00, B11, B11), kron3(B11, B00, B11), kron3(B11, B11, B00)]
         else:
-            B_blocks = [kron3(B11, B11, B11)]
+            B_diag = [kron3(B11, B11, B11)]
 
-    if len(B_blocks) == 1:
-        result = B_blocks[0]
+    if len(B_diag) == 1:
+        result = B_diag[0]
     else:
-        nrows = sum(Bk.size[0] for Bk in B_blocks)
-        ncols = sum(Bk.size[1] for Bk in B_blocks)
-        csr_block = [Bk.getValuesCSR() for Bk in B_blocks]
-        ishift = numpy.cumsum([0] + [csr[0][-1] for csr in csr_block])
-        jshift = numpy.cumsum([0] + [Bk.size[1] for Bk in B_blocks])
-        indptr = numpy.concatenate([csr[0][bool(shift):]+shift for csr, shift in zip(csr_block, ishift[:-1])])
-        indices = numpy.concatenate([csr[1]+shift for csr, shift in zip(csr_block, jshift[:-1])])
-        data = numpy.concatenate([csr[2] for csr in csr_block])
-        result = PETSc.Mat().createAIJ((nrows, ncols), csr=(indptr, indices, data), comm=comm)
-        for B in B_blocks:
+        n = len(B_diag)
+        B_zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=comm)
+        B_zero.assemble()
+        B_blocks = [[B_diag[i] if i == j else B_zero for j in range(n)] for i in range(n)]
+        result = block_mat(B_blocks)
+        B_zero.destroy()
+        for B in B_diag:
             B.destroy()
     B00.destroy()
     B11.destroy()
@@ -958,7 +956,7 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
 def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
     # Construct exterior derivative matrix on reference cell from 1D mass matrices A00 and A11,
     # and exterior derivative moments A10.
-    # It can be applied with either broken or conforming test and trial spaces.
+    # The 1D matrices may come with different test and trial spaces.
     if comm is None:
         comm = PETSc.COMM_SELF
     if formdegree == tdim:

From 9c0ccbfddf51db31d9493ecb72edbb8606d429e0 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 16 Mar 2023 22:23:21 +0000
Subject: [PATCH 28/75] fix docstrings

---
 firedrake/preconditioners/fdm.py | 41 ++++++++++++++------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 9d13962782..3b7c775667 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -67,6 +67,7 @@ class FDMPC(PCBase):
     def load_set_values(triu=False):
         """
         Compile C code to insert sparse element matrices and store in class cache
+
         :arg triu: are we inserting onto the upper triangular part of the matrix?
 
         :returns: a python wrapper for the matrix insertion function
@@ -210,7 +211,7 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type):
         :arg J: the Jacobian bilinear form
         :arg bcs: an iterable of boundary conditions on V
         :arg form_compiler_parameters: parameters to assemble diagonal factors
-        :pmat_type: the preconditioner `PETSc.Mat.Type`
+        :arg pmat_type: the preconditioner `PETSc.Mat.Type`
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
         """
@@ -511,11 +512,11 @@ def assemble_coef(self, J, form_compiler_parameters):
         See Section 3.2 of Brubeck2022b.
 
         :arg J: the Jacobian bilinear :class:`ufl.Form`,
-        :form_compiler_parameters: a `dict` with tsfc parameters.
+        :arg form_compiler_parameters: a `dict` with tsfc parameters.
 
-        :return: a 2-tuple with a `dict` with the zero-th order and second
-        order coefficients keyed on ``"beta"`` and ``"alpha"``, and a list of
-        assembly callables.
+        :returns: a 2-tuple of a `dict` with the zero-th order and second
+                  order coefficients keyed on ``"beta"`` and ``"alpha"``,
+                  and a list of assembly callables.
         """
         from ufl.algorithms.ad import expand_derivatives
         from ufl.algorithms.expand_indices import expand_indices
@@ -622,8 +623,8 @@ def assemble_reference_tensor(self, V):
 
         :arg V: a :class:`.FunctionSpace`
 
-        :return: a :class:`PETSc.Mat` with the moments of orthogonalized bases
-        against the basis and its exterior derivative.
+        :returns: a :class:`PETSc.Mat` with the moments of orthogonalized bases
+                  against the basis and its exterior derivative.
         """
         tdim = V.mesh().topological_dimension()
         value_size = V.value_size
@@ -911,16 +912,12 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
     # The 1D matrices may come with different test and trial spaces.
     if comm is None:
         comm = PETSc.COMM_SELF
+    if tdim == 1:
+        return petsc_sparse(B11 if formdegree else B00, comm=comm)
+
     B00 = petsc_sparse(B00, comm=comm)
     B11 = petsc_sparse(B11, comm=comm)
-    if tdim == 1:
-        if formdegree == 0:
-            B11.destroy()
-            return B00
-        else:
-            B00.destroy()
-            return B11
-    elif tdim == 2:
+    if tdim == 2:
         if formdegree == 0:
             B_diag = [B00.kron(B00)]
         elif formdegree == 1:
@@ -937,6 +934,8 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
         else:
             B_diag = [kron3(B11, B11, B11)]
 
+    B00.destroy()
+    B11.destroy()
     if len(B_diag) == 1:
         result = B_diag[0]
     else:
@@ -948,8 +947,6 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
         B_zero.destroy()
         for B in B_diag:
             B.destroy()
-    B00.destroy()
-    B11.destroy()
     return result
 
 
@@ -965,15 +962,13 @@ def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
         A_zero.assemble()
         return A_zero
 
-    A00 = petsc_sparse(A00, comm=comm)
-    A11 = petsc_sparse(A11, comm=comm)
     A10 = petsc_sparse(A10, comm=comm)
     if tdim == 1:
-        A00.destroy()
-        A11.destroy()
-
         return A10
-    elif tdim == 2:
+
+    A00 = petsc_sparse(A00, comm=comm)
+    A11 = petsc_sparse(A11, comm=comm)
+    if tdim == 2:
         if formdegree == 0:
             A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
         elif formdegree == 1:

From b954a8a3b4e588b97f244d5188cf28cf0c076868 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 20 Mar 2023 09:06:20 +0000
Subject: [PATCH 29/75] prolongation arguments ordered now as coarse, fine

---
 firedrake/preconditioners/fdm.py | 16 +++++++--------
 firedrake/preconditioners/pmg.py | 34 ++++++++++++++++----------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 5616cf547c..07a20fa444 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -133,7 +133,7 @@ def initialize(self, pc):
                 bcs_fdm.append(bc.reconstruct(V=W, g=0))
 
             # Construct interpolation from original to variant spaces
-            self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
+            self.fdm_interp = prolongation_matrix_matfree(V_fdm, V, bcs_fdm, [])
             self.work_vec_x = Amat.createVecLeft()
             self.work_vec_y = Amat.createVecRight()
             if use_amat:
@@ -162,7 +162,7 @@ def interp_nullspace(I, nsp):
                         x.destroy()
                     return PETSc.NullSpace().create(constant=False, vectors=vectors, comm=nsp.getComm())
 
-                inject = prolongation_matrix_matfree(V_fdm, V, [], [])
+                inject = prolongation_matrix_matfree(V, V_fdm, [], [])
                 Amat.setNullSpace(interp_nullspace(inject, omat.getNullSpace()))
                 Amat.setTransposeNullSpace(interp_nullspace(inject, omat.getTransposeNullSpace()))
                 Amat.setNearNullSpace(interp_nullspace(inject, omat.getNearNullSpace()))
@@ -980,16 +980,16 @@ def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
     return result
 
 
-def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
+def diff_prolongator(Vc, Vf, cbcs=[], fbcs=[]):
     """
-    Magic. Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
-    Works for any basis. These are the same matrices one needs for HypreAMS and friends.
+    Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
+    Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
     """
     from tsfc.finatinterface import create_element
     from firedrake.preconditioners.pmg import fiat_reference_prolongator
 
-    ef = Vf.finat_element
     ec = Vc.finat_element
+    ef = Vf.finat_element
     if ef.formdegree - ec.formdegree != 1:
         raise ValueError("Expecting Vf = d(Vc)")
 
@@ -1000,7 +1000,7 @@ def diff_prolongator(Vf, Vc, fbcs=[], cbcs=[]):
     degree = e0.degree()
     A11 = numpy.eye(degree, dtype=PETSc.RealType)
     A00 = numpy.eye(degree+1, dtype=PETSc.RealType)
-    A10 = fiat_reference_prolongator(e1, e0, derivative=True)
+    A10 = fiat_reference_prolongator(e0, e1, derivative=True)
 
     tdim = Vc.mesh().topological_dimension()
     Dhat = diff_matrix(tdim, ec.formdegree, A00, A11, A10)
@@ -1720,7 +1720,7 @@ def extrude_node_map(node_map, bsize=1):
     :arg node_map: a :class:`pyop2.Map` mapping entities to their local dofs, including ghost entities.
     :arg bsize: the block size
 
-    :returns: a 2-tuple with the map as function and the number of cells owned by this process
+    :returns: a 2-tuple with the cell to node map and the number of cells owned by this process
     """
     nelv = node_map.values.shape[0]
     if node_map.offset is None:
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 029fffad6d..84433142d0 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -346,7 +346,7 @@ def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
         fV = fctx.J.arguments()[0].function_space()
         cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
         fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
-        key = (fV, cV, cbcs, fbcs, mat_type)
+        key = (cV, fV, cbcs, fbcs, mat_type)
         try:
             return self._cache_transfer[key]
         except KeyError:
@@ -356,7 +356,7 @@ def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
                 construct_mat = prolongation_matrix_aij
             else:
                 raise ValueError("Unknown matrix type")
-            return self._cache_transfer.setdefault(key, construct_mat(fV, cV, fbcs, cbcs))
+            return self._cache_transfer.setdefault(key, construct_mat(cV, fV, cbcs, fbcs))
 
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
@@ -678,7 +678,7 @@ def get_permutation_to_line_elements(finat_element):
 
 
 @lru_cache(maxsize=10)
-def fiat_reference_prolongator(felem, celem, derivative=False):
+def fiat_reference_prolongator(celem, felem, derivative=False):
     ckey = (felem.formdegree,) if derivative else None
     fkey = (celem.formdegree,) if derivative else None
     fdual = felem.dual_basis()
@@ -915,7 +915,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         fshapes.append((nscal,) + tuple(fshape))
         cshapes.append((nscal,) + tuple(cshape))
 
-        J = [fiat_reference_prolongator(fe, ce).T for fe, ce in zip(felem, celem)]
+        J = [fiat_reference_prolongator(ce, fe).T for fe, ce in zip(felem, celem)]
         if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
             prolong_code.append(f"""
             for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
@@ -1130,13 +1130,13 @@ class StandaloneInterpolationMatrix(object):
 
     _cache_work = {}
 
-    def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
-        self.uf = self.work_function(Vf)
+    def __init__(self, Vc, Vf, Vc_bcs, Vf_bcs):
         self.uc = self.work_function(Vc)
-        self.Vf = self.uf.function_space()
+        self.uf = self.work_function(Vf)
         self.Vc = self.uc.function_space()
-        self.Vf_bcs = Vf_bcs
+        self.Vf = self.uf.function_space()
         self.Vc_bcs = Vc_bcs
+        self.Vf_bcs = Vf_bcs
 
     def work_function(self, V):
         if isinstance(V, firedrake.Function):
@@ -1453,10 +1453,10 @@ def _weight(self):
     @cached_property
     def _standalones(self):
         standalones = []
-        for (i, (uf_sub, uc_sub)) in enumerate(zip(self.uf.subfunctions, self.uc.subfunctions)):
-            Vf_sub_bcs = [bc for bc in self.Vf_bcs if bc.function_space().index == i]
+        for i, (uc_sub, uf_sub) in enumerate(zip(self.uc.subfunctions, self.uf.subfunctions)):
             Vc_sub_bcs = [bc for bc in self.Vc_bcs if bc.function_space().index == i]
-            standalone = StandaloneInterpolationMatrix(uf_sub, uc_sub, Vf_sub_bcs, Vc_sub_bcs)
+            Vf_sub_bcs = [bc for bc in self.Vf_bcs if bc.function_space().index == i]
+            standalone = StandaloneInterpolationMatrix(uc_sub, uf_sub, Vc_sub_bcs, Vf_sub_bcs)
             standalones.append(standalone)
         return standalones
 
@@ -1477,11 +1477,11 @@ def getNestSubMatrix(self, i, j):
             return None
 
 
-def prolongation_matrix_aij(Pk, P1, Pk_bcs=[], P1_bcs=[]):
-    if isinstance(Pk, firedrake.Function):
-        Pk = Pk.function_space()
+def prolongation_matrix_aij(P1, Pk, P1_bcs=[], Pk_bcs=[]):
     if isinstance(P1, firedrake.Function):
         P1 = P1.function_space()
+    if isinstance(Pk, firedrake.Function):
+        Pk = Pk.function_space()
     sp = op2.Sparsity((Pk.dof_dset,
                        P1.dof_dset),
                       (Pk.cell_node_map(),
@@ -1532,12 +1532,12 @@ def prolongation_matrix_aij(Pk, P1, Pk_bcs=[], P1_bcs=[]):
     return mat.handle
 
 
-def prolongation_matrix_matfree(Vf, Vc, Vf_bcs=[], Vc_bcs=[]):
+def prolongation_matrix_matfree(Vc, Vf, Vc_bcs=[], Vf_bcs=[]):
     fele = Vf.ufl_element()
     if isinstance(fele, ufl.MixedElement) and not isinstance(fele, (ufl.VectorElement, ufl.TensorElement)):
-        ctx = MixedInterpolationMatrix(Vf, Vc, Vf_bcs, Vc_bcs)
+        ctx = MixedInterpolationMatrix(Vc, Vf, Vc_bcs, Vf_bcs)
     else:
-        ctx = StandaloneInterpolationMatrix(Vf, Vc, Vf_bcs, Vc_bcs)
+        ctx = StandaloneInterpolationMatrix(Vc, Vf, Vc_bcs, Vf_bcs)
 
     sizes = (Vf.dof_dset.layout_vec.getSizes(), Vc.dof_dset.layout_vec.getSizes())
     M_shll = PETSc.Mat().createPython(sizes, ctx, comm=Vf._comm)

From 4b5f7b70eeafbc2c921b8f276329c1404f5f3686 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 20 Mar 2023 10:01:20 +0000
Subject: [PATCH 30/75] address some more review comments

---
 firedrake/preconditioners/fdm.py    |  4 +--
 firedrake/preconditioners/pmg.py    | 51 ++++++++++++++++++++++-------
 tests/multigrid/test_p_multigrid.py |  4 +--
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 86d34123fc..ae1ef176c6 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -936,10 +936,10 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
 
     B00.destroy()
     B11.destroy()
-    if len(B_diag) == 1:
+    n = len(B_diag)
+    if n == 1:
         result = B_diag[0]
     else:
-        n = len(B_diag)
         B_zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=comm)
         B_zero.assemble()
         B_blocks = [[B_diag[i] if i == j else B_zero for j in range(n)] for i in range(n)]
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index fbbade5c10..e98798b7f7 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -608,6 +608,19 @@ def compare_dual_basis(l1, l2):
 @lru_cache(maxsize=10)
 @PETSc.Log.EventDecorator("GetLineElements")
 def get_permutation_to_line_elements(finat_element):
+    """
+    Find DOF permuation to factor out the EnrichedElement expansion into common
+    TensorProductElements. This routine exposes structure to e.g vectorize
+    prolongation of NCE or NCF accross vector components, by permuting all
+    components into a common TensorProductElement.
+
+    This is temporary while we wait for dual evaluation of :class:`finat.EnrichedElement`.
+
+    :returns: a 3-tuple of the DOF permuation, the unique terms in expansion as
+              a list of tuples of :class:`FIAT.FiniteElements`, and the cyclic
+              permuatations of the axes to form the element given by their shifts
+              in list of `int` tuples
+    """
     from FIAT.reference_element import LINE
 
     expansion = expand_element(finat_element)
@@ -714,7 +727,7 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
 y is (mx*my*mz)-by-nel.
 
 Important notes:
-The input data in x is destroyed in the process.
+This routine is in-place: the input data in x and y are destroyed in the process.
 Need to allocate nel*max(mx, nx)*max(my, ny)*max(mz, nz) memory for both x and y.
 */
 
@@ -766,6 +779,10 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
     PetscBLASInt nx, PetscBLASInt ny, PetscBLASInt nz, PetscBLASInt nel,
     PetscScalar *A1, PetscScalar *A2, PetscScalar *A3,
     PetscScalar *x, PetscScalar *y, PetscScalar *xwork, PetscScalar *ywork){
+    /*
+    Same as kronmxv_inplace, but the work buffers allow the input data in x to
+    be kept untouched.
+    */
 
     PetscScalar *ptr[2] = {xwork, ywork};
 
@@ -784,6 +801,10 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
 static inline void permute_axis(PetscBLASInt axis,
     PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
     PetscScalar *x, PetscScalar *y){
+    /*
+    Apply a cyclic permuation to a n0 x n1 x n2 x n3 array x, exponsing axis as
+    the fast direction.  Write the result on y.
+    */
 
     PetscBLASInt p = 0;
     PetscBLASInt s0, s1, s2, s3;
@@ -805,6 +826,9 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
 static inline void ipermute_axis(PetscBLASInt axis,
     PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
     PetscScalar *x, PetscScalar *y){
+    /*
+    Apply the transpose of permute_axis, reading from y and adding to x.
+    */
 
     PetscBLASInt p = 0;
     PetscBLASInt s0, s1, s2, s3;
@@ -827,15 +851,15 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
 
 
 @PETSc.Log.EventDecorator("MakeKronCode")
-def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
+def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
     """
     Return interpolation and restriction kernels between enriched tensor product elements
     """
     operator_decl = []
     prolong_code = []
     restrict_code = []
-    _, felems, fshifts = get_permutation_to_line_elements(Vf.finat_element)
     _, celems, cshifts = get_permutation_to_line_elements(Vc.finat_element)
+    _, felems, fshifts = get_permutation_to_line_elements(Vf.finat_element)
 
     shifts = fshifts
     in_place = False
@@ -902,7 +926,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
     fshapes = []
     cshapes = []
     has_code = False
-    for felem, celem, shift in zip(felems, celems, shifts):
+    for celem, felem, shift in zip(celems, felems, shifts):
         if len(felem) != len(celem):
             raise ValueError("Fine and coarse elements do not have the same number of factors")
         if len(felem) > 3:
@@ -915,7 +939,7 @@ def make_kron_code(Vf, Vc, t_in, t_out, mat_name, scratch):
         fshapes.append((nscal,) + tuple(fshape))
         cshapes.append((nscal,) + tuple(cshape))
 
-        J = [fiat_reference_prolongator(ce, fe).T for fe, ce in zip(felem, celem)]
+        J = [fiat_reference_prolongator(ce, fe).T for ce, fe in zip(celem, felem)]
         if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
             prolong_code.append(f"""
             for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
@@ -1022,7 +1046,7 @@ def cache_generate_code(kernel, comm):
     return code
 
 
-def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
+def make_mapping_code(Q, cmapping, fmapping, t_in, t_out):
     if fmapping == cmapping:
         return None
     A = get_piola_tensor(cmapping, Q.mesh(), inverse=False)
@@ -1166,6 +1190,9 @@ def _weight(self):
     @cached_property
     def _kernels(self):
         try:
+            # We generate custom prolongation and restriction kernels mainly because:
+            # 1. Code generation for the transpose of prolongation is not readily available
+            # 2. Dual evaluation of EnrichedElement is not yet implemented in FInAT
             uf_map = get_permuted_map(self.Vf)
             uc_map = get_permuted_map(self.Vc)
             prolong_kernel, restrict_kernel, coefficients = self.make_blas_kernels(self.Vf, self.Vc)
@@ -1174,6 +1201,8 @@ def _kernels(self):
                             self.uc.dat(op2.READ, uc_map),
                             self._weight.dat(op2.READ, uf_map)]
         except ValueError:
+            # The elements do not have the expected tensor product structure
+            # Fall back to aij kernels
             uf_map = self.Vf.cell_node_map()
             uc_map = self.Vc.cell_node_map()
             prolong_kernel, restrict_kernel, coefficients = self.make_kernels(self.Vf, self.Vc)
@@ -1250,7 +1279,7 @@ def make_blas_kernels(Vf, Vc):
 
         if fmapping == cmapping:
             # interpolate on each direction via Kroncker product
-            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vf, Vc, "t0", "t1", "J0", "t2")
+            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vc, Vf, "t0", "t1", "J0", "t2")
         else:
             decl = [""]*4
             prolong = [""]*5
@@ -1261,18 +1290,18 @@ def make_blas_kernels(Vf, Vc):
                 if qelem.mapping() != "identity":
                     qelem = qelem.reconstruct(mapping="identity")
                 Qf = Vf if qelem == felem else firedrake.FunctionSpace(Vf.mesh(), qelem)
-                mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
+                mapping_output = make_mapping_code(Qf, cmapping, fmapping, "t0", "t1")
                 in_place_mapping = True
             except Exception:
                 qelem = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
                 if felem.value_shape():
                     qelem = ufl.TensorElement(qelem, shape=felem.value_shape(), symmetry=felem.symmetry())
                 Qf = firedrake.FunctionSpace(Vf.mesh(), qelem)
-                mapping_output = make_mapping_code(Qf, fmapping, cmapping, "t0", "t1")
+                mapping_output = make_mapping_code(Qf, cmapping, fmapping, "t0", "t1")
 
             qshape = (Qf.value_size, Qf.finat_element.space_dimension())
             # interpolate to embedding fine space
-            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Qf, Vc, "t0", "t1", "J0", "t2")
+            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Vc, Qf, "t0", "t1", "J0", "t2")
 
             if mapping_output is not None:
                 # permute to FInAT ordering, and apply the mapping
@@ -1281,7 +1310,7 @@ def make_blas_kernels(Vf, Vc):
                 if not in_place_mapping:
                     # permute to Kronecker-friendly ordering and interpolate to fine space
                     decl[2], prolong[3], restrict[3] = make_permutation_code(Vf, qshape, shapes[0], "t1", "t0", "perm1")
-                    decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Vf, Qf, "t0", "t1", "J1", "t2")
+                    decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Qf, Vf, "t0", "t1", "J1", "t2")
                     shapes.extend(_shapes)
 
             operator_decl = "".join(decl)
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index a7414ebb4c..c04913e7d4 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -87,7 +87,7 @@ def test_prolong_de_rham(tp_mesh):
     for u in us:
         for v in us:
             if u != v:
-                P = prolongation_matrix_matfree(v, u).getPythonContext()
+                P = prolongation_matrix_matfree(u, v).getPythonContext()
                 P._prolong()
                 assert norm(v-expr, "L2") < 1E-14
 
@@ -113,7 +113,7 @@ def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
     uc.dat.data[1::2] = 1.0
 
     for v in [ui, uf]:
-        P = prolongation_matrix_matfree(v, uc).getPythonContext()
+        P = prolongation_matrix_matfree(uc, v).getPythonContext()
         P._prolong()
 
     assert norm(ui + uf - uc, "L2") < 2E-14

From dd7d85518acf356a44cc979cb2c7bae33c978b46 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 20 Mar 2023 10:08:36 +0000
Subject: [PATCH 31/75] change API of exterior derivative in hiptmair.py

---
 firedrake/preconditioners/hiptmair.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/firedrake/preconditioners/hiptmair.py b/firedrake/preconditioners/hiptmair.py
index 317dc5614b..af1d1bab27 100644
--- a/firedrake/preconditioners/hiptmair.py
+++ b/firedrake/preconditioners/hiptmair.py
@@ -201,7 +201,7 @@ def coarsen(self, pc):
         if G_callback is None:
             interp_petscmat = chop(Interpolator(dminus(test), V, bcs=bcs + coarse_space_bcs).callable().handle)
         else:
-            interp_petscmat = G_callback(V, coarse_space, bcs, coarse_space_bcs)
+            interp_petscmat = G_callback(coarse_space, V, coarse_space_bcs, bcs)
 
         return coarse_operator, coarse_space_bcs, interp_petscmat
 

From fa6da0e3d38f20b7802b00e77f6ce61ca080d733 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 20 Mar 2023 10:56:11 +0000
Subject: [PATCH 32/75] new prolongator API in hypre

---
 firedrake/preconditioners/hypre_ads.py | 4 ++--
 firedrake/preconditioners/hypre_ams.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/firedrake/preconditioners/hypre_ads.py b/firedrake/preconditioners/hypre_ads.py
index ca3728abdb..9cbc2537da 100644
--- a/firedrake/preconditioners/hypre_ads.py
+++ b/firedrake/preconditioners/hypre_ads.py
@@ -34,12 +34,12 @@ def initialize(self, obj):
         if G_callback is None:
             G = chop(Interpolator(grad(TestFunction(P1)), NC1).callable().handle)
         else:
-            G = G_callback(NC1, P1)
+            G = G_callback(P1, NC1)
         C_callback = appctx.get("get_curl", None)
         if C_callback is None:
             C = chop(Interpolator(curl(TestFunction(NC1)), V).callable().handle)
         else:
-            C = C_callback(V, NC1)
+            C = C_callback(NC1, V)
 
         pc = PETSc.PC().create(comm=obj.comm)
         pc.incrementTabLevel(1, parent=obj)
diff --git a/firedrake/preconditioners/hypre_ams.py b/firedrake/preconditioners/hypre_ams.py
index 8bfd14908e..a00334403b 100644
--- a/firedrake/preconditioners/hypre_ams.py
+++ b/firedrake/preconditioners/hypre_ams.py
@@ -54,7 +54,7 @@ def initialize(self, obj):
         if G_callback is None:
             G = chop(Interpolator(grad(TestFunction(P1)), V).callable().handle)
         else:
-            G = G_callback(V, P1)
+            G = G_callback(P1, V)
 
         pc = PETSc.PC().create(comm=obj.comm)
         pc.incrementTabLevel(1, parent=obj)

From e7622d9dfccb482973f5ee69c693349daa034ea6 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 20 Mar 2023 17:28:15 +0000
Subject: [PATCH 33/75] typos

---
 firedrake/preconditioners/pmg.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index e98798b7f7..4ed71d2d14 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -609,16 +609,16 @@ def compare_dual_basis(l1, l2):
 @PETSc.Log.EventDecorator("GetLineElements")
 def get_permutation_to_line_elements(finat_element):
     """
-    Find DOF permuation to factor out the EnrichedElement expansion into common
+    Find DOF permutation to factor out the EnrichedElement expansion into common
     TensorProductElements. This routine exposes structure to e.g vectorize
     prolongation of NCE or NCF accross vector components, by permuting all
     components into a common TensorProductElement.
 
     This is temporary while we wait for dual evaluation of :class:`finat.EnrichedElement`.
 
-    :returns: a 3-tuple of the DOF permuation, the unique terms in expansion as
+    :returns: a 3-tuple of the DOF permutation, the unique terms in expansion as
               a list of tuples of :class:`FIAT.FiniteElements`, and the cyclic
-              permuatations of the axes to form the element given by their shifts
+              permutations of the axes to form the element given by their shifts
               in list of `int` tuples
     """
     from FIAT.reference_element import LINE
@@ -802,7 +802,7 @@ def fiat_reference_prolongator(celem, felem, derivative=False):
     PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
     PetscScalar *x, PetscScalar *y){
     /*
-    Apply a cyclic permuation to a n0 x n1 x n2 x n3 array x, exponsing axis as
+    Apply a cyclic permutation to a n0 x n1 x n2 x n3 array x, exponsing axis as
     the fast direction.  Write the result on y.
     */
 

From 823c306747e9ff3d3bdb1e3b902de30c698c3629 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 21 Mar 2023 10:45:00 +0000
Subject: [PATCH 34/75] dual evaluation direclty through FIAT

---
 firedrake/preconditioners/pmg.py | 51 ++++++++++++++++----------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 4ed71d2d14..4c1592a082 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -557,20 +557,19 @@ def expand_element(ele):
         return ele
 
 
-def evaluate_dual(dual, element, key=None):
-    # Evaluate the action of a set of dual functionals on the basis functions of an element.
-    keys = list(dict.fromkeys(tuple(phi.get_point_dict().keys()) for phi in dual))
-    pts = list(dict.fromkeys(sum(keys, ())))
-    if key is None:
-        key = (0, ) * len(pts[0])
-    tab = element.tabulate(sum(key), pts)[key]
-    result = numpy.empty((len(dual), element.space_dimension()), dtype=tab.dtype)
-    zero = [(0.0, ())]
-    for k, phi in enumerate(dual):
-        wts = phi.get_point_dict()
-        wts = numpy.array([wts.get(pt, zero)[0][0] for pt in pts])
-        result[k] = tab.dot(wts).T
-    return result
+def evaluate_dual(source, target, alpha=None):
+    # Evaluate the action of a set of dual functionals of the target element
+    # on the (derivatives of the) basis functions of the source element.
+    primal = source.get_nodal_basis()
+    dual = target.get_dual_set()
+    A = dual.to_riesz(primal)
+    B = numpy.transpose(primal.get_coeffs())
+    if alpha is not None:
+        dmats = primal.get_dmats()
+        for i in range(len(alpha)):
+            for j in range(alpha[i]):
+                B = numpy.dot(dmats[i], B)
+    return numpy.dot(A, B)
 
 
 def compare_element(e1, e2):
@@ -578,7 +577,7 @@ def compare_element(e1, e2):
         return True
     if e1.space_dimension() != e2.space_dimension():
         return False
-    B = evaluate_dual(e1.dual_basis(), e2)
+    B = evaluate_dual(e1, e2)
     numpy.fill_diagonal(B, numpy.diagonal(B)-1.0)
     return numpy.allclose(B, 0.0, rtol=1E-14, atol=1E-14)
 
@@ -605,6 +604,17 @@ def compare_dual_basis(l1, l2):
     return all(compare_dual(b1, b2) for b1, b2 in zip(l1, l2))
 
 
+@lru_cache(maxsize=10)
+def fiat_reference_prolongator(celem, felem, derivative=False):
+    ckey = (felem.formdegree,) if derivative else None
+    fkey = (celem.formdegree,) if derivative else None
+    fdual = felem.dual_basis()
+    cdual = celem.dual_basis()
+    if fkey == ckey and (celem is felem or compare_dual_basis(cdual, fdual)):
+        return numpy.array([])
+    return evaluate_dual(celem, felem, alpha=ckey)
+
+
 @lru_cache(maxsize=10)
 @PETSc.Log.EventDecorator("GetLineElements")
 def get_permutation_to_line_elements(finat_element):
@@ -690,17 +700,6 @@ def get_permutation_to_line_elements(finat_element):
     return dof_perm, unique_line_elements, shifts
 
 
-@lru_cache(maxsize=10)
-def fiat_reference_prolongator(celem, felem, derivative=False):
-    ckey = (felem.formdegree,) if derivative else None
-    fkey = (celem.formdegree,) if derivative else None
-    fdual = felem.dual_basis()
-    cdual = celem.dual_basis()
-    if fkey == ckey and compare_dual_basis(fdual, cdual):
-        return numpy.array([])
-    return evaluate_dual(fdual, celem, ckey)
-
-
 # Common kernel to compute y = kron(A3, kron(A2, A1)) * x
 # Vector and tensor field generalization from Deville, Fischer, and Mund section 8.3.1.
 kronmxv_code = """

From 67d56e9f6da62c8721c1df3a6ad7a082b401382b Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 21 Mar 2023 10:56:13 +0000
Subject: [PATCH 35/75] test HiptmairPC on hexes

---
 firedrake/preconditioners/fdm.py |  2 +-
 tests/multigrid/test_hiptmair.py | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index ae1ef176c6..56a512d2d6 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -997,7 +997,7 @@ def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
     return result
 
 
-def diff_prolongator(Vc, Vf, cbcs=[], fbcs=[]):
+def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
     """
     Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
     Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
diff --git a/tests/multigrid/test_hiptmair.py b/tests/multigrid/test_hiptmair.py
index d851e43e2f..b553356e5e 100644
--- a/tests/multigrid/test_hiptmair.py
+++ b/tests/multigrid/test_hiptmair.py
@@ -62,9 +62,14 @@ def run_riesz_map(V, mat_type):
     a = inner(d(u), d(v))*dx + inner(u, v)*dx
     L = inner(f, v)*dx
     bcs = [DirichletBC(V, u_exact, "on_boundary")]
-
+    if V.mesh().ufl_cell().is_simplex():
+        appctx = dict()
+    else:
+        from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+        appctx = {"get_gradient": tabulate_exterior_derivative,
+                  "get_curl": tabulate_exterior_derivative,}
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
-    solver = LinearVariationalSolver(problem, solver_parameters=parameters)
+    solver = LinearVariationalSolver(problem, solver_parameters=parameters, appctx=appctx)
     solver.solve()
     its = solver.snes.ksp.getIterationNumber()
     return its
@@ -72,7 +77,7 @@ def run_riesz_map(V, mat_type):
 
 @pytest.mark.skipcomplexnoslate
 @pytest.mark.parametrize(["family", "cell"],
-                         [("N1curl", "tetrahedron")])
+                         [("N1curl", "tetrahedron"), ("NCE", "hexahedron")])
 def test_hiptmair_hcurl(family, cell):
     mesh = mesh_hierarchy(cell)[-1]
     V = FunctionSpace(mesh, family, degree=1)
@@ -82,7 +87,7 @@ def test_hiptmair_hcurl(family, cell):
 
 @pytest.mark.skipcomplexnoslate
 @pytest.mark.parametrize(["family", "cell"],
-                         [("RT", "tetrahedron")])
+                         [("RT", "tetrahedron"), ("NCF", "hexahedron")])
 def test_hiptmair_hdiv(family, cell):
     mesh = mesh_hierarchy(cell)[-1]
     V = FunctionSpace(mesh, family, degree=1)

From fa966ff233984d303f1d69cd89b0b3a13438da23 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 21 Mar 2023 11:00:49 +0000
Subject: [PATCH 36/75] lint

---
 tests/multigrid/test_hiptmair.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/multigrid/test_hiptmair.py b/tests/multigrid/test_hiptmair.py
index b553356e5e..b59022d9c5 100644
--- a/tests/multigrid/test_hiptmair.py
+++ b/tests/multigrid/test_hiptmair.py
@@ -67,7 +67,7 @@ def run_riesz_map(V, mat_type):
     else:
         from firedrake.preconditioners.fdm import tabulate_exterior_derivative
         appctx = {"get_gradient": tabulate_exterior_derivative,
-                  "get_curl": tabulate_exterior_derivative,}
+                  "get_curl": tabulate_exterior_derivative}
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters=parameters, appctx=appctx)
     solver.solve()

From b108044d5ed3419b5b5e703eeecc4f5002c62388 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 21 Mar 2023 13:56:22 +0000
Subject: [PATCH 37/75] add option fdm_static_condensation

---
 firedrake/preconditioners/fdm.py | 18 ++++++++++--------
 tests/regression/test_fdm.py     |  1 +
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 56a512d2d6..0887cb0e3d 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -93,6 +93,7 @@ def initialize(self, pc):
         options = PETSc.Options(options_prefix)
 
         use_amat = options.getBool("pc_use_amat", True)
+        use_static_condensation = options.getBool("static_condensation", False)
         pmat_type = options.getString("mat_type", PETSc.Mat.Type.AIJ)
 
         appctx = self.get_appctx(pc)
@@ -176,7 +177,7 @@ def interp_nullspace(I, nsp):
                                               fcp=fcp, options_prefix=options_prefix)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type)
+        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
         self._assemble_P()
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
@@ -203,7 +204,7 @@ def interp_nullspace(I, nsp):
             fdmpc.setFromOptions()
 
     @PETSc.Log.EventDecorator("FDMPrealloc")
-    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type):
+    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type, use_static_condensation):
         """
         Assemble the sparse preconditioner from diagonal mass matrices.
 
@@ -212,6 +213,7 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type):
         :arg bcs: an iterable of boundary conditions on V
         :arg form_compiler_parameters: parameters to assemble diagonal factors
         :arg pmat_type: the preconditioner `PETSc.Mat.Type`
+        :arg use_static_condensation: are we assembling the statically-condensed Schur complement on facets?
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
         """
@@ -241,7 +243,7 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type):
 
         self.reference_tensor_on_diag = dict()
         self.get_static_condensation = dict()
-        if Vfacet:
+        if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
             self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1], self.submats)
@@ -618,7 +620,7 @@ def assemble_coef(self, J, form_compiler_parameters):
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V):
         """
-        Return the reference tensor used in the diagonal factorization of the
+        Return the reference tensor used in the diagonal factorisation of the
         sparse cell matrices.  See Section 3.2 of Brubeck2022b.
 
         :arg V: a :class:`.FunctionSpace`
@@ -713,14 +715,14 @@ def factor_interior_mat(A00):
     zlice = slice(0, nblocks)
     numpy.sqrt(data[zlice], out=data[zlice])
     numpy.reciprocal(data[zlice], out=data[zlice])
-    PETSc.Log.logFlops(2*nblocks)
+    flops = nblocks * 2
     for k in range(2, degree[-1]+1):
         nblocks = numpy.count_nonzero(degree == k)
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         data[zlice] = invchol(data[zlice].reshape((-1, k, k))).reshape((-1,))
-        flops = ((k+1)**3 + 5*(k+1)-12)//3 + k**3
-        PETSc.Log.logFlops(flops*nblocks)
+        flops += nblocks * (((k+1)**3 + 5*(k+1)-12)//3 + k**3)
 
+    PETSc.Log.logFlops(flops)
     A00.setValuesCSR(indptr, indices, data)
     A00.assemble()
 
@@ -1607,7 +1609,7 @@ def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
 @lru_cache(maxsize=10)
 def fdm_setup_ipdg(fdm_element, eta):
     """
-    Setup for the fast diagonalization method for the IP-DG formulation.
+    Setup for the fast diagonalisation method for the IP-DG formulation.
     Compute sparsified interval stiffness and mass matrices
     and tabulate the normal derivative of the shape functions.
 
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index f2263a8f10..1104c8bb91 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -45,6 +45,7 @@
     "pc_python_type": "firedrake.FacetSplitPC",
     "facet_pc_type": "python",
     "facet_pc_python_type": "firedrake.FDMPC",
+    "facet_fdm_static_condensation": True,
     "facet_fdm_pc_use_amat": False,
     "facet_fdm_pc_type": "fieldsplit",
     "facet_fdm_pc_fieldsplit_type": "symmetric_multiplicative",

From 4149dce826f5d28d75861dbc69afc5b62d5c0401 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 22 Mar 2023 10:59:27 +0000
Subject: [PATCH 38/75] more elegant caching, remove interpolation of
 nullspace, comments addressing the extension of static condensation to
 non-symmetric matrices and SLATE

---
 firedrake/preconditioners/fdm.py | 62 +++++++++++++-------------------
 firedrake/preconditioners/pmg.py | 16 ++++-----
 2 files changed, 33 insertions(+), 45 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 0887cb0e3d..6c7838b29c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -53,15 +53,19 @@ class FDMPC(PCBase):
     where alpha and beta are possibly tensor-valued.  The sparse matrix is
     obtained by approximating (v, alpha * u) and (v, beta * u) as diagonal mass
     matrices.
+
+    The PETSc options inspected by this class are:
+    - 'fdm_mat_type': can be either 'aij' or 'sbaij'
+    - 'fdm_static_condensation': are we assembling the Schur complement on facets?
+
+    Static condensation is currently only implemented for the symmetric case,
+    use it at your own risk.
     """
 
     _prefix = "fdm_"
     _variant = "fdm"
     _citation = "Brubeck2022b"
-
-    _reference_tensor_cache = {}
-    _coefficient_cache = {}
-    _c_code_cache = {}
+    _cache = {}
 
     @staticmethod
     def load_set_values(triu=False):
@@ -73,7 +77,7 @@ def load_set_values(triu=False):
         :returns: a python wrapper for the matrix insertion function
         """
         key = triu
-        cache = FDMPC._c_code_cache
+        cache = FDMPC._cache.setdefault("load_set_values", {})
         try:
             return cache[key]
         except KeyError:
@@ -112,9 +116,13 @@ def initialize(self, pc):
             bcs = tuple(ctx._problem.bcs)
             mat_type = ctx.mat_type
 
-        if isinstance(J, firedrake.slate.Add):
-            J = J.children[0].form
-        assert type(J) == ufl.Form
+        # For static condensation with SLATE, we might extract the form on the
+        # interface-interface block like this:
+        #
+        # if isinstance(J, firedrake.slate.TensorBase) and use_static_condensation:
+        #     J = J.children[0].form
+        if not isinstance(J, ufl.Form):
+            raise ValueError("Expecting a ufl.Form, not a %r" % type(J))
 
         # Transform the problem into the space with FDM shape functions
         V = J.arguments()[-1].function_space()
@@ -134,12 +142,15 @@ def initialize(self, pc):
                     W = W.sub(index)
                 bcs_fdm.append(bc.reconstruct(V=W, g=0))
 
-            # Construct interpolation from original to variant spaces
+            # Create a new _SNESContext in the variant space
+            self._ctx_ref = self.new_snes_ctx(pc, J_fdm, bcs_fdm, mat_type,
+                                              fcp=fcp, options_prefix=options_prefix)
+
+            # Construct interpolation from variant to original spaces
             self.fdm_interp = prolongation_matrix_matfree(V_fdm, V, bcs_fdm, [])
             self.work_vec_x = Amat.createVecLeft()
             self.work_vec_y = Amat.createVecRight()
             if use_amat:
-                omat = Amat
                 self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp,
                                          mat_type=mat_type, options_prefix=options_prefix)
                 self._assemble_A = partial(assemble, J_fdm, tensor=self.A, bcs=bcs_fdm,
@@ -147,34 +158,10 @@ def initialize(self, pc):
                 self._assemble_A()
                 Amat = self.A.petscmat
 
-                def interp_nullspace(I, nsp):
-                    if not nsp.handle:
-                        return nsp
-                    vectors = []
-                    for x in nsp.getVecs():
-                        y = I.createVecLeft()
-                        I.mult(x, y)
-                        vectors.append(y)
-                    if nsp.hasConstant():
-                        y = I.createVecLeft()
-                        x = I.createVecRight()
-                        x.set(1.0E0)
-                        I.mult(x, y)
-                        vectors.append(y)
-                        x.destroy()
-                    return PETSc.NullSpace().create(constant=False, vectors=vectors, comm=nsp.getComm())
-
-                inject = prolongation_matrix_matfree(V, V_fdm, [], [])
-                Amat.setNullSpace(interp_nullspace(inject, omat.getNullSpace()))
-                Amat.setTransposeNullSpace(interp_nullspace(inject, omat.getTransposeNullSpace()))
-                Amat.setNearNullSpace(interp_nullspace(inject, omat.getNearNullSpace()))
-
             if len(bcs) > 0:
                 self.bc_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
             else:
                 self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
-            self._ctx_ref = self.new_snes_ctx(pc, J_fdm, bcs_fdm, mat_type,
-                                              fcp=fcp, options_prefix=options_prefix)
 
         # Assemble the FDM preconditioner with sparse local matrices
         Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
@@ -596,9 +583,10 @@ def assemble_coef(self, J, form_compiler_parameters):
 
         # Return coefficients and assembly callables, and cache them class
         key = (mixed_form.signature(), mesh)
+        cache = self._cache.setdefault("coefficients", {})
         block_diagonal = True
         try:
-            return self._coefficient_cache[key]
+            return cache[key]
         except KeyError:
             if not block_diagonal or not V.shape:
                 tensor = firedrake.Function(Z)
@@ -615,7 +603,7 @@ def assemble_coef(self, J, form_compiler_parameters):
                     ctx = sub.getPythonContext()
                     coefficients[name] = ctx._block_diagonal
                     assembly_callables.append(ctx._assemble_block_diagonal)
-            return self._coefficient_cache.setdefault(key, (coefficients, assembly_callables))
+            return cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V):
@@ -640,7 +628,7 @@ def assemble_reference_tensor(self, V):
             degree = degree + 1
         is_interior, is_facet = is_restricted(V.finat_element)
         key = (degree, tdim, formdegree, value_size, is_interior, is_facet)
-        cache = self._reference_tensor_cache
+        cache = self._cache.setdefault("reference_tensor", {})
         try:
             return cache[key]
         except KeyError:
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 4c1592a082..d63c006f79 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -49,8 +49,7 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
-
-    _cache_transfer = {}
+    _cache = {}
 
     def coarsen_element(self, ele):
         """
@@ -340,15 +339,16 @@ def coarsen_bcs(self, fbcs, cV):
                 raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
         return cbcs
 
-    def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
+    def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
         # Create a transfer or retrieve it from the class cache
         cV = cctx.J.arguments()[0].function_space()
         fV = fctx.J.arguments()[0].function_space()
         cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
         fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
-        key = (cV, fV, cbcs, fbcs, mat_type)
+        key = (mat_type, cV, fV, cbcs, fbcs)
+        cache = self._cache.setdefault("transfer", {})
         try:
-            return self._cache_transfer[key]
+            return cache[key]
         except KeyError:
             if mat_type == "matfree":
                 construct_mat = prolongation_matrix_matfree
@@ -356,19 +356,19 @@ def create_transfer(self, cctx, fctx, mat_type, cbcs, fbcs):
                 construct_mat = prolongation_matrix_aij
             else:
                 raise ValueError("Unknown matrix type")
-            return self._cache_transfer.setdefault(key, construct_mat(cV, fV, cbcs, fbcs))
+            return cache.setdefault(key, construct_mat(cV, fV, cbcs, fbcs))
 
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        interpolate = self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False)
+        interpolate = self.create_transfer(mat_type, get_appctx(dmc), get_appctx(dmf), True, False)
         rscale = interpolate.createVecRight()  # only used as a workaround in the creation of coarse vecs
         return interpolate, rscale
 
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        return self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False)
+        return self.create_transfer(mat_type, get_appctx(dmf), get_appctx(dmc), False, False)
 
     @staticmethod
     def max_degree(ele):

From 43429f856690e8426d86dcb667a27377cf62b735 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 22 Mar 2023 13:16:49 +0000
Subject: [PATCH 39/75] create fewer intermidiate Mats

---
 firedrake/preconditioners/fdm.py | 102 +++++++++++++++----------------
 1 file changed, 50 insertions(+), 52 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 6c7838b29c..7db94e14e1 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -373,6 +373,8 @@ def view(self, pc, viewer=None):
 
     def destroy(self, pc):
         objs = []
+        if hasattr(self, "A"):
+            objs.append(self.A)
         if hasattr(self, "pc"):
             objs.append(self.pc.getOperators()[-1])
             objs.append(self.pc)
@@ -666,12 +668,9 @@ def assemble_reference_tensor(self, V):
             A10 = numpy.linalg.solve(A11, A10)
             A11 = numpy.eye(A11.shape[0])
 
-            Ihat = mass_matrix(tdim, formdegree, A00, A11)
-            Dhat = diff_matrix(tdim, formdegree, A00, A11, A10)
-            result = block_mat([[Ihat], [Dhat]])
-            Ihat.destroy()
-            Dhat.destroy()
-
+            B_blocks = mass_blocks(tdim, formdegree, A00, A11)
+            A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
+            result = block_mat(B_blocks + A_blocks, destroy=True)
             if value_size != 1:
                 eye = petsc_sparse(numpy.eye(value_size))
                 temp = result
@@ -720,13 +719,14 @@ def condense_element_mat(A, i0, i1, submats):
     # Return the Schur complement associated to indices in i1, condensing i0 out
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
+    structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
     submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
     A00, A01, A10, A11 = submats[:4]
     factor_interior_mat(A00)
     submats[4] = A00.matMult(A01, result=submats[4])
     submats[5] = A10.matTransposeMult(A00, result=submats[5])
     submats[6] = submats[5].matMult(submats[4], result=submats[6])
-    submats[6].aypx(-1.0, A11)
+    submats[6].aypx(-1.0, A11, structure=structure)
     return submats[6]
 
 
@@ -735,12 +735,13 @@ def condense_element_pattern(A, i0, i1, submats):
     # Add zeroes on the statically condensed pattern so that you can run ICC(0)
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
+    structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
     A00, A01, A10 = submats[:3]
     submats[4] = A10.matTransposeMult(A00, result=submats[4])
     submats[5] = A00.matMult(A01, result=submats[5])
     submats[6] = submats[4].matMult(submats[5], result=submats[6])
-    submats[6].aypx(0.0, A)
+    submats[6].aypx(0.0, A, structure=structure)
     return submats[6]
 
 
@@ -823,30 +824,6 @@ def load_assemble_csr(comm, triu=False):
                        restype=ctypes.c_int)
 
 
-def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
-    # Convert dense numpy matrix into a sparse PETSc matrix
-    Amax = max(A_numpy.min(), A_numpy.max(), key=abs)
-    atol = rtol*Amax
-    nnz = numpy.count_nonzero(abs(A_numpy) > atol, axis=1).astype(PETSc.IntType)
-    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
-    for row, Arow in enumerate(A_numpy):
-        cols = numpy.argwhere(abs(Arow) > atol).astype(PETSc.IntType).flat
-        A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
-    A.assemble()
-    return A
-
-
-def block_mat(A_blocks):
-    # Return a concrete Mat corresponding to a block matrix given as a list of lists
-    if len(A_blocks) == 1:
-        if len(A_blocks[0]) == 1:
-            return A_blocks[0][0]
-
-    nest = PETSc.Mat().createNest(A_blocks, comm=A_blocks[0][0].getComm())
-    # A nest Mat would not allow us to take matrix-matrix products
-    return nest.convert(mat_type=A_blocks[0][0].getType())
-
-
 def is_restricted(finat_element):
     # Determine if an element is a restriction onto interior or facets
     is_interior = True
@@ -888,6 +865,19 @@ def sort_interior_dofs(idofs, A):
     Aii.destroy()
 
 
+def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
+    # Convert dense numpy matrix into a sparse PETSc matrix
+    Amax = max(A_numpy.min(), A_numpy.max(), key=abs)
+    atol = rtol*Amax
+    nnz = numpy.count_nonzero(abs(A_numpy) > atol, axis=1).astype(PETSc.IntType)
+    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
+    for row, Arow in enumerate(A_numpy):
+        cols = numpy.argwhere(abs(Arow) > atol).astype(PETSc.IntType).flat
+        A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
+    A.assemble()
+    return A
+
+
 def kron3(A, B, C, scale=None):
     temp = B.kron(C)
     if scale is not None:
@@ -897,13 +887,30 @@ def kron3(A, B, C, scale=None):
     return result
 
 
-def mass_matrix(tdim, formdegree, B00, B11, comm=None):
-    # Construct mass matrix on reference cell from 1D mass matrices B00 and B11.
+def block_mat(A_blocks, destroy=False):
+    # Return a concrete Mat corresponding to a block matrix given as a list of lists
+    # Optionally, destroys the input Mats if a new Mat is created
+    if len(A_blocks) == 1:
+        if len(A_blocks[0]) == 1:
+            return A_blocks[0][0]
+
+    result = PETSc.Mat().createNest(A_blocks, comm=A_blocks[0][0].getComm())
+    # A nest Mat would not allow us to take matrix-matrix products
+    result = result.convert(mat_type=A_blocks[0][0].getType())
+    if destroy:
+        for row in A_blocks:
+            for mat in row:
+                mat.destroy()
+    return result
+
+
+def mass_blocks(tdim, formdegree, B00, B11, comm=None):
+    # Construct mass block matrix on reference cell from 1D mass matrices B00 and B11.
     # The 1D matrices may come with different test and trial spaces.
     if comm is None:
         comm = PETSc.COMM_SELF
     if tdim == 1:
-        return petsc_sparse(B11 if formdegree else B00, comm=comm)
+        return [[petsc_sparse(B11 if formdegree else B00, comm=comm)]]
 
     B00 = petsc_sparse(B00, comm=comm)
     B11 = petsc_sparse(B11, comm=comm)
@@ -928,20 +935,15 @@ def mass_matrix(tdim, formdegree, B00, B11, comm=None):
     B11.destroy()
     n = len(B_diag)
     if n == 1:
-        result = B_diag[0]
+        return [B_diag]
     else:
         B_zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=comm)
         B_zero.assemble()
-        B_blocks = [[B_diag[i] if i == j else B_zero for j in range(n)] for i in range(n)]
-        result = block_mat(B_blocks)
-        B_zero.destroy()
-        for B in B_diag:
-            B.destroy()
-    return result
+        return [[B_diag[i] if i == j else B_zero for j in range(n)] for i in range(n)]
 
 
-def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
-    # Construct exterior derivative matrix on reference cell from 1D mass matrices A00 and A11,
+def diff_blocks(tdim, formdegree, A00, A11, A10, comm=None):
+    # Construct exterior derivative block matrix on reference cell from 1D mass matrices A00 and A11,
     # and exterior derivative moments A10.
     # The 1D matrices may come with different test and trial spaces.
     if comm is None:
@@ -950,11 +952,11 @@ def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
         ncols = A10.shape[0]**tdim
         A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=comm)
         A_zero.assemble()
-        return A_zero
+        return [[A_zero]]
 
     A10 = petsc_sparse(A10, comm=comm)
     if tdim == 1:
-        return A10
+        return [[A10]]
 
     A00 = petsc_sparse(A00, comm=comm)
     A11 = petsc_sparse(A11, comm=comm)
@@ -980,11 +982,7 @@ def diff_matrix(tdim, formdegree, A00, A11, A10, comm=None):
     A00.destroy()
     A11.destroy()
     A10.destroy()
-    result = block_mat(A_blocks)
-    for A_row in A_blocks:
-        for A in A_row:
-            A.destroy()
-    return result
+    return A_blocks
 
 
 def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
@@ -1010,7 +1008,7 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
     A10 = fiat_reference_prolongator(e0, e1, derivative=True)
 
     tdim = Vc.mesh().topological_dimension()
-    Dhat = diff_matrix(tdim, ec.formdegree, A00, A11, A10)
+    Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy=True)
 
     scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
     fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))

From a7b1e952c710f8adbea5c427f428ee2be1eff02d Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 22 Mar 2023 18:26:42 +0000
Subject: [PATCH 40/75] move imports to the top

---
 firedrake/preconditioners/fdm.py | 63 +++++++++++++++-----------------
 firedrake/preconditioners/pmg.py | 15 ++++----
 2 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 7db94e14e1..e1a3a12f0c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1,10 +1,21 @@
 from functools import partial, lru_cache
 from itertools import product
-from pyop2.sparsity import get_preallocation
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
+from firedrake.preconditioners.patch import bcdofs
+from firedrake.preconditioners.pmg import (prolongation_matrix_matfree,
+                                           fiat_reference_prolongator,
+                                           get_permutation_to_line_elements)
 from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
+from firedrake.formmanipulation import ExtractSubBlock
 from firedrake_citations import Citations
+from pyop2.compilation import load
+from pyop2.utils import get_petsc_dir
+from pyop2.sparsity import get_preallocation
+from tsfc.finatinterface import create_element
+from ufl.algorithms.ad import expand_derivatives
+from ufl.algorithms.expand_indices import expand_indices
+
 import firedrake.dmhooks as dmhooks
 import firedrake
 import ctypes
@@ -85,10 +96,6 @@ def load_set_values(triu=False):
 
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
-        from firedrake.assemble import allocate_matrix, assemble
-        from firedrake.preconditioners.pmg import prolongation_matrix_matfree
-        from firedrake.preconditioners.patch import bcdofs
-
         Citations().register(self._citation)
         self.comm = pc.comm
         Amat, Pmat = pc.getOperators()
@@ -151,10 +158,12 @@ def initialize(self, pc):
             self.work_vec_x = Amat.createVecLeft()
             self.work_vec_y = Amat.createVecRight()
             if use_amat:
+                from firedrake.assemble import allocate_matrix, TwoFormAssembler
                 self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp,
                                          mat_type=mat_type, options_prefix=options_prefix)
-                self._assemble_A = partial(assemble, J_fdm, tensor=self.A, bcs=bcs_fdm,
-                                           form_compiler_parameters=fcp, mat_type=mat_type)
+                self._assemble_A = TwoFormAssembler(J_fdm, tensor=self.A, bcs=bcs_fdm,
+                                                    form_compiler_parameters=fcp,
+                                                    mat_type=mat_type).assemble
                 self._assemble_A()
                 Amat = self.A.petscmat
 
@@ -509,11 +518,6 @@ def assemble_coef(self, J, form_compiler_parameters):
                   order coefficients keyed on ``"beta"`` and ``"alpha"``,
                   and a list of assembly callables.
         """
-        from ufl.algorithms.ad import expand_derivatives
-        from ufl.algorithms.expand_indices import expand_indices
-        from firedrake.formmanipulation import ExtractSubBlock
-        from firedrake.assemble import assemble
-
         # Basic idea: take the original bilinear form and
         # replace the exterior derivatives with arguments in broken(V^{k+1}).
         # Then, replace the original arguments with arguments in broken(V^k).
@@ -593,11 +597,11 @@ def assemble_coef(self, J, form_compiler_parameters):
             if not block_diagonal or not V.shape:
                 tensor = firedrake.Function(Z)
                 coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
-                assembly_callables = [partial(assemble, mixed_form, tensor=tensor, diagonal=True,
+                assembly_callables = [partial(firedrake.assemble, mixed_form, tensor=tensor, diagonal=True,
                                               form_compiler_parameters=form_compiler_parameters)]
             else:
-                M = assemble(mixed_form, mat_type="matfree",
-                             form_compiler_parameters=form_compiler_parameters)
+                M = firedrake.assemble(mixed_form, mat_type="matfree",
+                                       form_compiler_parameters=form_compiler_parameters)
                 coefficients = dict()
                 assembly_callables = []
                 for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
@@ -747,8 +751,6 @@ def condense_element_pattern(A, i0, i1, submats):
 
 @PETSc.Log.EventDecorator("LoadCode")
 def load_c_code(code, name, **kwargs):
-    from pyop2.compilation import load
-    from pyop2.utils import get_petsc_dir
     cppargs = ["-I%s/include" % d for d in get_petsc_dir()]
     ldargs = (["-L%s/lib" % d for d in get_petsc_dir()]
               + ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()]
@@ -990,9 +992,6 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
     Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
     Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
     """
-    from tsfc.finatinterface import create_element
-    from firedrake.preconditioners.pmg import fiat_reference_prolongator
-
     ec = Vc.finat_element
     ef = Vf.finat_element
     if ef.formdegree - ec.formdegree != 1:
@@ -1132,7 +1131,6 @@ class PoissonFDMPC(FDMPC):
     _citation = "Brubeck2022a"
 
     def assemble_reference_tensor(self, V):
-        from firedrake.preconditioners.pmg import get_permutation_to_line_elements
         try:
             _, line_elements, shifts = get_permutation_to_line_elements(V.finat_element)
         except ValueError:
@@ -1381,9 +1379,6 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_average=True):
-        from ufl import inner, diff
-        from ufl.algorithms.ad import expand_derivatives
-
         coefficients = {}
         assembly_callables = []
 
@@ -1421,8 +1416,8 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
         else:
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
 
-        alpha = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_grad),
-                                             ref_grad[0]), ref_grad[1]) for i in integrals_J]))
+        alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
+                                                 ref_grad[0]), ref_grad[1]) for i in integrals_J]))
 
         # get zero-th order coefficent
         ref_val = [ufl.variable(t) for t in args_J]
@@ -1433,8 +1428,8 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
         else:
             replace_val = {t: s for t, s in zip(args_J, ref_val)}
 
-        beta = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_val),
-                                            ref_val[0]), ref_val[1]) for i in integrals_J]))
+        beta = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
+                                                ref_val[0]), ref_val[1]) for i in integrals_J]))
         if Piola:
             beta = ufl.replace(beta, {dummy_Piola: Piola})
 
@@ -1457,7 +1452,7 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
             q = firedrake.TestFunction(Q)
             Gq = firedrake.Function(Q)
             coefficients["alpha"] = Gq
-            assembly_callables.append(partial(firedrake.assemble, inner(G, q)*dx, Gq))
+            assembly_callables.append(partial(firedrake.assemble, ufl.inner(G, q)*dx, Gq))
 
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
@@ -1472,7 +1467,7 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
             q = firedrake.TestFunction(Q)
             Bq = firedrake.Function(Q)
             coefficients["beta"] = Bq
-            assembly_callables.append(partial(firedrake.assemble, inner(beta, q)*dx, Bq))
+            assembly_callables.append(partial(firedrake.assemble, ufl.inner(beta, q)*dx, Bq))
 
         if Piola:
             # make DGT functions with the second order coefficient
@@ -1483,8 +1478,8 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
             area = ufl.FacetArea(mesh)
 
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
-            alpha = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_grad),
-                                                 ref_grad[0]), ref_grad[1]) for i in integrals_J]))
+            alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
+                                                     ref_grad[0]), ref_grad[1]) for i in integrals_J]))
             vol = abs(ufl.JacobianDeterminant(mesh))
             G = vol * alpha
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
@@ -1493,14 +1488,14 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
             q = firedrake.TestFunction(Q)
             Gq_facet = firedrake.Function(Q)
             coefficients["Gq_facet"] = Gq_facet
-            assembly_callables.append(partial(firedrake.assemble, ((inner(q('+'), G('+')) + inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
+            assembly_callables.append(partial(firedrake.assemble, ((ufl.inner(q('+'), G('+')) + ufl.inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
 
             PT = Piola.T
             Q = firedrake.TensorFunctionSpace(mesh, ele, shape=PT.ufl_shape)
             q = firedrake.TestFunction(Q)
             PT_facet = firedrake.Function(Q)
             coefficients["PT_facet"] = PT_facet
-            assembly_callables.append(partial(firedrake.assemble, ((inner(q('+'), PT('+')) + inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
+            assembly_callables.append(partial(firedrake.assemble, ((ufl.inner(q('+'), PT('+')) + ufl.inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
 
         # make DGT functions with BC flags
         rvs = V.ufl_element().reference_value_shape()
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index d63c006f79..8446d5aa3e 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -1,14 +1,19 @@
 from functools import partial, lru_cache
 from itertools import chain
-from pyop2 import op2, PermutedMap
+from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase, SNESBase, PCSNESBase
 from firedrake.dmhooks import (attach_hooks, get_appctx, push_appctx, pop_appctx,
                                add_hook, get_parent, push_parent, pop_parent,
                                get_function_space, set_function_space)
 from firedrake.solving_utils import _SNESContext
+from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
 from firedrake.tsfc_interface import extract_numbered_coefficients
 from firedrake.utils import ScalarType_c, IntType_c, cached_property
-from firedrake.petsc import PETSc
+from pyop2 import op2, PermutedMap
+from tsfc import compile_expression_dual_evaluation
+from tsfc.finatinterface import create_element
+from FIAT.reference_element import LINE
+
 import firedrake
 import finat
 import ufl
@@ -162,8 +167,6 @@ def destroy(self, obj):
     def coarsen(self, fdm, comm):
         # Coarsen the _SNESContext of a DM fdm
         # return the coarse DM cdm of the coarse _SNESContext
-        from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
-
         fctx = get_appctx(fdm)
         parent = get_parent(fdm)
         assert parent is not None
@@ -517,8 +520,6 @@ def coarsen_bc_value(self, bc, cV):
 
 
 def prolongation_transfer_kernel_action(Vf, expr):
-    from tsfc import compile_expression_dual_evaluation
-    from tsfc.finatinterface import create_element
     to_element = create_element(Vf.ufl_element())
     kernel = compile_expression_dual_evaluation(expr, to_element, Vf.ufl_element(), log=PETSc.Log.isActive())
     coefficients = extract_numbered_coefficients(expr, kernel.coefficient_numbers)
@@ -631,8 +632,6 @@ def get_permutation_to_line_elements(finat_element):
               permutations of the axes to form the element given by their shifts
               in list of `int` tuples
     """
-    from FIAT.reference_element import LINE
-
     expansion = expand_element(finat_element)
     if expansion.space_dimension() != finat_element.space_dimension():
         raise ValueError("Failed to decompose %s into tensor products" % finat_element)

From a2cd50c02dd02b56282d88fc188ced2f82d357d5 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 23 Mar 2023 09:27:49 +0000
Subject: [PATCH 41/75] use any instead of sum

---
 firedrake/preconditioners/fdm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index e1a3a12f0c..5f8e479b8f 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -833,8 +833,7 @@ def is_restricted(finat_element):
     tdim = finat_element.cell.get_spatial_dimension()
     entity_dofs = finat_element.entity_dofs()
     for edim in sorted(entity_dofs):
-        v = sum(list(entity_dofs[edim].values()), [])
-        if len(v):
+        if any(len(entity_dofs[edim][entity]) > 0 for entity in entity_dofs[edim]):
             try:
                 edim = sum(edim)
             except TypeError:

From b6f31530998173bd66e5d1519cbc2a7730e71981 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 23 Mar 2023 15:42:27 +0000
Subject: [PATCH 42/75] address some of Connor's comments format docstrings,
 dict() -> {}

---
 firedrake/preconditioners/fdm.py | 184 +++++++++++++++----------------
 firedrake/preconditioners/pmg.py |   4 +-
 2 files changed, 91 insertions(+), 97 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 5f8e479b8f..862b80038c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -79,20 +79,20 @@ class FDMPC(PCBase):
     _cache = {}
 
     @staticmethod
-    def load_set_values(triu=False):
+    def setSubMatCSR(comm, triu=False):
         """
-        Compile C code to insert sparse element matrices and store in class cache
+        Compile C code to insert sparse submatrices and store in class cache
 
         :arg triu: are we inserting onto the upper triangular part of the matrix?
 
         :returns: a python wrapper for the matrix insertion function
         """
+        cache = FDMPC._cache.setdefault("setSubMatCSR", {})
         key = triu
-        cache = FDMPC._cache.setdefault("load_set_values", {})
         try:
             return cache[key]
         except KeyError:
-            return cache.setdefault(key, load_assemble_csr(PETSc.COMM_SELF, triu=triu))
+            return cache.setdefault(key, load_setSubMatCSR(comm, triu))
 
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
@@ -123,8 +123,8 @@ def initialize(self, pc):
             bcs = tuple(ctx._problem.bcs)
             mat_type = ctx.mat_type
 
-        # For static condensation with SLATE, we might extract the form on the
-        # interface-interface block like this:
+        # TODO assemble Schur complements specified by a SLATE Tensor
+        # we might extract the form on the interface-interface block like this:
         #
         # if isinstance(J, firedrake.slate.TensorBase) and use_static_condensation:
         #     J = J.children[0].form
@@ -237,8 +237,8 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type, use_st
         self.ises = tuple(PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs))
         self.submats = [None for _ in range(7)]
 
-        self.reference_tensor_on_diag = dict()
-        self.get_static_condensation = dict()
+        self.reference_tensor_on_diag = {}
+        self.get_static_condensation = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
@@ -256,9 +256,9 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             return lgmap.apply(result, result=result)
 
         # Create data structures needed for assembly
-        self.cell_to_global = dict()
-        self.lgmaps = dict()
-        bc_rows = dict()
+        self.cell_to_global = {}
+        self.lgmaps = {}
+        bc_rows = {}
         for Vsub in V:
             lgmap = Vsub.local_to_global_map([bc.reconstruct(V=Vsub, g=0) for bc in bcs])
             bsize = Vsub.dof_dset.layout_vec.getBlockSize()
@@ -286,9 +286,9 @@ def get_coeffs(e, result=None):
         self.get_coeffs = get_coeffs
 
         self.nel = nel
-        self.work_mats = dict()
+        self.work_mats = {}
 
-        Pmats = dict()
+        Pmats = {}
         addv = PETSc.InsertMode.ADD_VALUES
         symmetric = pmat_type.endswith("sbaij")
 
@@ -414,11 +414,11 @@ def RtAP(R, A, P, result=None):
             return R.transposeMatMult(RtAP.buff, result=result)
         RtAP.buff = None
 
-        set_values_csr = self.load_set_values(triu=triu)
+        set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         get_rindices = self.cell_to_global[Vrow]
         if Vrow == Vcol:
             get_cindices = lambda e, result=None: result
-            update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, rindices, addv)
+            update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, rindices, addv)
             # moments of orthogonalized basis against basis tabulation and derivative tabulation
             rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
             # element matrix obtained via Equation (3.9) of Brubeck2022b
@@ -426,7 +426,7 @@ def RtAP(R, A, P, result=None):
             condense_element_mat = self.get_static_condensation.get(Vrow)
         else:
             get_cindices = self.cell_to_global[Vcol]
-            update_A = lambda Ae, rindices, cindices: set_values_csr(A, Ae, rindices, cindices, addv)
+            update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, cindices, addv)
             rtensor = self.assemble_reference_tensor(Vrow)
             ctensor = self.assemble_reference_tensor(Vcol)
             assemble_element_mat = lambda De, result=None: RtAP(rtensor, De, ctensor, result=result)
@@ -583,7 +583,7 @@ def assemble_coef(self, J, form_compiler_parameters):
         # Transform the exterior derivative and the original arguments of J to arguments in Z
         args = (firedrake.TestFunctions(Z), firedrake.TrialFunctions(Z))
         repargs = {t: v[0] for t, v in zip(args_J, args)}
-        repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else dict()
+        repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else {}
         Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
         mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
 
@@ -594,21 +594,21 @@ def assemble_coef(self, J, form_compiler_parameters):
         try:
             return cache[key]
         except KeyError:
-            if not block_diagonal or not V.shape:
-                tensor = firedrake.Function(Z)
-                coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
-                assembly_callables = [partial(firedrake.assemble, mixed_form, tensor=tensor, diagonal=True,
-                                              form_compiler_parameters=form_compiler_parameters)]
-            else:
+            if block_diagonal and V.shape:
                 M = firedrake.assemble(mixed_form, mat_type="matfree",
                                        form_compiler_parameters=form_compiler_parameters)
-                coefficients = dict()
+                coefficients = {}
                 assembly_callables = []
                 for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
                     sub = M.petscmat.createSubMatrix(iset, iset)
                     ctx = sub.getPythonContext()
                     coefficients[name] = ctx._block_diagonal
                     assembly_callables.append(ctx._assemble_block_diagonal)
+            else:
+                tensor = firedrake.Function(Z)
+                coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
+                assembly_callables = [partial(firedrake.assemble, mixed_form, tensor=tensor, diagonal=True,
+                                              form_compiler_parameters=form_compiler_parameters)]
             return cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
@@ -720,7 +720,7 @@ def factor_interior_mat(A00):
 
 @PETSc.Log.EventDecorator("FDMCondense")
 def condense_element_mat(A, i0, i1, submats):
-    # Return the Schur complement associated to indices in i1, condensing i0 out
+    """Return the Schur complement associated to indices in i1, condensing i0 out"""
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
     structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
@@ -736,7 +736,7 @@ def condense_element_mat(A, i0, i1, submats):
 
 @PETSc.Log.EventDecorator("FDMCondense")
 def condense_element_pattern(A, i0, i1, submats):
-    # Add zeroes on the statically condensed pattern so that you can run ICC(0)
+    """Add zeroes on the statically condensed pattern so that you can run ICC(0)"""
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
     structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
@@ -760,7 +760,7 @@ def load_c_code(code, name, **kwargs):
                   **kwargs)
 
     def get_pointer(obj):
-        if isinstance(obj, (PETSc.Mat, PETSc.Vec)):
+        if isinstance(obj, PETSc.Object):
             return obj.handle
         elif isinstance(obj, numpy.ndarray):
             return obj.ctypes.data
@@ -772,9 +772,9 @@ def wrapper(*args):
     return wrapper
 
 
-def load_assemble_csr(comm, triu=False):
-    # Insert one sparse matrix into another sparse matrix.
-    # Done in C for efficiency, since it loops over rows.
+def load_setSubMatCSR(comm, triu=False):
+    """Insert one sparse matrix into another sparse matrix.
+       Done in C for efficiency, since it loops over rows."""
     if triu:
         name = "setSubMatCSR_SBAIJ"
         select_cols = "icol < irow ? -1: icol"
@@ -827,18 +827,14 @@ def load_assemble_csr(comm, triu=False):
 
 
 def is_restricted(finat_element):
-    # Determine if an element is a restriction onto interior or facets
+    """Determine if an element is a restriction onto interior or facets"""
     is_interior = True
     is_facet = True
-    tdim = finat_element.cell.get_spatial_dimension()
+    cell_dim = finat_element.cell.get_dimension()
     entity_dofs = finat_element.entity_dofs()
-    for edim in sorted(entity_dofs):
-        if any(len(entity_dofs[edim][entity]) > 0 for entity in entity_dofs[edim]):
-            try:
-                edim = sum(edim)
-            except TypeError:
-                pass
-            if edim == tdim:
+    for dim in sorted(entity_dofs):
+        if any(len(entity_dofs[dim][entity]) > 0 for entity in entity_dofs[dim]):
+            if dim == cell_dim:
                 is_facet = False
             else:
                 is_interior = False
@@ -846,8 +842,8 @@ def is_restricted(finat_element):
 
 
 def sort_interior_dofs(idofs, A):
-    # Permute `idofs` to have A[idofs, idofs] with square blocks of
-    # increasing dimension along its diagonal.
+    """Permute `idofs` to have A[idofs, idofs] with square blocks of
+       increasing dimension along its diagonal."""
     Aii = A.createSubMatrix(idofs, idofs)
     indptr, indices, _ = Aii.getValuesCSR()
     n = idofs.getSize()
@@ -867,19 +863,20 @@ def sort_interior_dofs(idofs, A):
 
 
 def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
-    # Convert dense numpy matrix into a sparse PETSc matrix
-    Amax = max(A_numpy.min(), A_numpy.max(), key=abs)
-    atol = rtol*Amax
-    nnz = numpy.count_nonzero(abs(A_numpy) > atol, axis=1).astype(PETSc.IntType)
+    """Convert dense numpy matrix into a sparse PETSc matrix"""
+    atol = rtol * max(A_numpy.min(), A_numpy.max(), key=abs)
+    sparsity = abs(A_numpy) > atol
+    nnz = numpy.count_nonzero(sparsity, axis=1).astype(PETSc.IntType)
     A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
-    for row, Arow in enumerate(A_numpy):
-        cols = numpy.argwhere(abs(Arow) > atol).astype(PETSc.IntType).flat
+    for row, (Arow, Srow) in enumerate(zip(A_numpy, sparsity)):
+        cols = numpy.argwhere(Srow).astype(PETSc.IntType).flat
         A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
     A.assemble()
     return A
 
 
 def kron3(A, B, C, scale=None):
+    """Returns scale * kron(A, kron(B, C))"""
     temp = B.kron(C)
     if scale is not None:
         temp.scale(scale)
@@ -889,8 +886,8 @@ def kron3(A, B, C, scale=None):
 
 
 def block_mat(A_blocks, destroy=False):
-    # Return a concrete Mat corresponding to a block matrix given as a list of lists
-    # Optionally, destroys the input Mats if a new Mat is created
+    """Return a concrete Mat corresponding to a block matrix given as a list of lists.
+       Optionally, destroys the input Mats if a new Mat is created."""
     if len(A_blocks) == 1:
         if len(A_blocks[0]) == 1:
             return A_blocks[0][0]
@@ -906,8 +903,8 @@ def block_mat(A_blocks, destroy=False):
 
 
 def mass_blocks(tdim, formdegree, B00, B11, comm=None):
-    # Construct mass block matrix on reference cell from 1D mass matrices B00 and B11.
-    # The 1D matrices may come with different test and trial spaces.
+    """Construct mass block matrix on reference cell from 1D mass matrices B00 and B11.
+       The 1D matrices may come with different test and trial spaces."""
     if comm is None:
         comm = PETSc.COMM_SELF
     if tdim == 1:
@@ -944,9 +941,9 @@ def mass_blocks(tdim, formdegree, B00, B11, comm=None):
 
 
 def diff_blocks(tdim, formdegree, A00, A11, A10, comm=None):
-    # Construct exterior derivative block matrix on reference cell from 1D mass matrices A00 and A11,
-    # and exterior derivative moments A10.
-    # The 1D matrices may come with different test and trial spaces.
+    """Construct exterior derivative block matrix on reference cell from 1D
+       mass matrices A00 and A11, and exterior derivative moments A10.
+       The 1D matrices may come with different test and trial spaces."""
     if comm is None:
         comm = PETSc.COMM_SELF
     if formdegree == tdim:
@@ -1001,23 +998,24 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
     e0, e1 = elements[::len(elements)-1]
 
     degree = e0.degree()
+    tdim = Vc.mesh().topological_dimension()
     A11 = numpy.eye(degree, dtype=PETSc.RealType)
     A00 = numpy.eye(degree+1, dtype=PETSc.RealType)
     A10 = fiat_reference_prolongator(e0, e1, derivative=True)
-
-    tdim = Vc.mesh().topological_dimension()
     Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy=True)
 
-    scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
-    fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
-    cdofs = restricted_dofs(ec, create_element(unrestrict_element(scalar_element(Vc.ufl_element()))))
-    fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
-    cises = PETSc.IS().createGeneral(cdofs, comm=PETSc.COMM_SELF)
-    temp = Dhat
-    Dhat = temp.createSubMatrix(fises, cises)
-    fises.destroy()
-    cises.destroy()
-    temp.destroy()
+    if any(is_restricted(ec)) or any(is_restricted(ef)):
+        scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
+        fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
+        cdofs = restricted_dofs(ec, create_element(unrestrict_element(scalar_element(Vc.ufl_element()))))
+        fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
+        cises = PETSc.IS().createGeneral(cdofs, comm=PETSc.COMM_SELF)
+        temp = Dhat
+        Dhat = temp.createSubMatrix(fises, cises)
+        temp.destroy()
+        fises.destroy()
+        cises.destroy()
+
     if Vf.value_size > 1:
         temp = Dhat
         eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType))
@@ -1035,7 +1033,7 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
         return lgmap.apply(result, result=result)
 
     imode = PETSc.InsertMode.INSERT
-    update_Dmat = FDMPC.load_set_values()
+    update_Dmat = FDMPC.setSubMatCSR(PETSc.COMM_SELF, triu=False)
 
     sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
     block_size = Vf.dof_dset.layout_vec.getBlockSize()
@@ -1068,7 +1066,8 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
 
 
 def unrestrict_element(ele):
-    # Get an element that might or might not be restricted and return the parent unrestricted element.
+    """Get an element that might or might not be restricted and
+       return the parent unrestricted element."""
     if isinstance(ele, ufl.VectorElement):
         return type(ele)(unrestrict_element(ele._sub_element), dim=ele.num_sub_elements())
     elif isinstance(ele, ufl.TensorElement):
@@ -1136,7 +1135,7 @@ def assemble_reference_tensor(self, V):
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
         line_elements, = line_elements
-        self.axes_shifts, = shifts
+        axes_shifts, = shifts
 
         degree = max(e.degree() for e in line_elements)
         eta = float(self.appctx.get("eta", degree*(degree+1)))
@@ -1151,7 +1150,7 @@ def assemble_reference_tensor(self, V):
             if not is_dg and e.degree() == degree:
                 # do not apply SIPG along continuous directions
                 Dfdm[0] = None
-        return Afdm, Dfdm, bdof
+        return Afdm, Dfdm, bdof, axes_shifts
 
     @PETSc.Log.EventDecorator("FDMSetValues")
     def set_values(self, A, Vrow, Vcol, addv, triu=False):
@@ -1164,14 +1163,16 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         :arg addv: a `PETSc.Mat.InsertMode`
         :arg triu: are we assembling only the upper triangular part?
         """
-        set_values_csr = self.load_set_values(triu=triu)
-        update_A = lambda A, Ae, rindices: set_values_csr(A, Ae, rindices, rindices, addv)
+        set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
+        update_A = lambda A, Ae, rindices: set_submat(A, Ae, rindices, rindices, addv)
         condense_element_mat = lambda x: x
 
         get_rindices = self.cell_to_global[Vrow]
-        rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
-        self.reference_tensor_on_diag[Vrow] = rtensor
-        Afdm, Dfdm, bdof = rtensor
+        try:
+            rtensor = self.reference_tensor_on_diag[Vrow]
+        except KeyError:
+            rtensor = self.reference_tensor_on_diag.setdefault(Vrow, self.assemble_reference_tensor(Vrow))
+        Afdm, Dfdm, bdof, axes_shifts = rtensor
 
         Gq = self.coefficients.get("alpha")
         Bq = self.coefficients.get("beta")
@@ -1184,7 +1185,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         ncomp = V.ufl_element().reference_value_size()
         sdim = (V.finat_element.space_dimension() * bsize) // ncomp  # dimension of a single component
         tdim = V.mesh().topological_dimension()
-        shift = self.axes_shifts * bsize
+        shift = axes_shifts * bsize
 
         index_coef, _ = extrude_node_map((Gq or Bq).cell_node_map())
         index_bc, _ = extrude_node_map(bcflags.cell_node_map())
@@ -1322,8 +1323,8 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                         continue
 
                     if PT_facet:
-                        k0 = iord0[k] if shift != 1 else tdim-1-iord0[-k-1]
-                        k1 = iord1[k] if shift != 1 else tdim-1-iord1[-k-1]
+                        k0 = iord0[k] if shift[1] != 1 else tdim-1-iord0[-k-1]
+                        k1 = iord1[k] if shift[1] != 1 else tdim-1-iord1[-k-1]
                         Piola = Pfacet[[0, 1], [k0, k1]]
                         mu = Gfacet[[0, 1], idir]
                     else:
@@ -1377,7 +1378,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                     Ae.destroy()
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
-    def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_average=True):
+    def assemble_coef(self, J, form_compiler_parameters):
         coefficients = {}
         assembly_callables = []
 
@@ -1396,12 +1397,8 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
         quad_deg = (form_compiler_parameters or {}).get("degree", quad_deg)
         dx = firedrake.dx(degree=quad_deg)
 
-        if cell_average:
-            family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
-            degree = 0
-        else:
-            family = "Quadrature"
-            degree = quad_deg
+        family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
+        degree = 0
 
         # extract coefficients directly from the bilinear form
         integrals_J = J.integrals_by_type("cell")
@@ -1432,18 +1429,15 @@ def assemble_coef(self, J, form_compiler_parameters, discard_mixed=True, cell_av
         if Piola:
             beta = ufl.replace(beta, {dummy_Piola: Piola})
 
+        # discard mixed derivatives and mixed components
         G = alpha
-        if discard_mixed:
-            # discard mixed derivatives and mixed components
-            if len(G.ufl_shape) == 2:
-                G = ufl.diag_vector(G)
-            else:
-                Gshape = G.ufl_shape
-                Gshape = Gshape[:len(Gshape)//2]
-                G = ufl.as_tensor(numpy.reshape([G[i+i] for i in numpy.ndindex(Gshape)], (Gshape[0], -1)))
-            Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape)
+        if len(G.ufl_shape) == 2:
+            G = ufl.diag_vector(G)
         else:
-            Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape, symmetry=True)
+            Gshape = G.ufl_shape
+            Gshape = Gshape[:len(Gshape)//2]
+            G = ufl.as_tensor(numpy.reshape([G[i+i] for i in numpy.ndindex(Gshape)], (Gshape[0], -1)))
+        Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape)
 
         # assemble second order coefficient
         if not isinstance(alpha, ufl.constantvalue.Zero):
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 8446d5aa3e..1fb2e267da 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -1481,8 +1481,8 @@ def _weight(self):
     def _standalones(self):
         standalones = []
         for i, (uc_sub, uf_sub) in enumerate(zip(self.uc.subfunctions, self.uf.subfunctions)):
-            Vc_sub_bcs = [bc for bc in self.Vc_bcs if bc.function_space().index == i]
-            Vf_sub_bcs = [bc for bc in self.Vf_bcs if bc.function_space().index == i]
+            Vc_sub_bcs = tuple(bc for bc in self.Vc_bcs if bc.function_space().index == i)
+            Vf_sub_bcs = tuple(bc for bc in self.Vf_bcs if bc.function_space().index == i)
             standalone = StandaloneInterpolationMatrix(uc_sub, uf_sub, Vc_sub_bcs, Vf_sub_bcs)
             standalones.append(standalone)
         return standalones

From 1d754e53b8b5392c7558a3ffee66e2dd99752b7a Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 23 Mar 2023 16:05:20 +0000
Subject: [PATCH 43/75] more careful imports from firedrake

---
 firedrake/preconditioners/fdm.py | 69 +++++++++++++++++---------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 862b80038c..dcb6f9aedb 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -8,6 +8,10 @@
                                            get_permutation_to_line_elements)
 from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
 from firedrake.formmanipulation import ExtractSubBlock
+from firedrake.function import Function
+from firedrake.functionspace import FunctionSpace
+from firedrake.ufl_expr import TestFunction, TestFunctions, TrialFunctions
+
 from firedrake_citations import Citations
 from pyop2.compilation import load
 from pyop2.utils import get_petsc_dir
@@ -17,7 +21,6 @@
 from ufl.algorithms.expand_indices import expand_indices
 
 import firedrake.dmhooks as dmhooks
-import firedrake
 import ctypes
 import numpy
 import ufl
@@ -140,7 +143,7 @@ def initialize(self, pc):
             V_fdm, J_fdm, bcs_fdm = (V, J, bcs)
         else:
             # Reconstruct Jacobian and bcs with variant element
-            V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
+            V_fdm = FunctionSpace(V.mesh(), e_fdm)
             J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
             bcs_fdm = []
             for bc in bcs:
@@ -221,7 +224,7 @@ def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type, use_st
         elif len(ifacet) == 1:
             Vfacet = V[ifacet[0]]
             ebig, = set(unrestrict_element(Vsub.ufl_element()) for Vsub in V)
-            Vbig = firedrake.FunctionSpace(V.mesh(), ebig)
+            Vbig = FunctionSpace(V.mesh(), ebig)
             if len(V) > 1:
                 dims = [Vsub.finat_element.space_dimension() for Vsub in V]
                 assert sum(dims) == Vbig.finat_element.space_dimension()
@@ -578,10 +581,10 @@ def assemble_coef(self, J, form_compiler_parameters):
         elements = list(map(ufl.BrokenElement, elements))
         if V.shape:
             elements = [ufl.TensorElement(ele, shape=V.shape) for ele in elements]
-        Z = firedrake.FunctionSpace(mesh, ufl.MixedElement(elements))
+        Z = FunctionSpace(mesh, ufl.MixedElement(elements))
 
         # Transform the exterior derivative and the original arguments of J to arguments in Z
-        args = (firedrake.TestFunctions(Z), firedrake.TrialFunctions(Z))
+        args = (TestFunctions(Z), TrialFunctions(Z))
         repargs = {t: v[0] for t, v in zip(args_J, args)}
         repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else {}
         Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
@@ -594,9 +597,10 @@ def assemble_coef(self, J, form_compiler_parameters):
         try:
             return cache[key]
         except KeyError:
+            from firedrake.assemble import assemble
             if block_diagonal and V.shape:
-                M = firedrake.assemble(mixed_form, mat_type="matfree",
-                                       form_compiler_parameters=form_compiler_parameters)
+                M = assemble(mixed_form, mat_type="matfree",
+                             form_compiler_parameters=form_compiler_parameters)
                 coefficients = {}
                 assembly_callables = []
                 for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
@@ -605,9 +609,9 @@ def assemble_coef(self, J, form_compiler_parameters):
                     coefficients[name] = ctx._block_diagonal
                     assembly_callables.append(ctx._assemble_block_diagonal)
             else:
-                tensor = firedrake.Function(Z)
+                tensor = Function(Z)
                 coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
-                assembly_callables = [partial(firedrake.assemble, mixed_form, tensor=tensor, diagonal=True,
+                assembly_callables = [partial(assemble, mixed_form, tensor=tensor, diagonal=True,
                                               form_compiler_parameters=form_compiler_parameters)]
             return cache.setdefault(key, (coefficients, assembly_callables))
 
@@ -1379,6 +1383,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coef(self, J, form_compiler_parameters):
+        from firedrake.assemble import assemble
         coefficients = {}
         assembly_callables = []
 
@@ -1395,7 +1400,7 @@ def assemble_coef(self, J, form_compiler_parameters):
             pass
         quad_deg = 2*degree+1
         quad_deg = (form_compiler_parameters or {}).get("degree", quad_deg)
-        dx = firedrake.dx(degree=quad_deg)
+        dx = ufl.dx(degree=quad_deg)
 
         family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
         degree = 0
@@ -1441,11 +1446,11 @@ def assemble_coef(self, J, form_compiler_parameters):
 
         # assemble second order coefficient
         if not isinstance(alpha, ufl.constantvalue.Zero):
-            Q = firedrake.FunctionSpace(mesh, Qe)
-            q = firedrake.TestFunction(Q)
-            Gq = firedrake.Function(Q)
+            Q = FunctionSpace(mesh, Qe)
+            q = TestFunction(Q)
+            Gq = Function(Q)
             coefficients["alpha"] = Gq
-            assembly_callables.append(partial(firedrake.assemble, ufl.inner(G, q)*dx, Gq))
+            assembly_callables.append(partial(assemble, ufl.inner(G, q)*dx, Gq))
 
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
@@ -1456,17 +1461,17 @@ def assemble_coef(self, J, form_compiler_parameters):
             Qe = ufl.FiniteElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default")
             if shape:
                 Qe = ufl.TensorElement(Qe, shape=shape)
-            Q = firedrake.FunctionSpace(mesh, Qe)
-            q = firedrake.TestFunction(Q)
-            Bq = firedrake.Function(Q)
+            Q = FunctionSpace(mesh, Qe)
+            q = TestFunction(Q)
+            Bq = Function(Q)
             coefficients["beta"] = Bq
-            assembly_callables.append(partial(firedrake.assemble, ufl.inner(beta, q)*dx, Bq))
+            assembly_callables.append(partial(assemble, ufl.inner(beta, q)*dx, Bq))
 
         if Piola:
             # make DGT functions with the second order coefficient
             # and the Piola tensor for each side of each facet
             extruded = mesh.cell_set._extruded
-            dS_int = firedrake.dS_h(degree=quad_deg) + firedrake.dS_v(degree=quad_deg) if extruded else firedrake.dS(degree=quad_deg)
+            dS_int = ufl.dS_h(degree=quad_deg) + ufl.dS_v(degree=quad_deg) if extruded else ufl.dS(degree=quad_deg)
             ele = ufl.BrokenElement(ufl.FiniteElement("DGT", mesh.ufl_cell(), 0))
             area = ufl.FacetArea(mesh)
 
@@ -1477,18 +1482,18 @@ def assemble_coef(self, J, form_compiler_parameters):
             G = vol * alpha
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
 
-            Q = firedrake.TensorFunctionSpace(mesh, ele, shape=G.ufl_shape)
-            q = firedrake.TestFunction(Q)
-            Gq_facet = firedrake.Function(Q)
+            Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=G.ufl_shape))
+            q = TestFunction(Q)
+            Gq_facet = Function(Q)
             coefficients["Gq_facet"] = Gq_facet
-            assembly_callables.append(partial(firedrake.assemble, ((ufl.inner(q('+'), G('+')) + ufl.inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
+            assembly_callables.append(partial(assemble, ((ufl.inner(q('+'), G('+')) + ufl.inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
 
             PT = Piola.T
-            Q = firedrake.TensorFunctionSpace(mesh, ele, shape=PT.ufl_shape)
-            q = firedrake.TestFunction(Q)
-            PT_facet = firedrake.Function(Q)
+            Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=PT.ufl_shape))
+            q = TestFunction(Q)
+            PT_facet = Function(Q)
             coefficients["PT_facet"] = PT_facet
-            assembly_callables.append(partial(firedrake.assemble, ((ufl.inner(q('+'), PT('+')) + ufl.inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
+            assembly_callables.append(partial(assemble, ((ufl.inner(q('+'), PT('+')) + ufl.inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
 
         # make DGT functions with BC flags
         rvs = V.ufl_element().reference_value_shape()
@@ -1498,9 +1503,9 @@ def assemble_coef(self, J, form_compiler_parameters):
         Qe = ufl.FiniteElement(family, cell=cell, degree=degree)
         if rvs:
             Qe = ufl.TensorElement(Qe, shape=rvs)
-        Q = firedrake.FunctionSpace(mesh, Qe)
-        q = firedrake.TestFunction(Q)
-        bcflags = firedrake.Function(Q)
+        Q = FunctionSpace(mesh, Qe)
+        q = TestFunction(Q)
+        bcflags = Function(Q)
 
         ref_args = [ufl.variable(t) for t in args_J]
         replace_args = {t: s for t, s in zip(args_J, ref_args)}
@@ -1520,7 +1525,7 @@ def assemble_coef(self, J, form_compiler_parameters):
         if len(forms):
             form = sum(forms)
             if len(form.arguments()) == 1:
-                assembly_callables.append(partial(firedrake.assemble, form, bcflags))
+                assembly_callables.append(partial(assemble, form, bcflags))
                 coefficients["bcflags"] = bcflags
 
         # set arbitrary non-zero coefficients for preallocation
@@ -1643,7 +1648,7 @@ def get_interior_facet_maps(V):
         local_facet_data_fun: maps interior facets to the local facet numbering in the two cells sharing it,
         nfacets: the total number of interior facets owned by this process
     """
-    if isinstance(V, firedrake.Function):
+    if isinstance(V, Function):
         V = V.function_space()
     mesh = V.mesh()
     intfacets = mesh.interior_facets

From a9dbd8a44eb28baf2c0a2036800ea438db74c4fc Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 23 Mar 2023 18:21:35 +0000
Subject: [PATCH 44/75] mantain old code

---
 firedrake/preconditioners/fdm.py | 79 ++++++++++++++------------------
 1 file changed, 34 insertions(+), 45 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index dcb6f9aedb..09c7cc730a 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -111,7 +111,7 @@ def initialize(self, pc):
         pmat_type = options.getString("mat_type", PETSc.Mat.Type.AIJ)
 
         appctx = self.get_appctx(pc)
-        fcp = appctx.get("form_compiler_parameters")
+        fcp = appctx.get("form_compiler_parameters") or {}
         self.appctx = appctx
 
         # Get original Jacobian form and bcs
@@ -203,14 +203,14 @@ def initialize(self, pc):
             fdmpc.setFromOptions()
 
     @PETSc.Log.EventDecorator("FDMPrealloc")
-    def assemble_fdm_op(self, V, J, bcs, form_compiler_parameters, pmat_type, use_static_condensation):
+    def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         """
         Assemble the sparse preconditioner from diagonal mass matrices.
 
         :arg V: the :class:`.FunctionSpace` of the form arguments
         :arg J: the Jacobian bilinear form
         :arg bcs: an iterable of boundary conditions on V
-        :arg form_compiler_parameters: parameters to assemble diagonal factors
+        :arg fcp: form compiler parameters to assemble coefficients
         :arg pmat_type: the preconditioner `PETSc.Mat.Type`
         :arg use_static_condensation: are we assembling the statically-condensed Schur complement on facets?
 
@@ -273,7 +273,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             bdofs = numpy.nonzero(lgmap.indices[:own] < 0)[0].astype(PETSc.IntType)
             bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
 
-        coefficients, assembly_callables = self.assemble_coef(J, form_compiler_parameters)
+        coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
         coeffs = [coefficients.get(k) for k in ("beta", "alpha")]
         cmaps = [extrude_node_map(ck.cell_node_map())[0] for ck in coeffs]
 
@@ -508,14 +508,14 @@ def update_De(data):
             RtAP.buff.destroy()
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
-    def assemble_coef(self, J, form_compiler_parameters):
+    def assemble_coefficients(self, J, fcp):
         """
         Obtain coefficients for the auxiliary operator as the diagonal of a
         weighted mass matrix in broken(V^k) * broken(V^{k+1}).
         See Section 3.2 of Brubeck2022b.
 
         :arg J: the Jacobian bilinear :class:`ufl.Form`,
-        :arg form_compiler_parameters: a `dict` with tsfc parameters.
+        :arg fcp: form compiler parameters to assemble the diagonal matrices.
 
         :returns: a 2-tuple of a `dict` with the zero-th order and second
                   order coefficients keyed on ``"beta"`` and ``"alpha"``,
@@ -600,7 +600,7 @@ def assemble_coef(self, J, form_compiler_parameters):
             from firedrake.assemble import assemble
             if block_diagonal and V.shape:
                 M = assemble(mixed_form, mat_type="matfree",
-                             form_compiler_parameters=form_compiler_parameters)
+                             form_compiler_parameters=fcp)
                 coefficients = {}
                 assembly_callables = []
                 for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
@@ -612,7 +612,7 @@ def assemble_coef(self, J, form_compiler_parameters):
                 tensor = Function(Z)
                 coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
                 assembly_callables = [partial(assemble, mixed_form, tensor=tensor, diagonal=True,
-                                              form_compiler_parameters=form_compiler_parameters)]
+                                              form_compiler_parameters=fcp)]
             return cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
@@ -1207,7 +1207,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
         # assemble zero-th order term separately, including off-diagonals (mixed components)
         # I cannot do this for hdiv elements as off-diagonals are not sparse, this is because
-        # the FDM eigenbases for CG(k) and DG(k-1) are not orthogonal to each other
+        # the FDM eigenbases for CG(k) and CG(k-1) are not orthogonal to each other
         rindices = None
         use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2 or static_condensation
         if not use_diag_Bq:
@@ -1246,10 +1246,10 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
             # get second order coefficient on this cell
             if Gq is not None:
-                mue.flat[:] = numpy.sum(Gq.dat.data_ro[je], axis=0)
+                numpy.sum(Gq.dat.data_ro[je], axis=0, out=mue)
             # get zero-th order coefficient on this cell
             if Bq is not None:
-                bqe.flat[:] = numpy.sum(Bq.dat.data_ro[je], axis=0)
+                numpy.sum(Bq.dat.data_ro[je], axis=0, out=bqe)
 
             for k in range(ncomp):
                 # permutation of axes with respect to the first vector component
@@ -1382,7 +1382,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
                     Ae.destroy()
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
-    def assemble_coef(self, J, form_compiler_parameters):
+    def assemble_coefficients(self, J, fcp):
         from firedrake.assemble import assemble
         coefficients = {}
         assembly_callables = []
@@ -1399,11 +1399,9 @@ def assemble_coef(self, J, form_compiler_parameters):
         except TypeError:
             pass
         quad_deg = 2*degree+1
-        quad_deg = (form_compiler_parameters or {}).get("degree", quad_deg)
-        dx = ufl.dx(degree=quad_deg)
-
+        quad_deg = fcp.get("degree", quad_deg)
+        dx = ufl.dx(degree=quad_deg, domain=mesh)
         family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
-        degree = 0
 
         # extract coefficients directly from the bilinear form
         integrals_J = J.integrals_by_type("cell")
@@ -1435,22 +1433,19 @@ def assemble_coef(self, J, form_compiler_parameters):
             beta = ufl.replace(beta, {dummy_Piola: Piola})
 
         # discard mixed derivatives and mixed components
-        G = alpha
-        if len(G.ufl_shape) == 2:
-            G = ufl.diag_vector(G)
+        if len(alpha.ufl_shape) == 2:
+            alpha = ufl.diag_vector(alpha)
         else:
-            Gshape = G.ufl_shape
-            Gshape = Gshape[:len(Gshape)//2]
-            G = ufl.as_tensor(numpy.reshape([G[i+i] for i in numpy.ndindex(Gshape)], (Gshape[0], -1)))
-        Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape)
+            ashape = alpha.ufl_shape
+            ashape = ashape[:len(ashape)//2]
+            alpha = ufl.as_tensor(numpy.reshape([alpha[i+i] for i in numpy.ndindex(ashape)], (ashape[0], -1)))
+        Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=0, shape=alpha.ufl_shape)
 
         # assemble second order coefficient
         if not isinstance(alpha, ufl.constantvalue.Zero):
             Q = FunctionSpace(mesh, Qe)
-            q = TestFunction(Q)
-            Gq = Function(Q)
-            coefficients["alpha"] = Gq
-            assembly_callables.append(partial(assemble, ufl.inner(G, q)*dx, Gq))
+            tensor = coefficients.setdefault("alpha", Function(Q))
+            assembly_callables.append(partial(assemble, ufl.inner(TestFunction(Q), alpha)*dx, tensor))
 
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
@@ -1458,42 +1453,36 @@ def assemble_coef(self, J, form_compiler_parameters):
                 # keep diagonal
                 beta = ufl.diag_vector(beta)
             shape = beta.ufl_shape
-            Qe = ufl.FiniteElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default")
+            Qe = ufl.FiniteElement(family, mesh.ufl_cell(), degree=0)
             if shape:
                 Qe = ufl.TensorElement(Qe, shape=shape)
             Q = FunctionSpace(mesh, Qe)
-            q = TestFunction(Q)
-            Bq = Function(Q)
-            coefficients["beta"] = Bq
-            assembly_callables.append(partial(assemble, ufl.inner(beta, q)*dx, Bq))
+            tensor = coefficients.setdefault("beta", Function(Q))
+            assembly_callables.append(partial(assemble, ufl.inner(TestFunction(Q), beta)*dx, tensor))
 
         if Piola:
             # make DGT functions with the second order coefficient
             # and the Piola tensor for each side of each facet
             extruded = mesh.cell_set._extruded
             dS_int = ufl.dS_h(degree=quad_deg) + ufl.dS_v(degree=quad_deg) if extruded else ufl.dS(degree=quad_deg)
-            ele = ufl.BrokenElement(ufl.FiniteElement("DGT", mesh.ufl_cell(), 0))
-            area = ufl.FacetArea(mesh)
+            ifacet_inner = lambda v, u: ((ufl.inner(v('+'), u('+')) + ufl.inner(v('-'), u('-')))/ufl.FacetArea(mesh))*dS_int
 
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
             alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
                                                      ref_grad[0]), ref_grad[1]) for i in integrals_J]))
-            vol = abs(ufl.JacobianDeterminant(mesh))
-            G = vol * alpha
+            G = alpha
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
+            G = G * abs(ufl.JacobianDeterminant(mesh))
 
+            ele = ufl.BrokenElement(ufl.FiniteElement("DGT", cell=mesh.ufl_cell(), degree=0))
             Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=G.ufl_shape))
-            q = TestFunction(Q)
-            Gq_facet = Function(Q)
-            coefficients["Gq_facet"] = Gq_facet
-            assembly_callables.append(partial(assemble, ((ufl.inner(q('+'), G('+')) + ufl.inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
+            tensor = coefficients.setdefault("Gq_facet", Function(Q))
+            assembly_callables.append(partial(assemble, ifacet_inner(TestFunction(Q), G), tensor))
 
             PT = Piola.T
             Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=PT.ufl_shape))
-            q = TestFunction(Q)
-            PT_facet = Function(Q)
-            coefficients["PT_facet"] = PT_facet
-            assembly_callables.append(partial(assemble, ((ufl.inner(q('+'), PT('+')) + ufl.inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
+            tensor = coefficients.setdefault("PT_facet", Function(Q))
+            assembly_callables.append(partial(assemble, ifacet_inner(TestFunction(Q), PT), tensor))
 
         # make DGT functions with BC flags
         rvs = V.ufl_element().reference_value_shape()
@@ -1525,8 +1514,8 @@ def assemble_coef(self, J, form_compiler_parameters):
         if len(forms):
             form = sum(forms)
             if len(form.arguments()) == 1:
-                assembly_callables.append(partial(assemble, form, bcflags))
                 coefficients["bcflags"] = bcflags
+                assembly_callables.append(partial(assemble, form, bcflags))
 
         # set arbitrary non-zero coefficients for preallocation
         for coef in coefficients.values():

From ab144a25e72886dda15ff4c4af7a49412bce4492 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Sat, 25 Mar 2023 09:28:54 +0000
Subject: [PATCH 45/75] use OneFormAssembler, compute reference tensor via dual
 evaluation

---
 firedrake/preconditioners/fdm.py | 50 ++++++++++++++------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 09c7cc730a..7e410a6411 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -508,14 +508,15 @@ def update_De(data):
             RtAP.buff.destroy()
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
-    def assemble_coefficients(self, J, fcp):
+    def assemble_coefficients(self, J, fcp, block_diagonal=True):
         """
         Obtain coefficients for the auxiliary operator as the diagonal of a
         weighted mass matrix in broken(V^k) * broken(V^{k+1}).
         See Section 3.2 of Brubeck2022b.
 
         :arg J: the Jacobian bilinear :class:`ufl.Form`,
-        :arg fcp: form compiler parameters to assemble the diagonal matrices.
+        :arg fcp: form compiler parameters to assemble the diagonal of the mass matrices.
+        :arg block_diagonal: are we assembling the block diagonal of the mass matrices?
 
         :returns: a 2-tuple of a `dict` with the zero-th order and second
                   order coefficients keyed on ``"beta"`` and ``"alpha"``,
@@ -593,12 +594,11 @@ def assemble_coefficients(self, J, fcp):
         # Return coefficients and assembly callables, and cache them class
         key = (mixed_form.signature(), mesh)
         cache = self._cache.setdefault("coefficients", {})
-        block_diagonal = True
         try:
             return cache[key]
         except KeyError:
-            from firedrake.assemble import assemble
             if block_diagonal and V.shape:
+                from firedrake.assemble import assemble
                 M = assemble(mixed_form, mat_type="matfree",
                              form_compiler_parameters=fcp)
                 coefficients = {}
@@ -609,10 +609,11 @@ def assemble_coefficients(self, J, fcp):
                     coefficients[name] = ctx._block_diagonal
                     assembly_callables.append(ctx._assemble_block_diagonal)
             else:
+                from firedrake.assemble import OneFormAssembler
                 tensor = Function(Z)
                 coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
-                assembly_callables = [partial(assemble, mixed_form, tensor=tensor, diagonal=True,
-                                              form_compiler_parameters=fcp)]
+                assembly_callables = [OneFormAssembler(mixed_form, tensor=tensor, diagonal=True,
+                                                       form_compiler_parameters=fcp).assemble]
             return cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
@@ -657,24 +658,10 @@ def assemble_reference_tensor(self, V):
             e1 = elements[-1] if elements[-1].formdegree == 1 else FIAT.FDMDiscontinuousLagrange(ref_el, degree-1)
             if is_interior:
                 e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
-            if hasattr(eq.dual, "rule"):
-                rule = eq.dual.rule
-            else:
-                rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
-
-            pts = rule.get_points()
-            wts = rule.get_weights()
-
-            phiq = eq.tabulate(0, pts)
-            phi1 = e1.tabulate(0, pts)
-            phi0 = e0.tabulate(1, pts)
 
-            moments = lambda v, u: numpy.dot(numpy.multiply(v, wts), u.T)
-            A00 = moments(phiq[(0, )], phi0[(0, )])
-            A11 = moments(phi1[(0, )], phi1[(0, )])
-            A10 = moments(phi1[(0, )], phi0[(1, )])
-            A10 = numpy.linalg.solve(A11, A10)
-            A11 = numpy.eye(A11.shape[0])
+            A00 = fiat_reference_prolongator(e0, eq)
+            A10 = fiat_reference_prolongator(e0, e1, derivative=True)
+            A11 = numpy.eye(e1.space_dimension(), dtype=A00.dtype)
 
             B_blocks = mass_blocks(tdim, formdegree, A00, A11)
             A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
@@ -1383,7 +1370,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp):
-        from firedrake.assemble import assemble
+        from firedrake.assemble import OneFormAssembler
         coefficients = {}
         assembly_callables = []
 
@@ -1445,7 +1432,8 @@ def assemble_coefficients(self, J, fcp):
         if not isinstance(alpha, ufl.constantvalue.Zero):
             Q = FunctionSpace(mesh, Qe)
             tensor = coefficients.setdefault("alpha", Function(Q))
-            assembly_callables.append(partial(assemble, ufl.inner(TestFunction(Q), alpha)*dx, tensor))
+            assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), alpha)*dx, tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
 
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
@@ -1458,7 +1446,8 @@ def assemble_coefficients(self, J, fcp):
                 Qe = ufl.TensorElement(Qe, shape=shape)
             Q = FunctionSpace(mesh, Qe)
             tensor = coefficients.setdefault("beta", Function(Q))
-            assembly_callables.append(partial(assemble, ufl.inner(TestFunction(Q), beta)*dx, tensor))
+            assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), beta)*dx, tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
 
         if Piola:
             # make DGT functions with the second order coefficient
@@ -1477,12 +1466,14 @@ def assemble_coefficients(self, J, fcp):
             ele = ufl.BrokenElement(ufl.FiniteElement("DGT", cell=mesh.ufl_cell(), degree=0))
             Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=G.ufl_shape))
             tensor = coefficients.setdefault("Gq_facet", Function(Q))
-            assembly_callables.append(partial(assemble, ifacet_inner(TestFunction(Q), G), tensor))
+            assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), G), tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
 
             PT = Piola.T
             Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=PT.ufl_shape))
             tensor = coefficients.setdefault("PT_facet", Function(Q))
-            assembly_callables.append(partial(assemble, ifacet_inner(TestFunction(Q), PT), tensor))
+            assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), PT), tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
 
         # make DGT functions with BC flags
         rvs = V.ufl_element().reference_value_shape()
@@ -1515,7 +1506,8 @@ def assemble_coefficients(self, J, fcp):
             form = sum(forms)
             if len(form.arguments()) == 1:
                 coefficients["bcflags"] = bcflags
-                assembly_callables.append(partial(assemble, form, bcflags))
+                assembly_callables.append(OneFormAssembler(form, tensor=bcflags,
+                                                           form_compiler_parameters=fcp).assemble)
 
         # set arbitrary non-zero coefficients for preallocation
         for coef in coefficients.values():

From bfcdbb885584f788f1c0efad05dcaf4c7ea6d565 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Sat, 25 Mar 2023 13:05:51 +0000
Subject: [PATCH 46/75] optimise assembly

---
 firedrake/preconditioners/fdm.py | 88 +++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 36 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 7e410a6411..8aa109c225 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -238,14 +238,17 @@ def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
         idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
         self.ises = tuple(PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs))
-        self.submats = [None for _ in range(7)]
+        self.submats = [None for _ in range(8)]
 
         self.reference_tensor_on_diag = {}
         self.get_static_condensation = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
+            diagonal_interior = Vfacet.finat_element.formdegree == 0 and value_size == 1
+            factor = factor_diagonal_mat if diagonal_interior else factor_block_diagonal_mat
             self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
-            self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1], self.submats)
+            self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1],
+                                                                                  self.submats, factor)
 
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
@@ -272,23 +275,24 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             own = Vsub.dof_dset.layout_vec.getLocalSize()
             bdofs = numpy.nonzero(lgmap.indices[:own] < 0)[0].astype(PETSc.IntType)
             bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
+        self.nel = nel
 
         coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
-        coeffs = [coefficients.get(k) for k in ("beta", "alpha")]
-        cmaps = [extrude_node_map(ck.cell_node_map())[0] for ck in coeffs]
+        coeffs = [coefficients.get(name) for name in ("beta", "alpha")]
+        cdata = [c.dat.data_ro for c in coeffs]
+        cmaps = [extrude_node_map(c.cell_node_map())[0] for c in coeffs]
+        cindices = [cmap(0) if self.nel else None for cmap in cmaps]
 
         @PETSc.Log.EventDecorator("FDMGetCoeffs")
         def get_coeffs(e, result=None):
             # Get vector for betas and alphas on a cell
-            vals = []
-            for k, (coeff, cmap) in enumerate(zip(coeffs, cmaps)):
-                get_coeffs.indices[k] = cmap(e, result=get_coeffs.indices[k])
-                vals.append(coeff.dat.data_ro[get_coeffs.indices[k]])
-            return numpy.concatenate(vals, out=result)
-        get_coeffs.indices = [None for _ in range(len(coeffs))]
-        self.get_coeffs = get_coeffs
+            if result is None:
+                return numpy.concatenate([c[cmap(e, result=idx)] for c, cmap, idx in zip(cdata, cmaps, cindices)], out=result)
+            numpy.take(cdata[0], cmaps[0](e, result=cindices[0]), axis=0, out=result[:cindices[0].size])
+            numpy.take(cdata[1], cmaps[1](e, result=cindices[1]), axis=0, out=result[cindices[0].size:])
+            return result
 
-        self.nel = nel
+        self.get_coeffs = get_coeffs
         self.work_mats = {}
 
         Pmats = {}
@@ -446,28 +450,28 @@ def RtAP(R, A, P, result=None):
         if A.getType() != PETSc.Mat.Type.PREALLOCATOR:
             Ae = self.work_mats[Vrow, Vcol]
             De = self.work_mats[common_key]
-            data = self.work_csr[2]
             insert = PETSc.InsertMode.INSERT
             work_vec = De.getDiagonal()
+            data = self.work_csr[2]
             if len(data.shape) == 3:
                 @PETSc.Log.EventDecorator("FDMUpdateDiag")
-                def update_De(data):
+                def update_De():
                     De.setValuesCSR(*self.work_csr, addv=insert)
                     De.assemble()
                     return De
             else:
                 @PETSc.Log.EventDecorator("FDMUpdateDiag")
-                def update_De(data):
-                    work_vec.setArray(data)
+                def update_De():
                     De.setDiagonal(work_vec, addv=insert)
                     return De
+                data = work_vec.array_w
 
             # Core assembly loop
             for e in range(self.nel):
                 rindices = get_rindices(e, result=rindices)
                 cindices = get_cindices(e, result=cindices)
                 data = self.get_coeffs(e, result=data)
-                Ae = assemble_element_mat(update_De(data), result=Ae)
+                Ae = assemble_element_mat(update_De(), result=Ae)
                 update_A(condense_element_mat(Ae), rindices, cindices)
 
             work_vec.destroy()
@@ -682,13 +686,26 @@ def assemble_reference_tensor(self, V):
             return cache.setdefault(key, result)
 
 
-def factor_interior_mat(A00):
+@PETSc.Log.EventDecorator("FDMFactor")
+def factor_diagonal_mat(A, work_vec=None):
     """
-    Used in static condensation. Take in A00 on a cell, return its Cholesky
+    Used in static condensation. Take in A on a cell, return its Cholesky
+    factorisation.
+    """
+    work_vec = A.getDiagonal(result=work_vec)
+    work_vec.reciprocal()
+    work_vec.sqrtabs()
+    A.setDiagonal(work_vec)
+
+
+@PETSc.Log.EventDecorator("FDMFactor")
+def factor_block_diagonal_mat(A, work_vec=None):
+    """
+    Used in static condensation. Take in A on a cell, return its Cholesky
     factorisation. Assumes that interior DOF have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    indptr, indices, data = A00.getValuesCSR()
+    indptr, indices, data = A.getValuesCSR()
     degree = numpy.diff(indptr)
 
     # TODO handle non-symmetric case with LU, requires scipy
@@ -703,26 +720,25 @@ def factor_interior_mat(A00):
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         data[zlice] = invchol(data[zlice].reshape((-1, k, k))).reshape((-1,))
         flops += nblocks * (((k+1)**3 + 5*(k+1)-12)//3 + k**3)
-
+    A.setValuesCSR(indptr, indices, data)
+    A.assemble()
     PETSc.Log.logFlops(flops)
-    A00.setValuesCSR(indptr, indices, data)
-    A00.assemble()
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_mat(A, i0, i1, submats):
+def condense_element_mat(A, i0, i1, submats, factor):
     """Return the Schur complement associated to indices in i1, condensing i0 out"""
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
-    structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
+    structure = PETSc.Mat.Structure.SUBSET if submats[7] else None
     submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
     A00, A01, A10, A11 = submats[:4]
-    factor_interior_mat(A00)
-    submats[4] = A00.matMult(A01, result=submats[4])
-    submats[5] = A10.matTransposeMult(A00, result=submats[5])
-    submats[6] = submats[5].matMult(submats[4], result=submats[6])
-    submats[6].aypx(-1.0, A11, structure=structure)
-    return submats[6]
+    factor(A00, submats[4])
+    submats[5] = A00.matMult(A01, result=submats[5])
+    submats[6] = A10.matTransposeMult(A00, result=submats[6])
+    submats[7] = submats[6].matMult(submats[5], result=submats[7])
+    submats[7].aypx(-1.0, A11, structure=structure)
+    return submats[7]
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
@@ -730,14 +746,14 @@ def condense_element_pattern(A, i0, i1, submats):
     """Add zeroes on the statically condensed pattern so that you can run ICC(0)"""
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
-    structure = PETSc.Mat.Structure.SUBSET if submats[6] else None
+    structure = PETSc.Mat.Structure.SUBSET if submats[7] else None
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
     A00, A01, A10 = submats[:3]
-    submats[4] = A10.matTransposeMult(A00, result=submats[4])
     submats[5] = A00.matMult(A01, result=submats[5])
-    submats[6] = submats[4].matMult(submats[5], result=submats[6])
-    submats[6].aypx(0.0, A, structure=structure)
-    return submats[6]
+    submats[6] = A10.matTransposeMult(A00, result=submats[6])
+    submats[7] = submats[6].matMult(submats[5], result=submats[7])
+    submats[7].aypx(0.0, A, structure=structure)
+    return submats[7]
 
 
 @PETSc.Log.EventDecorator("LoadCode")

From 1ff89011da797ef6f06ca8f84b1a2731d26d76a5 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Sun, 26 Mar 2023 17:05:23 +0100
Subject: [PATCH 47/75] compute Schur compelment via block QR and block SVD

---
 firedrake/preconditioners/fdm.py | 175 ++++++++++++++++++++++++-------
 1 file changed, 138 insertions(+), 37 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 8aa109c225..73e5dfc0e3 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -245,10 +245,10 @@ def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             diagonal_interior = Vfacet.finat_element.formdegree == 0 and value_size == 1
-            factor = factor_diagonal_mat if diagonal_interior else factor_block_diagonal_mat
+            get_schur = schur_complement_diagonal if diagonal_interior else schur_complement_block_qr
             self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
             self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1],
-                                                                                  self.submats, factor)
+                                                                                  self.submats, get_schur)
 
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
@@ -426,7 +426,7 @@ def RtAP(R, A, P, result=None):
         if Vrow == Vcol:
             get_cindices = lambda e, result=None: result
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, rindices, addv)
-            # moments of orthogonalized basis against basis tabulation and derivative tabulation
+            # interpolator of basis and exterior derivative onto broken spaces
             rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
             # element matrix obtained via Equation (3.9) of Brubeck2022b
             assemble_element_mat = lambda De, result=None: De.PtAP(rtensor, result=result)
@@ -628,8 +628,8 @@ def assemble_reference_tensor(self, V):
 
         :arg V: a :class:`.FunctionSpace`
 
-        :returns: a :class:`PETSc.Mat` with the moments of orthogonalized bases
-                  against the basis and its exterior derivative.
+        :returns: a :class:`PETSc.Mat` interpolating V^k * d(V^k) onto
+                  broken(V^k) * broken(V^{k+1}) on the reference element.
         """
         tdim = V.mesh().topological_dimension()
         value_size = V.value_size
@@ -686,59 +686,160 @@ def assemble_reference_tensor(self, V):
             return cache.setdefault(key, result)
 
 
-@PETSc.Log.EventDecorator("FDMFactor")
-def factor_diagonal_mat(A, work_vec=None):
+@PETSc.Log.EventDecorator("FDMGetSchur")
+def schur_complement_diagonal(submats):
     """
-    Used in static condensation. Take in A on a cell, return its Cholesky
-    factorisation.
+    Used in static condensation. Take in blocks A00, A01, A10, A11,
+    return the Schur complement A11 - A10 * inv(A00) * A01.
+
+    Assumes A00 is diagonal.
     """
-    work_vec = A.getDiagonal(result=work_vec)
-    work_vec.reciprocal()
-    work_vec.sqrtabs()
-    A.setDiagonal(work_vec)
+    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    A00, A01, A10, A11 = submats[:4]
+    submats[4] = A00.getDiagonal(result=submats[4])
+    submats[4].reciprocal()
+    submats[4].scale(-1)
+    A01.diagonalScale(L=submats[4])
+    submats[-1] = A10.matMult(A01, result=submats[-1])
+    submats[-1].axpy(1.0, A11, structure=structure)
+    return submats[-1]
 
 
-@PETSc.Log.EventDecorator("FDMFactor")
-def factor_block_diagonal_mat(A, work_vec=None):
+@PETSc.Log.EventDecorator("FDMGetSchur")
+def schur_complement_block_cholesky(submats):
     """
-    Used in static condensation. Take in A on a cell, return its Cholesky
-    factorisation. Assumes that interior DOF have been reordered to make A00
+    Used in static condensation. Take in blocks A00, A01, A10, A11,
+    return A11 - A10 * inv(A00) * A01.
+
+    Assumes that interior DOFs have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    indptr, indices, data = A.getValuesCSR()
+    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    A00, A01, A10, A11 = submats[:4]
+    indptr, indices, R = A00.getValuesCSR()
     degree = numpy.diff(indptr)
 
-    # TODO handle non-symmetric case with LU, requires scipy
-    invchol = lambda X: numpy.linalg.inv(numpy.linalg.cholesky(X))
     nblocks = numpy.count_nonzero(degree == 1)
     zlice = slice(0, nblocks)
-    numpy.sqrt(data[zlice], out=data[zlice])
-    numpy.reciprocal(data[zlice], out=data[zlice])
-    flops = nblocks * 2
+    numpy.sqrt(R[zlice], out=R[zlice])
+    numpy.reciprocal(R[zlice], out=R[zlice])
+    flops = 2*nblocks
     for k in range(2, degree[-1]+1):
         nblocks = numpy.count_nonzero(degree == k)
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
-        data[zlice] = invchol(data[zlice].reshape((-1, k, k))).reshape((-1,))
-        flops += nblocks * (((k+1)**3 + 5*(k+1)-12)//3 + k**3)
-    A.setValuesCSR(indptr, indices, data)
-    A.assemble()
+        A = R[zlice].reshape((-1, k, k))
+        R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
+        flops += nblocks * ((k**3)//3 + k**3)
+
     PETSc.Log.logFlops(flops)
+    A00.setValuesCSR(indptr, indices, R)
+    A00.assemble()
+    submats[4] = A10.matTransposeMult(A00, result=submats[4])
+    submats[5] = A00.matMult(A01, result=submats[5])
+    submats[-1] = submats[4].matMult(submats[5], result=submats[-1])
+    submats[-1].aypx(-1.0, A11, structure=structure)
+    return submats[-1]
+
+
+@PETSc.Log.EventDecorator("FDMGetSchur")
+def schur_complement_block_qr(submats):
+    """
+    Used in static condensation. Take in blocks A00, A01, A10, A11,
+    return A11 - A10 * inv(A00) * A01.
+
+    Assumes that interior DOFs have been reordered to make A00
+    block diagonal with blocks of increasing dimension.
+    """
+    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    A00, A01, A10, A11 = submats[:4]
+    indptr, indices, R = A00.getValuesCSR()
+    degree = numpy.diff(indptr)
+    Q = numpy.ones(R.shape, dtype=R.dtype)
+
+    nblocks = numpy.count_nonzero(degree == 1)
+    zlice = slice(0, nblocks)
+    numpy.reciprocal(R[zlice], out=R[zlice])
+    flops = nblocks
+    for k in range(2, degree[-1]+1):
+        nblocks = numpy.count_nonzero(degree == k)
+        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
+        A = R[zlice].reshape((-1, k, k))
+        q, r = numpy.linalg.qr(A, mode="complete")
+        R[zlice] = numpy.linalg.inv(r).reshape((-1,))
+        Q[zlice] = q.reshape((-1,))
+        flops += nblocks * ((4*k**3)//3 + k**3)
+
+    PETSc.Log.logFlops(flops)
+    A00.setValuesCSR(indptr, indices, R)
+    A00.assemble()
+    submats[4] = A10.matMult(A00, result=submats[4])
+    A00.setValuesCSR(indptr, indices, Q)
+    A00.assemble()
+    submats[5] = A00.transposeMatMult(A01, result=submats[5])
+    submats[-1] = submats[4].matMult(submats[5], result=submats[-1])
+    submats[-1].aypx(-1.0, A11, structure=structure)
+    return submats[-1]
+
+
+@PETSc.Log.EventDecorator("FDMGetSchur")
+def schur_complement_block_svd(submats):
+    """
+    Used in static condensation. Take in blocks A00, A01, A10, A11,
+    return A11 - A10 * inv(A00) * A01.
+
+    Assumes that interior DOFs have been reordered to make A00
+    block diagonal with blocks of increasing dimension.
+    """
+    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    A00, A01, A10, A11 = submats[:4]
+    indptr, indices, U = A00.getValuesCSR()
+    degree = numpy.diff(indptr)
+    V = numpy.ones(U.shape, dtype=U.dtype)
+    submats[4] = A00.getDiagonal(result=submats[4])
+    D = submats[4]
+
+    nblocks = numpy.count_nonzero(degree == 1)
+    bslice = slice(0, nblocks)
+    dslice = slice(0, nblocks)
+    numpy.sign(D.array_r[dslice], out=U[bslice])
+
+    flops = nblocks
+    for k in range(2, degree[-1]+1):
+        nblocks = numpy.count_nonzero(degree == k)
+        bslice = slice(bslice.stop, bslice.stop + k*nblocks)
+        dslice = slice(dslice.stop, dslice.stop + nblocks)
+        A = U[bslice].reshape((-1, k, k))
+
+        u, s, v = numpy.linalg.svd(A, full_matrices=False)
+        D.array_w[dslice] = s.reshape((-1,))
+        U[bslice] = u.reshape((-1,))
+        V[bslice] = v.reshape((-1,))
+        flops += nblocks * ((4*k**3)//3 + 4*k**3)
+
+    PETSc.Log.logFlops(flops)
+
+    A00.setValuesCSR(indptr, indices, V)
+    A00.assemble()
+    D.sqrtabs()
+    D.reciprocal()
+    A00.diagonalScale(L=D)
+    submats[5] = A10.matTransposeMult(A00, result=submats[5])
+    A00.setValuesCSR(indptr, indices, U)
+    A00.assemble()
+    A00.diagonalScale(R=D)
+    submats[6] = A00.transposeMatMult(A01, result=submats[6])
+    submats[-1] = submats[5].matMult(submats[6], result=submats[-1])
+    submats[-1].aypx(-1.0, A11, structure=structure)
+    return submats[-1]
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_mat(A, i0, i1, submats, factor):
+def condense_element_mat(A, i0, i1, submats, get_schur_complement):
     """Return the Schur complement associated to indices in i1, condensing i0 out"""
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
-    structure = PETSc.Mat.Structure.SUBSET if submats[7] else None
     submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
-    A00, A01, A10, A11 = submats[:4]
-    factor(A00, submats[4])
-    submats[5] = A00.matMult(A01, result=submats[5])
-    submats[6] = A10.matTransposeMult(A00, result=submats[6])
-    submats[7] = submats[6].matMult(submats[5], result=submats[7])
-    submats[7].aypx(-1.0, A11, structure=structure)
-    return submats[7]
+    return get_schur_complement(submats)
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
@@ -750,7 +851,7 @@ def condense_element_pattern(A, i0, i1, submats):
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
     A00, A01, A10 = submats[:3]
     submats[5] = A00.matMult(A01, result=submats[5])
-    submats[6] = A10.matTransposeMult(A00, result=submats[6])
+    submats[6] = A10.matMult(A00, result=submats[6])
     submats[7] = submats[6].matMult(submats[5], result=submats[7])
     submats[7].aypx(0.0, A, structure=structure)
     return submats[7]

From dd7b2de8b4a50dffe96afb6e9941e84651405c77 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 28 Mar 2023 12:43:55 +0100
Subject: [PATCH 48/75] use triple matrix product

---
 firedrake/preconditioners/fdm.py | 123 ++++++++++++++++++++-----------
 1 file changed, 78 insertions(+), 45 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 73e5dfc0e3..271a392184 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -416,28 +416,24 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         :arg addv: a `PETSc.Mat.InsertMode`
         :arg triu: are we assembling only the upper triangular part?
         """
-        def RtAP(R, A, P, result=None):
-            RtAP.buff = A.matMult(P, result=RtAP.buff)
-            return R.transposeMatMult(RtAP.buff, result=result)
-        RtAP.buff = None
-
         set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         get_rindices = self.cell_to_global[Vrow]
         if Vrow == Vcol:
+            condense_element_mat = self.get_static_condensation.get(Vrow)
             get_cindices = lambda e, result=None: result
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, rindices, addv)
-            # interpolator of basis and exterior derivative onto broken spaces
-            rtensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
-            # element matrix obtained via Equation (3.9) of Brubeck2022b
-            assemble_element_mat = lambda De, result=None: De.PtAP(rtensor, result=result)
-            condense_element_mat = self.get_static_condensation.get(Vrow)
+            # interpolators of basis and exterior derivative onto broken spaces
+            ctensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
+            rtensor = PETSc.Mat().createTranspose(ctensor).convert(ctensor.getType())
         else:
+            condense_element_mat = None
             get_cindices = self.cell_to_global[Vcol]
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, cindices, addv)
-            rtensor = self.assemble_reference_tensor(Vrow)
             ctensor = self.assemble_reference_tensor(Vcol)
-            assemble_element_mat = lambda De, result=None: RtAP(rtensor, De, ctensor, result=result)
-            condense_element_mat = None
+            rtensor = self.assemble_reference_tensor(Vrow, transpose=True)
+
+        # element matrix obtained via Equation (3.9) of Brubeck2022b
+        assemble_element_mat = lambda De, result=None: rtensor.matMatMult(De, ctensor, result=result)
 
         do_sort = True
         if condense_element_mat is None:
@@ -508,8 +504,8 @@ def update_De():
             self.work_csr = (None, None, None)
             self.work_mats[common_key] = None
             self.work_mats[Vrow, Vcol] = None
-        if RtAP.buff:
-            RtAP.buff.destroy()
+        if Vcol == Vrow:
+            rtensor.destroy()
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp, block_diagonal=True):
@@ -621,7 +617,7 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
             return cache.setdefault(key, (coefficients, assembly_callables))
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
-    def assemble_reference_tensor(self, V):
+    def assemble_reference_tensor(self, V, transpose=False):
         """
         Return the reference tensor used in the diagonal factorisation of the
         sparse cell matrices.  See Section 3.2 of Brubeck2022b.
@@ -642,12 +638,17 @@ def assemble_reference_tensor(self, V):
         if formdegree == tdim:
             degree = degree + 1
         is_interior, is_facet = is_restricted(V.finat_element)
-        key = (degree, tdim, formdegree, value_size, is_interior, is_facet)
+        key = (degree, tdim, formdegree, value_size, is_interior, is_facet, transpose)
         cache = self._cache.setdefault("reference_tensor", {})
         try:
             return cache[key]
         except KeyError:
-            full_key = (degree, tdim, formdegree, value_size, False, False)
+            if transpose:
+                result = self.assemble_reference_tensor(V, transpose=False)
+                result = PETSc.Mat().createTranspose(result).convert(result.getType())
+                return cache.setdefault(key, result)
+
+            full_key = (degree, tdim, formdegree, value_size, False, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
@@ -705,6 +706,40 @@ def schur_complement_diagonal(submats):
     return submats[-1]
 
 
+@PETSc.Log.EventDecorator("FDMGetSchur")
+def schur_complement_block_inv(submats):
+    """
+    Used in static condensation. Take in blocks A00, A01, A10, A11,
+    return A11 - A10 * inv(A00) * A01.
+
+    Assumes that interior DOFs have been reordered to make A00
+    block diagonal with blocks of increasing dimension.
+    """
+    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    A00, A01, A10, A11 = submats[:4]
+    indptr, indices, R = A00.getValuesCSR()
+    degree = numpy.diff(indptr)
+
+    nblocks = numpy.count_nonzero(degree == 1)
+    zlice = slice(0, nblocks)
+    numpy.reciprocal(R[zlice], out=R[zlice])
+    flops = nblocks
+    for k in range(2, degree[-1]+1):
+        nblocks = numpy.count_nonzero(degree == k)
+        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
+        A = R[zlice].reshape((-1, k, k))
+        R[zlice] = numpy.linalg.inv(A).reshape((-1,))
+        flops += nblocks * (k**3)
+
+    PETSc.Log.logFlops(flops)
+    A00.setValuesCSR(indptr, indices, R)
+    A00.assemble()
+    A00.scale(-1.0)
+    submats[-1] = A10.matMatMult(A00, A01, result=submats[-1])
+    submats[-1].axpy(1.0, A11, structure=structure)
+    return submats[-1]
+
+
 @PETSc.Log.EventDecorator("FDMGetSchur")
 def schur_complement_block_cholesky(submats):
     """
@@ -735,9 +770,9 @@ def schur_complement_block_cholesky(submats):
     A00.setValuesCSR(indptr, indices, R)
     A00.assemble()
     submats[4] = A10.matTransposeMult(A00, result=submats[4])
-    submats[5] = A00.matMult(A01, result=submats[5])
-    submats[-1] = submats[4].matMult(submats[5], result=submats[-1])
-    submats[-1].aypx(-1.0, A11, structure=structure)
+    A00.scale(-1.0)
+    submats[-1] = submats[4].matMatMult(A00, A01, result=submats[-1])
+    submats[-1].axpy(1.0, A11, structure=structure)
     return submats[-1]
 
 
@@ -765,19 +800,19 @@ def schur_complement_block_qr(submats):
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         A = R[zlice].reshape((-1, k, k))
         q, r = numpy.linalg.qr(A, mode="complete")
-        R[zlice] = numpy.linalg.inv(r).reshape((-1,))
         Q[zlice] = q.reshape((-1,))
+        R[zlice] = numpy.linalg.inv(r).reshape((-1,))
         flops += nblocks * ((4*k**3)//3 + k**3)
 
     PETSc.Log.logFlops(flops)
-    A00.setValuesCSR(indptr, indices, R)
-    A00.assemble()
-    submats[4] = A10.matMult(A00, result=submats[4])
     A00.setValuesCSR(indptr, indices, Q)
     A00.assemble()
-    submats[5] = A00.transposeMatMult(A01, result=submats[5])
-    submats[-1] = submats[4].matMult(submats[5], result=submats[-1])
-    submats[-1].aypx(-1.0, A11, structure=structure)
+    submats[4] = A00.transposeMatMult(A01, result=submats[4])
+    A00.setValuesCSR(indptr, indices, R)
+    A00.assemble()
+    A00.scale(-1.0)
+    submats[-1] = A10.matMatMult(A00, submats[4], result=submats[-1])
+    submats[-1].axpy(1.0, A11, structure=structure)
     return submats[-1]
 
 
@@ -812,24 +847,23 @@ def schur_complement_block_svd(submats):
 
         u, s, v = numpy.linalg.svd(A, full_matrices=False)
         D.array_w[dslice] = s.reshape((-1,))
-        U[bslice] = u.reshape((-1,))
-        V[bslice] = v.reshape((-1,))
+        U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
+        V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
         flops += nblocks * ((4*k**3)//3 + 4*k**3)
 
     PETSc.Log.logFlops(flops)
-
-    A00.setValuesCSR(indptr, indices, V)
-    A00.assemble()
     D.sqrtabs()
     D.reciprocal()
-    A00.diagonalScale(L=D)
-    submats[5] = A10.matTransposeMult(A00, result=submats[5])
-    A00.setValuesCSR(indptr, indices, U)
+    A00.setValuesCSR(indptr, indices, V)
     A00.assemble()
     A00.diagonalScale(R=D)
-    submats[6] = A00.transposeMatMult(A01, result=submats[6])
-    submats[-1] = submats[5].matMult(submats[6], result=submats[-1])
-    submats[-1].aypx(-1.0, A11, structure=structure)
+    submats[5] = A10.matMult(A00, result=submats[5])
+    D.scale(-1.0)
+    A00.setValuesCSR(indptr, indices, U)
+    A00.assemble()
+    A00.diagonalScale(L=D)
+    submats[-1] = submats[5].matMatMult(A00, A01, result=submats[-1])
+    submats[-1].axpy(1.0, A11, structure=structure)
     return submats[-1]
 
 
@@ -847,14 +881,13 @@ def condense_element_pattern(A, i0, i1, submats):
     """Add zeroes on the statically condensed pattern so that you can run ICC(0)"""
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
-    structure = PETSc.Mat.Structure.SUBSET if submats[7] else None
+    structure = PETSc.Mat.Structure.SUBSET if submats[3] else None
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
     A00, A01, A10 = submats[:3]
-    submats[5] = A00.matMult(A01, result=submats[5])
-    submats[6] = A10.matMult(A00, result=submats[6])
-    submats[7] = submats[6].matMult(submats[5], result=submats[7])
-    submats[7].aypx(0.0, A, structure=structure)
-    return submats[7]
+    A00.scale(0.0)
+    submats[3] = A10.matMatMult(A00, A01, result=submats[3])
+    submats[3].axpy(1.0, A, structure=structure)
+    return submats[3]
 
 
 @PETSc.Log.EventDecorator("LoadCode")

From 20f2db55a2b3070b3fd4f6eeaa834e920659d2cf Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 28 Mar 2023 17:52:31 +0100
Subject: [PATCH 49/75] remove unnecessary lru_caches in fdm.py

---
 firedrake/preconditioners/fdm.py | 51 +++++++++++++++-----------------
 firedrake/preconditioners/pmg.py | 17 +++++------
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 271a392184..77c55cf1c1 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1,4 +1,4 @@
-from functools import partial, lru_cache
+from functools import partial
 from itertools import product
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
@@ -11,14 +11,13 @@
 from firedrake.function import Function
 from firedrake.functionspace import FunctionSpace
 from firedrake.ufl_expr import TestFunction, TestFunctions, TrialFunctions
-
 from firedrake_citations import Citations
-from pyop2.compilation import load
-from pyop2.utils import get_petsc_dir
-from pyop2.sparsity import get_preallocation
-from tsfc.finatinterface import create_element
 from ufl.algorithms.ad import expand_derivatives
 from ufl.algorithms.expand_indices import expand_indices
+from tsfc.finatinterface import create_element
+from pyop2.compilation import load
+from pyop2.sparsity import get_preallocation
+from pyop2.utils import get_petsc_dir
 
 import firedrake.dmhooks as dmhooks
 import ctypes
@@ -464,8 +463,8 @@ def update_De():
 
             # Core assembly loop
             for e in range(self.nel):
-                rindices = get_rindices(e, result=rindices)
                 cindices = get_cindices(e, result=cindices)
+                rindices = get_rindices(e, result=rindices)
                 data = self.get_coeffs(e, result=data)
                 Ae = assemble_element_mat(update_De(), result=Ae)
                 update_A(condense_element_mat(Ae), rindices, cindices)
@@ -497,8 +496,8 @@ def update_De():
             Se = condense_element_mat(Ae)
 
             for e in range(self.nel):
-                rindices = get_rindices(e, result=rindices)
                 cindices = get_cindices(e, result=cindices)
+                rindices = get_rindices(e, result=rindices)
                 update_A(Se, rindices, cindices)
         else:
             self.work_csr = (None, None, None)
@@ -1286,8 +1285,14 @@ def assemble_reference_tensor(self, V):
         Afdm = []  # sparse interval mass and stiffness matrices for each direction
         Dfdm = []  # tabulation of normal derivatives at the boundary for each direction
         bdof = []  # indices of point evaluation dofs for each direction
+        cache = {}
         for e in line_elements:
-            Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(fdm_setup_ipdg(e, eta)))
+            key = e.degree()
+            try:
+                rtensor = cache[key]
+            except KeyError:
+                rtensor = cache.setdefault(key, fdm_setup_ipdg(e, eta, comm=PETSc.COMM_SELF))
+            Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(rtensor))
             if not is_dg and e.degree() == degree:
                 # do not apply SIPG along continuous directions
                 Dfdm[0] = None
@@ -1436,8 +1441,8 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             eta = float(self.appctx.get("eta"))
 
             lgmap = self.lgmaps[V]
-            index_facet, local_facet_data, nfacets = get_interior_facet_maps(V)
-            index_coef, _, _ = get_interior_facet_maps(Gq_facet or Gq)
+            index_facet, local_facet_data, nfacets = extrude_interior_facet_maps(V)
+            index_coef, _, _ = extrude_interior_facet_maps(Gq_facet or Gq)
             rows = numpy.zeros((2, sdim), dtype=PETSc.IntType)
 
             for e in range(nfacets):
@@ -1685,7 +1690,7 @@ def pull_axis(x, pshape, idir):
     return numpy.reshape(numpy.moveaxis(numpy.reshape(x.copy(), pshape), idir, 0), x.shape)
 
 
-def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
+def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False, comm=None):
     """
     Create a SeqAIJ Mat from a dense matrix using the diagonal and a subset of rows and columns.
     If dense_indices is empty, then also include the off-diagonal corners of the matrix.
@@ -1696,8 +1701,7 @@ def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
     nnz[dense_indices] = len(dense_indices) if block else n
 
     imode = PETSc.InsertMode.INSERT
-    A_petsc = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=PETSc.COMM_SELF)
-
+    A_petsc = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
     idx = numpy.arange(n, dtype=PETSc.IntType)
     if block:
         values = A_numpy[dense_indices, :][:, dense_indices]
@@ -1706,18 +1710,15 @@ def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
         for j in dense_indices:
             A_petsc.setValues(j, idx, A_numpy[j, :], imode)
             A_petsc.setValues(idx, j, A_numpy[:, j], imode)
-
     if diag:
         idx = idx[:, None]
         values = A_numpy.diagonal()[:, None]
         A_petsc.setValuesRCV(idx, idx, values, imode)
-
     A_petsc.assemble()
     return A_petsc
 
 
-@lru_cache(maxsize=10)
-def fdm_setup_ipdg(fdm_element, eta):
+def fdm_setup_ipdg(fdm_element, eta, comm=None):
     """
     Setup for the fast diagonalisation method for the IP-DG formulation.
     Compute sparsified interval stiffness and mass matrices
@@ -1725,6 +1726,7 @@ def fdm_setup_ipdg(fdm_element, eta):
 
     :arg fdm_element: a :class:`FIAT.FDMElement`
     :arg eta: penalty coefficient as a `float`
+    :arg comm: a :class:`PETSc.Comm`
 
     :returns: 3-tuple of:
         Afdm: a list of :class:`PETSc.Mats` with the sparse interval matrices
@@ -1735,10 +1737,7 @@ def fdm_setup_ipdg(fdm_element, eta):
     """
     ref_el = fdm_element.get_reference_element()
     degree = fdm_element.degree()
-    if hasattr(fdm_element.dual, "rule"):
-        rule = fdm_element.dual.rule
-    else:
-        rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
+    rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
     edof = fdm_element.entity_dofs()
     bdof = edof[0][0] + edof[0][1]
 
@@ -1753,7 +1752,7 @@ def fdm_setup_ipdg(fdm_element, eta):
     Dfacet = basis[(1,)]
     Dfacet[:, 0] = -Dfacet[:, 0]
 
-    Afdm = [numpy_to_petsc(Bhat, bdof, block=True)]
+    Afdm = [numpy_to_petsc(Bhat, bdof, block=True, comm=comm)]
     for bc in range(4):
         bcs = (bc % 2, bc//2)
         Abc = Ahat.copy()
@@ -1763,12 +1762,11 @@ def fdm_setup_ipdg(fdm_element, eta):
                 Abc[:, j] -= Dfacet[:, k]
                 Abc[j, :] -= Dfacet[:, k]
                 Abc[j, j] += eta
-        Afdm.append(numpy_to_petsc(Abc, bdof))
+        Afdm.append(numpy_to_petsc(Abc, bdof, comm=comm))
     return Afdm, Dfacet, bdof
 
 
-@lru_cache(maxsize=10)
-def get_interior_facet_maps(V):
+def extrude_interior_facet_maps(V):
     """
     Extrude V.interior_facet_node_map and V.mesh().interior_facets.local_facet_dat
 
@@ -1841,7 +1839,6 @@ def get_interior_facet_maps(V):
     return facet_to_nodes_fun, local_facet_data_fun, nfacets
 
 
-@lru_cache(maxsize=20)
 def extrude_node_map(node_map, bsize=1):
     """
     Construct a (possibly vector-valued) cell to node map from an un-extruded scalar map.
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 1fb2e267da..c6b482b5ee 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -9,10 +9,9 @@
 from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
 from firedrake.tsfc_interface import extract_numbered_coefficients
 from firedrake.utils import ScalarType_c, IntType_c, cached_property
-from pyop2 import op2, PermutedMap
 from tsfc import compile_expression_dual_evaluation
 from tsfc.finatinterface import create_element
-from FIAT.reference_element import LINE
+from pyop2 import op2
 
 import firedrake
 import finat
@@ -559,8 +558,9 @@ def expand_element(ele):
 
 
 def evaluate_dual(source, target, alpha=None):
-    # Evaluate the action of a set of dual functionals of the target element
-    # on the (derivatives of the) basis functions of the source element.
+    """Evaluate the action of a set of dual functionals of the target element
+       on the (derivative of order alpha of the) basis functions of the source
+       element."""
     primal = source.get_nodal_basis()
     dual = target.get_dual_set()
     A = dual.to_riesz(primal)
@@ -642,7 +642,7 @@ def get_permutation_to_line_elements(finat_element):
     for term in terms:
         factors = term.factors if hasattr(term, "factors") else (term,)
         fiat_factors = [e.fiat_equivalent for e in reversed(factors)]
-        if any(e.get_reference_element().shape != LINE for e in fiat_factors):
+        if any(e.get_reference_element().get_spatial_dimension() != 1 for e in fiat_factors):
             raise ValueError("Failed to decompose %s into line elements" % fiat_factors)
 
         # use the same FIAT element if it appears multiple times in the expansion
@@ -1142,7 +1142,7 @@ def get_permuted_map(V):
     indices, _, _ = get_permutation_to_line_elements(V.finat_element)
     if numpy.all(indices[:-1] < indices[1:]):
         return V.cell_node_map()
-    return PermutedMap(V.cell_node_map(), indices)
+    return op2.PermutedMap(V.cell_node_map(), indices)
 
 
 class StandaloneInterpolationMatrix(object):
@@ -1237,7 +1237,6 @@ def view(self, mat, viewer=None):
                            type(self).__name__)
 
     def getInfo(self, mat, info=None):
-        from mpi4py import MPI
         memory = self.uf.dat.nbytes + self.uc.dat.nbytes
         if self._weight is not None:
             memory += self._weight.dat.nbytes
@@ -1246,10 +1245,10 @@ def getInfo(self, mat, info=None):
         if info == PETSc.Mat.InfoType.LOCAL:
             return {"memory": memory}
         elif info == PETSc.Mat.InfoType.GLOBAL_SUM:
-            gmem = mat.comm.tompi4py().allreduce(memory, op=MPI.SUM)
+            gmem = mat.comm.tompi4py().allreduce(memory, op=op2.MPI.SUM)
             return {"memory": gmem}
         elif info == PETSc.Mat.InfoType.GLOBAL_MAX:
-            gmem = mat.comm.tompi4py().allreduce(memory, op=MPI.MAX)
+            gmem = mat.comm.tompi4py().allreduce(memory, op=op2.MPI.MAX)
             return {"memory": gmem}
         else:
             raise ValueError("Unknown info type %s" % info)

From cb9de37f10b63a3b0944eb03034dbf3eff4831c3 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 29 Mar 2023 09:46:22 +0100
Subject: [PATCH 50/75] remove repeated import

---
 firedrake/matrix_free/operators.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/firedrake/matrix_free/operators.py b/firedrake/matrix_free/operators.py
index 5e73f69785..b39c4d2e76 100644
--- a/firedrake/matrix_free/operators.py
+++ b/firedrake/matrix_free/operators.py
@@ -309,7 +309,6 @@ def view(self, mat, viewer=None):
                            type(self).__name__)
 
     def getInfo(self, mat, info=None):
-        from mpi4py import MPI
         memory = self._x.dat.nbytes + self._y.dat.nbytes
         if hasattr(self, "_xbc"):
             memory += self._xbc.dat.nbytes

From 24c8b80393828b8bcd8f815c6857ef6fa7e09ad6 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 29 Mar 2023 10:39:35 +0100
Subject: [PATCH 51/75] cleanup

---
 firedrake/preconditioners/fdm.py | 78 +++++++++++++-------------------
 1 file changed, 32 insertions(+), 46 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 77c55cf1c1..e0b98f5da6 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -265,7 +265,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
         self.lgmaps = {}
         bc_rows = {}
         for Vsub in V:
-            lgmap = Vsub.local_to_global_map([bc.reconstruct(V=Vsub, g=0) for bc in bcs])
+            lgmap = Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub])
             bsize = Vsub.dof_dset.layout_vec.getBlockSize()
             cell_to_local, nel = extrude_node_map(Vsub.cell_node_map(), bsize=bsize)
             self.cell_to_global[Vsub] = partial(cell_to_global, lgmap, cell_to_local)
@@ -335,7 +335,7 @@ def get_coeffs(e, result=None):
         if len(V) == 1:
             Pmat = Pmats[V, V]
         else:
-            Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=V.comm)
+            Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=self.comm)
 
         @PETSc.Log.EventDecorator("FDMAssemble")
         def assemble_P():
@@ -1540,10 +1540,10 @@ def assemble_coefficients(self, J, fcp):
             degree = max(degree)
         except TypeError:
             pass
-        quad_deg = 2*degree+1
-        quad_deg = fcp.get("degree", quad_deg)
+        quad_deg = fcp.get("degree", 2*degree+1)
         dx = ufl.dx(degree=quad_deg, domain=mesh)
         family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
+        DG = ufl.FiniteElement(family, mesh.ufl_cell(), degree=0)
 
         # extract coefficients directly from the bilinear form
         integrals_J = J.integrals_by_type("cell")
@@ -1556,24 +1556,8 @@ def assemble_coefficients(self, J, fcp):
             replace_grad = {ufl.grad(t): ufl.dot(Piola, ufl.dot(dt, Finv)) for t, dt in zip(args_J, ref_grad)}
         else:
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
-
         alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
                                                  ref_grad[0]), ref_grad[1]) for i in integrals_J]))
-
-        # get zero-th order coefficent
-        ref_val = [ufl.variable(t) for t in args_J]
-        if Piola:
-            dummy_element = ufl.TensorElement("DQ", cell=mesh.ufl_cell(), degree=1, shape=Piola.ufl_shape)
-            dummy_Piola = ufl.Coefficient(ufl.FunctionSpace(mesh, dummy_element))
-            replace_val = {t: ufl.dot(dummy_Piola, s) for t, s in zip(args_J, ref_val)}
-        else:
-            replace_val = {t: s for t, s in zip(args_J, ref_val)}
-
-        beta = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
-                                                ref_val[0]), ref_val[1]) for i in integrals_J]))
-        if Piola:
-            beta = ufl.replace(beta, {dummy_Piola: Piola})
-
         # discard mixed derivatives and mixed components
         if len(alpha.ufl_shape) == 2:
             alpha = ufl.diag_vector(alpha)
@@ -1581,35 +1565,46 @@ def assemble_coefficients(self, J, fcp):
             ashape = alpha.ufl_shape
             ashape = ashape[:len(ashape)//2]
             alpha = ufl.as_tensor(numpy.reshape([alpha[i+i] for i in numpy.ndindex(ashape)], (ashape[0], -1)))
-        Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=0, shape=alpha.ufl_shape)
 
         # assemble second order coefficient
         if not isinstance(alpha, ufl.constantvalue.Zero):
-            Q = FunctionSpace(mesh, Qe)
+            Q = FunctionSpace(mesh, ufl.TensorElement(DG, shape=alpha.ufl_shape))
             tensor = coefficients.setdefault("alpha", Function(Q))
             assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), alpha)*dx, tensor=tensor,
                                                        form_compiler_parameters=fcp).assemble)
 
+        # get zero-th order coefficent
+        ref_val = [ufl.variable(t) for t in args_J]
+        if Piola:
+            dummy_element = ufl.TensorElement(family, cell=mesh.ufl_cell(), degree=1, shape=Piola.ufl_shape)
+            dummy_Piola = ufl.Coefficient(ufl.FunctionSpace(mesh, dummy_element))
+            replace_val = {t: ufl.dot(dummy_Piola, s) for t, s in zip(args_J, ref_val)}
+        else:
+            replace_val = {t: s for t, s in zip(args_J, ref_val)}
+        beta = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
+                                                ref_val[0]), ref_val[1]) for i in integrals_J]))
+        if Piola:
+            beta = ufl.replace(beta, {dummy_Piola: Piola})
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
             if Piola:
                 # keep diagonal
                 beta = ufl.diag_vector(beta)
-            shape = beta.ufl_shape
-            Qe = ufl.FiniteElement(family, mesh.ufl_cell(), degree=0)
-            if shape:
-                Qe = ufl.TensorElement(Qe, shape=shape)
-            Q = FunctionSpace(mesh, Qe)
+            Q = FunctionSpace(mesh, ufl.TensorElement(DG, shape=beta.ufl_shape) if beta.ufl_shape else DG)
             tensor = coefficients.setdefault("beta", Function(Q))
             assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), beta)*dx, tensor=tensor,
                                                        form_compiler_parameters=fcp).assemble)
 
+        family = "CG" if tdim == 1 else "DGT"
+        degree = 1 if tdim == 1 else 0
+        DGT = ufl.BrokenElement(ufl.FiniteElement(family, cell=mesh.ufl_cell(), degree=degree))
         if Piola:
             # make DGT functions with the second order coefficient
             # and the Piola tensor for each side of each facet
             extruded = mesh.cell_set._extruded
             dS_int = ufl.dS_h(degree=quad_deg) + ufl.dS_v(degree=quad_deg) if extruded else ufl.dS(degree=quad_deg)
-            ifacet_inner = lambda v, u: ((ufl.inner(v('+'), u('+')) + ufl.inner(v('-'), u('-')))/ufl.FacetArea(mesh))*dS_int
+            area = ufl.FacetArea(mesh)
+            ifacet_inner = lambda v, u: ((ufl.inner(v('+'), u('+')) + ufl.inner(v('-'), u('-')))/area)*dS_int
 
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
             alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
@@ -1618,29 +1613,20 @@ def assemble_coefficients(self, J, fcp):
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
             G = G * abs(ufl.JacobianDeterminant(mesh))
 
-            ele = ufl.BrokenElement(ufl.FiniteElement("DGT", cell=mesh.ufl_cell(), degree=0))
-            Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=G.ufl_shape))
+            Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=G.ufl_shape))
             tensor = coefficients.setdefault("Gq_facet", Function(Q))
             assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), G), tensor=tensor,
                                                        form_compiler_parameters=fcp).assemble)
-
             PT = Piola.T
-            Q = FunctionSpace(mesh, ufl.TensorElement(ele, shape=PT.ufl_shape))
+            Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=PT.ufl_shape))
             tensor = coefficients.setdefault("PT_facet", Function(Q))
             assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), PT), tensor=tensor,
                                                        form_compiler_parameters=fcp).assemble)
 
         # make DGT functions with BC flags
-        rvs = V.ufl_element().reference_value_shape()
-        cell = mesh.ufl_cell()
-        family = "CG" if cell.topological_dimension() == 1 else "DGT"
-        degree = 1 if cell.topological_dimension() == 1 else 0
-        Qe = ufl.FiniteElement(family, cell=cell, degree=degree)
-        if rvs:
-            Qe = ufl.TensorElement(Qe, shape=rvs)
-        Q = FunctionSpace(mesh, Qe)
-        q = TestFunction(Q)
-        bcflags = Function(Q)
+        shape = V.ufl_element().reference_value_shape()
+        Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=shape) if shape else DGT)
+        test = TestFunction(Q)
 
         ref_args = [ufl.variable(t) for t in args_J]
         replace_args = {t: s for t, s in zip(args_J, ref_args)}
@@ -1652,16 +1638,16 @@ def assemble_coefficients(self, J, fcp):
             if itype.startswith("exterior_facet"):
                 beta = ufl.diff(ufl.diff(ufl.replace(it.integrand(), replace_args), ref_args[0]), ref_args[1])
                 beta = expand_derivatives(beta)
-                if rvs:
+                if beta.ufl_shape:
                     beta = ufl.diag_vector(beta)
                 ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
-                forms.append(ufl.inner(q, beta)*ds_ext)
+                forms.append(ufl.inner(test, beta)*ds_ext)
 
         if len(forms):
             form = sum(forms)
             if len(form.arguments()) == 1:
-                coefficients["bcflags"] = bcflags
-                assembly_callables.append(OneFormAssembler(form, tensor=bcflags,
+                tensor = coefficients.setdefault("bcflags", Function(Q))
+                assembly_callables.append(OneFormAssembler(form, tensor=tensor,
                                                            form_compiler_parameters=fcp).assemble)
 
         # set arbitrary non-zero coefficients for preallocation

From 1ace1b8d47faf03620c76b4b10559e22c6624da9 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 29 Mar 2023 10:58:24 +0100
Subject: [PATCH 52/75] use numpy.take in extrude_node_map

---
 firedrake/preconditioners/fdm.py | 41 ++++++++++----------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index e0b98f5da6..0601fe0a16 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1649,7 +1649,6 @@ def assemble_coefficients(self, J, fcp):
                 tensor = coefficients.setdefault("bcflags", Function(Q))
                 assembly_callables.append(OneFormAssembler(form, tensor=tensor,
                                                            form_compiler_parameters=fcp).assemble)
-
         # set arbitrary non-zero coefficients for preallocation
         for coef in coefficients.values():
             with coef.dat.vec as cvec:
@@ -1834,52 +1833,37 @@ def extrude_node_map(node_map, bsize=1):
 
     :returns: a 2-tuple with the cell to node map and the number of cells owned by this process
     """
-    nelv = node_map.values.shape[0]
+    nel = node_map.values.shape[0]
     if node_map.offset is None:
-        nel = nelv
-
         def scalar_map(e, result=None):
-            if result is None:
-                result = numpy.copy(node_map.values_with_halo[e])
-            else:
-                numpy.copyto(result, node_map.values_with_halo[e])
-            return result
-
+            return numpy.take(node_map.values_with_halo, e, axis=0, out=result)
     else:
         layers = node_map.iterset.layers_array
         if layers.shape[0] == 1:
-            nelz = layers[0, 1]-layers[0, 0]-1
-            nel = nelz*nelv
-
             def _scalar_map(node_map, nelz, e, result=None):
-                if result is None:
-                    result = numpy.copy(node_map.values_with_halo[e // nelz])
-                else:
-                    numpy.copyto(result, node_map.values_with_halo[e // nelz])
+                result = numpy.take(node_map.values_with_halo, e // nelz, axis=0, out=result)
                 result += (e % nelz)*node_map.offset
                 return result
+
+            nelz = layers[0, 1]-layers[0, 0]-1
+            nel *= nelz
             scalar_map = partial(_scalar_map, node_map, nelz)
 
         else:
-            nelz = layers[:, 1]-layers[:, 0]-1
-            nel = sum(nelz[:nelv])
-            to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
-            to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-
             def _scalar_map(node_map, to_base, to_layer, e, result=None):
-                if result is None:
-                    result = numpy.copy(node_map.values_with_halo[to_base[e]])
-                else:
-                    numpy.copyto(result, node_map.values_with_halo[to_base[e]])
+                result = numpy.take(node_map.values_with_halo, to_base[e], axis=0, out=result)
                 result += to_layer[e]*node_map.offset
                 return result
+
+            nelz = layers[:, 1]-layers[:, 0]-1
+            nel = sum(nelz[:nel])
+            to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
+            to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
             scalar_map = partial(_scalar_map, node_map, to_base, to_layer)
 
     if bsize == 1:
         return scalar_map, nel
 
-    ibase = numpy.arange(bsize, dtype=node_map.values.dtype)
-
     def vector_map(bsize, ibase, e, result=None):
         index = None
         if result is not None:
@@ -1888,4 +1872,5 @@ def vector_map(bsize, ibase, e, result=None):
         index *= bsize
         return numpy.add.outer(index, ibase, out=result)
 
+    ibase = numpy.arange(bsize, dtype=node_map.values.dtype)
     return partial(vector_map, bsize, ibase), nel

From 9f95e2b1bc3084a2110afbe8a676c00a47aeb059 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 29 Mar 2023 12:01:08 +0100
Subject: [PATCH 53/75] optimise FDMGetIndices

---
 firedrake/preconditioners/fdm.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 0601fe0a16..2e6f8fb552 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1835,31 +1835,41 @@ def extrude_node_map(node_map, bsize=1):
     """
     nel = node_map.values.shape[0]
     if node_map.offset is None:
-        def scalar_map(e, result=None):
-            return numpy.take(node_map.values_with_halo, e, axis=0, out=result)
+        def scalar_map(map_values, e, result=None):
+            if result is None:
+                result = numpy.empty_like(map_values[e])
+            numpy.copyto(result, map_values[e])
+            return result
+
+        scalar_map = partial(_scalar_map, node_map.values_with_halo)
     else:
         layers = node_map.iterset.layers_array
         if layers.shape[0] == 1:
-            def _scalar_map(node_map, nelz, e, result=None):
-                result = numpy.take(node_map.values_with_halo, e // nelz, axis=0, out=result)
-                result += (e % nelz)*node_map.offset
+            def _scalar_map(map_values, offset, nelz, e, result=None):
+                if result is None:
+                    result = numpy.empty_like(offset)
+                numpy.copyto(result, offset)
+                result *= (e % nelz)
+                result += map_values[e // nelz]
                 return result
 
             nelz = layers[0, 1]-layers[0, 0]-1
             nel *= nelz
-            scalar_map = partial(_scalar_map, node_map, nelz)
-
+            scalar_map = partial(_scalar_map, node_map.values_with_halo, node_map.offset, nelz)
         else:
-            def _scalar_map(node_map, to_base, to_layer, e, result=None):
-                result = numpy.take(node_map.values_with_halo, to_base[e], axis=0, out=result)
-                result += to_layer[e]*node_map.offset
+            def _scalar_map(map_values, offset, to_base, to_layer, e, result=None):
+                if result is None:
+                    result = numpy.empty_like(offset)
+                numpy.copyto(result, offset)
+                result *= to_layer[e]
+                result += map_values[to_base[e]]
                 return result
 
             nelz = layers[:, 1]-layers[:, 0]-1
             nel = sum(nelz[:nel])
             to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
             to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-            scalar_map = partial(_scalar_map, node_map, to_base, to_layer)
+            scalar_map = partial(_scalar_map, node_map.values_with_halo, node_map.offset, to_base, to_layer)
 
     if bsize == 1:
         return scalar_map, nel

From a42d76a6a8bed6b43a60bb77b4f1632cdefa3716 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 29 Mar 2023 12:12:31 +0100
Subject: [PATCH 54/75] fix typo

---
 firedrake/preconditioners/fdm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 2e6f8fb552..7a9a9d112c 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1835,7 +1835,7 @@ def extrude_node_map(node_map, bsize=1):
     """
     nel = node_map.values.shape[0]
     if node_map.offset is None:
-        def scalar_map(map_values, e, result=None):
+        def _scalar_map(map_values, e, result=None):
             if result is None:
                 result = numpy.empty_like(map_values[e])
             numpy.copyto(result, map_values[e])

From da6899dd66f65367b021d05b539a5db92452016a Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Thu, 30 Mar 2023 17:17:10 +0100
Subject: [PATCH 55/75] fuse prealloaction and assembly loops, more elegant
 handling of element mass matrices

---
 firedrake/preconditioners/fdm.py | 277 ++++++++++++++-----------------
 firedrake/preconditioners/pmg.py |  41 +++--
 2 files changed, 149 insertions(+), 169 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 7a9a9d112c..ef6a328a35 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -11,6 +11,7 @@
 from firedrake.function import Function
 from firedrake.functionspace import FunctionSpace
 from firedrake.ufl_expr import TestFunction, TestFunctions, TrialFunctions
+from firedrake.utils import cached_property
 from firedrake_citations import Citations
 from ufl.algorithms.ad import expand_derivatives
 from ufl.algorithms.expand_indices import expand_indices
@@ -70,9 +71,6 @@ class FDMPC(PCBase):
     The PETSc options inspected by this class are:
     - 'fdm_mat_type': can be either 'aij' or 'sbaij'
     - 'fdm_static_condensation': are we assembling the Schur complement on facets?
-
-    Static condensation is currently only implemented for the symmetric case,
-    use it at your own risk.
     """
 
     _prefix = "fdm_"
@@ -175,11 +173,11 @@ def initialize(self, pc):
                 self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
-        self._assemble_P()
+        Pmat, self._assemble_P = self.allocate_matrix(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
         Pmat.setNearNullSpace(Amat.getNearNullSpace())
+        self._assemble_P()
 
         # Internally, we just set up a PC object that the user can configure
         # however from the PETSc command line.  Since PC allows the user to specify
@@ -202,9 +200,9 @@ def initialize(self, pc):
             fdmpc.setFromOptions()
 
     @PETSc.Log.EventDecorator("FDMPrealloc")
-    def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
+    def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         """
-        Assemble the sparse preconditioner from diagonal mass matrices.
+        Allocate the FDM sparse preconditioner.
 
         :arg V: the :class:`.FunctionSpace` of the form arguments
         :arg J: the Jacobian bilinear form
@@ -215,7 +213,7 @@ def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
         """
-        ifacet, = numpy.nonzero([is_restricted(Vsub.finat_element)[1] for Vsub in V])
+        ifacet = [i for i, Vsub in enumerate(V) if is_restricted(Vsub.finat_element)[1]]
         if len(ifacet) == 0:
             Vfacet = None
             Vbig = V
@@ -237,22 +235,21 @@ def assemble_fdm_op(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
         idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
         self.ises = tuple(PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs))
-        self.submats = [None for _ in range(8)]
+        self.submats = [None for _ in range(7)]
 
-        self.reference_tensor_on_diag = {}
+        # Dictionary with the parent space and a method to form the Schur complement
         self.get_static_condensation = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             diagonal_interior = Vfacet.finat_element.formdegree == 0 and value_size == 1
             get_schur = schur_complement_diagonal if diagonal_interior else schur_complement_block_qr
-            self.reference_tensor_on_diag[Vfacet] = self.assemble_reference_tensor(Vbig)
-            self.get_static_condensation[Vfacet] = lambda A: condense_element_mat(A, self.ises[0], self.ises[1],
-                                                                                  self.submats, get_schur)
+            self.get_static_condensation[Vfacet] = Vbig, lambda A: condense_element_mat(A, self.ises[0], self.ises[1],
+                                                                                        self.submats, get_schur)
 
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
             i1 = PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF)
-            self.get_static_condensation[V] = lambda Ae: condense_element_pattern(Ae, self.ises[0], i1, self.submats)
+            self.get_static_condensation[V] = Vbig, lambda Ae: condense_element_pattern(Ae, self.ises[0], i1, self.submats)
 
         @PETSc.Log.EventDecorator("FDMGetIndices")
         def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
@@ -391,8 +388,8 @@ def destroy(self, pc):
         if hasattr(self, "A"):
             objs.append(self.A)
         if hasattr(self, "pc"):
-            objs.append(self.pc.getOperators()[-1])
             objs.append(self.pc)
+            objs.append(self.pc.getOperators()[-1])
         if hasattr(self, "submats"):
             objs.extend(self.submats)
         if hasattr(self, "work_mats"):
@@ -403,6 +400,24 @@ def destroy(self, pc):
             if hasattr(obj, "destroy"):
                 obj.destroy()
 
+    @cached_property
+    def _element_mass_matrix(self):
+        data = self.get_coeffs(0)
+        data.fill(1.0E0)
+        shape = data.shape + (1,)*(3-len(data.shape))
+        nrows = shape[0] * shape[1]
+        ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
+        aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
+        if shape[2] > 1:
+            ai *= shape[2]
+            data = numpy.tile(numpy.eye(shape[2], dtype=data.dtype), shape[:1] + (1,)*(len(shape)-1))
+        Me = PETSc.Mat().createAIJ((nrows, nrows), bsize=shape[2], csr=(ai, aj, data), comm=PETSc.COMM_SELF)
+        return self.work_mats.setdefault("mass_matrix", Me)
+
+    @cached_property
+    def _element_mass_diagonal(self):
+        return self.work_mats.setdefault("mass_diagonal", self._element_mass_matrix.getDiagonal())
+
     @PETSc.Log.EventDecorator("FDMSetValues")
     def set_values(self, A, Vrow, Vcol, addv, triu=False):
         """
@@ -415,96 +430,69 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         :arg addv: a `PETSc.Mat.InsertMode`
         :arg triu: are we assembling only the upper triangular part?
         """
+        if self.nel == 0:
+            # This MPI rank does not own any elements, nothing to be done
+            return
+
+        Vbig = None
+        condense_element_mat = lambda x: x
         set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         get_rindices = self.cell_to_global[Vrow]
         if Vrow == Vcol:
-            condense_element_mat = self.get_static_condensation.get(Vrow)
             get_cindices = lambda e, result=None: result
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, rindices, addv)
-            # interpolators of basis and exterior derivative onto broken spaces
-            ctensor = self.reference_tensor_on_diag.get(Vrow) or self.assemble_reference_tensor(Vrow)
-            rtensor = PETSc.Mat().createTranspose(ctensor).convert(ctensor.getType())
+            Vbig, condense_element_mat = self.get_static_condensation.get(Vrow, (Vbig, condense_element_mat))
         else:
-            condense_element_mat = None
             get_cindices = self.cell_to_global[Vcol]
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, cindices, addv)
-            ctensor = self.assemble_reference_tensor(Vcol)
-            rtensor = self.assemble_reference_tensor(Vrow, transpose=True)
 
+        Me = self._element_mass_matrix
+        # interpolation of basis and exterior derivative onto broken spaces
+        ctensor = self.assemble_reference_tensor(Vbig or Vcol)
+        rtensor = self.assemble_reference_tensor(Vbig or Vrow, transpose=True)
         # element matrix obtained via Equation (3.9) of Brubeck2022b
-        assemble_element_mat = lambda De, result=None: rtensor.matMatMult(De, ctensor, result=result)
-
-        do_sort = True
-        if condense_element_mat is None:
-            condense_element_mat = lambda x: x
-            do_sort = False
-
-        common_key = "coefs"
-        rindices = None
-        cindices = None
-        if A.getType() != PETSc.Mat.Type.PREALLOCATOR:
+        assemble_element_mat = partial(rtensor.matMatMult, Me, ctensor)
+        try:
             Ae = self.work_mats[Vrow, Vcol]
-            De = self.work_mats[common_key]
-            insert = PETSc.InsertMode.INSERT
-            work_vec = De.getDiagonal()
-            data = self.work_csr[2]
-            if len(data.shape) == 3:
-                @PETSc.Log.EventDecorator("FDMUpdateDiag")
-                def update_De():
-                    De.setValuesCSR(*self.work_csr, addv=insert)
-                    De.assemble()
-                    return De
-            else:
-                @PETSc.Log.EventDecorator("FDMUpdateDiag")
-                def update_De():
-                    De.setDiagonal(work_vec, addv=insert)
-                    return De
-                data = work_vec.array_w
+        except KeyError:
+            Ae = self.work_mats.setdefault((Vrow, Vcol), assemble_element_mat())
 
-            # Core assembly loop
-            for e in range(self.nel):
-                cindices = get_cindices(e, result=cindices)
-                rindices = get_rindices(e, result=rindices)
-                data = self.get_coeffs(e, result=data)
-                Ae = assemble_element_mat(update_De(), result=Ae)
-                update_A(condense_element_mat(Ae), rindices, cindices)
-
-            work_vec.destroy()
-
-        elif self.nel:
-            # Preallocation of the sparsity pattern
-            if common_key not in self.work_mats:
-                data = self.get_coeffs(0)
-                data.fill(1.0E0)
-                shape = data.shape + (1,)*(3-len(data.shape))
-                nrows = shape[0] * shape[1]
-                ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
-                aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
-                if shape[2] > 1:
-                    ai *= shape[2]
-                    data = numpy.tile(numpy.eye(shape[2]), shape[:1] + (1,)*(len(shape)-1))
-
-                self.work_csr = (ai, aj, data)
-                De = PETSc.Mat().createAIJ((nrows, nrows), csr=self.work_csr, comm=PETSc.COMM_SELF)
-                self.work_mats[common_key] = De
-
-            De = self.work_mats[common_key]
-            Ae = assemble_element_mat(De, result=None)
-            self.work_mats[Vrow, Vcol] = Ae
-            if do_sort:
+        insert = PETSc.InsertMode.INSERT
+        if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
+            # Empty kernel for preallocation
+            if Vbig is not None:
                 sort_interior_dofs(self.ises[0], Ae)
             Se = condense_element_mat(Ae)
-
-            for e in range(self.nel):
-                cindices = get_cindices(e, result=cindices)
-                rindices = get_rindices(e, result=rindices)
-                update_A(Se, rindices, cindices)
+            element_kernel = lambda e, result=None: result
+            condense_element_mat = lambda Ae: Se
+        elif Me.getBlockSize() == 1:
+            # Kernel with diagonal mass matrix
+            diagonal = self._element_mass_diagonal
+            data = diagonal.array_w.reshape((-1,) + Vrow.shape)
+
+            def element_kernel(e, result=None):
+                self.get_coeffs(e, result=data)
+                Me.setDiagonal(diagonal, addv=insert)
+                return assemble_element_mat(result=result)
         else:
-            self.work_csr = (None, None, None)
-            self.work_mats[common_key] = None
-            self.work_mats[Vrow, Vcol] = None
-        if Vcol == Vrow:
-            rtensor.destroy()
+            # Kernel with block diagonal mass matrix
+            ai, aj, data = Me.getValuesCSR()
+            data = data.reshape((-1,) + Vrow.shape * 2)
+
+            def element_kernel(e, result=None):
+                self.get_coeffs(e, result=data)
+                Me.setValuesCSR(ai, aj, data, addv=insert)
+                Me.assemble()
+                return assemble_element_mat(result=result)
+
+        cindices = None
+        rindices = None
+        # Core assembly loop
+        for e in range(self.nel):
+            cindices = get_cindices(e, result=cindices)
+            rindices = get_rindices(e, result=rindices)
+            Ae = element_kernel(e, result=Ae)
+            update_A(condense_element_mat(Ae), rindices, cindices)
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp, block_diagonal=True):
@@ -650,7 +638,7 @@ def assemble_reference_tensor(self, V, transpose=False):
             full_key = (degree, tdim, formdegree, value_size, False, False, False)
             if is_facet and full_key in cache:
                 result = cache[full_key]
-                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
+                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
                 result = result.createSubMatrix(noperm, self.ises[1])
                 noperm.destroy()
                 return cache.setdefault(key, result)
@@ -663,15 +651,19 @@ def assemble_reference_tensor(self, V, transpose=False):
             if is_interior:
                 e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
 
-            A00 = fiat_reference_prolongator(e0, eq)
-            A10 = fiat_reference_prolongator(e0, e1, derivative=True)
-            A11 = numpy.eye(e1.space_dimension(), dtype=A00.dtype)
-
+            comm = PETSc.COMM_SELF
+            A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=comm)
+            A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=comm)
+            A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=comm)
             B_blocks = mass_blocks(tdim, formdegree, A00, A11)
             A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
-            result = block_mat(B_blocks + A_blocks, destroy=True)
+            result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
+            A00.destroy()
+            A10.destroy()
+            A11.destroy()
+
             if value_size != 1:
-                eye = petsc_sparse(numpy.eye(value_size))
+                eye = petsc_sparse(numpy.eye(value_size), comm=comm)
                 temp = result
                 result = temp.kron(eye)
                 temp.destroy()
@@ -679,7 +671,7 @@ def assemble_reference_tensor(self, V, transpose=False):
 
             if is_facet:
                 cache[full_key] = result
-                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.comm)
+                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
                 result = result.createSubMatrix(noperm, self.ises[1])
                 noperm.destroy()
 
@@ -1025,7 +1017,7 @@ def kron3(A, B, C, scale=None):
     return result
 
 
-def block_mat(A_blocks, destroy=False):
+def block_mat(A_blocks, destroy_blocks=False):
     """Return a concrete Mat corresponding to a block matrix given as a list of lists.
        Optionally, destroys the input Mats if a new Mat is created."""
     if len(A_blocks) == 1:
@@ -1035,24 +1027,19 @@ def block_mat(A_blocks, destroy=False):
     result = PETSc.Mat().createNest(A_blocks, comm=A_blocks[0][0].getComm())
     # A nest Mat would not allow us to take matrix-matrix products
     result = result.convert(mat_type=A_blocks[0][0].getType())
-    if destroy:
+    if destroy_blocks:
         for row in A_blocks:
             for mat in row:
                 mat.destroy()
     return result
 
 
-def mass_blocks(tdim, formdegree, B00, B11, comm=None):
+def mass_blocks(tdim, formdegree, B00, B11):
     """Construct mass block matrix on reference cell from 1D mass matrices B00 and B11.
        The 1D matrices may come with different test and trial spaces."""
-    if comm is None:
-        comm = PETSc.COMM_SELF
     if tdim == 1:
-        return [[petsc_sparse(B11 if formdegree else B00, comm=comm)]]
-
-    B00 = petsc_sparse(B00, comm=comm)
-    B11 = petsc_sparse(B11, comm=comm)
-    if tdim == 2:
+        B_diag = [B11 if formdegree else B00]
+    elif tdim == 2:
         if formdegree == 0:
             B_diag = [B00.kron(B00)]
         elif formdegree == 1:
@@ -1069,36 +1056,27 @@ def mass_blocks(tdim, formdegree, B00, B11, comm=None):
         else:
             B_diag = [kron3(B11, B11, B11)]
 
-    B00.destroy()
-    B11.destroy()
     n = len(B_diag)
     if n == 1:
         return [B_diag]
     else:
-        B_zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=comm)
-        B_zero.assemble()
-        return [[B_diag[i] if i == j else B_zero for j in range(n)] for i in range(n)]
+        zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=B_diag[0].getComm())
+        zero.assemble()
+        return [[B_diag[i] if i == j else zero for j in range(n)] for i in range(n)]
 
 
-def diff_blocks(tdim, formdegree, A00, A11, A10, comm=None):
+def diff_blocks(tdim, formdegree, A00, A11, A10):
     """Construct exterior derivative block matrix on reference cell from 1D
        mass matrices A00 and A11, and exterior derivative moments A10.
        The 1D matrices may come with different test and trial spaces."""
-    if comm is None:
-        comm = PETSc.COMM_SELF
     if formdegree == tdim:
         ncols = A10.shape[0]**tdim
-        A_zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=comm)
-        A_zero.assemble()
-        return [[A_zero]]
-
-    A10 = petsc_sparse(A10, comm=comm)
-    if tdim == 1:
-        return [[A10]]
-
-    A00 = petsc_sparse(A00, comm=comm)
-    A11 = petsc_sparse(A11, comm=comm)
-    if tdim == 2:
+        zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=A10.getComm())
+        zero.assemble()
+        A_blocks = [[zero]]
+    elif tdim == 1:
+        A_blocks = [[A10]]
+    elif tdim == 2:
         if formdegree == 0:
             A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
         elif formdegree == 1:
@@ -1109,25 +1087,23 @@ def diff_blocks(tdim, formdegree, A00, A11, A10, comm=None):
             A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
         elif formdegree == 1:
             size = tuple(A11.getSize()[k] * A10.getSize()[k] * A00.getSize()[k] for k in range(2))
-            A_zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=comm)
-            A_zero.assemble()
-            A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), A_zero],
-                        [kron3(A10, A00, A11, scale=-1), A_zero, kron3(A11, A00, A10)],
-                        [A_zero, kron3(A10, A11, A00), kron3(A11, A10, A00, scale=-1)]]
+            zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=A10.getComm())
+            zero.assemble()
+            A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), zero],
+                        [kron3(A10, A00, A11, scale=-1), zero, kron3(A11, A00, A10)],
+                        [zero, kron3(A10, A11, A00), kron3(A11, A10, A00, scale=-1)]]
         elif formdegree == 2:
             A_blocks = [[kron3(A10, A11, A11, scale=-1), kron3(A11, A10, A11), kron3(A11, A11, A10)]]
-
-    A00.destroy()
-    A11.destroy()
-    A10.destroy()
     return A_blocks
 
 
-def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
+def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
     """
     Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
     Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
     """
+    if comm is None:
+        comm = Vf.comm
     ec = Vc.finat_element
     ef = Vf.finat_element
     if ef.formdegree - ec.formdegree != 1:
@@ -1139,10 +1115,13 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
 
     degree = e0.degree()
     tdim = Vc.mesh().topological_dimension()
-    A11 = numpy.eye(degree, dtype=PETSc.RealType)
-    A00 = numpy.eye(degree+1, dtype=PETSc.RealType)
-    A10 = fiat_reference_prolongator(e0, e1, derivative=True)
-    Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy=True)
+    A00 = petsc_sparse(numpy.eye(degree+1, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+    A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
+    A11 = petsc_sparse(numpy.eye(degree, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+    Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy_blocks=True)
+    A00.destroy()
+    A10.destroy()
+    A11.destroy()
 
     if any(is_restricted(ec)) or any(is_restricted(ef)):
         scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
@@ -1158,7 +1137,7 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[]):
 
     if Vf.value_size > 1:
         temp = Dhat
-        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType))
+        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
         Dhat = temp.kron(eye)
         temp.destroy()
         eye.destroy()
@@ -1177,7 +1156,7 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
 
     sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
     block_size = Vf.dof_dset.layout_vec.getBlockSize()
-    preallocator = PETSc.Mat().create(comm=Vf.comm)
+    preallocator = PETSc.Mat().create(comm=comm)
     preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
     preallocator.setSizes(sizes)
     preallocator.setUp()
@@ -1192,7 +1171,7 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
     preallocator.assemble()
     nnz = get_preallocation(preallocator, sizes[0][0])
     preallocator.destroy()
-    Dmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=Vf.comm)
+    Dmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=comm)
     Dmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
 
     for e in range(nel):
@@ -1285,9 +1264,9 @@ def assemble_reference_tensor(self, V):
         Afdm = []  # sparse interval mass and stiffness matrices for each direction
         Dfdm = []  # tabulation of normal derivatives at the boundary for each direction
         bdof = []  # indices of point evaluation dofs for each direction
-        cache = {}
+        cache = self._cache.setdefault("ipdg_reference_tensor", {})
         for e in line_elements:
-            key = e.degree()
+            key = (e.degree(), eta)
             try:
                 rtensor = cache[key]
             except KeyError:
@@ -1314,11 +1293,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         condense_element_mat = lambda x: x
 
         get_rindices = self.cell_to_global[Vrow]
-        try:
-            rtensor = self.reference_tensor_on_diag[Vrow]
-        except KeyError:
-            rtensor = self.reference_tensor_on_diag.setdefault(Vrow, self.assemble_reference_tensor(Vrow))
-        Afdm, Dfdm, bdof, axes_shifts = rtensor
+        Afdm, Dfdm, bdof, axes_shifts = self.assemble_reference_tensor(Vrow)
 
         Gq = self.coefficients.get("alpha")
         Bq = self.coefficients.get("beta")
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index c6b482b5ee..0d3544f26c 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -203,7 +203,7 @@ def _coarsen_form(a):
             return a
 
         cJ = _coarsen_form(fctx.J)
-        cJp = _coarsen_form(fctx.Jp)
+        cJp = cJ if fctx.Jp is fctx.J else _coarsen_form(fctx.Jp)
         # This fixes a subtle bug where you are applying PMGPC on a mixed
         # problem with geometric multigrid only on one block and an non-Lagrange element
         # on the other block (gmg breaks for non-Lagrange elements)
@@ -268,20 +268,20 @@ def _coarsen_form(a):
 
         if cu in cJ.coefficients():
             # Only inject state if the coarse state is a dependency of the coarse Jacobian.
-            inject_petscmat = cdm.createInjection(fdm)
+            inject = cdm.createInjection(fdm)
 
             def inject_state():
                 with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
-                    inject_petscmat.mult(xf, xc)
+                    inject.mult(xf, xc)
 
             add_hook(parent, setup=inject_state, call_setup=True)
 
         # Coarsen the nullspace basis
-        def coarsen_nullspace(coarse_V, mat, fine_nullspace):
+        def coarsen_nullspace(coarse_V, interpolate, fine_nullspace):
             if isinstance(fine_nullspace, MixedVectorSpaceBasis):
-                if mat.type == 'python':
-                    mat = mat.getPythonContext()
-                submats = [mat.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
+                if interpolate.getType() == "python":
+                    interpolate = interpolate.getPythonContext()
+                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
                 coarse_bases = []
                 for fs, submat, basis in zip(coarse_V, submats, fine_nullspace._bases):
                     if isinstance(basis, VectorSpaceBasis):
@@ -294,10 +294,7 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
                 for xf in fine_nullspace._petsc_vecs:
                     wc = firedrake.Function(coarse_V)
                     with wc.dat.vec_wo as xc:
-                        if mat.getSize()[1] == xf.getSize():
-                            mat.mult(xf, xc)
-                        else:
-                            mat.multTranspose(xf, xc)
+                        interpolate.multTranspose(xf, xc)
                     coarse_vecs.append(wc)
                 vsb = VectorSpaceBasis(coarse_vecs, constant=fine_nullspace._constant)
                 vsb.orthonormalize()
@@ -305,16 +302,24 @@ def coarsen_nullspace(coarse_V, mat, fine_nullspace):
             else:
                 return fine_nullspace
 
-        if fctx._nullspace or fctx._near_nullspace or fctx._nullspace_T:
-            interp_petscmat, _ = cdm.createInterpolation(fdm)
+        interpolate = None
+        if fctx._nullspace or fctx._nullspace_T or fctx._near_nullspace:
+            interpolate, _ = cdm.createInterpolation(fdm)
+        cctx._nullspace = coarsen_nullspace(cV, interpolate, fctx._nullspace)
+        if fctx._nullspace_T is fctx._nullspace:
+            cctx._nullspace_T = cctx._nullspace
         else:
-            interp_petscmat = None
-        cctx._nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace)
+            cctx._nullspace_T = coarsen_nullspace(cV, interpolate, fctx._nullspace_T)
+        if fctx._near_nullspace is fctx._nullspace:
+            cctx._near_nullspace = cctx._nullspace
+        elif fctx._near_nullspace is fctx._nullspace_T:
+            cctx._near_nullspace = cctx._nullspace_T
+        else:
+            cctx._near_nullspace = coarsen_nullspace(cV, interpolate, fctx._near_nullspace)
+
         cctx.set_nullspace(cctx._nullspace, cV._ises, transpose=False, near=False)
-        cctx._near_nullspace = coarsen_nullspace(cV, interp_petscmat, fctx._near_nullspace)
-        cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
-        cctx._nullspace_T = coarsen_nullspace(cV, interp_petscmat, fctx._nullspace_T)
         cctx.set_nullspace(cctx._nullspace_T, cV._ises, transpose=True, near=False)
+        cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
         return cdm
 
     def coarsen_quadrature(self, metadata, fdeg, cdeg):

From 031e95017468a34280d9f2167dc74bdd34ee52f5 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 3 Apr 2023 07:54:00 +0100
Subject: [PATCH 56/75] homogenize IPDG 3D tests

---
 firedrake/preconditioners/fdm.py | 238 ++++++++++++++++---------------
 tests/regression/test_fdm.py     |  71 +++++----
 2 files changed, 163 insertions(+), 146 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index ef6a328a35..058d4ddeac 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -234,22 +234,25 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
             fdofs = numpy.add.outer(value_size * fdofs, numpy.arange(value_size, dtype=fdofs.dtype))
         dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
         idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
-        self.ises = tuple(PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs))
-        self.submats = [None for _ in range(7)]
+        self.ises = [PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs)]
+        self.submats = [None for _ in range(6)]
 
         # Dictionary with the parent space and a method to form the Schur complement
         self.get_static_condensation = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
-            diagonal_interior = Vfacet.finat_element.formdegree == 0 and value_size == 1
-            get_schur = schur_complement_diagonal if diagonal_interior else schur_complement_block_qr
-            self.get_static_condensation[Vfacet] = Vbig, lambda A: condense_element_mat(A, self.ises[0], self.ises[1],
-                                                                                        self.submats, get_schur)
-
+            if Vfacet.finat_element.formdegree == 0 and value_size == 1:
+                default_schur = schur_complement_diagonal
+            elif pmat_type.endswith("sbaij"):
+                default_schur = schur_complement_block_cholesky
+            else:
+                default_schur = schur_complement_block_qr
+            self.get_static_condensation[Vfacet] = Vbig, partial(condense_element_mat, default_schur,
+                                                                 self.ises[0], self.ises[1], self.submats)
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            i1 = PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF)
-            self.get_static_condensation[V] = Vbig, lambda Ae: condense_element_pattern(Ae, self.ises[0], i1, self.submats)
+            self.ises.append(PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF))
+            self.get_static_condensation[V] = Vbig, partial(condense_element_pattern, self.ises[0], self.ises[2], self.submats)
 
         @PETSc.Log.EventDecorator("FDMGetIndices")
         def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
@@ -269,7 +272,7 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             self.lgmaps[Vsub] = lgmap
 
             own = Vsub.dof_dset.layout_vec.getLocalSize()
-            bdofs = numpy.nonzero(lgmap.indices[:own] < 0)[0].astype(PETSc.IntType)
+            bdofs = numpy.flatnonzero(lgmap.indices[:own] < 0).astype(PETSc.IntType)
             bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
         self.nel = nel
 
@@ -393,11 +396,11 @@ def destroy(self, pc):
         if hasattr(self, "submats"):
             objs.extend(self.submats)
         if hasattr(self, "work_mats"):
-            objs.extend(list(self.work_mats.values()))
+            objs.extend(self.work_mats.values())
         if hasattr(self, "ises"):
             objs.extend(self.ises)
         for obj in objs:
-            if hasattr(obj, "destroy"):
+            if isinstance(obj, PETSc.Object):
                 obj.destroy()
 
     @cached_property
@@ -434,8 +437,11 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             # This MPI rank does not own any elements, nothing to be done
             return
 
+        def get_key(*args):
+            return tuple(map(lambda V: V.ufl_element() if V else None, args))
+
         Vbig = None
-        condense_element_mat = lambda x: x
+        condense_element_mat = lambda Ae, result=None: Ae
         set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         get_rindices = self.cell_to_global[Vrow]
         if Vrow == Vcol:
@@ -447,24 +453,31 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, cindices, addv)
 
         Me = self._element_mass_matrix
-        # interpolation of basis and exterior derivative onto broken spaces
+        # Interpolation of basis and exterior derivative onto broken spaces
         ctensor = self.assemble_reference_tensor(Vbig or Vcol)
         rtensor = self.assemble_reference_tensor(Vbig or Vrow, transpose=True)
-        # element matrix obtained via Equation (3.9) of Brubeck2022b
+        # Element matrix obtained via Equation (3.9) of Brubeck2022b
         assemble_element_mat = partial(rtensor.matMatMult, Me, ctensor)
+        # Preallocate the element matrix
+        key = get_key(Vbig or Vrow, Vbig or Vcol, None)
         try:
-            Ae = self.work_mats[Vrow, Vcol]
+            Ae = self.work_mats[key]
         except KeyError:
-            Ae = self.work_mats.setdefault((Vrow, Vcol), assemble_element_mat())
+            Ae = self.work_mats.setdefault(key, assemble_element_mat())
+        # Preallocate the element Schur complement
+        key = get_key(Vrow, Vcol, Vbig)
+        try:
+            Se = self.work_mats[key]
+        except KeyError:
+            sort_interior_dofs(self.ises[0], Ae)
+            Se = self.work_mats.setdefault(key, condense_element_mat(Ae))
 
         insert = PETSc.InsertMode.INSERT
         if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
             # Empty kernel for preallocation
-            if Vbig is not None:
-                sort_interior_dofs(self.ises[0], Ae)
-            Se = condense_element_mat(Ae)
             element_kernel = lambda e, result=None: result
-            condense_element_mat = lambda Ae: Se
+            condense_element_mat = lambda Ae, result=None: result
+
         elif Me.getBlockSize() == 1:
             # Kernel with diagonal mass matrix
             diagonal = self._element_mass_diagonal
@@ -492,7 +505,8 @@ def element_kernel(e, result=None):
             cindices = get_cindices(e, result=cindices)
             rindices = get_rindices(e, result=rindices)
             Ae = element_kernel(e, result=Ae)
-            update_A(condense_element_mat(Ae), rindices, cindices)
+            Se = condense_element_mat(Ae, result=Se)
+            update_A(Se, rindices, cindices)
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp, block_diagonal=True):
@@ -578,30 +592,25 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
         Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
         mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
 
-        # Return coefficients and assembly callables, and cache them class
-        key = (mixed_form.signature(), mesh)
-        cache = self._cache.setdefault("coefficients", {})
-        try:
-            return cache[key]
-        except KeyError:
-            if block_diagonal and V.shape:
-                from firedrake.assemble import assemble
-                M = assemble(mixed_form, mat_type="matfree",
-                             form_compiler_parameters=fcp)
-                coefficients = {}
-                assembly_callables = []
-                for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
-                    sub = M.petscmat.createSubMatrix(iset, iset)
-                    ctx = sub.getPythonContext()
-                    coefficients[name] = ctx._block_diagonal
-                    assembly_callables.append(ctx._assemble_block_diagonal)
-            else:
-                from firedrake.assemble import OneFormAssembler
-                tensor = Function(Z)
-                coefficients = {"beta": tensor.sub(0), "alpha": tensor.sub(1)}
-                assembly_callables = [OneFormAssembler(mixed_form, tensor=tensor, diagonal=True,
-                                                       form_compiler_parameters=fcp).assemble]
-            return cache.setdefault(key, (coefficients, assembly_callables))
+        # Return coefficients and assembly callables
+        coefficients = {}
+        assembly_callables = []
+        if block_diagonal and V.shape:
+            from firedrake.assemble import assemble
+            M = assemble(mixed_form, mat_type="matfree", form_compiler_parameters=fcp)
+            for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
+                sub = M.petscmat.createSubMatrix(iset, iset)
+                ctx = sub.getPythonContext()
+                coefficients[name] = ctx._block_diagonal
+                assembly_callables.append(ctx._assemble_block_diagonal)
+        else:
+            from firedrake.assemble import OneFormAssembler
+            tensor = Function(Z)
+            coefficients["beta"] = tensor.subfunctions[0]
+            coefficients["alpha"] = tensor.subfunctions[1]
+            assembly_callables.append(OneFormAssembler(mixed_form, tensor=tensor, diagonal=True,
+                                                       form_compiler_parameters=fcp).assemble)
+        return coefficients, assembly_callables
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
     def assemble_reference_tensor(self, V, transpose=False):
@@ -651,10 +660,9 @@ def assemble_reference_tensor(self, V, transpose=False):
             if is_interior:
                 e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
 
-            comm = PETSc.COMM_SELF
-            A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=comm)
-            A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=comm)
-            A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=comm)
+            A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=PETSc.COMM_SELF)
+            A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
+            A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
             B_blocks = mass_blocks(tdim, formdegree, A00, A11)
             A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
             result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
@@ -663,7 +671,7 @@ def assemble_reference_tensor(self, V, transpose=False):
             A11.destroy()
 
             if value_size != 1:
-                eye = petsc_sparse(numpy.eye(value_size), comm=comm)
+                eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
                 temp = result
                 result = temp.kron(eye)
                 temp.destroy()
@@ -679,26 +687,26 @@ def assemble_reference_tensor(self, V, transpose=False):
 
 
 @PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_diagonal(submats):
+def schur_complement_diagonal(submats, result=None):
     """
     Used in static condensation. Take in blocks A00, A01, A10, A11,
     return the Schur complement A11 - A10 * inv(A00) * A01.
 
     Assumes A00 is diagonal.
     """
-    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     A00, A01, A10, A11 = submats[:4]
     submats[4] = A00.getDiagonal(result=submats[4])
     submats[4].reciprocal()
     submats[4].scale(-1)
     A01.diagonalScale(L=submats[4])
-    submats[-1] = A10.matMult(A01, result=submats[-1])
-    submats[-1].axpy(1.0, A11, structure=structure)
-    return submats[-1]
+    result = A10.matMult(A01, result=result)
+    result.axpy(1.0, A11, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_inv(submats):
+def schur_complement_block_inv(submats, result=None):
     """
     Used in static condensation. Take in blocks A00, A01, A10, A11,
     return A11 - A10 * inv(A00) * A01.
@@ -706,17 +714,16 @@ def schur_complement_block_inv(submats):
     Assumes that interior DOFs have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, R = A00.getValuesCSR()
-    degree = numpy.diff(indptr)
-
-    nblocks = numpy.count_nonzero(degree == 1)
+    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
+    istart = degree[0] == 1
+    nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.reciprocal(R[zlice], out=R[zlice])
     flops = nblocks
-    for k in range(2, degree[-1]+1):
-        nblocks = numpy.count_nonzero(degree == k)
+    for k, nblocks in zip(degree[istart:], counts[istart:]):
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         A = R[zlice].reshape((-1, k, k))
         R[zlice] = numpy.linalg.inv(A).reshape((-1,))
@@ -726,13 +733,13 @@ def schur_complement_block_inv(submats):
     A00.setValuesCSR(indptr, indices, R)
     A00.assemble()
     A00.scale(-1.0)
-    submats[-1] = A10.matMatMult(A00, A01, result=submats[-1])
-    submats[-1].axpy(1.0, A11, structure=structure)
-    return submats[-1]
+    result = A10.matMatMult(A00, A01, result=result)
+    result.axpy(1.0, A11, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_cholesky(submats):
+def schur_complement_block_cholesky(submats, result=None):
     """
     Used in static condensation. Take in blocks A00, A01, A10, A11,
     return A11 - A10 * inv(A00) * A01.
@@ -740,18 +747,17 @@ def schur_complement_block_cholesky(submats):
     Assumes that interior DOFs have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, R = A00.getValuesCSR()
-    degree = numpy.diff(indptr)
-
-    nblocks = numpy.count_nonzero(degree == 1)
+    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
+    istart = degree[0] == 1
+    nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.sqrt(R[zlice], out=R[zlice])
     numpy.reciprocal(R[zlice], out=R[zlice])
     flops = 2*nblocks
-    for k in range(2, degree[-1]+1):
-        nblocks = numpy.count_nonzero(degree == k)
+    for k, nblocks in zip(degree[istart:], counts[istart:]):
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         A = R[zlice].reshape((-1, k, k))
         R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
@@ -762,13 +768,13 @@ def schur_complement_block_cholesky(submats):
     A00.assemble()
     submats[4] = A10.matTransposeMult(A00, result=submats[4])
     A00.scale(-1.0)
-    submats[-1] = submats[4].matMatMult(A00, A01, result=submats[-1])
-    submats[-1].axpy(1.0, A11, structure=structure)
-    return submats[-1]
+    result = submats[4].matMatMult(A00, A01, result=result)
+    result.axpy(1.0, A11, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_qr(submats):
+def schur_complement_block_qr(submats, result=None):
     """
     Used in static condensation. Take in blocks A00, A01, A10, A11,
     return A11 - A10 * inv(A00) * A01.
@@ -776,18 +782,18 @@ def schur_complement_block_qr(submats):
     Assumes that interior DOFs have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, R = A00.getValuesCSR()
-    degree = numpy.diff(indptr)
     Q = numpy.ones(R.shape, dtype=R.dtype)
 
-    nblocks = numpy.count_nonzero(degree == 1)
+    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
+    istart = degree[0] == 1
+    nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.reciprocal(R[zlice], out=R[zlice])
     flops = nblocks
-    for k in range(2, degree[-1]+1):
-        nblocks = numpy.count_nonzero(degree == k)
+    for k, nblocks in zip(degree[istart:], counts[istart:]):
         zlice = slice(zlice.stop, zlice.stop + k*nblocks)
         A = R[zlice].reshape((-1, k, k))
         q, r = numpy.linalg.qr(A, mode="complete")
@@ -802,13 +808,13 @@ def schur_complement_block_qr(submats):
     A00.setValuesCSR(indptr, indices, R)
     A00.assemble()
     A00.scale(-1.0)
-    submats[-1] = A10.matMatMult(A00, submats[4], result=submats[-1])
-    submats[-1].axpy(1.0, A11, structure=structure)
-    return submats[-1]
+    result = A10.matMatMult(A00, submats[4], result=result)
+    result.axpy(1.0, A11, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_svd(submats):
+def schur_complement_block_svd(submats, result=None):
     """
     Used in static condensation. Take in blocks A00, A01, A10, A11,
     return A11 - A10 * inv(A00) * A01.
@@ -816,22 +822,21 @@ def schur_complement_block_svd(submats):
     Assumes that interior DOFs have been reordered to make A00
     block diagonal with blocks of increasing dimension.
     """
-    structure = PETSc.Mat.Structure.SUBSET if submats[-1] else None
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, U = A00.getValuesCSR()
-    degree = numpy.diff(indptr)
     V = numpy.ones(U.shape, dtype=U.dtype)
     submats[4] = A00.getDiagonal(result=submats[4])
     D = submats[4]
 
-    nblocks = numpy.count_nonzero(degree == 1)
+    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
+    istart = degree[0] == 1
+    nblocks = counts[0] if istart else 0
     bslice = slice(0, nblocks)
     dslice = slice(0, nblocks)
     numpy.sign(D.array_r[dslice], out=U[bslice])
-
     flops = nblocks
-    for k in range(2, degree[-1]+1):
-        nblocks = numpy.count_nonzero(degree == k)
+    for k, nblocks in zip(degree[istart:], counts[istart:]):
         bslice = slice(bslice.stop, bslice.stop + k*nblocks)
         dslice = slice(dslice.stop, dslice.stop + nblocks)
         A = U[bslice].reshape((-1, k, k))
@@ -853,32 +858,32 @@ def schur_complement_block_svd(submats):
     A00.setValuesCSR(indptr, indices, U)
     A00.assemble()
     A00.diagonalScale(L=D)
-    submats[-1] = submats[5].matMatMult(A00, A01, result=submats[-1])
-    submats[-1].axpy(1.0, A11, structure=structure)
-    return submats[-1]
+    result = submats[5].matMatMult(A00, A01, result=result)
+    result.axpy(1.0, A11, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_mat(A, i0, i1, submats, get_schur_complement):
+def condense_element_mat(get_schur_complement, i0, i1, submats, A, result=None):
     """Return the Schur complement associated to indices in i1, condensing i0 out"""
     isrows = [i0, i0, i1, i1]
     iscols = [i0, i1, i0, i1]
     submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
-    return get_schur_complement(submats)
+    return get_schur_complement(submats, result=result)
 
 
 @PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_pattern(A, i0, i1, submats):
+def condense_element_pattern(i0, i1, submats, A, result=None):
     """Add zeroes on the statically condensed pattern so that you can run ICC(0)"""
+    structure = PETSc.Mat.Structure.SUBSET if result else None
     isrows = [i0, i0, i1]
     iscols = [i0, i1, i0]
-    structure = PETSc.Mat.Structure.SUBSET if submats[3] else None
     submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
     A00, A01, A10 = submats[:3]
     A00.scale(0.0)
-    submats[3] = A10.matMatMult(A00, A01, result=submats[3])
-    submats[3].axpy(1.0, A, structure=structure)
-    return submats[3]
+    result = A10.matMatMult(A00, A01, result=result)
+    result.axpy(1.0, A, structure=structure)
+    return result
 
 
 @PETSc.Log.EventDecorator("LoadCode")
@@ -900,7 +905,7 @@ def get_pointer(obj):
 
     @PETSc.Log.EventDecorator(name)
     def wrapper(*args):
-        return funptr(*list(map(get_pointer, args)))
+        return funptr(*map(get_pointer, args))
     return wrapper
 
 
@@ -978,17 +983,18 @@ def sort_interior_dofs(idofs, A):
        increasing dimension along its diagonal."""
     Aii = A.createSubMatrix(idofs, idofs)
     indptr, indices, _ = Aii.getValuesCSR()
-    n = idofs.getSize()
-    visit = numpy.zeros((n, ), dtype=bool)
+    degree = numpy.diff(indptr)
     perm = []
-    degree = 0
-    while not visit.all():
-        degree += 1
-        for i in range(n):
-            if not visit[i]:
-                neigh = indices[slice(*indptr[i:i+2])]
-                if len(neigh) == degree:
-                    visit[neigh] = True
+    for k in sorted(numpy.unique(degree)):
+        if k == 1:
+            neigh = numpy.flatnonzero(degree == k)
+            degree[neigh] = 0
+            perm.extend(neigh)
+        else:
+            for i in range(len(degree)):
+                if degree[i] == k:
+                    neigh = indices[slice(*indptr[i:i+2])]
+                    degree[neigh] = 0
                     perm.extend(neigh)
     idofs.setIndices(idofs.getIndices()[perm])
     Aii.destroy()
@@ -1127,9 +1133,9 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
         scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
         fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
         cdofs = restricted_dofs(ec, create_element(unrestrict_element(scalar_element(Vc.ufl_element()))))
-        fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
-        cises = PETSc.IS().createGeneral(cdofs, comm=PETSc.COMM_SELF)
         temp = Dhat
+        fises = PETSc.IS().createGeneral(fdofs, comm=temp.getComm())
+        cises = PETSc.IS().createGeneral(cdofs, comm=temp.getComm())
         Dhat = temp.createSubMatrix(fises, cises)
         temp.destroy()
         fises.destroy()
@@ -1137,7 +1143,7 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
 
     if Vf.value_size > 1:
         temp = Dhat
-        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType), comm=temp.getComm())
         Dhat = temp.kron(eye)
         temp.destroy()
         eye.destroy()
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 1104c8bb91..42b4319eab 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -226,20 +226,26 @@ def test_ipdg_direct_solver(fs):
     x = SpatialCoordinate(mesh)
     gdim = mesh.geometric_dimension()
     ncomp = fs.ufl_element().value_size()
-    u_exact = dot(x, x)
-    if ncomp:
-        u_exact = as_vector([u_exact + Constant(k) for k in range(ncomp)])
+
+    homogenize = gdim > 2
+    if homogenize:
+        rg = RandomGenerator(PCG64(seed=123456789))
+        uh = rg.uniform(fs, -1, 1)
+        u_exact = zero(uh.ufl_shape)
+        u_bc = 0
+    else:
+        uh = Function(fs)
+        u_exact = dot(x, x)
+        if ncomp:
+            u_exact = as_vector([u_exact + Constant(k) for k in range(ncomp)])
+        u_bc = u_exact
 
     degree = fs.ufl_element().degree()
     try:
-        degree, = set(degree)
+        degree = max(degree)
     except TypeError:
         pass
-
     quad_degree = 2*(degree+1)-1
-    uh = Function(fs)
-    u = TrialFunction(fs)
-    v = TestFunction(fs)
 
     # problem coefficients
     A1 = diag(Constant(range(1, gdim+1)))
@@ -247,19 +253,13 @@ def test_ipdg_direct_solver(fs):
     alpha = lambda grad_u: dot(dot(A2, grad_u), A1)
     beta = diag(Constant(range(2, ncomp+2)))
 
-    n = FacetNormal(mesh)
-    f_exact = alpha(grad(u_exact))
-    B = dot(beta, u_exact) - div(f_exact)
-    T = dot(f_exact, n)
-
     extruded = mesh.cell_set._extruded
     subs = (1,)
     if gdim > 1:
         subs += (3,)
     if extruded:
         subs += ("top",)
-
-    bcs = [DirichletBC(fs, u_exact, sub) for sub in subs]
+    bcs = [DirichletBC(fs, u_bc, sub) for sub in subs]
 
     dirichlet_ids = subs
     if "on_boundary" in dirichlet_ids:
@@ -287,24 +287,31 @@ def test_ipdg_direct_solver(fs):
 
     ds_Dir = sum(ds_Dir, ds(tuple()))
     ds_Neu = sum(ds_Neu, ds(tuple()))
+    n = FacetNormal(mesh)
+    h = CellVolume(mesh) / FacetArea(mesh)
     eta = Constant((degree+1)**2)
-    h = CellVolume(mesh)/FacetArea(mesh)
-    penalty = eta/h
+    penalty = eta / h
 
-    outer_jump = lambda w, n: outer(w("+"), n("+")) + outer(w("-"), n("-"))
-    num_flux = lambda w: alpha(avg(penalty/2) * outer_jump(w, n))
-    num_flux_b = lambda w: alpha((penalty/2) * outer(w, n))
+    num_flux = lambda u: avg(penalty) * avg(outer(u, n))
+    num_flux_b = lambda u: (penalty/2) * outer(u, n)
+    alpha_inner = lambda v, u: inner(v, alpha(u))
 
-    a = (inner(v, dot(beta, u)) * dxq
-         + inner(grad(v), alpha(grad(u))) * dxq
-         + inner(outer_jump(v, n), num_flux(u)-avg(alpha(grad(u)))) * dS_int
-         + inner(outer_jump(u, n), num_flux(v)-avg(alpha(grad(v)))) * dS_int
-         + inner(outer(v, n), num_flux_b(u)-alpha(grad(u))) * ds_Dir
-         + inner(outer(u, n), num_flux_b(v)-alpha(grad(v))) * ds_Dir)
+    a_int = lambda v, u: alpha_inner(2 * avg(outer(v, n)), num_flux(u) - avg(grad(u))) * dS_int
+    a_Dir = lambda v, u: alpha_inner(outer(v, n), num_flux_b(u) - grad(u)) * ds_Dir
 
-    L = (inner(v, B)*dxq
-         + inner(v, T)*ds_Neu
-         + inner(outer(u_exact, n), 2*num_flux_b(v)-alpha(grad(v))) * ds_Dir)
+    u = TrialFunction(fs)
+    v = TestFunction(fs)
+    a = ((inner(v, dot(beta, u)) + alpha_inner(grad(v), grad(u))) * dxq
+         + a_int(v, u) + a_int(u, v) + a_Dir(v, u) + a_Dir(u, v))
+
+    if homogenize:
+        L = 0
+    else:
+        f_exact = alpha(grad(u_exact))
+        B = dot(beta, u_exact) - div(f_exact)
+        T = dot(f_exact, n)
+        L = (inner(v, B)*dxq + inner(v, T)*ds_Neu
+             + alpha_inner(outer(u_exact, n), 2*num_flux_b(v) - grad(v)) * ds_Dir)
 
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={
@@ -324,4 +331,8 @@ def test_ipdg_direct_solver(fs):
     solver.solve()
 
     assert solver.snes.ksp.getIterationNumber() == 1
-    assert norm(u_exact-uh, "H1") < 1.0E-8
+    if homogenize:
+        with uh.dat.vec_ro as uvec:
+            assert uvec.norm() < 1E-8
+    else:
+        assert norm(u_exact-uh, "H1") < 1.0E-8

From 24dbf065fb7eb4a2c2a0ca1615c351239254cbe2 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 3 Apr 2023 08:56:02 +0100
Subject: [PATCH 57/75] cast bool to int

---
 firedrake/preconditioners/fdm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 058d4ddeac..922d3eb23e 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -718,7 +718,7 @@ def schur_complement_block_inv(submats, result=None):
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, R = A00.getValuesCSR()
     degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = degree[0] == 1
+    istart = int(degree[0] == 1)
     nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.reciprocal(R[zlice], out=R[zlice])
@@ -751,7 +751,7 @@ def schur_complement_block_cholesky(submats, result=None):
     A00, A01, A10, A11 = submats[:4]
     indptr, indices, R = A00.getValuesCSR()
     degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = degree[0] == 1
+    istart = int(degree[0] == 1)
     nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.sqrt(R[zlice], out=R[zlice])
@@ -788,7 +788,7 @@ def schur_complement_block_qr(submats, result=None):
     Q = numpy.ones(R.shape, dtype=R.dtype)
 
     degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = degree[0] == 1
+    istart = int(degree[0] == 1)
     nblocks = counts[0] if istart else 0
     zlice = slice(0, nblocks)
     numpy.reciprocal(R[zlice], out=R[zlice])
@@ -830,7 +830,7 @@ def schur_complement_block_svd(submats, result=None):
     D = submats[4]
 
     degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = degree[0] == 1
+    istart = int(degree[0] == 1)
     nblocks = counts[0] if istart else 0
     bslice = slice(0, nblocks)
     dslice = slice(0, nblocks)

From 2747d63798d74199c40c79b2781fc7e40e78d971 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 3 Apr 2023 14:26:02 +0100
Subject: [PATCH 58/75] SchurComplementBuilder class

---
 firedrake/preconditioners/fdm.py | 471 +++++++++++++++----------------
 tests/regression/test_fdm.py     |  15 +-
 2 files changed, 240 insertions(+), 246 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 922d3eb23e..5b4d4e8c61 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -232,27 +232,24 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         value_size = Vbig.value_size
         if value_size != 1:
             fdofs = numpy.add.outer(value_size * fdofs, numpy.arange(value_size, dtype=fdofs.dtype))
+        self.fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
         dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
         idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
-        self.ises = [PETSc.IS().createGeneral(indices, comm=PETSc.COMM_SELF) for indices in (idofs, fdofs)]
-        self.submats = [None for _ in range(6)]
 
         # Dictionary with the parent space and a method to form the Schur complement
         self.get_static_condensation = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             if Vfacet.finat_element.formdegree == 0 and value_size == 1:
-                default_schur = schur_complement_diagonal
+                sc_builder = SchurComplementDiagonal
             elif pmat_type.endswith("sbaij"):
-                default_schur = schur_complement_block_cholesky
+                sc_builder = SchurComplementBlockCholesky
             else:
-                default_schur = schur_complement_block_qr
-            self.get_static_condensation[Vfacet] = Vbig, partial(condense_element_mat, default_schur,
-                                                                 self.ises[0], self.ises[1], self.submats)
+                sc_builder = SchurComplementBlockQR
+            self.get_static_condensation[Vfacet] = Vbig, sc_builder(idofs, fdofs, PETSc.COMM_SELF).condense
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            self.ises.append(PETSc.IS().createGeneral(dofs, comm=PETSc.COMM_SELF))
-            self.get_static_condensation[V] = Vbig, partial(condense_element_pattern, self.ises[0], self.ises[2], self.submats)
+            self.get_static_condensation[V] = Vbig, SchurComplementPattern(idofs, dofs, PETSc.COMM_SELF).condense
 
         @PETSc.Log.EventDecorator("FDMGetIndices")
         def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
@@ -387,21 +384,14 @@ def view(self, pc, viewer=None):
             self.pc.view(viewer)
 
     def destroy(self, pc):
-        objs = []
         if hasattr(self, "A"):
-            objs.append(self.A)
+            self.A.petscmat.destroy()
         if hasattr(self, "pc"):
-            objs.append(self.pc)
-            objs.append(self.pc.getOperators()[-1])
-        if hasattr(self, "submats"):
-            objs.extend(self.submats)
+            self.pc.getOperators()[-1].destroy()
+            self.pc.destroy()
         if hasattr(self, "work_mats"):
-            objs.extend(self.work_mats.values())
-        if hasattr(self, "ises"):
-            objs.extend(self.ises)
-        for obj in objs:
-            if isinstance(obj, PETSc.Object):
-                obj.destroy()
+            for mat in self.work_mats.values():
+                mat.destroy()
 
     @cached_property
     def _element_mass_matrix(self):
@@ -469,7 +459,6 @@ def get_key(*args):
         try:
             Se = self.work_mats[key]
         except KeyError:
-            sort_interior_dofs(self.ises[0], Ae)
             Se = self.work_mats.setdefault(key, condense_element_mat(Ae))
 
         insert = PETSc.InsertMode.INSERT
@@ -648,7 +637,7 @@ def assemble_reference_tensor(self, V, transpose=False):
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-                result = result.createSubMatrix(noperm, self.ises[1])
+                result = result.createSubMatrix(noperm, self.fises)
                 noperm.destroy()
                 return cache.setdefault(key, result)
 
@@ -686,204 +675,232 @@ def assemble_reference_tensor(self, V, transpose=False):
             return cache.setdefault(key, result)
 
 
-@PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_diagonal(submats, result=None):
-    """
-    Used in static condensation. Take in blocks A00, A01, A10, A11,
-    return the Schur complement A11 - A10 * inv(A00) * A01.
-
-    Assumes A00 is diagonal.
-    """
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    A00, A01, A10, A11 = submats[:4]
-    submats[4] = A00.getDiagonal(result=submats[4])
-    submats[4].reciprocal()
-    submats[4].scale(-1)
-    A01.diagonalScale(L=submats[4])
-    result = A10.matMult(A01, result=result)
-    result.axpy(1.0, A11, structure=structure)
-    return result
-
-
-@PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_inv(submats, result=None):
-    """
-    Used in static condensation. Take in blocks A00, A01, A10, A11,
-    return A11 - A10 * inv(A00) * A01.
-
-    Assumes that interior DOFs have been reordered to make A00
-    block diagonal with blocks of increasing dimension.
-    """
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    A00, A01, A10, A11 = submats[:4]
-    indptr, indices, R = A00.getValuesCSR()
-    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = int(degree[0] == 1)
-    nblocks = counts[0] if istart else 0
-    zlice = slice(0, nblocks)
-    numpy.reciprocal(R[zlice], out=R[zlice])
-    flops = nblocks
-    for k, nblocks in zip(degree[istart:], counts[istart:]):
-        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
-        A = R[zlice].reshape((-1, k, k))
-        R[zlice] = numpy.linalg.inv(A).reshape((-1,))
-        flops += nblocks * (k**3)
-
-    PETSc.Log.logFlops(flops)
-    A00.setValuesCSR(indptr, indices, R)
-    A00.assemble()
-    A00.scale(-1.0)
-    result = A10.matMatMult(A00, A01, result=result)
-    result.axpy(1.0, A11, structure=structure)
-    return result
-
-
-@PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_cholesky(submats, result=None):
-    """
-    Used in static condensation. Take in blocks A00, A01, A10, A11,
-    return A11 - A10 * inv(A00) * A01.
-
-    Assumes that interior DOFs have been reordered to make A00
-    block diagonal with blocks of increasing dimension.
-    """
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    A00, A01, A10, A11 = submats[:4]
-    indptr, indices, R = A00.getValuesCSR()
-    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = int(degree[0] == 1)
-    nblocks = counts[0] if istart else 0
-    zlice = slice(0, nblocks)
-    numpy.sqrt(R[zlice], out=R[zlice])
-    numpy.reciprocal(R[zlice], out=R[zlice])
-    flops = 2*nblocks
-    for k, nblocks in zip(degree[istart:], counts[istart:]):
-        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
-        A = R[zlice].reshape((-1, k, k))
-        R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
-        flops += nblocks * ((k**3)//3 + k**3)
-
-    PETSc.Log.logFlops(flops)
-    A00.setValuesCSR(indptr, indices, R)
-    A00.assemble()
-    submats[4] = A10.matTransposeMult(A00, result=submats[4])
-    A00.scale(-1.0)
-    result = submats[4].matMatMult(A00, A01, result=result)
-    result.axpy(1.0, A11, structure=structure)
-    return result
-
-
-@PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_qr(submats, result=None):
+class SchurComplementBuilder(object):
     """
-    Used in static condensation. Take in blocks A00, A01, A10, A11,
-    return A11 - A10 * inv(A00) * A01.
-
-    Assumes that interior DOFs have been reordered to make A00
-    block diagonal with blocks of increasing dimension.
-    """
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    A00, A01, A10, A11 = submats[:4]
-    indptr, indices, R = A00.getValuesCSR()
-    Q = numpy.ones(R.shape, dtype=R.dtype)
-
-    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = int(degree[0] == 1)
-    nblocks = counts[0] if istart else 0
-    zlice = slice(0, nblocks)
-    numpy.reciprocal(R[zlice], out=R[zlice])
-    flops = nblocks
-    for k, nblocks in zip(degree[istart:], counts[istart:]):
-        zlice = slice(zlice.stop, zlice.stop + k*nblocks)
-        A = R[zlice].reshape((-1, k, k))
-        q, r = numpy.linalg.qr(A, mode="complete")
-        Q[zlice] = q.reshape((-1,))
-        R[zlice] = numpy.linalg.inv(r).reshape((-1,))
-        flops += nblocks * ((4*k**3)//3 + k**3)
-
-    PETSc.Log.logFlops(flops)
-    A00.setValuesCSR(indptr, indices, Q)
-    A00.assemble()
-    submats[4] = A00.transposeMatMult(A01, result=submats[4])
-    A00.setValuesCSR(indptr, indices, R)
-    A00.assemble()
-    A00.scale(-1.0)
-    result = A10.matMatMult(A00, submats[4], result=result)
-    result.axpy(1.0, A11, structure=structure)
-    return result
-
-
-@PETSc.Log.EventDecorator("FDMGetSchur")
-def schur_complement_block_svd(submats, result=None):
+    Class to build element Schur complement.
     """
-    Used in static condensation. Take in blocks A00, A01, A10, A11,
-    return A11 - A10 * inv(A00) * A01.
 
-    Assumes that interior DOFs have been reordered to make A00
-    block diagonal with blocks of increasing dimension.
-    """
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    A00, A01, A10, A11 = submats[:4]
-    indptr, indices, U = A00.getValuesCSR()
-    V = numpy.ones(U.shape, dtype=U.dtype)
-    submats[4] = A00.getDiagonal(result=submats[4])
-    D = submats[4]
-
-    degree, counts = numpy.unique(numpy.diff(indptr), return_counts=True)
-    istart = int(degree[0] == 1)
-    nblocks = counts[0] if istart else 0
-    bslice = slice(0, nblocks)
-    dslice = slice(0, nblocks)
-    numpy.sign(D.array_r[dslice], out=U[bslice])
-    flops = nblocks
-    for k, nblocks in zip(degree[istart:], counts[istart:]):
-        bslice = slice(bslice.stop, bslice.stop + k*nblocks)
-        dslice = slice(dslice.stop, dslice.stop + nblocks)
-        A = U[bslice].reshape((-1, k, k))
-
-        u, s, v = numpy.linalg.svd(A, full_matrices=False)
-        D.array_w[dslice] = s.reshape((-1,))
-        U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
-        V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
-        flops += nblocks * ((4*k**3)//3 + 4*k**3)
-
-    PETSc.Log.logFlops(flops)
-    D.sqrtabs()
-    D.reciprocal()
-    A00.setValuesCSR(indptr, indices, V)
-    A00.assemble()
-    A00.diagonalScale(R=D)
-    submats[5] = A10.matMult(A00, result=submats[5])
-    D.scale(-1.0)
-    A00.setValuesCSR(indptr, indices, U)
-    A00.assemble()
-    A00.diagonalScale(L=D)
-    result = submats[5].matMatMult(A00, A01, result=result)
-    result.axpy(1.0, A11, structure=structure)
-    return result
+    def __init__(self, idofs, fdofs, comm):
+        i0 = PETSc.IS().createGeneral(idofs, comm=comm)
+        i1 = PETSc.IS().createGeneral(fdofs, comm=comm)
+        self.ises = (i0, i1)
+        self.isrows = [i0, i0, i1, i1]
+        self.iscols = [i0, i1, i0, i1]
+        self.work = [None for _ in range(2)]
+        self.submats = []
+        self.slices = {}
+
+    def __del__(self):
+        self.ises[0].destroy()
+        self.ises[1].destroy()
+        for mat in self.submats:
+            if isinstance(mat, PETSc.Object):
+                mat.destroy()
+        for obj in self.work:
+            if isinstance(obj, PETSc.Object):
+                obj.destroy()
 
+    def sort_interior_dofs(self, idofs, A):
+        """Permute `idofs` to have A[idofs, idofs] with square blocks of
+           increasing dimension along its diagonal."""
+        Aii = A.createSubMatrix(idofs, idofs)
+        indptr, indices, _ = Aii.getValuesCSR()
+        degree = numpy.diff(indptr)
+
+        perm = list(numpy.flatnonzero(degree == 1))
+        degree[perm] = 0
+
+        iend = len(perm)
+        if iend:
+            self.slices[1] = slice(0, iend)
+
+        for k in sorted(numpy.unique(degree)):
+            if k > 1:
+                nblocks = 0
+                for i in numpy.flatnonzero(degree == k):
+                    if degree[i] == k:
+                        block = indices[slice(*indptr[i:i+2])]
+                        degree[block] = 0
+                        perm.extend(block)
+                        nblocks += 1
+
+                istart = iend
+                iend += k * k * nblocks
+                self.slices[k] = slice(istart, iend)
+
+        idofs.setIndices(idofs.getIndices()[perm])
+        Aii.destroy()
+
+    def get_blocks(self, A):
+        if len(self.submats) == 0:
+            self.sort_interior_dofs(self.ises[0], A)
+        self.submats = A.createSubMatrices(self.isrows, iscols=self.iscols, submats=self.submats or None)
+        return self.submats
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, A, result=None):
+        return result
+
+
+class SchurComplementDiagonal(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A00, A01, A10, A11 = self.get_blocks(A)
+        self.work[0] = A00.getDiagonal(result=self.work[0])
+        self.work[0].reciprocal()
+        self.work[0].scale(-1)
+        A01.diagonalScale(L=self.work[0])
+        result = A10.matMult(A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementPattern(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        if result is None:
+            A00, A01, A10, _ = self.get_blocks(A)
+            result = A10.matMatMult(A00, A01, result=result)
+        result.aypx(0.0, A, structure=structure)
+        return result
+
+
+class SchurComplementBlockCholesky(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A00, A01, A10, A11 = self.get_blocks(A)
+        indptr, indices, R = A00.getValuesCSR()
+
+        flops = 0
+        for k in sorted(self.slices):
+            zlice = self.slices[k]
+            if k == 1:
+                numpy.sqrt(R[zlice], out=R[zlice])
+                numpy.reciprocal(R[zlice], out=R[zlice])
+                flops += 2 * (zlice.stop - zlice.start)
+            else:
+                A = R[zlice].reshape((-1, k, k))
+                R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
+                flops += A.shape[0] * ((k**3)//3 + k**3)
+
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        self.work[0] = A10.matTransposeMult(A00, result=self.work[0])
+        A00.scale(-1.0)
+        result = self.work[0].matMatMult(A00, A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockQR(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMGetSchur")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A00, A01, A10, A11 = self.get_blocks(A)
+        indptr, indices, R = A00.getValuesCSR()
+        Q = numpy.ones(R.shape, dtype=R.dtype)
+
+        flops = 0
+        for k in sorted(self.slices):
+            zlice = self.slices[k]
+            if k == 1:
+                numpy.reciprocal(R[zlice], out=R[zlice])
+                flops += zlice.stop - zlice.start
+            else:
+                A = R[zlice].reshape((-1, k, k))
+                q, r = numpy.linalg.qr(A, mode="complete")
+                Q[zlice] = q.reshape((-1,))
+                R[zlice] = numpy.linalg.inv(r).reshape((-1,))
+                flops += A.shape[0] * ((4*k**3)//3 + k**3)
+
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, Q)
+        A00.assemble()
+        self.work[0] = A00.transposeMatMult(A01, result=self.work[0])
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        A00.scale(-1.0)
+        result = A10.matMatMult(A00, self.work[0], result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockSVD(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMGetSchur")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A00, A01, A10, A11 = self.get_blocks(A)
+        indptr, indices, U = A00.getValuesCSR()
+        V = numpy.ones(U.shape, dtype=U.dtype)
+        self.work[0] = A00.getDiagonal(result=self.work[0])
+        D = self.work[0]
+        dslice = self.slices.get(1, slice(0, 0))
+        flops = 0
+        for k in sorted(self.slices):
+            bslice = self.slices[k]
+            if k == 1:
+                numpy.sign(D.array_r[bslice], out=U[bslice])
+                flops += bslice.stop - bslice.start
+            else:
+                A = U[bslice].reshape((-1, k, k))
+                u, s, v = numpy.linalg.svd(A, full_matrices=False)
+                dslice = slice(dslice.stop, dslice.stop + k * A.shape[0])
+                D.array_w[dslice] = s.reshape((-1,))
+                U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
+                V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
+                flops += A.shape[0] * ((4*k**3)//3 + 4*k**3)
+
+        PETSc.Log.logFlops(flops)
+        D.sqrtabs()
+        D.reciprocal()
+        A00.setValuesCSR(indptr, indices, V)
+        A00.assemble()
+        A00.diagonalScale(R=D)
+        self.work[1] = A10.matMult(A00, result=self.work[1])
+        D.scale(-1.0)
+        A00.setValuesCSR(indptr, indices, U)
+        A00.assemble()
+        A00.diagonalScale(L=D)
+        result = self.work[1].matMatMult(A00, A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockInverse(SchurComplementBuilder):
+
+    @PETSc.Log.EventDecorator("FDMGetSchur")
+    def condense(self, A, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A00, A01, A10, A11 = self.get_blocks(A)
+        indptr, indices, R = A00.getValuesCSR()
+
+        flops = 0
+        for k in sorted(self.slices):
+            zlice = self.slices[k]
+            if k == 1:
+                numpy.reciprocal(R[zlice], out=R[zlice])
+                flops += zlice.stop - zlice.start
+            else:
+                A = R[zlice].reshape((-1, k, k))
+                R[zlice] = numpy.linalg.inv(A).reshape((-1,))
+                flops += A.shape[0] * (k**3)
 
-@PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_mat(get_schur_complement, i0, i1, submats, A, result=None):
-    """Return the Schur complement associated to indices in i1, condensing i0 out"""
-    isrows = [i0, i0, i1, i1]
-    iscols = [i0, i1, i0, i1]
-    submats[:4] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:4] if submats[0] else None)
-    return get_schur_complement(submats, result=result)
-
-
-@PETSc.Log.EventDecorator("FDMCondense")
-def condense_element_pattern(i0, i1, submats, A, result=None):
-    """Add zeroes on the statically condensed pattern so that you can run ICC(0)"""
-    structure = PETSc.Mat.Structure.SUBSET if result else None
-    isrows = [i0, i0, i1]
-    iscols = [i0, i1, i0]
-    submats[:3] = A.createSubMatrices(isrows, iscols=iscols, submats=submats[:3] if submats[0] else None)
-    A00, A01, A10 = submats[:3]
-    A00.scale(0.0)
-    result = A10.matMatMult(A00, A01, result=result)
-    result.axpy(1.0, A, structure=structure)
-    return result
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        A00.scale(-1.0)
+        result = A10.matMatMult(A00, A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
 
 
 @PETSc.Log.EventDecorator("LoadCode")
@@ -978,28 +995,6 @@ def is_restricted(finat_element):
     return is_interior, is_facet
 
 
-def sort_interior_dofs(idofs, A):
-    """Permute `idofs` to have A[idofs, idofs] with square blocks of
-       increasing dimension along its diagonal."""
-    Aii = A.createSubMatrix(idofs, idofs)
-    indptr, indices, _ = Aii.getValuesCSR()
-    degree = numpy.diff(indptr)
-    perm = []
-    for k in sorted(numpy.unique(degree)):
-        if k == 1:
-            neigh = numpy.flatnonzero(degree == k)
-            degree[neigh] = 0
-            perm.extend(neigh)
-        else:
-            for i in range(len(degree)):
-                if degree[i] == k:
-                    neigh = indices[slice(*indptr[i:i+2])]
-                    degree[neigh] = 0
-                    perm.extend(neigh)
-    idofs.setIndices(idofs.getIndices()[perm])
-    Aii.destroy()
-
-
 def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
     """Convert dense numpy matrix into a sparse PETSc matrix"""
     atol = rtol * max(A_numpy.min(), A_numpy.max(), key=abs)
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 42b4319eab..3ad7db838e 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -294,15 +294,14 @@ def test_ipdg_direct_solver(fs):
 
     num_flux = lambda u: avg(penalty) * avg(outer(u, n))
     num_flux_b = lambda u: (penalty/2) * outer(u, n)
-    alpha_inner = lambda v, u: inner(v, alpha(u))
+    a_int = lambda v, u: inner(2 * avg(outer(v, n)), alpha(num_flux(u) - avg(grad(u))))
+    a_Dir = lambda v, u: inner(outer(v, n), alpha(num_flux_b(u) - grad(u)))
 
-    a_int = lambda v, u: alpha_inner(2 * avg(outer(v, n)), num_flux(u) - avg(grad(u))) * dS_int
-    a_Dir = lambda v, u: alpha_inner(outer(v, n), num_flux_b(u) - grad(u)) * ds_Dir
-
-    u = TrialFunction(fs)
     v = TestFunction(fs)
-    a = ((inner(v, dot(beta, u)) + alpha_inner(grad(v), grad(u))) * dxq
-         + a_int(v, u) + a_int(u, v) + a_Dir(v, u) + a_Dir(u, v))
+    u = TrialFunction(fs)
+    a = ((inner(v, dot(beta, u)) + inner(grad(v), alpha(grad(u)))) * dxq
+         + (a_int(v, u) + a_int(u, v)) * dS_int
+         + (a_Dir(v, u) + a_Dir(u, v)) * ds_Dir)
 
     if homogenize:
         L = 0
@@ -311,7 +310,7 @@ def test_ipdg_direct_solver(fs):
         B = dot(beta, u_exact) - div(f_exact)
         T = dot(f_exact, n)
         L = (inner(v, B)*dxq + inner(v, T)*ds_Neu
-             + alpha_inner(outer(u_exact, n), 2*num_flux_b(v) - grad(v)) * ds_Dir)
+             + inner(outer(u_exact, n), alpha(2*num_flux_b(v) - grad(v))) * ds_Dir)
 
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={

From 2ad64a5c75bf91a519f41f7b8eb2d0edade3394d Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 3 Apr 2023 14:37:46 +0100
Subject: [PATCH 59/75] small change

---
 firedrake/preconditioners/fdm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 5b4d4e8c61..549250eb5d 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -669,7 +669,7 @@ def assemble_reference_tensor(self, V, transpose=False):
             if is_facet:
                 cache[full_key] = result
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-                result = result.createSubMatrix(noperm, self.ises[1])
+                result = result.createSubMatrix(noperm, self.fises)
                 noperm.destroy()
 
             return cache.setdefault(key, result)

From 6fce996257a5942a2621521fdb17c7828d4517e2 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 4 Apr 2023 18:14:27 +0100
Subject: [PATCH 60/75] style

---
 firedrake/preconditioners/fdm.py | 210 ++++++++++++++++---------------
 1 file changed, 106 insertions(+), 104 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 549250eb5d..646feb7dbb 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -246,10 +246,10 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
                 sc_builder = SchurComplementBlockCholesky
             else:
                 sc_builder = SchurComplementBlockQR
-            self.get_static_condensation[Vfacet] = Vbig, sc_builder(idofs, fdofs, PETSc.COMM_SELF).condense
+            self.get_static_condensation[Vfacet] = Vbig, sc_builder(idofs, fdofs).condense
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            self.get_static_condensation[V] = Vbig, SchurComplementPattern(idofs, dofs, PETSc.COMM_SELF).condense
+            self.get_static_condensation[V] = Vbig, SchurComplementPattern(idofs, dofs).condense
 
         @PETSc.Log.EventDecorator("FDMGetIndices")
         def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
@@ -432,15 +432,8 @@ def get_key(*args):
 
         Vbig = None
         condense_element_mat = lambda Ae, result=None: Ae
-        set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
-        get_rindices = self.cell_to_global[Vrow]
         if Vrow == Vcol:
-            get_cindices = lambda e, result=None: result
-            update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, rindices, addv)
             Vbig, condense_element_mat = self.get_static_condensation.get(Vrow, (Vbig, condense_element_mat))
-        else:
-            get_cindices = self.cell_to_global[Vcol]
-            update_A = lambda Ae, rindices, cindices: set_submat(A, Ae, rindices, cindices, addv)
 
         Me = self._element_mass_matrix
         # Interpolation of basis and exterior derivative onto broken spaces
@@ -461,6 +454,16 @@ def get_key(*args):
         except KeyError:
             Se = self.work_mats.setdefault(key, condense_element_mat(Ae))
 
+        get_rindices = self.cell_to_global[Vrow]
+        rindices = numpy.empty(Se.getSize()[:1], dtype=PETSc.IntType)
+        if Vrow == Vcol:
+            get_cindices = lambda e, result=None: result
+            cindices = rindices
+        else:
+            get_cindices = self.cell_to_global[Vcol]
+            cindices = numpy.empty(Se.getSize()[1:], dtype=PETSc.IntType)
+
+        setSubMatCSR = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         insert = PETSc.InsertMode.INSERT
         if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
             # Empty kernel for preallocation
@@ -487,15 +490,13 @@ def element_kernel(e, result=None):
                 Me.assemble()
                 return assemble_element_mat(result=result)
 
-        cindices = None
-        rindices = None
         # Core assembly loop
         for e in range(self.nel):
-            cindices = get_cindices(e, result=cindices)
-            rindices = get_rindices(e, result=rindices)
+            get_rindices(e, result=rindices)
+            get_cindices(e, result=cindices)
             Ae = element_kernel(e, result=Ae)
             Se = condense_element_mat(Ae, result=Se)
-            update_A(Se, rindices, cindices)
+            setSubMatCSR(A, Se, rindices, cindices, addv)
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp, block_diagonal=True):
@@ -677,63 +678,71 @@ def assemble_reference_tensor(self, V, transpose=False):
 
 class SchurComplementBuilder(object):
     """
-    Class to build element Schur complement.
+    Class to build Schur complement matrices that reuses work matrices and the
+    symbolic factorization of the interior block.
     """
 
-    def __init__(self, idofs, fdofs, comm):
-        i0 = PETSc.IS().createGeneral(idofs, comm=comm)
-        i1 = PETSc.IS().createGeneral(fdofs, comm=comm)
-        self.ises = (i0, i1)
-        self.isrows = [i0, i0, i1, i1]
-        self.iscols = [i0, i1, i0, i1]
-        self.work = [None for _ in range(2)]
-        self.submats = []
+    def __init__(self, idofs, fdofs):
+        self.idofs = idofs
+        self.fdofs = fdofs
         self.slices = {}
+        self.ises = tuple()
+        self.isrows = []
+        self.iscols = []
+        self.submats = []
+        self.work = [None for _ in range(2)]
 
     def __del__(self):
-        self.ises[0].destroy()
-        self.ises[1].destroy()
-        for mat in self.submats:
-            if isinstance(mat, PETSc.Object):
-                mat.destroy()
+        self.reset()
+
+    def reset(self):
+        for obj in self.ises:
+            if isinstance(obj, PETSc.Object):
+                obj.destroy()
+        for obj in self.submats:
+            if isinstance(obj, PETSc.Object):
+                obj.destroy()
         for obj in self.work:
             if isinstance(obj, PETSc.Object):
                 obj.destroy()
+        self.submats = []
+        self.work = [None for _ in range(2)]
 
-    def sort_interior_dofs(self, idofs, A):
-        """Permute `idofs` to have A[idofs, idofs] with square blocks of
-           increasing dimension along its diagonal."""
-        Aii = A.createSubMatrix(idofs, idofs)
-        indptr, indices, _ = Aii.getValuesCSR()
+    def sort_interior_dofs(self, i0, A):
+        """Permute `i0` to have A[i0, i0] with square blocks of
+           increasing dimension along its diagonal. Add slices with the extents
+           of each set of blocks in the CSR representation of A."""
+        A00 = A.createSubMatrix(i0, i0)
+        indptr, indices, _ = A00.getValuesCSR()
         degree = numpy.diff(indptr)
-
-        perm = list(numpy.flatnonzero(degree == 1))
-        degree[perm] = 0
-
-        iend = len(perm)
-        if iend:
-            self.slices[1] = slice(0, iend)
-
-        for k in sorted(numpy.unique(degree)):
+        perm = numpy.argsort(degree)
+        icur = 0
+        istart = 0
+        self.slices[1] = slice(0, 0)
+        unique_degree, counts = numpy.unique(degree, return_counts=True)
+        for k, kdofs in sorted(zip(unique_degree, counts)):
             if k > 1:
-                nblocks = 0
-                for i in numpy.flatnonzero(degree == k):
-                    if degree[i] == k:
-                        block = indices[slice(*indptr[i:i+2])]
-                        degree[block] = 0
-                        perm.extend(block)
-                        nblocks += 1
-
-                istart = iend
-                iend += k * k * nblocks
-                self.slices[k] = slice(istart, iend)
-
-        idofs.setIndices(idofs.getIndices()[perm])
-        Aii.destroy()
+                neigh = numpy.empty((kdofs, k), dtype=indices.dtype)
+                for row in range(kdofs):
+                    i = perm[icur+row]
+                    neigh[row] = indices[slice(*indptr[i:i+2])]
+                perm[icur:icur+kdofs] = list(dict.fromkeys(neigh.flat))
+
+            self.slices[k] = slice(istart, istart + k * kdofs)
+            istart += k * kdofs
+            icur += kdofs
+        i0.setIndices(i0.getIndices()[perm])
+        A00.destroy()
 
     def get_blocks(self, A):
         if len(self.submats) == 0:
-            self.sort_interior_dofs(self.ises[0], A)
+            comm = A.getComm()
+            i0 = PETSc.IS().createGeneral(self.idofs, comm=comm)
+            i1 = PETSc.IS().createGeneral(self.fdofs, comm=comm)
+            self.sort_interior_dofs(i0, A)
+            self.isrows = [i0, i0, i1, i1]
+            self.iscols = [i0, i1, i0, i1]
+            self.ises = (i0, i1)
         self.submats = A.createSubMatrices(self.isrows, iscols=self.iscols, submats=self.submats or None)
         return self.submats
 
@@ -777,17 +786,15 @@ def condense(self, A, result=None):
         A00, A01, A10, A11 = self.get_blocks(A)
         indptr, indices, R = A00.getValuesCSR()
 
-        flops = 0
-        for k in sorted(self.slices):
+        zlice = self.slices[1]
+        numpy.sqrt(R[zlice], out=R[zlice])
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = 2 * (zlice.stop - zlice.start)
+        for k in sorted(degree for degree in self.slices if degree > 1):
             zlice = self.slices[k]
-            if k == 1:
-                numpy.sqrt(R[zlice], out=R[zlice])
-                numpy.reciprocal(R[zlice], out=R[zlice])
-                flops += 2 * (zlice.stop - zlice.start)
-            else:
-                A = R[zlice].reshape((-1, k, k))
-                R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
-                flops += A.shape[0] * ((k**3)//3 + k**3)
+            A = R[zlice].reshape((-1, k, k))
+            R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
+            flops += A.shape[0] * ((k**3)//3 + k**3)
 
         PETSc.Log.logFlops(flops)
         A00.setValuesCSR(indptr, indices, R)
@@ -808,18 +815,16 @@ def condense(self, A, result=None):
         indptr, indices, R = A00.getValuesCSR()
         Q = numpy.ones(R.shape, dtype=R.dtype)
 
-        flops = 0
-        for k in sorted(self.slices):
+        zlice = self.slices[1]
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = zlice.stop - zlice.start
+        for k in sorted(degree for degree in self.slices if degree > 1):
             zlice = self.slices[k]
-            if k == 1:
-                numpy.reciprocal(R[zlice], out=R[zlice])
-                flops += zlice.stop - zlice.start
-            else:
-                A = R[zlice].reshape((-1, k, k))
-                q, r = numpy.linalg.qr(A, mode="complete")
-                Q[zlice] = q.reshape((-1,))
-                R[zlice] = numpy.linalg.inv(r).reshape((-1,))
-                flops += A.shape[0] * ((4*k**3)//3 + k**3)
+            A = R[zlice].reshape((-1, k, k))
+            q, r = numpy.linalg.qr(A, mode="complete")
+            Q[zlice] = q.reshape((-1,))
+            R[zlice] = numpy.linalg.inv(r).reshape((-1,))
+            flops += A.shape[0] * ((4*k**3)//3 + k**3)
 
         PETSc.Log.logFlops(flops)
         A00.setValuesCSR(indptr, indices, Q)
@@ -843,21 +848,18 @@ def condense(self, A, result=None):
         V = numpy.ones(U.shape, dtype=U.dtype)
         self.work[0] = A00.getDiagonal(result=self.work[0])
         D = self.work[0]
-        dslice = self.slices.get(1, slice(0, 0))
-        flops = 0
-        for k in sorted(self.slices):
+        dslice = self.slices[1]
+        numpy.sign(D.array_r[dslice], out=U[dslice])
+        flops = dslice.stop - dslice.start
+        for k in sorted(degree for degree in self.slices if degree > 1):
             bslice = self.slices[k]
-            if k == 1:
-                numpy.sign(D.array_r[bslice], out=U[bslice])
-                flops += bslice.stop - bslice.start
-            else:
-                A = U[bslice].reshape((-1, k, k))
-                u, s, v = numpy.linalg.svd(A, full_matrices=False)
-                dslice = slice(dslice.stop, dslice.stop + k * A.shape[0])
-                D.array_w[dslice] = s.reshape((-1,))
-                U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
-                V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
-                flops += A.shape[0] * ((4*k**3)//3 + 4*k**3)
+            A = U[bslice].reshape((-1, k, k))
+            u, s, v = numpy.linalg.svd(A, full_matrices=False)
+            dslice = slice(dslice.stop, dslice.stop + k * A.shape[0])
+            D.array_w[dslice] = s.reshape((-1,))
+            U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
+            V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
+            flops += A.shape[0] * ((4*k**3)//3 + 4*k**3)
 
         PETSc.Log.logFlops(flops)
         D.sqrtabs()
@@ -883,16 +885,14 @@ def condense(self, A, result=None):
         A00, A01, A10, A11 = self.get_blocks(A)
         indptr, indices, R = A00.getValuesCSR()
 
-        flops = 0
-        for k in sorted(self.slices):
+        zlice = self.slices[1]
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = zlice.stop - zlice.start
+        for k in sorted(degree for degree in self.slices if degree > 1):
             zlice = self.slices[k]
-            if k == 1:
-                numpy.reciprocal(R[zlice], out=R[zlice])
-                flops += zlice.stop - zlice.start
-            else:
-                A = R[zlice].reshape((-1, k, k))
-                R[zlice] = numpy.linalg.inv(A).reshape((-1,))
-                flops += A.shape[0] * (k**3)
+            A = R[zlice].reshape((-1, k, k))
+            R[zlice] = numpy.linalg.inv(A).reshape((-1,))
+            flops += A.shape[0] * (k**3)
 
         PETSc.Log.logFlops(flops)
         A00.setValuesCSR(indptr, indices, R)
@@ -1001,9 +1001,11 @@ def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
     sparsity = abs(A_numpy) > atol
     nnz = numpy.count_nonzero(sparsity, axis=1).astype(PETSc.IntType)
     A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
-    for row, (Arow, Srow) in enumerate(zip(A_numpy, sparsity)):
-        cols = numpy.argwhere(Srow).astype(PETSc.IntType).flat
-        A.setValues(row, cols, Arow[cols], PETSc.InsertMode.INSERT)
+    rows, cols = numpy.nonzero(sparsity)
+    rows = rows.astype(PETSc.IntType)
+    cols = cols.astype(PETSc.IntType)
+    vals = A_numpy[sparsity]
+    A.setValuesRCV(rows[:, None], cols[:, None], vals[:, None], PETSc.InsertMode.INSERT)
     A.assemble()
     return A
 

From e5dfbc80de87191088f811d4281417e0c172d0e7 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 5 Apr 2023 16:06:35 +0100
Subject: [PATCH 61/75] cleanup

---
 firedrake/preconditioners/fdm.py |  57 +++++----
 firedrake/preconditioners/pmg.py | 199 +++++++++++++------------------
 2 files changed, 111 insertions(+), 145 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 646feb7dbb..f861132fd1 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -258,9 +258,10 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             return lgmap.apply(result, result=result)
 
         # Create data structures needed for assembly
+        bc_rows = {}
+        bc_vals = {}
         self.cell_to_global = {}
         self.lgmaps = {}
-        bc_rows = {}
         for Vsub in V:
             lgmap = Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub])
             bsize = Vsub.dof_dset.layout_vec.getBlockSize()
@@ -269,8 +270,9 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             self.lgmaps[Vsub] = lgmap
 
             own = Vsub.dof_dset.layout_vec.getLocalSize()
-            bdofs = numpy.flatnonzero(lgmap.indices[:own] < 0).astype(PETSc.IntType)
+            bdofs = numpy.flatnonzero(lgmap.indices[:own] < 0).astype(PETSc.IntType)[:, None]
             bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
+            bc_vals[Vsub] = numpy.ones(bdofs.shape, dtype=PETSc.RealType)
         self.nel = nel
 
         coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
@@ -342,11 +344,11 @@ def assemble_P():
                 P = Pmats[Vrow, Vcol]
                 if P.getType().endswith("aij"):
                     P.zeroEntries()
-                    if Vrow == Vcol and len(bc_rows[Vrow]) > 0:
-                        rows = bc_rows[Vrow][:, None]
-                        vals = numpy.ones(rows.shape, dtype=PETSc.RealType)
-                        P.setValuesRCV(rows, rows, vals, addv)
                     self.set_values(P, Vrow, Vcol, addv)
+            for Vrow in Vsort:
+                rows = bc_rows[Vrow]
+                if len(rows) > 0:
+                    Pmats[Vrow, Vrow].setValuesRCV(rows, rows, bc_vals[Vrow], addv)
             Pmat.assemble()
 
         return Pmat, assemble_P
@@ -1328,8 +1330,8 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         # assemble zero-th order term separately, including off-diagonals (mixed components)
         # I cannot do this for hdiv elements as off-diagonals are not sparse, this is because
         # the FDM eigenbases for CG(k) and CG(k-1) are not orthogonal to each other
-        rindices = None
         use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2 or static_condensation
+        rindices = None
         if not use_diag_Bq:
             bshape = Bq.ufl_shape
             # Be = Bhat kron ... kron Bhat
@@ -1351,60 +1353,57 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             Bq = None
 
         # assemble the second order term and the zero-th order term if any,
-        # discarding mixed derivatives and mixed componentsget_weak_bc_flags(J)
-        mue = numpy.zeros((ncomp, tdim), dtype=PETSc.RealType)
-        bqe = numpy.zeros((ncomp,), dtype=PETSc.RealType)
-
+        # discarding mixed derivatives and mixed components
+        ae = numpy.zeros((ncomp, tdim), dtype=PETSc.RealType)
+        be = numpy.zeros((ncomp,), dtype=PETSc.RealType)
+        je = None
         for e in range(self.nel):
-            je = index_coef(e)
+            je = index_coef(e, result=je)
             bce = bcflags.dat.data_ro_with_halos[index_bc(e)] > 1E-8
+            # get coefficients on this cell
+            if Gq is not None:
+                numpy.sum(Gq.dat.data_ro[je], axis=0, out=ae)
+            if Bq is not None:
+                numpy.sum(Bq.dat.data_ro[je], axis=0, out=be)
 
             rindices = get_rindices(e, result=rindices)
             rows = numpy.reshape(rindices, (-1, bsize))
             rows = numpy.transpose(rows)
             rows = numpy.reshape(rows, (ncomp, -1))
-
-            # get second order coefficient on this cell
-            if Gq is not None:
-                numpy.sum(Gq.dat.data_ro[je], axis=0, out=mue)
-            # get zero-th order coefficient on this cell
-            if Bq is not None:
-                numpy.sum(Bq.dat.data_ro[je], axis=0, out=bqe)
-
+            # for each component: compute the stiffness matrix Ae
             for k in range(ncomp):
                 # permutation of axes with respect to the first vector component
                 axes = numpy.roll(numpy.arange(tdim), -shift[k])
-                # for each component: compute the stiffness matrix Ae
                 bck = bce[:, k] if len(bce.shape) == 2 else bce
                 fbc = numpy.dot(bck, flag2id)
 
                 if Gq is not None:
-                    # Ae = mue[k][0] Ahat + bqe[k] Bhat
+                    # Ae = ae[k][0] Ahat + be[k] Bhat
                     Be = Afdm[axes[0]][0].copy()
                     Ae = Afdm[axes[0]][1+fbc[0]].copy()
-                    Ae.scale(mue[k][0])
+                    Ae.scale(ae[k][0])
                     if Bq is not None:
-                        Ae.axpy(bqe[k], Be)
+                        Ae.axpy(be[k], Be)
 
                     if tdim > 1:
-                        # Ae = Ae kron Bhat + mue[k][1] Bhat kron Ahat
+                        # Ae = Ae kron Bhat + ae[k][1] Bhat kron Ahat
                         Ae = Ae.kron(Afdm[axes[1]][0])
                         if Gq is not None:
-                            Ae.axpy(mue[k][1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
+                            Ae.axpy(ae[k][1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
 
                         if tdim > 2:
-                            # Ae = Ae kron Bhat + mue[k][2] Bhat kron Bhat kron Ahat
+                            # Ae = Ae kron Bhat + ae[k][2] Bhat kron Bhat kron Ahat
                             Be = Be.kron(Afdm[axes[1]][0])
                             Ae = Ae.kron(Afdm[axes[2]][0])
                             if Gq is not None:
-                                Ae.axpy(mue[k][2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
+                                Ae.axpy(ae[k][2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
                     Be.destroy()
 
                 elif Bq is not None:
                     Ae = Afdm[axes[0]][0]
                     for m in range(1, tdim):
                         Ae = Ae.kron(Afdm[axes[m]][0])
-                    Ae.scale(bqe[k])
+                    Ae.scale(be[k])
 
                 Ae = condense_element_mat(Ae)
                 update_A(A, Ae, rows[k].astype(PETSc.IntType))
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 0d3544f26c..8719c76ece 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -1,16 +1,16 @@
 from functools import partial, lru_cache
 from itertools import chain
-from firedrake.petsc import PETSc
-from firedrake.preconditioners.base import PCBase, SNESBase, PCSNESBase
 from firedrake.dmhooks import (attach_hooks, get_appctx, push_appctx, pop_appctx,
                                add_hook, get_parent, push_parent, pop_parent,
                                get_function_space, set_function_space)
-from firedrake.solving_utils import _SNESContext
+from firedrake.petsc import PETSc
+from firedrake.preconditioners.base import PCBase, SNESBase, PCSNESBase
 from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
+from firedrake.solving_utils import _SNESContext
 from firedrake.tsfc_interface import extract_numbered_coefficients
 from firedrake.utils import ScalarType_c, IntType_c, cached_property
-from tsfc import compile_expression_dual_evaluation
 from tsfc.finatinterface import create_element
+from tsfc import compile_expression_dual_evaluation
 from pyop2 import op2
 
 import firedrake
@@ -36,7 +36,8 @@ class PMGBase(PCSNESBase):
 
     Other PETSc options inspected by this class are:
     - 'pmg_mg_coarse_degree': polynomial degree of the coarse level
-    - 'pmg_mg_coarse_mat_type': can be either 'aij' or 'matfree'
+    - 'pmg_mg_coarse_mat_type': can be either a `PETSc.Mat.Type`, or 'matfree'
+    - 'pmg_mg_coarse_pmat_type': can be either a `PETSc.Mat.Type`, or 'matfree'
     - 'pmg_mg_coarse_form_compiler_mode': can be 'spectral' (default), 'vanilla', 'coffee', or 'tensor'
     - 'pmg_mg_levels_transfer_mat_type': can be either 'aij' or 'matfree'
 
@@ -90,6 +91,8 @@ def initialize(self, obj):
             raise ValueError("No context found.")
         if not isinstance(ctx, _SNESContext):
             raise ValueError("Don't know how to get form from %r" % ctx)
+        fcp = ctx._problem.form_compiler_parameters
+        mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
 
         test, trial = ctx.J.arguments()
         if test.function_space() != trial.function_space():
@@ -103,11 +106,8 @@ def initialize(self, obj):
         ppc = self.configure_pmg(obj, pdm)
         self.is_snes = isinstance(obj, PETSc.SNES)
 
-        copts = PETSc.Options(ppc.getOptionsPrefix() + ppc.getType() + "_coarse_")
-
         # Get the coarse degree from PETSc options
-        fcp = ctx._problem.form_compiler_parameters
-        mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
+        copts = PETSc.Options(ppc.getOptionsPrefix() + ppc.getType() + "_coarse_")
         self.coarse_degree = copts.getInt("degree", default=1)
         self.coarse_mat_type = copts.getString("mat_type", default=ctx.mat_type)
         self.coarse_pmat_type = copts.getString("pmat_type", default=self.coarse_mat_type)
@@ -234,11 +234,7 @@ def _coarsen_form(a):
         except ValueError:
             mat_type = self.coarse_mat_type
             pmat_type = self.coarse_pmat_type
-            if fcp is None:
-                fcp = dict()
-            elif fcp is fproblem.form_compiler_parameters:
-                fcp = dict(fcp)
-            fcp["mode"] = self.coarse_form_compiler_mode
+            fcp = dict(fcp or {}, mode=self.coarse_form_compiler_mode)
 
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
@@ -276,63 +272,17 @@ def inject_state():
 
             add_hook(parent, setup=inject_state, call_setup=True)
 
-        # Coarsen the nullspace basis
-        def coarsen_nullspace(coarse_V, interpolate, fine_nullspace):
-            if isinstance(fine_nullspace, MixedVectorSpaceBasis):
-                if interpolate.getType() == "python":
-                    interpolate = interpolate.getPythonContext()
-                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
-                coarse_bases = []
-                for fs, submat, basis in zip(coarse_V, submats, fine_nullspace._bases):
-                    if isinstance(basis, VectorSpaceBasis):
-                        coarse_bases.append(coarsen_nullspace(fs, submat, basis))
-                    else:
-                        coarse_bases.append(coarse_V.sub(basis.index))
-                return MixedVectorSpaceBasis(coarse_V, coarse_bases)
-            elif isinstance(fine_nullspace, VectorSpaceBasis):
-                coarse_vecs = []
-                for xf in fine_nullspace._petsc_vecs:
-                    wc = firedrake.Function(coarse_V)
-                    with wc.dat.vec_wo as xc:
-                        interpolate.multTranspose(xf, xc)
-                    coarse_vecs.append(wc)
-                vsb = VectorSpaceBasis(coarse_vecs, constant=fine_nullspace._constant)
-                vsb.orthonormalize()
-                return vsb
-            else:
-                return fine_nullspace
-
         interpolate = None
         if fctx._nullspace or fctx._nullspace_T or fctx._near_nullspace:
             interpolate, _ = cdm.createInterpolation(fdm)
-        cctx._nullspace = coarsen_nullspace(cV, interpolate, fctx._nullspace)
-        if fctx._nullspace_T is fctx._nullspace:
-            cctx._nullspace_T = cctx._nullspace
-        else:
-            cctx._nullspace_T = coarsen_nullspace(cV, interpolate, fctx._nullspace_T)
-        if fctx._near_nullspace is fctx._nullspace:
-            cctx._near_nullspace = cctx._nullspace
-        elif fctx._near_nullspace is fctx._nullspace_T:
-            cctx._near_nullspace = cctx._nullspace_T
-        else:
-            cctx._near_nullspace = coarsen_nullspace(cV, interpolate, fctx._near_nullspace)
-
+        cctx._nullspace = self.coarsen_nullspace(cV, interpolate, fctx._nullspace)
+        cctx._nullspace_T = self.coarsen_nullspace(cV, interpolate, fctx._nullspace_T)
+        cctx._near_nullspace = self.coarsen_nullspace(cV, interpolate, fctx._near_nullspace)
         cctx.set_nullspace(cctx._nullspace, cV._ises, transpose=False, near=False)
         cctx.set_nullspace(cctx._nullspace_T, cV._ises, transpose=True, near=False)
         cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
         return cdm
 
-    def coarsen_quadrature(self, metadata, fdeg, cdeg):
-        if isinstance(metadata, dict):
-            # Coarsen the quadrature degree in a dictionary
-            # preserving the ratio of quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1)
-            qdeg = metadata.get("quadrature_degree", None)
-            if qdeg is not None:
-                cmd = dict(metadata)
-                cmd["quadrature_degree"] = max(2*cdeg+1, ((qdeg+1)*(cdeg+1)+fdeg)//(fdeg+1)-1)
-                return cmd
-        return metadata
-
     def coarsen_bcs(self, fbcs, cV):
         cbcs = []
         for bc in fbcs:
@@ -346,13 +296,55 @@ def coarsen_bcs(self, fbcs, cV):
                 raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
         return cbcs
 
+    def coarsen_quadrature(self, metadata, fdeg, cdeg):
+        """Coarsen the quadrature degree in a dictionary preserving the ratio of
+           quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1)."""
+        try:
+            qdeg = metadata["quadrature_degree"]
+            coarse_qdeg = max(2*cdeg+1, ((qdeg+1)*(cdeg+1)+fdeg)//(fdeg+1)-1)
+            return dict(metadata, quadrature_degree=coarse_qdeg)
+        except (KeyError, TypeError):
+            return metadata
+
+    def coarsen_nullspace(self, coarse_V, interpolate, fine_nullspace):
+        """Coarsen a nullspace or retrieve it from class cache"""
+        cache = self._cache.setdefault("nullspace", {})
+        key = (coarse_V.ufl_element(), fine_nullspace)
+        try:
+            return cache[key]
+        except KeyError:
+            if isinstance(fine_nullspace, MixedVectorSpaceBasis):
+                if interpolate.getType() == "python":
+                    interpolate = interpolate.getPythonContext()
+                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
+                coarse_bases = []
+                for fs, submat, basis in zip(coarse_V, submats, fine_nullspace._bases):
+                    if isinstance(basis, VectorSpaceBasis):
+                        coarse_bases.append(self.coarsen_nullspace(fs, submat, basis))
+                    else:
+                        coarse_bases.append(coarse_V.sub(basis.index))
+                coarse_nullspace = MixedVectorSpaceBasis(coarse_V, coarse_bases)
+            elif isinstance(fine_nullspace, VectorSpaceBasis):
+                coarse_vecs = []
+                for xf in fine_nullspace._petsc_vecs:
+                    wc = firedrake.Function(coarse_V)
+                    with wc.dat.vec_wo as xc:
+                        # the nullspace basis is in the dual of V
+                        interpolate.multTranspose(xf, xc)
+                    coarse_vecs.append(wc)
+                coarse_nullspace = VectorSpaceBasis(coarse_vecs, constant=fine_nullspace._constant)
+                coarse_nullspace.orthonormalize()
+            else:
+                return fine_nullspace
+            return cache.setdefault(key, coarse_nullspace)
+
     def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
-        # Create a transfer or retrieve it from the class cache
+        """Create a transfer or retrieve it from class cache"""
         cV = cctx.J.arguments()[0].function_space()
         fV = fctx.J.arguments()[0].function_space()
         cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
         fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
-        key = (mat_type, cV, fV, cbcs, fbcs)
+        key = (mat_type, fV.mesh(), cV.ufl_element(), fV.ufl_element(), cbcs, fbcs)
         cache = self._cache.setdefault("transfer", {})
         try:
             return cache[key]
@@ -379,9 +371,7 @@ def create_injection(self, dmc, dmf):
 
     @staticmethod
     def max_degree(ele):
-        """
-        Return the maximum degree of a :class:`ufl.FiniteElement`
-        """
+        """Return the maximum degree of a :class:`ufl.FiniteElement`"""
         if isinstance(ele, (ufl.VectorElement, ufl.TensorElement)):
             return PMGBase.max_degree(ele._sub_element)
         elif isinstance(ele, (ufl.MixedElement, ufl.TensorProductElement)):
@@ -537,7 +527,8 @@ def prolongation_transfer_kernel_action(Vf, expr):
 
 
 def expand_element(ele):
-    # Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
+    """Expand a FiniteElement as an EnrichedElement of TensorProductElements,
+       discarding modifiers."""
     if isinstance(ele, finat.FlattenedDimensions):
         return expand_element(ele.product)
     elif isinstance(ele, (finat.HDivElement, finat.HCurlElement)):
@@ -554,7 +545,7 @@ def expand_element(ele):
             new_terms = []
             for f in e.elements if isinstance(e, finat.EnrichedElement) else [e]:
                 f_factors = tuple(f.factors) if isinstance(f, finat.TensorProductElement) else (f,)
-                new_terms.extend([t_factors + f_factors for t_factors in terms])
+                new_terms.extend(t_factors + f_factors for t_factors in terms)
             terms = new_terms
         terms = list(map(finat.TensorProductElement, terms))
         return finat.EnrichedElement(terms)
@@ -579,6 +570,8 @@ def evaluate_dual(source, target, alpha=None):
 
 
 def compare_element(e1, e2):
+    """Numerically compare two :class:`FIAT.elements`.
+       Equality is satisfied if e2.dual_basis(e1.primal_basis) == identity."""
     if e1 is e2:
         return True
     if e1.space_dimension() != e2.space_dimension():
@@ -588,37 +581,10 @@ def compare_element(e1, e2):
     return numpy.allclose(B, 0.0, rtol=1E-14, atol=1E-14)
 
 
-def compare_dual(b1, b2):
-    p1 = b1.get_point_dict()
-    p2 = b2.get_point_dict()
-    if len(p1) != len(p2):
-        return False
-
-    k1 = numpy.array(list(p1.keys()))
-    k2 = numpy.array(list(p2.keys()))
-    if not numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16):
-        return False
-
-    k1 = numpy.array([p1[k][0][0] for k in p1])
-    k2 = numpy.array([p2[k][0][0] for k in p2])
-    return numpy.allclose(k1, k2, rtol=1E-16, atol=1E-16)
-
-
-def compare_dual_basis(l1, l2):
-    if len(l1) != len(l2):
-        return False
-    return all(compare_dual(b1, b2) for b1, b2 in zip(l1, l2))
-
-
 @lru_cache(maxsize=10)
 def fiat_reference_prolongator(celem, felem, derivative=False):
-    ckey = (felem.formdegree,) if derivative else None
-    fkey = (celem.formdegree,) if derivative else None
-    fdual = felem.dual_basis()
-    cdual = celem.dual_basis()
-    if fkey == ckey and (celem is felem or compare_dual_basis(cdual, fdual)):
-        return numpy.array([])
-    return evaluate_dual(celem, felem, alpha=ckey)
+    alpha = (1,) if derivative else None
+    return evaluate_dual(celem, felem, alpha=alpha)
 
 
 @lru_cache(maxsize=10)
@@ -704,6 +670,17 @@ def get_permutation_to_line_elements(finat_element):
     return dof_perm, unique_line_elements, shifts
 
 
+def get_permuted_map(V):
+    """
+    Return a PermutedMap with the same tensor product shape for
+    every component of H(div) or H(curl) tensor product elements
+    """
+    indices, _, _ = get_permutation_to_line_elements(V.finat_element)
+    if numpy.all(indices[:-1] < indices[1:]):
+        return V.cell_node_map()
+    return op2.PermutedMap(V.cell_node_map(), indices)
+
+
 # Common kernel to compute y = kron(A3, kron(A2, A1)) * x
 # Vector and tensor field generalization from Deville, Fischer, and Mund section 8.3.1.
 kronmxv_code = """
@@ -929,6 +906,7 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
     fshapes = []
     cshapes = []
     has_code = False
+    identity_filter = lambda A: numpy.array([]) if A.shape[0] == A.shape[1] and numpy.allclose(A, numpy.eye(A.shape[0])) else A
     for celem, felem, shift in zip(celems, felems, shifts):
         if len(felem) != len(celem):
             raise ValueError("Fine and coarse elements do not have the same number of factors")
@@ -942,7 +920,7 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
         fshapes.append((nscal,) + tuple(fshape))
         cshapes.append((nscal,) + tuple(cshape))
 
-        J = [fiat_reference_prolongator(ce, fe).T for ce, fe in zip(celem, felem)]
+        J = [identity_filter(fiat_reference_prolongator(ce, fe)).T for ce, fe in zip(celem, felem)]
         if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
             prolong_code.append(f"""
             for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
@@ -1139,17 +1117,6 @@ def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
     return decl, prolong, restrict
 
 
-def get_permuted_map(V):
-    """
-    Return a PermutedMap with the same tensor product shape for
-    every component of H(div) or H(curl) tensor product elements
-    """
-    indices, _, _ = get_permutation_to_line_elements(V.finat_element)
-    if numpy.all(indices[:-1] < indices[1:]):
-        return V.cell_node_map()
-    return op2.PermutedMap(V.cell_node_map(), indices)
-
-
 class StandaloneInterpolationMatrix(object):
     """
     Interpolation matrix for a single standalone space.
@@ -1168,11 +1135,11 @@ def __init__(self, Vc, Vf, Vc_bcs, Vf_bcs):
     def work_function(self, V):
         if isinstance(V, firedrake.Function):
             return V
-        else:
-            try:
-                return self._cache_work[V]
-            except KeyError:
-                return self._cache_work.setdefault(V, firedrake.Function(V))
+        key = (V.ufl_element(), V.mesh())
+        try:
+            return self._cache_work[key]
+        except KeyError:
+            return self._cache_work.setdefault(key, firedrake.Function(V))
 
     @cached_property
     def _weight(self):

From 6b701743d928df6e74764745127b7f702f39d367 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 5 Apr 2023 21:21:47 +0100
Subject: [PATCH 62/75] list of assembly callables for matrix blocks

---
 firedrake/preconditioners/fdm.py | 78 ++++++++++++++------------------
 1 file changed, 35 insertions(+), 43 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index f861132fd1..f0655ec48f 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -173,7 +173,7 @@ def initialize(self, pc):
                 self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.allocate_matrix(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
+        Pmat, self.assembly_callables = self.allocate_matrix(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
         Pmat.setNearNullSpace(Amat.getNearNullSpace())
@@ -211,7 +211,7 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         :arg pmat_type: the preconditioner `PETSc.Mat.Type`
         :arg use_static_condensation: are we assembling the statically-condensed Schur complement on facets?
 
-        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
+        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and a list of assembly callables
         """
         ifacet = [i for i, Vsub in enumerate(V) if is_restricted(Vsub.finat_element)[1]]
         if len(ifacet) == 0:
@@ -258,8 +258,6 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             return lgmap.apply(result, result=result)
 
         # Create data structures needed for assembly
-        bc_rows = {}
-        bc_vals = {}
         self.cell_to_global = {}
         self.lgmaps = {}
         for Vsub in V:
@@ -268,11 +266,6 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             cell_to_local, nel = extrude_node_map(Vsub.cell_node_map(), bsize=bsize)
             self.cell_to_global[Vsub] = partial(cell_to_global, lgmap, cell_to_local)
             self.lgmaps[Vsub] = lgmap
-
-            own = Vsub.dof_dset.layout_vec.getLocalSize()
-            bdofs = numpy.flatnonzero(lgmap.indices[:own] < 0).astype(PETSc.IntType)[:, None]
-            bc_rows[Vsub] = Vsub.dof_dset.lgmap.apply(bdofs, result=bdofs)
-            bc_vals[Vsub] = numpy.ones(bdofs.shape, dtype=PETSc.RealType)
         self.nel = nel
 
         coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
@@ -329,6 +322,16 @@ def get_coeffs(e, result=None):
                 if ptype.endswith("sbaij"):
                     P.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
                 P.setUp()
+                # append callables to zero entries, insert element matrices, and apply BCs
+                assembly_callables.append(P.zeroEntries)
+                assembly_callables.append(partial(self.set_values, P, Vrow, Vcol, addv))
+                if on_diag:
+                    own = Vrow.dof_dset.layout_vec.getLocalSize()
+                    bdofs = numpy.flatnonzero(self.lgmaps[Vrow].indices[:own] < 0).astype(PETSc.IntType)[:, None]
+                    Vrow.dof_dset.lgmap.apply(bdofs, result=bdofs)
+                    if len(bdofs) > 0:
+                        vals = numpy.ones(bdofs.shape, dtype=PETSc.RealType)
+                        assembly_callables.append(partial(P.setValuesRCV, bdofs, bdofs, vals, addv))
             Pmats[Vrow, Vcol] = P
 
         if len(V) == 1:
@@ -336,22 +339,13 @@ def get_coeffs(e, result=None):
         else:
             Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=self.comm)
 
-        @PETSc.Log.EventDecorator("FDMAssemble")
-        def assemble_P():
-            for _assemble in assembly_callables:
-                _assemble()
-            for Vrow, Vcol in product(Vsort, Vsort):
-                P = Pmats[Vrow, Vcol]
-                if P.getType().endswith("aij"):
-                    P.zeroEntries()
-                    self.set_values(P, Vrow, Vcol, addv)
-            for Vrow in Vsort:
-                rows = bc_rows[Vrow]
-                if len(rows) > 0:
-                    Pmats[Vrow, Vrow].setValuesRCV(rows, rows, bc_vals[Vrow], addv)
-            Pmat.assemble()
-
-        return Pmat, assemble_P
+        assembly_callables.append(Pmat.assemble)
+        return Pmat, assembly_callables
+
+    @PETSc.Log.EventDecorator("FDMAssemble")
+    def _assemble_P(self):
+        for _assemble in self.assembly_callables:
+            _assemble()
 
     @PETSc.Log.EventDecorator("FDMUpdate")
     def update(self, pc):
@@ -551,12 +545,11 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
 
         # Construct Z = broken(V^k) * broken(V^{k+1})
         V = args_J[0].function_space()
-        formdegree = V.finat_element.formdegree
-        degree = e.degree()
-        try:
-            degree = max(degree)
-        except TypeError:
-            pass
+        fe = V.finat_element
+        formdegree = fe.formdegree
+        degree = fe.degree
+        if type(degree) != int:
+            degree, = set(degree)
         qdeg = degree
         if formdegree == tdim:
             qfam = "DG" if tdim == 1 else "DQ"
@@ -615,18 +608,17 @@ def assemble_reference_tensor(self, V, transpose=False):
         :returns: a :class:`PETSc.Mat` interpolating V^k * d(V^k) onto
                   broken(V^k) * broken(V^{k+1}) on the reference element.
         """
-        tdim = V.mesh().topological_dimension()
         value_size = V.value_size
-        formdegree = V.finat_element.formdegree
-        degree = V.finat_element.degree
-        try:
-            degree = max(degree)
-        except TypeError:
-            pass
+        fe = V.finat_element
+        tdim = fe.cell.get_spatial_dimension()
+        formdegree = fe.formdegree
+        degree = fe.degree
+        if type(degree) != int:
+            degree, = set(degree)
         if formdegree == tdim:
             degree = degree + 1
-        is_interior, is_facet = is_restricted(V.finat_element)
-        key = (degree, tdim, formdegree, value_size, is_interior, is_facet, transpose)
+        is_interior, is_facet = is_restricted(fe)
+        key = (value_size, tdim, degree, formdegree, is_interior, is_facet, transpose)
         cache = self._cache.setdefault("reference_tensor", {})
         try:
             return cache[key]
@@ -636,7 +628,7 @@ def assemble_reference_tensor(self, V, transpose=False):
                 result = PETSc.Mat().createTranspose(result).convert(result.getType())
                 return cache.setdefault(key, result)
 
-            full_key = (degree, tdim, formdegree, value_size, False, False, False)
+            full_key = key[:-3] + (False,) * 3
             if is_facet and full_key in cache:
                 result = cache[full_key]
                 noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
@@ -644,7 +636,7 @@ def assemble_reference_tensor(self, V, transpose=False):
                 noperm.destroy()
                 return cache.setdefault(key, result)
 
-            elements = sorted(get_base_elements(V.finat_element), key=lambda e: e.formdegree)
+            elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
             ref_el = elements[0].get_reference_element()
             eq = FIAT.FDMQuadrature(ref_el, degree)
             e0 = elements[0] if elements[0].formdegree == 0 else FIAT.FDMLagrange(ref_el, degree)
@@ -1217,7 +1209,7 @@ def get_base_elements(e):
         return sum(list(map(get_base_elements, e.elements)), [])
     elif isinstance(e, finat.TensorProductElement):
         return sum(list(map(get_base_elements, e.factors)), [])
-    elif isinstance(e, finat.cube.FlattenedDimensions):
+    elif isinstance(e, finat.FlattenedDimensions):
         return get_base_elements(e.product)
     elif isinstance(e, (finat.HCurlElement, finat.HDivElement)):
         return get_base_elements(e.wrappee)

From 63b82ec40223c0193eeea4b4a61ddf0cba271fb2 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Sat, 8 Apr 2023 11:52:09 +0100
Subject: [PATCH 63/75] use ElementKernel and SparseAssembler classes

---
 firedrake/preconditioners/fdm.py | 715 ++++++++++++++++---------------
 1 file changed, 379 insertions(+), 336 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index f0655ec48f..56eb9defad 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -21,11 +21,12 @@
 from pyop2.utils import get_petsc_dir
 
 import firedrake.dmhooks as dmhooks
-import ctypes
-import numpy
 import ufl
 import FIAT
 import finat
+import numpy
+import ctypes
+import operator
 
 Citations().add("Brubeck2022a", """
 @article{Brubeck2022a,
@@ -78,22 +79,6 @@ class FDMPC(PCBase):
     _citation = "Brubeck2022b"
     _cache = {}
 
-    @staticmethod
-    def setSubMatCSR(comm, triu=False):
-        """
-        Compile C code to insert sparse submatrices and store in class cache
-
-        :arg triu: are we inserting onto the upper triangular part of the matrix?
-
-        :returns: a python wrapper for the matrix insertion function
-        """
-        cache = FDMPC._cache.setdefault("setSubMatCSR", {})
-        key = triu
-        try:
-            return cache[key]
-        except KeyError:
-            return cache.setdefault(key, load_setSubMatCSR(comm, triu))
-
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
         Citations().register(self._citation)
@@ -236,8 +221,9 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
         idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
 
-        # Dictionary with the parent space and a method to form the Schur complement
-        self.get_static_condensation = {}
+        # Dictionaries with the parent space and kernel to compute the Schur complement
+        self.parent_space = {}
+        self.schur_kernel = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
             if Vfacet.finat_element.formdegree == 0 and value_size == 1:
@@ -246,45 +232,18 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
                 sc_builder = SchurComplementBlockCholesky
             else:
                 sc_builder = SchurComplementBlockQR
-            self.get_static_condensation[Vfacet] = Vbig, sc_builder(idofs, fdofs).condense
+            self.schur_kernel[Vfacet] = partial(sc_builder, idofs, fdofs)
+            self.parent_space[Vfacet] = Vbig
+
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            self.get_static_condensation[V] = Vbig, SchurComplementPattern(idofs, dofs).condense
-
-        @PETSc.Log.EventDecorator("FDMGetIndices")
-        def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
-            # Be careful not to create new arrays
-            result = cell_to_local(cell_index, result=result)
-            return lgmap.apply(result, result=result)
+            self.schur_kernel[V] = partial(SchurComplementKernel, idofs, dofs)
+            self.parent_space[V] = V
 
         # Create data structures needed for assembly
-        self.cell_to_global = {}
-        self.lgmaps = {}
-        for Vsub in V:
-            lgmap = Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub])
-            bsize = Vsub.dof_dset.layout_vec.getBlockSize()
-            cell_to_local, nel = extrude_node_map(Vsub.cell_node_map(), bsize=bsize)
-            self.cell_to_global[Vsub] = partial(cell_to_global, lgmap, cell_to_local)
-            self.lgmaps[Vsub] = lgmap
-        self.nel = nel
-
-        coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
-        coeffs = [coefficients.get(name) for name in ("beta", "alpha")]
-        cdata = [c.dat.data_ro for c in coeffs]
-        cmaps = [extrude_node_map(c.cell_node_map())[0] for c in coeffs]
-        cindices = [cmap(0) if self.nel else None for cmap in cmaps]
-
-        @PETSc.Log.EventDecorator("FDMGetCoeffs")
-        def get_coeffs(e, result=None):
-            # Get vector for betas and alphas on a cell
-            if result is None:
-                return numpy.concatenate([c[cmap(e, result=idx)] for c, cmap, idx in zip(cdata, cmaps, cindices)], out=result)
-            numpy.take(cdata[0], cmaps[0](e, result=cindices[0]), axis=0, out=result[:cindices[0].size])
-            numpy.take(cdata[1], cmaps[1](e, result=cindices[1]), axis=0, out=result[cindices[0].size:])
-            return result
-
-        self.get_coeffs = get_coeffs
-        self.work_mats = {}
+        self.lgmaps = {Vsub: Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub]) for Vsub in V}
+        self.coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
+        self.assemblers = {}
 
         Pmats = {}
         addv = PETSc.InsertMode.ADD_VALUES
@@ -308,6 +267,7 @@ def get_coeffs(e, result=None):
                 preallocator.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, False)
                 preallocator.setUp()
                 self.set_values(preallocator, Vrow, Vcol, addv, triu=triu)
+
                 preallocator.assemble()
                 d_nnz, o_nnz = get_preallocation(preallocator, sizes[0][0])
                 preallocator.destroy()
@@ -385,114 +345,6 @@ def destroy(self, pc):
         if hasattr(self, "pc"):
             self.pc.getOperators()[-1].destroy()
             self.pc.destroy()
-        if hasattr(self, "work_mats"):
-            for mat in self.work_mats.values():
-                mat.destroy()
-
-    @cached_property
-    def _element_mass_matrix(self):
-        data = self.get_coeffs(0)
-        data.fill(1.0E0)
-        shape = data.shape + (1,)*(3-len(data.shape))
-        nrows = shape[0] * shape[1]
-        ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
-        aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
-        if shape[2] > 1:
-            ai *= shape[2]
-            data = numpy.tile(numpy.eye(shape[2], dtype=data.dtype), shape[:1] + (1,)*(len(shape)-1))
-        Me = PETSc.Mat().createAIJ((nrows, nrows), bsize=shape[2], csr=(ai, aj, data), comm=PETSc.COMM_SELF)
-        return self.work_mats.setdefault("mass_matrix", Me)
-
-    @cached_property
-    def _element_mass_diagonal(self):
-        return self.work_mats.setdefault("mass_diagonal", self._element_mass_matrix.getDiagonal())
-
-    @PETSc.Log.EventDecorator("FDMSetValues")
-    def set_values(self, A, Vrow, Vcol, addv, triu=False):
-        """
-        Assemble the stiffness matrix in the FDM basis using sparse reference
-        tensors and diagonal mass matrices.
-
-        :arg A: the :class:`PETSc.Mat` to assemble
-        :arg Vrow: the :class:`.FunctionSpace` test space
-        :arg Vcol: the :class:`.FunctionSpace` trial space
-        :arg addv: a `PETSc.Mat.InsertMode`
-        :arg triu: are we assembling only the upper triangular part?
-        """
-        if self.nel == 0:
-            # This MPI rank does not own any elements, nothing to be done
-            return
-
-        def get_key(*args):
-            return tuple(map(lambda V: V.ufl_element() if V else None, args))
-
-        Vbig = None
-        condense_element_mat = lambda Ae, result=None: Ae
-        if Vrow == Vcol:
-            Vbig, condense_element_mat = self.get_static_condensation.get(Vrow, (Vbig, condense_element_mat))
-
-        Me = self._element_mass_matrix
-        # Interpolation of basis and exterior derivative onto broken spaces
-        ctensor = self.assemble_reference_tensor(Vbig or Vcol)
-        rtensor = self.assemble_reference_tensor(Vbig or Vrow, transpose=True)
-        # Element matrix obtained via Equation (3.9) of Brubeck2022b
-        assemble_element_mat = partial(rtensor.matMatMult, Me, ctensor)
-        # Preallocate the element matrix
-        key = get_key(Vbig or Vrow, Vbig or Vcol, None)
-        try:
-            Ae = self.work_mats[key]
-        except KeyError:
-            Ae = self.work_mats.setdefault(key, assemble_element_mat())
-        # Preallocate the element Schur complement
-        key = get_key(Vrow, Vcol, Vbig)
-        try:
-            Se = self.work_mats[key]
-        except KeyError:
-            Se = self.work_mats.setdefault(key, condense_element_mat(Ae))
-
-        get_rindices = self.cell_to_global[Vrow]
-        rindices = numpy.empty(Se.getSize()[:1], dtype=PETSc.IntType)
-        if Vrow == Vcol:
-            get_cindices = lambda e, result=None: result
-            cindices = rindices
-        else:
-            get_cindices = self.cell_to_global[Vcol]
-            cindices = numpy.empty(Se.getSize()[1:], dtype=PETSc.IntType)
-
-        setSubMatCSR = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
-        insert = PETSc.InsertMode.INSERT
-        if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
-            # Empty kernel for preallocation
-            element_kernel = lambda e, result=None: result
-            condense_element_mat = lambda Ae, result=None: result
-
-        elif Me.getBlockSize() == 1:
-            # Kernel with diagonal mass matrix
-            diagonal = self._element_mass_diagonal
-            data = diagonal.array_w.reshape((-1,) + Vrow.shape)
-
-            def element_kernel(e, result=None):
-                self.get_coeffs(e, result=data)
-                Me.setDiagonal(diagonal, addv=insert)
-                return assemble_element_mat(result=result)
-        else:
-            # Kernel with block diagonal mass matrix
-            ai, aj, data = Me.getValuesCSR()
-            data = data.reshape((-1,) + Vrow.shape * 2)
-
-            def element_kernel(e, result=None):
-                self.get_coeffs(e, result=data)
-                Me.setValuesCSR(ai, aj, data, addv=insert)
-                Me.assemble()
-                return assemble_element_mat(result=result)
-
-        # Core assembly loop
-        for e in range(self.nel):
-            get_rindices(e, result=rindices)
-            get_cindices(e, result=cindices)
-            Ae = element_kernel(e, result=Ae)
-            Se = condense_element_mat(Ae, result=Se)
-            setSubMatCSR(A, Se, rindices, cindices, addv)
 
     @PETSc.Log.EventDecorator("FDMCoefficients")
     def assemble_coefficients(self, J, fcp, block_diagonal=True):
@@ -623,88 +475,306 @@ def assemble_reference_tensor(self, V, transpose=False):
         try:
             return cache[key]
         except KeyError:
-            if transpose:
-                result = self.assemble_reference_tensor(V, transpose=False)
-                result = PETSc.Mat().createTranspose(result).convert(result.getType())
-                return cache.setdefault(key, result)
-
-            full_key = key[:-3] + (False,) * 3
-            if is_facet and full_key in cache:
-                result = cache[full_key]
-                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-                result = result.createSubMatrix(noperm, self.fises)
-                noperm.destroy()
-                return cache.setdefault(key, result)
-
-            elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
-            ref_el = elements[0].get_reference_element()
-            eq = FIAT.FDMQuadrature(ref_el, degree)
-            e0 = elements[0] if elements[0].formdegree == 0 else FIAT.FDMLagrange(ref_el, degree)
-            e1 = elements[-1] if elements[-1].formdegree == 1 else FIAT.FDMDiscontinuousLagrange(ref_el, degree-1)
-            if is_interior:
-                e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
-
-            A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=PETSc.COMM_SELF)
-            A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
-            A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
-            B_blocks = mass_blocks(tdim, formdegree, A00, A11)
-            A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
-            result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
-            A00.destroy()
-            A10.destroy()
-            A11.destroy()
-
-            if value_size != 1:
-                eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
-                temp = result
-                result = temp.kron(eye)
-                temp.destroy()
-                eye.destroy()
-
-            if is_facet:
-                cache[full_key] = result
-                noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-                result = result.createSubMatrix(noperm, self.fises)
-                noperm.destroy()
+            pass
+        if transpose:
+            result = self.assemble_reference_tensor(V, transpose=False)
+            result = PETSc.Mat().createTranspose(result).convert(result.getType())
+            return cache.setdefault(key, result)
 
+        full_key = key[:-3] + (False,) * 3
+        if is_facet and full_key in cache:
+            result = cache[full_key]
+            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
+            result = result.createSubMatrix(noperm, self.fises)
+            noperm.destroy()
             return cache.setdefault(key, result)
 
+        elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
+        ref_el = elements[0].get_reference_element()
+        eq = FIAT.FDMQuadrature(ref_el, degree)
+        e0 = elements[0] if elements[0].formdegree == 0 else FIAT.FDMLagrange(ref_el, degree)
+        e1 = elements[-1] if elements[-1].formdegree == 1 else FIAT.FDMDiscontinuousLagrange(ref_el, degree-1)
+        if is_interior:
+            e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
+
+        A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=PETSc.COMM_SELF)
+        A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
+        A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+        B_blocks = mass_blocks(tdim, formdegree, A00, A11)
+        A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
+        result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
+        A00.destroy()
+        A10.destroy()
+        A11.destroy()
+
+        if value_size != 1:
+            eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
+            temp = result
+            result = temp.kron(eye)
+            temp.destroy()
+            eye.destroy()
+
+        if is_facet:
+            cache[full_key] = result
+            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
+            result = result.createSubMatrix(noperm, self.fises)
+            noperm.destroy()
+
+        return cache.setdefault(key, result)
+
+    @cached_property
+    def _element_mass_matrix(self):
+        Z = [self.coefficients[name].function_space() for name in ("beta", "alpha")]
+        shape = (sum(V.finat_element.space_dimension() for V in Z),) + Z[0].shape
+        data = numpy.ones(shape, dtype=PETSc.RealType)
+        shape += (1,) * (3-len(shape))
+        nrows = shape[0] * shape[1]
+        ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
+        aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
+        if shape[2] > 1:
+            ai *= shape[2]
+            data = numpy.tile(numpy.eye(shape[2], dtype=data.dtype), shape[:1] + (1,)*(len(shape)-1))
+        return PETSc.Mat().createAIJ((nrows, nrows), csr=(ai, aj, data), comm=PETSc.COMM_SELF)
+
+    @PETSc.Log.EventDecorator("FDMSetValues")
+    def set_values(self, A, Vrow, Vcol, addv, triu=False):
+        """
+        Assemble the stiffness matrix in the FDM basis using sparse reference
+        tensors and diagonal mass matrices.
+
+        :arg A: the :class:`PETSc.Mat` to assemble
+        :arg Vrow: the :class:`.FunctionSpace` test space
+        :arg Vcol: the :class:`.FunctionSpace` trial space
+        :arg addv: a `PETSc.Mat.InsertMode`
+        :arg triu: are we assembling only the upper triangular part?
+        """
+        key = (Vrow.ufl_element(), Vcol.ufl_element())
+        try:
+            assembler = self.assemblers[key]
+        except KeyError:
+            Vbig = None
+            if Vrow == Vcol:
+                Vbig = self.parent_space.get(Vrow)
+
+            beta = self.coefficients["beta"]
+            alpha = self.coefficients["alpha"]
+            # Interpolation of basis and exterior derivative onto broken spaces
+            ctensor = self.assemble_reference_tensor(Vbig or Vcol)
+            rtensor = self.assemble_reference_tensor(Vbig or Vrow, transpose=True)
+            element_kernel = TripleProductKernel(rtensor, self._element_mass_matrix, ctensor, beta, alpha)
+            if Vbig is not None:
+                element_kernel = self.schur_kernel[Vrow](element_kernel)
+
+            assembler = SparseAssembler(element_kernel, Vrow, Vcol, self.lgmaps[Vrow], self.lgmaps[Vcol])
+            self.assemblers.setdefault(key, assembler)
+        assembler.assemble(A, addv=addv, triu=triu)
+
+
+class SparseAssembler(object):
+
+    _cache = {}
 
-class SchurComplementBuilder(object):
+    @staticmethod
+    def setSubMatCSR(comm, triu=False):
+        """
+        Compile C code to insert sparse submatrices and store in class cache
+
+        :arg triu: are we inserting onto the upper triangular part of the matrix?
+
+        :returns: a python wrapper for the matrix insertion function
+        """
+        cache = SparseAssembler._cache.setdefault("setSubMatCSR", {})
+        key = triu
+        try:
+            return cache[key]
+        except KeyError:
+            return cache.setdefault(key, load_setSubMatCSR(comm, triu))
+
+    def __init__(self, kernel, Vrow, Vcol, rmap, cmap):
+        self.kernel = kernel
+        m, n = kernel.result.getSize()
+
+        spaces = [Vrow]
+        row_shape = tuple() if Vrow.value_size == 1 else (Vrow.value_size,)
+        map_rows = (self.map_block_indices, rmap) if row_shape else (rmap.apply,)
+        rows = numpy.empty((m, ), dtype=PETSc.IntType).reshape((-1,) + row_shape)
+
+        self.bc_nodes = None
+        if Vcol == Vrow:
+            cols = rows
+            map_cols = (lambda *x, result=None: result, )
+        #     own = Vrow.dof_dset.layout_vec.getLocalSize()
+        #     bc_nodes = numpy.flatnonzero(rmap.indices[:own] < 0).astype(PETSc.IntType)
+        #     if len(bc_nodes) > 0:
+        #         bc_nodes = Vrow.dof_dset.lgmap.apply(bc_nodes, result=bc_nodes)
+        #         self.bc_nodes = bc_nodes[:, None]
+        else:
+            spaces.append(Vcol)
+            col_shape = tuple() if Vcol.value_size == 1 else (Vcol.value_size,)
+            map_cols = (self.map_block_indices, cmap) if col_shape else (cmap.apply, )
+            cols = numpy.empty((n, ), dtype=PETSc.IntType).reshape((-1,) + col_shape)
+
+        spaces.extend(c.function_space() for c in kernel.coefficients)
+        self.indices = tuple(numpy.empty((V.finat_element.space_dimension(),), dtype=PETSc.IntType) for V in spaces)
+        self.map_rows = partial(*map_rows, self.indices[spaces.index(Vrow)], result=rows)
+        self.map_cols = partial(*map_cols, self.indices[spaces.index(Vcol)], result=cols)
+        self.kernel_args = self.indices[1+spaces.index(Vcol):]
+
+        integral_type = kernel.integral_type
+        if integral_type == "cell":
+            get_map = operator.methodcaller("cell_node_map")
+        elif integral_type == "interior_facet":
+            get_map = operator.methodcaller("interior_facet_node_map")
+        else:
+            raise NotImplementedError("Only for cell or interior facet integrals")
+        self.node_maps = tuple(map(get_map, spaces))
+        node_map = self.node_maps[0]
+        self.nel = node_map.values.shape[0]
+        if node_map.offset is None:
+            layers = None
+        else:
+            layers = node_map.iterset.layers_array
+            layers = layers[:, 1]-layers[:, 0]-1
+            if layers.shape[0] != self.nel:
+                layers = numpy.repeat(layers, self.nel)
+        self.layers = layers
+
+    def map_block_indices(self, lgmap, indices, result=None):
+        bsize = result.shape[1]
+        numpy.copyto(result[:, 0], indices)
+        result[:, 0] *= bsize
+        numpy.add.outer(result[:, 0], numpy.arange(1, bsize, dtype=indices.dtype), out=result[:, 1:])
+        return lgmap.apply(result, result=result)
+
+    def set_indices(self, e):
+        for index, node_map in zip(self.indices, self.node_maps):
+            numpy.copyto(index, node_map.values_with_halo[e])
+
+    def add_offsets(self):
+        for index, node_map in zip(self.indices, self.node_maps):
+            index += node_map.offset
+
+    def assemble(self, A, addv=None, triu=False):
+        if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
+            kernel = lambda *args, result=None: result
+        else:
+            kernel = self.kernel
+            triu = False
+            if self.bc_nodes is not None:
+                vals = numpy.ones(self.bc_nodes.shape, dtype=PETSc.RealType)
+                A.setValuesRCV(self.bc_nodes, self.bc_nodes, vals, addv)
+        result = self.kernel.result
+        insert = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
+
+        # Core assembly loop
+        if self.layers is None:
+            for e in range(self.nel):
+                self.set_indices(e)
+                insert(A, kernel(*self.kernel_args, result=result),
+                       self.map_rows(), self.map_cols(), addv)
+        else:
+            for e in range(self.nel):
+                self.set_indices(e)
+                for _ in range(self.layers[e]):
+                    insert(A, kernel(*self.kernel_args, result=result),
+                           self.map_rows(), self.map_cols(), addv)
+                    self.add_offsets()
+
+
+class ElementKernel(object):
     """
-    Class to build Schur complement matrices that reuses work matrices and the
-    symbolic factorization of the interior block.
+    A constant element kernel
     """
+    def __init__(self, A, *coefficients):
+        self.result = A
+        self.coefficients = coefficients
+        self.integral_type = "cell"
 
-    def __init__(self, idofs, fdofs):
-        self.idofs = idofs
-        self.fdofs = fdofs
-        self.slices = {}
-        self.ises = tuple()
-        self.isrows = []
-        self.iscols = []
-        self.submats = []
-        self.work = [None for _ in range(2)]
+    def __call__(self, *args, result=None):
+        return result or self.result
 
     def __del__(self):
-        self.reset()
+        self.destroy()
 
-    def reset(self):
-        for obj in self.ises:
-            if isinstance(obj, PETSc.Object):
-                obj.destroy()
-        for obj in self.submats:
-            if isinstance(obj, PETSc.Object):
-                obj.destroy()
-        for obj in self.work:
+    def destroy(self):
+        pass
+
+
+class TripleProductKernel(ElementKernel):
+    """
+    An element kernel to compute a triple matrix product A * B * C Where A and
+    C are constant matrices and B is a block diagonal matrix with entries given
+    by coefficients.
+    See Equation (3.9) of Brubeck2022b.
+    """
+    def __init__(self, A, B, C, *coefficients):
+        self.work = None
+        V = coefficients[0].function_space()
+        dshape = (-1, ) + coefficients[0].dat.data_ro.shape[1:]
+        if V.value_size == 1:
+            self.work = B.getDiagonal()
+            self.update = partial(B.setDiagonal, self.work)
+            self.data = self.work.array_w.reshape(dshape)
+        else:
+            indptr, indices, data = B.getValuesCSR()
+            self.data = data.reshape(dshape)
+            self.update = lambda *args: (B.setValuesCSR(indptr, indices, self.data), B.assemble())
+
+        stops = numpy.cumsum([0] + [c.function_space().finat_element.space_dimension() for c in coefficients])
+        self.slices = [slice(*stops[k:k+2]) for k in range(len(stops)-1)]
+        self.product = partial(A.matMatMult, B, C)
+        super().__init__(self.product(), *coefficients)
+
+    def __call__(self, *indices, result=None):
+        for c, i, z in zip(self.coefficients, indices, self.slices):
+            numpy.take(c.dat.data_ro, i, axis=0, out=self.data[z])
+        self.update()
+        return self.product(result=result)
+
+    def destroy(self):
+        self.result.destroy()
+        if isinstance(self.work, PETSc.Object):
+            self.work.destroy()
+
+
+class SchurComplementKernel(ElementKernel):
+    """
+    An element kernel to compute Schur complements that reuses work matrices and the
+    symbolic factorization of the interior block.
+    """
+    def __init__(self, idofs, fdofs, kernel):
+        self.kernel = kernel
+        self.A = kernel.result
+        comm = self.A.getComm()
+        i0, i1 = tuple(PETSc.IS().createGeneral(i, comm=comm) for i in (idofs, fdofs))
+        self.slices = self.sort_interior_dofs(i0, self.A)
+        self.isrows = [i0, i0, i1, i1]
+        self.iscols = [i0, i1, i0, i1]
+        self.ises = (i0, i1)
+        self.work = [None for _ in range(2)]
+        self.submats = []
+        super().__init__(self.condense(), *kernel.coefficients)
+
+    def __call__(self, *args, result=None):
+        self.kernel(*args, result=self.A)
+        return self.condense(result=result)
+
+    def destroy(self):
+        self.kernel.destroy()
+        self.result.destroy()
+        objs = []
+        objs.extend(self.ises)
+        objs.extend(self.work)
+        objs.extend(self.submats)
+        for obj in objs:
             if isinstance(obj, PETSc.Object):
                 obj.destroy()
-        self.submats = []
-        self.work = [None for _ in range(2)]
 
     def sort_interior_dofs(self, i0, A):
         """Permute `i0` to have A[i0, i0] with square blocks of
-           increasing dimension along its diagonal. Add slices with the extents
+           increasing dimension along its diagonal.
+
+           Return a dict of slices with the extents
            of each set of blocks in the CSR representation of A."""
         A00 = A.createSubMatrix(i0, i0)
         indptr, indices, _ = A00.getValuesCSR()
@@ -712,7 +782,7 @@ def sort_interior_dofs(self, i0, A):
         perm = numpy.argsort(degree)
         icur = 0
         istart = 0
-        self.slices[1] = slice(0, 0)
+        slices = {1: slice(0, 0)}
         unique_degree, counts = numpy.unique(degree, return_counts=True)
         for k, kdofs in sorted(zip(unique_degree, counts)):
             if k > 1:
@@ -722,35 +792,34 @@ def sort_interior_dofs(self, i0, A):
                     neigh[row] = indices[slice(*indptr[i:i+2])]
                 perm[icur:icur+kdofs] = list(dict.fromkeys(neigh.flat))
 
-            self.slices[k] = slice(istart, istart + k * kdofs)
+            slices[k] = slice(istart, istart + k * kdofs)
             istart += k * kdofs
             icur += kdofs
         i0.setIndices(i0.getIndices()[perm])
         A00.destroy()
+        return slices
 
-    def get_blocks(self, A):
-        if len(self.submats) == 0:
-            comm = A.getComm()
-            i0 = PETSc.IS().createGeneral(self.idofs, comm=comm)
-            i1 = PETSc.IS().createGeneral(self.fdofs, comm=comm)
-            self.sort_interior_dofs(i0, A)
-            self.isrows = [i0, i0, i1, i1]
-            self.iscols = [i0, i1, i0, i1]
-            self.ises = (i0, i1)
-        self.submats = A.createSubMatrices(self.isrows, iscols=self.iscols, submats=self.submats or None)
+    def get_blocks(self):
+        self.submats = self.A.createSubMatrices(self.isrows, self.iscols, submats=self.submats or None)
         return self.submats
 
     @PETSc.Log.EventDecorator("FDMCondense")
-    def condense(self, A, result=None):
+    def condense(self, result=None):
+        """By default pad with zeros the statically condensed pattern"""
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        if result is None:
+            A00, A01, A10, _ = self.get_blocks()
+            result = A10.matMatMult(A00, A01, result=result)
+        result.aypx(0.0, self.A, structure=structure)
         return result
 
 
-class SchurComplementDiagonal(SchurComplementBuilder):
+class SchurComplementDiagonal(SchurComplementKernel):
 
     @PETSc.Log.EventDecorator("FDMCondense")
-    def condense(self, A, result=None):
+    def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks(A)
+        A00, A01, A10, A11 = self.get_blocks()
         self.work[0] = A00.getDiagonal(result=self.work[0])
         self.work[0].reciprocal()
         self.work[0].scale(-1)
@@ -760,24 +829,12 @@ def condense(self, A, result=None):
         return result
 
 
-class SchurComplementPattern(SchurComplementBuilder):
-
-    @PETSc.Log.EventDecorator("FDMCondense")
-    def condense(self, A, result=None):
-        structure = PETSc.Mat.Structure.SUBSET if result else None
-        if result is None:
-            A00, A01, A10, _ = self.get_blocks(A)
-            result = A10.matMatMult(A00, A01, result=result)
-        result.aypx(0.0, A, structure=structure)
-        return result
-
-
-class SchurComplementBlockCholesky(SchurComplementBuilder):
+class SchurComplementBlockCholesky(SchurComplementKernel):
 
     @PETSc.Log.EventDecorator("FDMCondense")
-    def condense(self, A, result=None):
+    def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks(A)
+        A00, A01, A10, A11 = self.get_blocks()
         indptr, indices, R = A00.getValuesCSR()
 
         zlice = self.slices[1]
@@ -785,9 +842,10 @@ def condense(self, A, result=None):
         numpy.reciprocal(R[zlice], out=R[zlice])
         flops = 2 * (zlice.stop - zlice.start)
         for k in sorted(degree for degree in self.slices if degree > 1):
-            zlice = self.slices[k]
-            A = R[zlice].reshape((-1, k, k))
-            R[zlice] = numpy.linalg.inv(numpy.linalg.cholesky(A)).reshape((-1))
+            Rk = R[self.slices[k]]
+            A = Rk.reshape((-1, k, k))
+            rinv = numpy.linalg.inv(numpy.linalg.cholesky(A))
+            numpy.copyto(Rk, rinv.flat)
             flops += A.shape[0] * ((k**3)//3 + k**3)
 
         PETSc.Log.logFlops(flops)
@@ -800,12 +858,12 @@ def condense(self, A, result=None):
         return result
 
 
-class SchurComplementBlockQR(SchurComplementBuilder):
+class SchurComplementBlockQR(SchurComplementKernel):
 
-    @PETSc.Log.EventDecorator("FDMGetSchur")
-    def condense(self, A, result=None):
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks(A)
+        A00, A01, A10, A11 = self.get_blocks()
         indptr, indices, R = A00.getValuesCSR()
         Q = numpy.ones(R.shape, dtype=R.dtype)
 
@@ -816,8 +874,9 @@ def condense(self, A, result=None):
             zlice = self.slices[k]
             A = R[zlice].reshape((-1, k, k))
             q, r = numpy.linalg.qr(A, mode="complete")
-            Q[zlice] = q.reshape((-1,))
-            R[zlice] = numpy.linalg.inv(r).reshape((-1,))
+            numpy.copyto(Q[zlice], q.flat)
+            rinv = numpy.linalg.inv(r)
+            numpy.copyto(R[zlice], rinv.flat)
             flops += A.shape[0] * ((4*k**3)//3 + k**3)
 
         PETSc.Log.logFlops(flops)
@@ -832,12 +891,12 @@ def condense(self, A, result=None):
         return result
 
 
-class SchurComplementBlockSVD(SchurComplementBuilder):
+class SchurComplementBlockSVD(SchurComplementKernel):
 
-    @PETSc.Log.EventDecorator("FDMGetSchur")
-    def condense(self, A, result=None):
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks(A)
+        A00, A01, A10, A11 = self.get_blocks()
         indptr, indices, U = A00.getValuesCSR()
         V = numpy.ones(U.shape, dtype=U.dtype)
         self.work[0] = A00.getDiagonal(result=self.work[0])
@@ -850,9 +909,9 @@ def condense(self, A, result=None):
             A = U[bslice].reshape((-1, k, k))
             u, s, v = numpy.linalg.svd(A, full_matrices=False)
             dslice = slice(dslice.stop, dslice.stop + k * A.shape[0])
-            D.array_w[dslice] = s.reshape((-1,))
-            U[bslice] = numpy.transpose(u, axes=(0, 2, 1)).reshape((-1,))
-            V[bslice] = numpy.transpose(v, axes=(0, 2, 1)).reshape((-1,))
+            numpy.copyto(D.array_w[dslice], s.flat)
+            numpy.copyto(U[bslice], numpy.transpose(u, axes=(0, 2, 1)).flat)
+            numpy.copyto(V[bslice], numpy.transpose(v, axes=(0, 2, 1)).flat)
             flops += A.shape[0] * ((4*k**3)//3 + 4*k**3)
 
         PETSc.Log.logFlops(flops)
@@ -871,21 +930,22 @@ def condense(self, A, result=None):
         return result
 
 
-class SchurComplementBlockInverse(SchurComplementBuilder):
+class SchurComplementBlockInverse(SchurComplementKernel):
 
-    @PETSc.Log.EventDecorator("FDMGetSchur")
-    def condense(self, A, result=None):
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks(A)
+        A00, A01, A10, A11 = self.get_blocks()
         indptr, indices, R = A00.getValuesCSR()
 
         zlice = self.slices[1]
         numpy.reciprocal(R[zlice], out=R[zlice])
         flops = zlice.stop - zlice.start
         for k in sorted(degree for degree in self.slices if degree > 1):
-            zlice = self.slices[k]
-            A = R[zlice].reshape((-1, k, k))
-            R[zlice] = numpy.linalg.inv(A).reshape((-1,))
+            Rk = R[self.slices[k]]
+            A = Rk.reshape((-1, k, k))
+            rinv = numpy.linalg.inv(A)
+            numpy.copyto(Rk, rinv.flat)
             flops += A.shape[0] * (k**3)
 
         PETSc.Log.logFlops(flops)
@@ -899,25 +959,12 @@ def condense(self, A, result=None):
 
 @PETSc.Log.EventDecorator("LoadCode")
 def load_c_code(code, name, **kwargs):
-    cppargs = ["-I%s/include" % d for d in get_petsc_dir()]
-    ldargs = (["-L%s/lib" % d for d in get_petsc_dir()]
-              + ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()]
+    petsc_dir = get_petsc_dir()
+    cppargs = ["-I%s/include" % d for d in petsc_dir]
+    ldargs = (["-L%s/lib" % d for d in petsc_dir]
+              + ["-Wl,-rpath,%s/lib" % d for d in petsc_dir]
               + ["-lpetsc", "-lm"])
-    funptr = load(code, "c", name,
-                  cppargs=cppargs, ldargs=ldargs,
-                  **kwargs)
-
-    def get_pointer(obj):
-        if isinstance(obj, PETSc.Object):
-            return obj.handle
-        elif isinstance(obj, numpy.ndarray):
-            return obj.ctypes.data
-        return obj
-
-    @PETSc.Log.EventDecorator(name)
-    def wrapper(*args):
-        return funptr(*map(get_pointer, args))
-    return wrapper
+    return load(code, "c", name, cppargs=cppargs, ldargs=ldargs, **kwargs)
 
 
 def load_setSubMatCSR(comm, triu=False):
@@ -925,10 +972,10 @@ def load_setSubMatCSR(comm, triu=False):
        Done in C for efficiency, since it loops over rows."""
     if triu:
         name = "setSubMatCSR_SBAIJ"
-        select_cols = "icol < irow ? -1: icol"
+        select_cols = "icol -= (icol < irow) * (1 + icol);"
     else:
         name = "setSubMatCSR_AIJ"
-        select_cols = "icol"
+        select_cols = ""
     code = f"""
 #include <petsc.h>
 
@@ -959,7 +1006,8 @@ def load_setSubMatCSR(comm, triu=False):
         irow = rindices[i];
         for (PetscInt j = 0; j < ncols; j++) {{
             icol = cindices[cols[j]];
-            indices[j] = {select_cols};
+            {select_cols}
+            indices[j] = icol;
         }}
         ierr = MatSetValues(A, 1, &irow, ncols, indices, vals, addv);CHKERRQ(ierr);
         ierr = MatRestoreRow(B, i, &ncols, &cols, &vals);CHKERRQ(ierr);
@@ -970,8 +1018,14 @@ def load_setSubMatCSR(comm, triu=False):
 """
     argtypes = [ctypes.c_voidp, ctypes.c_voidp,
                 ctypes.c_voidp, ctypes.c_voidp, ctypes.c_int]
-    return load_c_code(code, name, comm=comm, argtypes=argtypes,
-                       restype=ctypes.c_int)
+    funptr = load_c_code(code, name, comm=comm, argtypes=argtypes,
+                         restype=ctypes.c_int)
+
+    @PETSc.Log.EventDecorator(name)
+    def wrapper(A, B, rows, cols, addv):
+        return funptr(A.handle, B.handle, rows.ctypes.data, cols.ctypes.data, addv)
+
+    return wrapper
 
 
 def is_restricted(finat_element):
@@ -1139,18 +1193,6 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
         temp.destroy()
         eye.destroy()
 
-    rmap = Vf.local_to_global_map(fbcs)
-    cmap = Vc.local_to_global_map(cbcs)
-    rlocal, nel = extrude_node_map(Vf.cell_node_map(), bsize=Vf.value_size)
-    clocal, nel = extrude_node_map(Vc.cell_node_map(), bsize=Vc.value_size)
-
-    def cell_to_global(lgmap, cell_to_local, e, result=None):
-        result = cell_to_local(e, result=result)
-        return lgmap.apply(result, result=result)
-
-    imode = PETSc.InsertMode.INSERT
-    update_Dmat = FDMPC.setSubMatCSR(PETSc.COMM_SELF, triu=False)
-
     sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
     block_size = Vf.dof_dset.layout_vec.getBlockSize()
     preallocator = PETSc.Mat().create(comm=comm)
@@ -1158,24 +1200,19 @@ def cell_to_global(lgmap, cell_to_local, e, result=None):
     preallocator.setSizes(sizes)
     preallocator.setUp()
 
-    rindices = None
-    cindices = None
-    for e in range(nel):
-        rindices = cell_to_global(rmap, rlocal, e, result=rindices)
-        cindices = cell_to_global(cmap, clocal, e, result=cindices)
-        update_Dmat(preallocator, Dhat, rindices, cindices, imode)
-
+    insert = PETSc.InsertMode.INSERT
+    rmap = Vf.local_to_global_map(fbcs)
+    cmap = Vc.local_to_global_map(cbcs)
+    assembler = SparseAssembler(ElementKernel(Dhat), Vf, Vc, rmap, cmap)
+    assembler.assemble(preallocator, addv=insert)
     preallocator.assemble()
+
     nnz = get_preallocation(preallocator, sizes[0][0])
     preallocator.destroy()
     Dmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=comm)
     Dmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
 
-    for e in range(nel):
-        rindices = cell_to_global(rmap, rlocal, e, result=rindices)
-        cindices = cell_to_global(cmap, clocal, e, result=cindices)
-        update_Dmat(Dmat, Dhat, rindices, cindices, imode)
-
+    assembler.assemble(Dmat, addv=insert)
     Dmat.assemble()
     Dhat.destroy()
     return Dmat
@@ -1285,11 +1322,18 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         :arg addv: a `PETSc.Mat.InsertMode`
         :arg triu: are we assembling only the upper triangular part?
         """
-        set_submat = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
+        set_submat = SparseAssembler.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
         update_A = lambda A, Ae, rindices: set_submat(A, Ae, rindices, rindices, addv)
         condense_element_mat = lambda x: x
 
-        get_rindices = self.cell_to_global[Vrow]
+        def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
+            # Be careful not to create new arrays
+            result = cell_to_local(cell_index, result=result)
+            return lgmap.apply(result, result=result)
+
+        bsize = Vrow.dof_dset.layout_vec.getBlockSize()
+        cell_to_local, nel = extrude_node_map(Vrow.cell_node_map(), bsize=bsize)
+        get_rindices = partial(cell_to_global, self.lgmaps[Vrow], cell_to_local)
         Afdm, Dfdm, bdof, axes_shifts = self.assemble_reference_tensor(Vrow)
 
         Gq = self.coefficients.get("alpha")
@@ -1333,7 +1377,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
 
             aptr = numpy.arange(0, (bshape[0]+1)*bshape[1], bshape[1], dtype=PETSc.IntType)
             aidx = numpy.tile(numpy.arange(bshape[1], dtype=PETSc.IntType), bshape[0])
-            for e in range(self.nel):
+            for e in range(nel):
                 # Ae = Be kron Bq[e]
                 adata = numpy.sum(Bq.dat.data_ro[index_coef(e)], axis=0)
                 Ae = PETSc.Mat().createAIJWithArrays(bshape, (aptr, aidx, adata), comm=PETSc.COMM_SELF)
@@ -1349,7 +1393,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         ae = numpy.zeros((ncomp, tdim), dtype=PETSc.RealType)
         be = numpy.zeros((ncomp,), dtype=PETSc.RealType)
         je = None
-        for e in range(self.nel):
+        for e in range(nel):
             je = index_coef(e, result=je)
             bce = bcflags.dat.data_ro_with_halos[index_bc(e)] > 1E-8
             # get coefficients on this cell
@@ -1622,7 +1666,6 @@ def assemble_coefficients(self, J, fcp):
         for coef in coefficients.values():
             with coef.dat.vec as cvec:
                 cvec.set(1.0E0)
-        self.coefficients = coefficients
         return coefficients, assembly_callables
 
 

From 49302eb5eb34401bc61bf9795c5fb32a225171a5 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 10 Apr 2023 16:07:09 +0100
Subject: [PATCH 64/75] avoid createSubMatrices in static condensation

---
 firedrake/preconditioners/fdm.py | 218 ++++++++++++++++---------------
 1 file changed, 112 insertions(+), 106 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 56eb9defad..d5583ed177 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -218,27 +218,24 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         if value_size != 1:
             fdofs = numpy.add.outer(value_size * fdofs, numpy.arange(value_size, dtype=fdofs.dtype))
         self.fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
-        dofs = numpy.arange(value_size * Vbig.finat_element.space_dimension(), dtype=fdofs.dtype)
-        idofs = numpy.setdiff1d(dofs, fdofs, assume_unique=True)
 
-        # Dictionaries with the parent space and kernel to compute the Schur complement
-        self.parent_space = {}
+        # Dictionaries with the complement space and kernel to compute the Schur complement
+        self.complement_space = {}
         self.schur_kernel = {}
         if Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
+            self.complement_space[Vfacet] = FunctionSpace(V.mesh(), restrict_element(ebig, "interior"))
             if Vfacet.finat_element.formdegree == 0 and value_size == 1:
-                sc_builder = SchurComplementDiagonal
+                self.schur_kernel[Vfacet] = SchurComplementDiagonal
             elif pmat_type.endswith("sbaij"):
-                sc_builder = SchurComplementBlockCholesky
+                self.schur_kernel[Vfacet] = SchurComplementBlockCholesky
             else:
-                sc_builder = SchurComplementBlockQR
-            self.schur_kernel[Vfacet] = partial(sc_builder, idofs, fdofs)
-            self.parent_space[Vfacet] = Vbig
+                self.schur_kernel[Vfacet] = SchurComplementBlockQR
 
         elif len(fdofs) and V.finat_element.formdegree == 0:
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            self.schur_kernel[V] = partial(SchurComplementKernel, idofs, dofs)
-            self.parent_space[V] = V
+            self.complement_space[V] = FunctionSpace(V.mesh(), restrict_element(V.ufl_element(), "interior"))
+            self.schur_kernel[V] = SchurComplementKernel
 
         # Create data structures needed for assembly
         self.lgmaps = {Vsub: Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub]) for Vsub in V}
@@ -450,7 +447,7 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
         return coefficients, assembly_callables
 
     @PETSc.Log.EventDecorator("FDMRefTensor")
-    def assemble_reference_tensor(self, V, transpose=False):
+    def assemble_reference_tensor(self, V, transpose=False, sort_interior=False):
         """
         Return the reference tensor used in the diagonal factorisation of the
         sparse cell matrices.  See Section 3.2 of Brubeck2022b.
@@ -470,18 +467,38 @@ def assemble_reference_tensor(self, V, transpose=False):
         if formdegree == tdim:
             degree = degree + 1
         is_interior, is_facet = is_restricted(fe)
-        key = (value_size, tdim, degree, formdegree, is_interior, is_facet, transpose)
+        key = (value_size, tdim, degree, formdegree, is_interior, is_facet, transpose, sort_interior)
         cache = self._cache.setdefault("reference_tensor", {})
         try:
             return cache[key]
         except KeyError:
             pass
+
         if transpose:
-            result = self.assemble_reference_tensor(V, transpose=False)
+            result = self.assemble_reference_tensor(V, transpose=False, sort_interior=sort_interior)
             result = PETSc.Mat().createTranspose(result).convert(result.getType())
             return cache.setdefault(key, result)
 
-        full_key = key[:-3] + (False,) * 3
+        if sort_interior:
+            assert is_interior and not is_facet and not transpose
+            result = self.assemble_reference_tensor(V, transpose=transpose, sort_interior=False)
+            # Compute the stiffness matrix on the interior of a cell
+            A00 = self._element_mass_matrix.PtAP(result)
+            indptr, indices, _ = A00.getValuesCSR()
+            degree = numpy.diff(indptr)
+            # Sort DOFs to make A00 block diagonal with blocks of increasing dimension along the diagonal
+            perm = numpy.array(list(dict.fromkeys(indices)), dtype=indices.dtype)
+            perm = perm[numpy.argsort(degree[perm], kind='stable')]
+            A00.destroy()
+
+            perm = PETSc.IS().createGeneral(perm, comm=result.getComm())
+            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
+            result = result.createSubMatrix(noperm, perm)
+            noperm.destroy()
+            perm.destroy()
+            return cache.setdefault(key, result)
+
+        full_key = key[:-4] + (False,) * 4
         if is_facet and full_key in cache:
             result = cache[full_key]
             noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
@@ -552,18 +569,25 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
         try:
             assembler = self.assemblers[key]
         except KeyError:
-            Vbig = None
-            if Vrow == Vcol:
-                Vbig = self.parent_space.get(Vrow)
-
-            beta = self.coefficients["beta"]
-            alpha = self.coefficients["alpha"]
             # Interpolation of basis and exterior derivative onto broken spaces
-            ctensor = self.assemble_reference_tensor(Vbig or Vcol)
-            rtensor = self.assemble_reference_tensor(Vbig or Vrow, transpose=True)
-            element_kernel = TripleProductKernel(rtensor, self._element_mass_matrix, ctensor, beta, alpha)
-            if Vbig is not None:
-                element_kernel = self.schur_kernel[Vrow](element_kernel)
+            C1 = self.assemble_reference_tensor(Vcol)
+            R1 = self.assemble_reference_tensor(Vrow, transpose=True)
+            M = self._element_mass_matrix
+            # Element stiffness matrix = R1 * M * C1, see Equation (3.9) of Brubeck2022b
+            element_kernel = TripleProductKernel(R1, M, C1, self.coefficients["beta"], self.coefficients["alpha"])
+
+            schur_kernel = None
+            if Vrow == Vcol:
+                schur_kernel = self.schur_kernel.get(Vrow)
+            if schur_kernel is not None:
+                V0 = self.complement_space[Vrow]
+                C0 = self.assemble_reference_tensor(V0, sort_interior=True)
+                R0 = self.assemble_reference_tensor(V0, sort_interior=True, transpose=True)
+                # Only the facet block updates the coefficients in M
+                element_kernel = schur_kernel(element_kernel,
+                                              TripleProductKernel(R1, M, C0),
+                                              TripleProductKernel(R0, M, C1),
+                                              TripleProductKernel(R0, M, C0))
 
             assembler = SparseAssembler(element_kernel, Vrow, Vcol, self.lgmaps[Vrow], self.lgmaps[Vcol])
             self.assemblers.setdefault(key, assembler)
@@ -618,7 +642,7 @@ def __init__(self, kernel, Vrow, Vcol, rmap, cmap):
         self.indices = tuple(numpy.empty((V.finat_element.space_dimension(),), dtype=PETSc.IntType) for V in spaces)
         self.map_rows = partial(*map_rows, self.indices[spaces.index(Vrow)], result=rows)
         self.map_cols = partial(*map_cols, self.indices[spaces.index(Vcol)], result=cols)
-        self.kernel_args = self.indices[1+spaces.index(Vcol):]
+        self.kernel_args = self.indices[-len(kernel.coefficients):]
 
         integral_type = kernel.integral_type
         if integral_type == "cell":
@@ -702,26 +726,31 @@ def destroy(self):
 
 class TripleProductKernel(ElementKernel):
     """
-    An element kernel to compute a triple matrix product A * B * C Where A and
+    An element kernel to compute a triple matrix product A * B * C, where A and
     C are constant matrices and B is a block diagonal matrix with entries given
     by coefficients.
-    See Equation (3.9) of Brubeck2022b.
     """
     def __init__(self, A, B, C, *coefficients):
         self.work = None
-        V = coefficients[0].function_space()
-        dshape = (-1, ) + coefficients[0].dat.data_ro.shape[1:]
-        if V.value_size == 1:
-            self.work = B.getDiagonal()
-            self.update = partial(B.setDiagonal, self.work)
-            self.data = self.work.array_w.reshape(dshape)
+        if len(coefficients) == 0:
+            self.data = numpy.array([])
+            self.update = lambda *args: args
         else:
-            indptr, indices, data = B.getValuesCSR()
-            self.data = data.reshape(dshape)
-            self.update = lambda *args: (B.setValuesCSR(indptr, indices, self.data), B.assemble())
+            V = coefficients[0].function_space()
+            dshape = (-1, ) + coefficients[0].dat.data_ro.shape[1:]
+            if V.value_size == 1:
+                self.work = B.getDiagonal()
+                self.data = self.work.array_w.reshape(dshape)
+                self.update = partial(B.setDiagonal, self.work)
+            else:
+                indptr, indices, data = B.getValuesCSR()
+                self.data = data.reshape(dshape)
+                self.update = lambda *args: (B.setValuesCSR(indptr, indices, self.data), B.assemble())
+
+        stops = numpy.zeros((len(coefficients) + 1,), dtype=PETSc.IntType)
+        numpy.cumsum([c.function_space().finat_element.space_dimension() for c in coefficients], out=stops[1:])
+        self.slices = [slice(*stops[k:k+2]) for k in range(len(coefficients))]
 
-        stops = numpy.cumsum([0] + [c.function_space().finat_element.space_dimension() for c in coefficients])
-        self.slices = [slice(*stops[k:k+2]) for k in range(len(stops)-1)]
         self.product = partial(A.matMatMult, B, C)
         super().__init__(self.product(), *coefficients)
 
@@ -742,75 +771,48 @@ class SchurComplementKernel(ElementKernel):
     An element kernel to compute Schur complements that reuses work matrices and the
     symbolic factorization of the interior block.
     """
-    def __init__(self, idofs, fdofs, kernel):
-        self.kernel = kernel
-        self.A = kernel.result
-        comm = self.A.getComm()
-        i0, i1 = tuple(PETSc.IS().createGeneral(i, comm=comm) for i in (idofs, fdofs))
-        self.slices = self.sort_interior_dofs(i0, self.A)
-        self.isrows = [i0, i0, i1, i1]
-        self.iscols = [i0, i1, i0, i1]
-        self.ises = (i0, i1)
+    def __init__(self, *kernels):
+        self.children = kernels
+        self.submats = [k.result for k in self.children]
+
+        # Create dict of slices with the extents of the diagonal blocks
+        A00 = self.submats[-1]
+        degree = numpy.diff(A00.getValuesCSR()[0])
+        istart = 0
+        self.slices = {1: slice(0, 0)}
+        unique_degree, counts = numpy.unique(degree, return_counts=True)
+        for k, kdofs in sorted(zip(unique_degree, counts)):
+            self.slices[k] = slice(istart, istart + k * kdofs)
+            istart += k * kdofs
+
         self.work = [None for _ in range(2)]
-        self.submats = []
-        super().__init__(self.condense(), *kernel.coefficients)
+        coefficients = []
+        for k in self.children:
+            coefficients.extend(k.coefficients)
+        coefficients = list(dict.fromkeys(coefficients))
+        super().__init__(self.condense(), *coefficients)
 
     def __call__(self, *args, result=None):
-        self.kernel(*args, result=self.A)
+        for k in self.children:
+            k(*args, result=k.result)
         return self.condense(result=result)
 
     def destroy(self):
-        self.kernel.destroy()
+        for k in self.children:
+            k.destroy()
         self.result.destroy()
-        objs = []
-        objs.extend(self.ises)
-        objs.extend(self.work)
-        objs.extend(self.submats)
-        for obj in objs:
+        for obj in self.work:
             if isinstance(obj, PETSc.Object):
                 obj.destroy()
 
-    def sort_interior_dofs(self, i0, A):
-        """Permute `i0` to have A[i0, i0] with square blocks of
-           increasing dimension along its diagonal.
-
-           Return a dict of slices with the extents
-           of each set of blocks in the CSR representation of A."""
-        A00 = A.createSubMatrix(i0, i0)
-        indptr, indices, _ = A00.getValuesCSR()
-        degree = numpy.diff(indptr)
-        perm = numpy.argsort(degree)
-        icur = 0
-        istart = 0
-        slices = {1: slice(0, 0)}
-        unique_degree, counts = numpy.unique(degree, return_counts=True)
-        for k, kdofs in sorted(zip(unique_degree, counts)):
-            if k > 1:
-                neigh = numpy.empty((kdofs, k), dtype=indices.dtype)
-                for row in range(kdofs):
-                    i = perm[icur+row]
-                    neigh[row] = indices[slice(*indptr[i:i+2])]
-                perm[icur:icur+kdofs] = list(dict.fromkeys(neigh.flat))
-
-            slices[k] = slice(istart, istart + k * kdofs)
-            istart += k * kdofs
-            icur += kdofs
-        i0.setIndices(i0.getIndices()[perm])
-        A00.destroy()
-        return slices
-
-    def get_blocks(self):
-        self.submats = self.A.createSubMatrices(self.isrows, self.iscols, submats=self.submats or None)
-        return self.submats
-
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         """By default pad with zeros the statically condensed pattern"""
         structure = PETSc.Mat.Structure.SUBSET if result else None
         if result is None:
-            A00, A01, A10, _ = self.get_blocks()
+            _, A10, A01, A00 = self.submats
             result = A10.matMatMult(A00, A01, result=result)
-        result.aypx(0.0, self.A, structure=structure)
+        result.aypx(0.0, self.submats[0], structure=structure)
         return result
 
 
@@ -819,7 +821,7 @@ class SchurComplementDiagonal(SchurComplementKernel):
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks()
+        A11, A10, A01, A00 = self.submats
         self.work[0] = A00.getDiagonal(result=self.work[0])
         self.work[0].reciprocal()
         self.work[0].scale(-1)
@@ -834,7 +836,7 @@ class SchurComplementBlockCholesky(SchurComplementKernel):
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks()
+        A11, A10, A01, A00 = self.submats
         indptr, indices, R = A00.getValuesCSR()
 
         zlice = self.slices[1]
@@ -863,7 +865,7 @@ class SchurComplementBlockQR(SchurComplementKernel):
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks()
+        A11, A10, A01, A00 = self.submats
         indptr, indices, R = A00.getValuesCSR()
         Q = numpy.ones(R.shape, dtype=R.dtype)
 
@@ -896,7 +898,7 @@ class SchurComplementBlockSVD(SchurComplementKernel):
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks()
+        A11, A10, A01, A00 = self.submats
         indptr, indices, U = A00.getValuesCSR()
         V = numpy.ones(U.shape, dtype=U.dtype)
         self.work[0] = A00.getDiagonal(result=self.work[0])
@@ -935,7 +937,7 @@ class SchurComplementBlockInverse(SchurComplementKernel):
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A00, A01, A10, A11 = self.get_blocks()
+        A11, A10, A01, A00 = self.submats
         indptr, indices, R = A00.getValuesCSR()
 
         zlice = self.slices[1]
@@ -1218,6 +1220,18 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
     return Dmat
 
 
+def restrict_element(ele, restriction_domain):
+    """Get an element that is not restricted and return the restricted element."""
+    if isinstance(ele, ufl.VectorElement):
+        return type(ele)(restrict_element(ele._sub_element, restriction_domain), dim=ele.num_sub_elements())
+    elif isinstance(ele, ufl.TensorElement):
+        return type(ele)(restrict_element(ele._sub_element, restriction_domain), shape=ele._shape, symmetry=ele.symmetry())
+    elif isinstance(ele, ufl.MixedElement):
+        return type(ele)(*(restrict_element(e, restriction_domain) for e in ele.sub_elements()))
+    else:
+        return ele[restriction_domain]
+
+
 def unrestrict_element(ele):
     """Get an element that might or might not be restricted and
        return the parent unrestricted element."""
@@ -1225,18 +1239,10 @@ def unrestrict_element(ele):
         return type(ele)(unrestrict_element(ele._sub_element), dim=ele.num_sub_elements())
     elif isinstance(ele, ufl.TensorElement):
         return type(ele)(unrestrict_element(ele._sub_element), shape=ele._shape, symmetry=ele.symmetry())
-    elif isinstance(ele, ufl.EnrichedElement):
-        return type(ele)(*list(dict.fromkeys(unrestrict_element(e) for e in ele._elements)))
-    elif isinstance(ele, ufl.TensorProductElement):
-        return type(ele)(*(unrestrict_element(e) for e in ele.sub_elements()), cell=ele.cell())
     elif isinstance(ele, ufl.MixedElement):
         return type(ele)(*(unrestrict_element(e) for e in ele.sub_elements()))
-    elif isinstance(ele, ufl.WithMapping):
-        return type(ele)(unrestrict_element(ele.wrapee), ele.mapping())
     elif isinstance(ele, ufl.RestrictedElement):
         return unrestrict_element(ele._element)
-    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
-        return type(ele)(unrestrict_element(ele._element))
     else:
         return ele
 

From ce7cae1f2a3da111eca447645a53a4646cf3a9f9 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Mon, 17 Apr 2023 17:45:46 +0100
Subject: [PATCH 65/75] extract finite elements from coefficient spaces,
 exploit symmetry in SchurComplementCholesky

---
 firedrake/preconditioners/fdm.py | 246 ++++++++++++++++++-------------
 firedrake/preconditioners/pmg.py |  16 +-
 2 files changed, 151 insertions(+), 111 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index d5583ed177..c7d031079f 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -4,7 +4,7 @@
 from firedrake.preconditioners.base import PCBase
 from firedrake.preconditioners.patch import bcdofs
 from firedrake.preconditioners.pmg import (prolongation_matrix_matfree,
-                                           fiat_reference_prolongator,
+                                           evaluate_dual,
                                            get_permutation_to_line_elements)
 from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
 from firedrake.formmanipulation import ExtractSubBlock
@@ -193,15 +193,17 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
         :arg J: the Jacobian bilinear form
         :arg bcs: an iterable of boundary conditions on V
         :arg fcp: form compiler parameters to assemble coefficients
-        :arg pmat_type: the preconditioner `PETSc.Mat.Type`
+        :arg pmat_type: the `PETSc.Mat.Type` for the blocks in the diagonal
         :arg use_static_condensation: are we assembling the statically-condensed Schur complement on facets?
 
         :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and a list of assembly callables
         """
+        symmetric = pmat_type.endswith("sbaij")
         ifacet = [i for i, Vsub in enumerate(V) if is_restricted(Vsub.finat_element)[1]]
         if len(ifacet) == 0:
             Vfacet = None
             Vbig = V
+            ebig = V.ufl_element()
             _, fdofs = split_dofs(V.finat_element)
         elif len(ifacet) == 1:
             Vfacet = V[ifacet[0]]
@@ -213,30 +215,27 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
             fdofs = restricted_dofs(Vfacet.finat_element, Vbig.finat_element)
         else:
             raise ValueError("Expecting at most one FunctionSpace restricted onto facets.")
+        self.embedding_element = ebig
 
-        value_size = Vbig.value_size
-        if value_size != 1:
-            fdofs = numpy.add.outer(value_size * fdofs, numpy.arange(value_size, dtype=fdofs.dtype))
-        self.fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
+        if Vbig.value_size == 1:
+            self.fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
+        else:
+            self.fises = PETSc.IS().createBlock(Vbig.value_size, fdofs, comm=PETSc.COMM_SELF)
 
-        # Dictionaries with the complement space and kernel to compute the Schur complement
-        self.complement_space = {}
+        # Dictionary with kernel to compute the Schur complement
         self.schur_kernel = {}
-        if Vfacet and use_static_condensation:
+        if V == Vbig and Vbig.finat_element.formdegree == 0:
+            # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
+            self.schur_kernel[V] = SchurComplementPattern
+        elif Vfacet and use_static_condensation:
             # If we are in a facet space, we build the Schur complement on its diagonal block
-            self.complement_space[Vfacet] = FunctionSpace(V.mesh(), restrict_element(ebig, "interior"))
-            if Vfacet.finat_element.formdegree == 0 and value_size == 1:
+            if Vfacet.finat_element.formdegree == 0 and Vfacet.value_size == 1:
                 self.schur_kernel[Vfacet] = SchurComplementDiagonal
-            elif pmat_type.endswith("sbaij"):
+            elif symmetric:
                 self.schur_kernel[Vfacet] = SchurComplementBlockCholesky
             else:
                 self.schur_kernel[Vfacet] = SchurComplementBlockQR
 
-        elif len(fdofs) and V.finat_element.formdegree == 0:
-            # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
-            self.complement_space[V] = FunctionSpace(V.mesh(), restrict_element(V.ufl_element(), "interior"))
-            self.schur_kernel[V] = SchurComplementKernel
-
         # Create data structures needed for assembly
         self.lgmaps = {Vsub: Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub]) for Vsub in V}
         self.coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
@@ -244,8 +243,6 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
 
         Pmats = {}
         addv = PETSc.InsertMode.ADD_VALUES
-        symmetric = pmat_type.endswith("sbaij")
-
         # Store only off-diagonal blocks with more columns than rows to save memory
         Vsort = sorted(V, key=lambda Vsub: Vsub.dim())
         # Loop over all pairs of subspaces
@@ -276,6 +273,8 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
                 P.setSizes(sizes)
                 P.setPreallocationNNZ((d_nnz, o_nnz))
                 P.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+                if on_diag:
+                    P.setOption(PETSc.Mat.Option.STRUCTURALLY_SYMMETRIC, True)
                 if ptype.endswith("sbaij"):
                     P.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
                 P.setUp()
@@ -295,7 +294,6 @@ def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
             Pmat = Pmats[V, V]
         else:
             Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=self.comm)
-
         assembly_callables.append(Pmat.assemble)
         return Pmat, assembly_callables
 
@@ -358,6 +356,8 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
                   order coefficients keyed on ``"beta"`` and ``"alpha"``,
                   and a list of assembly callables.
         """
+        coefficients = {}
+        assembly_callables = []
         # Basic idea: take the original bilinear form and
         # replace the exterior derivatives with arguments in broken(V^{k+1}).
         # Then, replace the original arguments with arguments in broken(V^k).
@@ -427,8 +427,6 @@ def assemble_coefficients(self, J, fcp, block_diagonal=True):
         mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
 
         # Return coefficients and assembly callables
-        coefficients = {}
-        assembly_callables = []
         if block_diagonal and V.shape:
             from firedrake.assemble import assemble
             M = assemble(mixed_form, mat_type="matfree", form_compiler_parameters=fcp)
@@ -481,42 +479,54 @@ def assemble_reference_tensor(self, V, transpose=False, sort_interior=False):
 
         if sort_interior:
             assert is_interior and not is_facet and not transpose
-            result = self.assemble_reference_tensor(V, transpose=transpose, sort_interior=False)
-            # Compute the stiffness matrix on the interior of a cell
-            A00 = self._element_mass_matrix.PtAP(result)
-            indptr, indices, _ = A00.getValuesCSR()
-            degree = numpy.diff(indptr)
             # Sort DOFs to make A00 block diagonal with blocks of increasing dimension along the diagonal
-            perm = numpy.array(list(dict.fromkeys(indices)), dtype=indices.dtype)
-            perm = perm[numpy.argsort(degree[perm], kind='stable')]
-            A00.destroy()
-
-            perm = PETSc.IS().createGeneral(perm, comm=result.getComm())
-            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-            result = result.createSubMatrix(noperm, perm)
-            noperm.destroy()
-            perm.destroy()
+            result = self.assemble_reference_tensor(V, transpose=transpose, sort_interior=False)
+            if formdegree != 0:
+                # Compute the stiffness matrix on the interior of a cell
+                A00 = self._element_mass_matrix.PtAP(result)
+                indptr, indices, _ = A00.getValuesCSR()
+                degree = numpy.diff(indptr)
+                # Sort by blocks
+                uniq, u_index = numpy.unique(indices, return_index=True)
+                perm = uniq[u_index.argsort(kind='stable')]
+                # Sort by degree
+                degree = degree[perm]
+                perm = perm[degree.argsort(kind='stable')]
+                A00.destroy()
+
+                iscol = PETSc.IS().createGeneral(perm, comm=result.getComm())
+                result = get_submat(result, iscol=iscol)
+                iscol.destroy()
             return cache.setdefault(key, result)
 
         full_key = key[:-4] + (False,) * 4
         if is_facet and full_key in cache:
-            result = cache[full_key]
-            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-            result = result.createSubMatrix(noperm, self.fises)
-            noperm.destroy()
+            result = get_submat(cache[full_key], iscol=self.fises)
             return cache.setdefault(key, result)
 
+        # Get CG(k) and DG(k-1) 1D elements from V
         elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
-        ref_el = elements[0].get_reference_element()
-        eq = FIAT.FDMQuadrature(ref_el, degree)
-        e0 = elements[0] if elements[0].formdegree == 0 else FIAT.FDMLagrange(ref_el, degree)
-        e1 = elements[-1] if elements[-1].formdegree == 1 else FIAT.FDMDiscontinuousLagrange(ref_el, degree-1)
-        if is_interior:
+        e0, e1 = elements[::len(elements)-1]
+        e0 = elements[0] if elements[0].formdegree == 0 else None
+        e1 = elements[-1] if elements[-1].formdegree == 1 else None
+        if e0 and is_interior:
             e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
 
-        A00 = petsc_sparse(fiat_reference_prolongator(e0, eq), comm=PETSc.COMM_SELF)
-        A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
-        A11 = petsc_sparse(numpy.eye(e1.space_dimension(), dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+        # Get broken(CG(k)) and DG(k-1) 1D elements from the coefficient spaces
+        Q0 = self.coefficients["beta"].function_space().finat_element.element
+        elements = sorted(get_base_elements(Q0), key=lambda e: e.formdegree)
+        q0 = elements[0] if elements[0].formdegree == 0 else None
+        q1 = elements[-1]
+        if q1.formdegree != 1:
+            Q1 = self.coefficients["alpha"].function_space().finat_element.element
+            q1 = sorted(get_base_elements(Q1), key=lambda e: e.formdegree)[-1]
+
+        # Interpolate V * d(V) -> space(beta) * space(alpha)
+        comm = PETSc.COMM_SELF
+        zero = PETSc.Mat()
+        A00 = petsc_sparse(evaluate_dual(e0, q0), comm=comm) if e0 and q0 else zero
+        A11 = petsc_sparse(evaluate_dual(e1, q1), comm=comm) if e1 else zero
+        A10 = petsc_sparse(evaluate_dual(e0, q1, alpha=(1,)), comm=comm) if e0 else zero
         B_blocks = mass_blocks(tdim, formdegree, A00, A11)
         A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
         result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
@@ -533,9 +543,7 @@ def assemble_reference_tensor(self, V, transpose=False, sort_interior=False):
 
         if is_facet:
             cache[full_key] = result
-            noperm = PETSc.IS().createGeneral(numpy.arange(result.getSize()[0], dtype=PETSc.IntType), comm=result.getComm())
-            result = result.createSubMatrix(noperm, self.fises)
-            noperm.destroy()
+            result = get_submat(cache[full_key], iscol=self.fises)
 
         return cache.setdefault(key, result)
 
@@ -580,7 +588,7 @@ def set_values(self, A, Vrow, Vcol, addv, triu=False):
             if Vrow == Vcol:
                 schur_kernel = self.schur_kernel.get(Vrow)
             if schur_kernel is not None:
-                V0 = self.complement_space[Vrow]
+                V0 = FunctionSpace(Vrow.mesh(), restrict_element(self.embedding_element, "interior"))
                 C0 = self.assemble_reference_tensor(V0, sort_interior=True)
                 R0 = self.assemble_reference_tensor(V0, sort_interior=True, transpose=True)
                 # Only the facet block updates the coefficients in M
@@ -622,36 +630,32 @@ def __init__(self, kernel, Vrow, Vcol, rmap, cmap):
         row_shape = tuple() if Vrow.value_size == 1 else (Vrow.value_size,)
         map_rows = (self.map_block_indices, rmap) if row_shape else (rmap.apply,)
         rows = numpy.empty((m, ), dtype=PETSc.IntType).reshape((-1,) + row_shape)
-
-        self.bc_nodes = None
         if Vcol == Vrow:
             cols = rows
-            map_cols = (lambda *x, result=None: result, )
-        #     own = Vrow.dof_dset.layout_vec.getLocalSize()
-        #     bc_nodes = numpy.flatnonzero(rmap.indices[:own] < 0).astype(PETSc.IntType)
-        #     if len(bc_nodes) > 0:
-        #         bc_nodes = Vrow.dof_dset.lgmap.apply(bc_nodes, result=bc_nodes)
-        #         self.bc_nodes = bc_nodes[:, None]
+            map_cols = (lambda *args, result=None: result, )
         else:
             spaces.append(Vcol)
             col_shape = tuple() if Vcol.value_size == 1 else (Vcol.value_size,)
             map_cols = (self.map_block_indices, cmap) if col_shape else (cmap.apply, )
             cols = numpy.empty((n, ), dtype=PETSc.IntType).reshape((-1,) + col_shape)
-
         spaces.extend(c.function_space() for c in kernel.coefficients)
-        self.indices = tuple(numpy.empty((V.finat_element.space_dimension(),), dtype=PETSc.IntType) for V in spaces)
-        self.map_rows = partial(*map_rows, self.indices[spaces.index(Vrow)], result=rows)
-        self.map_cols = partial(*map_cols, self.indices[spaces.index(Vcol)], result=cols)
-        self.kernel_args = self.indices[-len(kernel.coefficients):]
 
         integral_type = kernel.integral_type
-        if integral_type == "cell":
+        if integral_type in ["cell", "interior_facet_horiz"]:
             get_map = operator.methodcaller("cell_node_map")
-        elif integral_type == "interior_facet":
+        elif integral_type in ["interior_facet", "interior_facet_vert"]:
             get_map = operator.methodcaller("interior_facet_node_map")
         else:
             raise NotImplementedError("Only for cell or interior facet integrals")
         self.node_maps = tuple(map(get_map, spaces))
+
+        ncell = 2 if integral_type.startswith("interior_facet") else 1
+        self.indices = tuple(numpy.empty((V.finat_element.space_dimension() * ncell,), dtype=PETSc.IntType) for V in spaces)
+        self.map_rows = partial(*map_rows, self.indices[spaces.index(Vrow)], result=rows)
+        self.map_cols = partial(*map_cols, self.indices[spaces.index(Vcol)], result=cols)
+        self.kernel_args = self.indices[-len(kernel.coefficients):]
+        self.set_indices = self.copy_indices
+
         node_map = self.node_maps[0]
         self.nel = node_map.values.shape[0]
         if node_map.offset is None:
@@ -659,18 +663,27 @@ def __init__(self, kernel, Vrow, Vcol, rmap, cmap):
         else:
             layers = node_map.iterset.layers_array
             layers = layers[:, 1]-layers[:, 0]-1
+            if integral_type.endswith("horiz"):
+                layers -= 1
+                self.set_indices = self.copy_indices_horiz
             if layers.shape[0] != self.nel:
                 layers = numpy.repeat(layers, self.nel)
         self.layers = layers
 
     def map_block_indices(self, lgmap, indices, result=None):
-        bsize = result.shape[1]
+        bsize = result.shape[-1]
         numpy.copyto(result[:, 0], indices)
         result[:, 0] *= bsize
         numpy.add.outer(result[:, 0], numpy.arange(1, bsize, dtype=indices.dtype), out=result[:, 1:])
         return lgmap.apply(result, result=result)
 
-    def set_indices(self, e):
+    def copy_indices_horiz(self, e):
+        for index, node_map in zip(self.indices, self.node_maps):
+            index = index.reshape((2, -1))
+            numpy.copyto(index, node_map.values_with_halo[e])
+            index[1] += node_map.offset
+
+    def copy_indices(self, e):
         for index, node_map in zip(self.indices, self.node_maps):
             numpy.copyto(index, node_map.values_with_halo[e])
 
@@ -683,10 +696,6 @@ def assemble(self, A, addv=None, triu=False):
             kernel = lambda *args, result=None: result
         else:
             kernel = self.kernel
-            triu = False
-            if self.bc_nodes is not None:
-                vals = numpy.ones(self.bc_nodes.shape, dtype=PETSc.RealType)
-                A.setValuesRCV(self.bc_nodes, self.bc_nodes, vals, addv)
         result = self.kernel.result
         insert = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
 
@@ -736,9 +745,8 @@ def __init__(self, A, B, C, *coefficients):
             self.data = numpy.array([])
             self.update = lambda *args: args
         else:
-            V = coefficients[0].function_space()
             dshape = (-1, ) + coefficients[0].dat.data_ro.shape[1:]
-            if V.value_size == 1:
+            if numpy.prod(dshape[1:]) == 1:
                 self.work = B.getDiagonal()
                 self.data = self.work.array_w.reshape(dshape)
                 self.update = partial(B.setDiagonal, self.work)
@@ -773,7 +781,7 @@ class SchurComplementKernel(ElementKernel):
     """
     def __init__(self, *kernels):
         self.children = kernels
-        self.submats = [k.result for k in self.children]
+        self.submats = [k.result for k in kernels]
 
         # Create dict of slices with the extents of the diagonal blocks
         A00 = self.submats[-1]
@@ -785,6 +793,8 @@ def __init__(self, *kernels):
             self.slices[k] = slice(istart, istart + k * kdofs)
             istart += k * kdofs
 
+        self.blocks = sorted(degree for degree in self.slices if degree > 1)
+
         self.work = [None for _ in range(2)]
         coefficients = []
         for k in self.children:
@@ -805,6 +815,18 @@ def destroy(self):
             if isinstance(obj, PETSc.Object):
                 obj.destroy()
 
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        return result
+
+
+class SchurComplementPattern(SchurComplementKernel):
+
+    def __call__(self, *args, result=None):
+        k = self.children[0]
+        k(*args, result=k.result)
+        return self.condense(result=result)
+
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         """By default pad with zeros the statically condensed pattern"""
@@ -833,17 +855,21 @@ def condense(self, result=None):
 
 class SchurComplementBlockCholesky(SchurComplementKernel):
 
+    def __init__(self, K11, K10, K01, K00):
+        # asssume that K10 = K01^T
+        super().__init__(K11, K01, K00)
+
     @PETSc.Log.EventDecorator("FDMCondense")
     def condense(self, result=None):
         structure = PETSc.Mat.Structure.SUBSET if result else None
-        A11, A10, A01, A00 = self.submats
+        A11, A01, A00 = self.submats
         indptr, indices, R = A00.getValuesCSR()
 
         zlice = self.slices[1]
         numpy.sqrt(R[zlice], out=R[zlice])
         numpy.reciprocal(R[zlice], out=R[zlice])
         flops = 2 * (zlice.stop - zlice.start)
-        for k in sorted(degree for degree in self.slices if degree > 1):
+        for k in self.blocks:
             Rk = R[self.slices[k]]
             A = Rk.reshape((-1, k, k))
             rinv = numpy.linalg.inv(numpy.linalg.cholesky(A))
@@ -853,10 +879,9 @@ def condense(self, result=None):
         PETSc.Log.logFlops(flops)
         A00.setValuesCSR(indptr, indices, R)
         A00.assemble()
-        self.work[0] = A10.matTransposeMult(A00, result=self.work[0])
-        A00.scale(-1.0)
-        result = self.work[0].matMatMult(A00, A01, result=result)
-        result.axpy(1.0, A11, structure=structure)
+        self.work[0] = A00.matMult(A01, result=self.work[0])
+        result = self.work[0].transposeMatMult(self.work[0], result=result)
+        result.aypx(-1.0, A11, structure=structure)
         return result
 
 
@@ -872,7 +897,7 @@ def condense(self, result=None):
         zlice = self.slices[1]
         numpy.reciprocal(R[zlice], out=R[zlice])
         flops = zlice.stop - zlice.start
-        for k in sorted(degree for degree in self.slices if degree > 1):
+        for k in self.blocks:
             zlice = self.slices[k]
             A = R[zlice].reshape((-1, k, k))
             q, r = numpy.linalg.qr(A, mode="complete")
@@ -906,7 +931,7 @@ def condense(self, result=None):
         dslice = self.slices[1]
         numpy.sign(D.array_r[dslice], out=U[dslice])
         flops = dslice.stop - dslice.start
-        for k in sorted(degree for degree in self.slices if degree > 1):
+        for k in self.blocks:
             bslice = self.slices[k]
             A = U[bslice].reshape((-1, k, k))
             u, s, v = numpy.linalg.svd(A, full_matrices=False)
@@ -943,7 +968,7 @@ def condense(self, result=None):
         zlice = self.slices[1]
         numpy.reciprocal(R[zlice], out=R[zlice])
         flops = zlice.stop - zlice.start
-        for k in sorted(degree for degree in self.slices if degree > 1):
+        for k in self.blocks:
             Rk = R[self.slices[k]]
             A = Rk.reshape((-1, k, k))
             rinv = numpy.linalg.inv(A)
@@ -1070,6 +1095,25 @@ def kron3(A, B, C, scale=None):
     return result
 
 
+def get_submat(A, isrow=None, iscol=None):
+    """Return the sub matrix A[isrow, iscol]"""
+    needs_rows = isrow is None
+    needs_cols = iscol is None
+    if needs_rows and needs_cols:
+        return A
+    size = A.getSize()
+    if needs_rows:
+        isrow = PETSc.IS().createStride(size[0], step=1, comm=A.getComm())
+    if needs_cols:
+        iscol = PETSc.IS().createStride(size[1], step=1, comm=A.getComm())
+    submat = A.createSubMatrix(isrow, iscol)
+    if needs_rows:
+        isrow.destroy()
+    if needs_cols:
+        iscol.destroy()
+    return submat
+
+
 def block_mat(A_blocks, destroy_blocks=False):
     """Return a concrete Mat corresponding to a block matrix given as a list of lists.
        Optionally, destroys the input Mats if a new Mat is created."""
@@ -1113,8 +1157,7 @@ def mass_blocks(tdim, formdegree, B00, B11):
     if n == 1:
         return [B_diag]
     else:
-        zero = PETSc.Mat().createAIJ(B_diag[0].getSize(), nnz=(0, 0), comm=B_diag[0].getComm())
-        zero.assemble()
+        zero = PETSc.Mat()
         return [[B_diag[i] if i == j else zero for j in range(n)] for i in range(n)]
 
 
@@ -1139,9 +1182,7 @@ def diff_blocks(tdim, formdegree, A00, A11, A10):
         if formdegree == 0:
             A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
         elif formdegree == 1:
-            size = tuple(A11.getSize()[k] * A10.getSize()[k] * A00.getSize()[k] for k in range(2))
-            zero = PETSc.Mat().createAIJ(size, nnz=(0, 0), comm=A10.getComm())
-            zero.assemble()
+            zero = PETSc.Mat()
             A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), zero],
                         [kron3(A10, A00, A11, scale=-1), zero, kron3(A11, A00, A10)],
                         [zero, kron3(A10, A11, A00), kron3(A11, A10, A00, scale=-1)]]
@@ -1162,15 +1203,20 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
     if ef.formdegree - ec.formdegree != 1:
         raise ValueError("Expecting Vf = d(Vc)")
 
-    elements = list(set(get_base_elements(ec) + get_base_elements(ef)))
-    elements = sorted(elements, key=lambda e: e.formdegree)
-    e0, e1 = elements[::len(elements)-1]
+    elements = sorted(get_base_elements(ec), key=lambda e: e.formdegree)
+    c0, c1 = elements[::len(elements)-1]
+    elements = sorted(get_base_elements(ef), key=lambda e: e.formdegree)
+    f0, f1 = elements[::len(elements)-1]
+    if f0.formdegree != 0:
+        f0 = None
+    if c1.formdegree != 1:
+        c1 = None
 
-    degree = e0.degree()
     tdim = Vc.mesh().topological_dimension()
-    A00 = petsc_sparse(numpy.eye(degree+1, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
-    A10 = petsc_sparse(fiat_reference_prolongator(e0, e1, derivative=True), comm=PETSc.COMM_SELF)
-    A11 = petsc_sparse(numpy.eye(degree, dtype=PETSc.RealType), comm=PETSc.COMM_SELF)
+    zero = PETSc.Mat()
+    A00 = petsc_sparse(evaluate_dual(c0, f0), comm=PETSc.COMM_SELF) if f0 else zero
+    A11 = petsc_sparse(evaluate_dual(c1, f1), comm=PETSc.COMM_SELF) if c1 else zero
+    A10 = petsc_sparse(evaluate_dual(c0, f1, alpha=(1,)), comm=PETSc.COMM_SELF)
     Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy_blocks=True)
     A00.destroy()
     A10.destroy()
@@ -1404,9 +1450,9 @@ def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
             bce = bcflags.dat.data_ro_with_halos[index_bc(e)] > 1E-8
             # get coefficients on this cell
             if Gq is not None:
-                numpy.sum(Gq.dat.data_ro[je], axis=0, out=ae)
+                ae[:] = numpy.sum(Gq.dat.data_ro[je], axis=0)
             if Bq is not None:
-                numpy.sum(Bq.dat.data_ro[je], axis=0, out=be)
+                be[:] = numpy.sum(Bq.dat.data_ro[je], axis=0)
 
             rindices = get_rindices(e, result=rindices)
             rows = numpy.reshape(rindices, (-1, bsize))
@@ -1662,10 +1708,10 @@ def assemble_coefficients(self, J, fcp):
                 ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
                 forms.append(ufl.inner(test, beta)*ds_ext)
 
+        tensor = coefficients.setdefault("bcflags", Function(Q))
         if len(forms):
             form = sum(forms)
             if len(form.arguments()) == 1:
-                tensor = coefficients.setdefault("bcflags", Function(Q))
                 assembly_callables.append(OneFormAssembler(form, tensor=tensor,
                                                            form_compiler_parameters=fcp).assemble)
         # set arbitrary non-zero coefficients for preallocation
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 8719c76ece..e497d32340 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -553,7 +553,8 @@ def expand_element(ele):
         return ele
 
 
-def evaluate_dual(source, target, alpha=None):
+@lru_cache(maxsize=10)
+def evaluate_dual(source, target, alpha=tuple()):
     """Evaluate the action of a set of dual functionals of the target element
        on the (derivative of order alpha of the) basis functions of the source
        element."""
@@ -561,7 +562,7 @@ def evaluate_dual(source, target, alpha=None):
     dual = target.get_dual_set()
     A = dual.to_riesz(primal)
     B = numpy.transpose(primal.get_coeffs())
-    if alpha is not None:
+    if sum(alpha):
         dmats = primal.get_dmats()
         for i in range(len(alpha)):
             for j in range(alpha[i]):
@@ -577,14 +578,7 @@ def compare_element(e1, e2):
     if e1.space_dimension() != e2.space_dimension():
         return False
     B = evaluate_dual(e1, e2)
-    numpy.fill_diagonal(B, numpy.diagonal(B)-1.0)
-    return numpy.allclose(B, 0.0, rtol=1E-14, atol=1E-14)
-
-
-@lru_cache(maxsize=10)
-def fiat_reference_prolongator(celem, felem, derivative=False):
-    alpha = (1,) if derivative else None
-    return evaluate_dual(celem, felem, alpha=alpha)
+    return numpy.allclose(B, numpy.eye(B.shape[0]), rtol=1E-14, atol=1E-14)
 
 
 @lru_cache(maxsize=10)
@@ -920,7 +914,7 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
         fshapes.append((nscal,) + tuple(fshape))
         cshapes.append((nscal,) + tuple(cshape))
 
-        J = [identity_filter(fiat_reference_prolongator(ce, fe)).T for ce, fe in zip(celem, felem)]
+        J = [identity_filter(evaluate_dual(ce, fe)).T for ce, fe in zip(celem, felem)]
         if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
             prolong_code.append(f"""
             for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;

From 65690d062364e116dafafba0aff4a3c6baf3e3c6 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Fri, 21 Apr 2023 12:01:59 +0100
Subject: [PATCH 66/75] Fix caching

---
 firedrake/preconditioners/fdm.py | 99 ++++++++++++++++----------------
 firedrake/preconditioners/pmg.py | 76 +++++++++++++++---------
 2 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index c7d031079f..66a9696628 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -494,57 +494,53 @@ def assemble_reference_tensor(self, V, transpose=False, sort_interior=False):
                 perm = perm[degree.argsort(kind='stable')]
                 A00.destroy()
 
-                iscol = PETSc.IS().createGeneral(perm, comm=result.getComm())
-                result = get_submat(result, iscol=iscol)
-                iscol.destroy()
+                isperm = PETSc.IS().createGeneral(perm, comm=result.getComm())
+                result = get_submat(result, iscol=isperm, permute=True)
+                isperm.destroy()
             return cache.setdefault(key, result)
 
-        full_key = key[:-4] + (False,) * 4
-        if is_facet and full_key in cache:
-            result = get_submat(cache[full_key], iscol=self.fises)
-            return cache.setdefault(key, result)
-
-        # Get CG(k) and DG(k-1) 1D elements from V
-        elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
-        e0, e1 = elements[::len(elements)-1]
-        e0 = elements[0] if elements[0].formdegree == 0 else None
-        e1 = elements[-1] if elements[-1].formdegree == 1 else None
-        if e0 and is_interior:
-            e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
-
-        # Get broken(CG(k)) and DG(k-1) 1D elements from the coefficient spaces
-        Q0 = self.coefficients["beta"].function_space().finat_element.element
-        elements = sorted(get_base_elements(Q0), key=lambda e: e.formdegree)
-        q0 = elements[0] if elements[0].formdegree == 0 else None
-        q1 = elements[-1]
-        if q1.formdegree != 1:
-            Q1 = self.coefficients["alpha"].function_space().finat_element.element
-            q1 = sorted(get_base_elements(Q1), key=lambda e: e.formdegree)[-1]
-
-        # Interpolate V * d(V) -> space(beta) * space(alpha)
-        comm = PETSc.COMM_SELF
-        zero = PETSc.Mat()
-        A00 = petsc_sparse(evaluate_dual(e0, q0), comm=comm) if e0 and q0 else zero
-        A11 = petsc_sparse(evaluate_dual(e1, q1), comm=comm) if e1 else zero
-        A10 = petsc_sparse(evaluate_dual(e0, q1, alpha=(1,)), comm=comm) if e0 else zero
-        B_blocks = mass_blocks(tdim, formdegree, A00, A11)
-        A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
-        result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
-        A00.destroy()
-        A10.destroy()
-        A11.destroy()
-
-        if value_size != 1:
-            eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
-            temp = result
-            result = temp.kron(eye)
-            temp.destroy()
-            eye.destroy()
+        short_key = key[:-3] + (False,) * 3
+        try:
+            result = cache[short_key]
+        except KeyError:
+            # Get CG(k) and DG(k-1) 1D elements from V
+            elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
+            e0 = elements[0] if elements[0].formdegree == 0 else None
+            e1 = elements[-1] if elements[-1].formdegree == 1 else None
+            if e0 and is_interior:
+                e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
+
+            # Get broken(CG(k)) and DG(k-1) 1D elements from the coefficient spaces
+            Q0 = self.coefficients["beta"].function_space().finat_element.element
+            elements = sorted(get_base_elements(Q0), key=lambda e: e.formdegree)
+            q0 = elements[0] if elements[0].formdegree == 0 else None
+            q1 = elements[-1]
+            if q1.formdegree != 1:
+                Q1 = self.coefficients["alpha"].function_space().finat_element.element
+                q1 = sorted(get_base_elements(Q1), key=lambda e: e.formdegree)[-1]
+
+            # Interpolate V * d(V) -> space(beta) * space(alpha)
+            comm = PETSc.COMM_SELF
+            zero = PETSc.Mat()
+            A00 = petsc_sparse(evaluate_dual(e0, q0), comm=comm) if e0 and q0 else zero
+            A11 = petsc_sparse(evaluate_dual(e1, q1), comm=comm) if e1 else zero
+            A10 = petsc_sparse(evaluate_dual(e0, q1, alpha=(1,)), comm=comm) if e0 else zero
+            B_blocks = mass_blocks(tdim, formdegree, A00, A11)
+            A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
+            result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
+            A00.destroy()
+            A10.destroy()
+            A11.destroy()
+            if value_size != 1:
+                eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
+                temp = result
+                result = temp.kron(eye)
+                temp.destroy()
+                eye.destroy()
 
         if is_facet:
-            cache[full_key] = result
-            result = get_submat(cache[full_key], iscol=self.fises)
-
+            cache[short_key] = result
+            result = get_submat(result, iscol=self.fises)
         return cache.setdefault(key, result)
 
     @cached_property
@@ -1095,7 +1091,7 @@ def kron3(A, B, C, scale=None):
     return result
 
 
-def get_submat(A, isrow=None, iscol=None):
+def get_submat(A, isrow=None, iscol=None, permute=False):
     """Return the sub matrix A[isrow, iscol]"""
     needs_rows = isrow is None
     needs_cols = iscol is None
@@ -1106,7 +1102,10 @@ def get_submat(A, isrow=None, iscol=None):
         isrow = PETSc.IS().createStride(size[0], step=1, comm=A.getComm())
     if needs_cols:
         iscol = PETSc.IS().createStride(size[1], step=1, comm=A.getComm())
-    submat = A.createSubMatrix(isrow, iscol)
+    if permute:
+        submat = A.permute(isrow, iscol)
+    else:
+        submat = A.createSubMatrix(isrow, iscol)
     if needs_rows:
         isrow.destroy()
     if needs_cols:
@@ -1335,7 +1334,7 @@ class PoissonFDMPC(FDMPC):
 
     def assemble_reference_tensor(self, V):
         try:
-            _, line_elements, shifts = get_permutation_to_line_elements(V.finat_element)
+            _, line_elements, shifts = get_permutation_to_line_elements(V)
         except ValueError:
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index e497d32340..b441ebde16 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -1,4 +1,4 @@
-from functools import partial, lru_cache
+from functools import partial
 from itertools import chain
 from firedrake.dmhooks import (attach_hooks, get_appctx, push_appctx, pop_appctx,
                                add_hook, get_parent, push_parent, pop_parent,
@@ -12,9 +12,11 @@
 from tsfc.finatinterface import create_element
 from tsfc import compile_expression_dual_evaluation
 from pyop2 import op2
+from pyop2.caching import cached
 
 import firedrake
 import finat
+import FIAT
 import ufl
 import loopy
 import numpy
@@ -553,7 +555,35 @@ def expand_element(ele):
         return ele
 
 
-@lru_cache(maxsize=10)
+def hash_fiat_element(element):
+    """FIAT elements are not hashable,
+       this is not the best way to create a hash"""
+    restriction = None
+    e = element
+    if isinstance(e, FIAT.DiscontinuousElement):
+        # this hash does not care about inter-element continuity
+        e = e._element
+    if isinstance(e, FIAT.RestrictedElement):
+        restriction = tuple(e._indices)
+        e = e._element
+        if len(restriction) == e.space_dimension():
+            restriction = None
+    family = e.__class__.__name__
+    degree = e.order
+    return (family, element.ref_el, degree, restriction)
+
+
+def generate_key_evaluate_dual(source, target, alpha=tuple()):
+    return hash_fiat_element(source) + hash_fiat_element(target) + (alpha,)
+
+
+def get_readonly_view(arr):
+    result = arr.view()
+    result.flags.writeable = False
+    return result
+
+
+@cached({}, key=generate_key_evaluate_dual)
 def evaluate_dual(source, target, alpha=tuple()):
     """Evaluate the action of a set of dual functionals of the target element
        on the (derivative of order alpha of the) basis functions of the source
@@ -562,14 +592,15 @@ def evaluate_dual(source, target, alpha=tuple()):
     dual = target.get_dual_set()
     A = dual.to_riesz(primal)
     B = numpy.transpose(primal.get_coeffs())
-    if sum(alpha):
+    if sum(alpha) != 0:
         dmats = primal.get_dmats()
         for i in range(len(alpha)):
             for j in range(alpha[i]):
                 B = numpy.dot(dmats[i], B)
-    return numpy.dot(A, B)
+    return get_readonly_view(numpy.dot(A, B))
 
 
+@cached({}, key=generate_key_evaluate_dual)
 def compare_element(e1, e2):
     """Numerically compare two :class:`FIAT.elements`.
        Equality is satisfied if e2.dual_basis(e1.primal_basis) == identity."""
@@ -581,9 +612,9 @@ def compare_element(e1, e2):
     return numpy.allclose(B, numpy.eye(B.shape[0]), rtol=1E-14, atol=1E-14)
 
 
-@lru_cache(maxsize=10)
+@cached({}, key=lambda V: V.ufl_element())
 @PETSc.Log.EventDecorator("GetLineElements")
-def get_permutation_to_line_elements(finat_element):
+def get_permutation_to_line_elements(V):
     """
     Find DOF permutation to factor out the EnrichedElement expansion into common
     TensorProductElements. This routine exposes structure to e.g vectorize
@@ -592,35 +623,26 @@ def get_permutation_to_line_elements(finat_element):
 
     This is temporary while we wait for dual evaluation of :class:`finat.EnrichedElement`.
 
+    :arg V: a :class:`.FunctionSpace`
+
     :returns: a 3-tuple of the DOF permutation, the unique terms in expansion as
               a list of tuples of :class:`FIAT.FiniteElements`, and the cyclic
               permutations of the axes to form the element given by their shifts
               in list of `int` tuples
     """
+    finat_element = V.finat_element
     expansion = expand_element(finat_element)
     if expansion.space_dimension() != finat_element.space_dimension():
-        raise ValueError("Failed to decompose %s into tensor products" % finat_element)
+        raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
 
-    unique_factors = set()
     line_elements = []
     terms = expansion.elements if hasattr(expansion, "elements") else [expansion]
     for term in terms:
         factors = term.factors if hasattr(term, "factors") else (term,)
-        fiat_factors = [e.fiat_equivalent for e in reversed(factors)]
+        fiat_factors = tuple(e.fiat_equivalent for e in reversed(factors))
         if any(e.get_reference_element().get_spatial_dimension() != 1 for e in fiat_factors):
-            raise ValueError("Failed to decompose %s into line elements" % fiat_factors)
-
-        # use the same FIAT element if it appears multiple times in the expansion
-        for i in range(len(fiat_factors)):
-            n = fiat_factors[i]
-            for f in unique_factors:
-                if compare_element(n, f):
-                    n = f
-                    break
-            if n is fiat_factors[i]:
-                unique_factors.add(n)
-            fiat_factors[i] = n
-        line_elements.append(tuple(fiat_factors))
+            raise ValueError("Failed to decompose %s into line elements" % V.ufl_element())
+        line_elements.append(fiat_factors)
 
     shapes = [tuple(e.space_dimension() for e in factors) for factors in line_elements]
     sizes = list(map(numpy.prod, shapes))
@@ -660,7 +682,7 @@ def get_permutation_to_line_elements(finat_element):
 
         shifts.append(axes_shifts)
 
-    dof_perm = numpy.concatenate(dof_perm)
+    dof_perm = get_readonly_view(numpy.concatenate(dof_perm))
     return dof_perm, unique_line_elements, shifts
 
 
@@ -669,7 +691,7 @@ def get_permuted_map(V):
     Return a PermutedMap with the same tensor product shape for
     every component of H(div) or H(curl) tensor product elements
     """
-    indices, _, _ = get_permutation_to_line_elements(V.finat_element)
+    indices, _, _ = get_permutation_to_line_elements(V)
     if numpy.all(indices[:-1] < indices[1:]):
         return V.cell_node_map()
     return op2.PermutedMap(V.cell_node_map(), indices)
@@ -832,8 +854,8 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
     operator_decl = []
     prolong_code = []
     restrict_code = []
-    _, celems, cshifts = get_permutation_to_line_elements(Vc.finat_element)
-    _, felems, fshifts = get_permutation_to_line_elements(Vf.finat_element)
+    _, celems, cshifts = get_permutation_to_line_elements(Vc)
+    _, felems, fshifts = get_permutation_to_line_elements(Vf)
 
     shifts = fshifts
     in_place = False
@@ -1065,7 +1087,7 @@ def make_mapping_code(Q, cmapping, fmapping, t_in, t_out):
 
 
 def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
-    _, _, shifts = get_permutation_to_line_elements(V.finat_element)
+    _, _, shifts = get_permutation_to_line_elements(V)
     shift = shifts[0]
     if shift != (0,):
         ndof = numpy.prod(vshape)

From 8954fdf7ed2c8aa097737fcdc96cc237d819436e Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Fri, 21 Apr 2023 16:14:59 +0100
Subject: [PATCH 67/75] lint

---
 firedrake/preconditioners/fdm.py | 30 ++++++++++++------------------
 firedrake/preconditioners/pmg.py |  4 ++--
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 66a9696628..f47861a7a7 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1,5 +1,5 @@
 from functools import partial
-from itertools import product
+from itertools import chain, product
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
 from firedrake.preconditioners.patch import bcdofs
@@ -126,7 +126,7 @@ def initialize(self, pc):
         else:
             # Reconstruct Jacobian and bcs with variant element
             V_fdm = FunctionSpace(V.mesh(), e_fdm)
-            J_fdm = J(*[t.reconstruct(function_space=V_fdm) for t in J.arguments()], coefficients={})
+            J_fdm = J(*(t.reconstruct(function_space=V_fdm) for t in J.arguments()), coefficients={})
             bcs_fdm = []
             for bc in bcs:
                 W = V_fdm
@@ -1053,16 +1053,10 @@ def wrapper(A, B, rows, cols, addv):
 
 def is_restricted(finat_element):
     """Determine if an element is a restriction onto interior or facets"""
-    is_interior = True
-    is_facet = True
-    cell_dim = finat_element.cell.get_dimension()
-    entity_dofs = finat_element.entity_dofs()
-    for dim in sorted(entity_dofs):
-        if any(len(entity_dofs[dim][entity]) > 0 for entity in entity_dofs[dim]):
-            if dim == cell_dim:
-                is_facet = False
-            else:
-                is_interior = False
+    tdim = finat_element.cell.get_dimension()
+    idofs = len(finat_element.entity_dofs()[tdim][0])
+    is_interior = idofs == finat_element.space_dimension()
+    is_facet = idofs == 0
     return is_interior, is_facet
 
 
@@ -1294,9 +1288,9 @@ def unrestrict_element(ele):
 
 def get_base_elements(e):
     if isinstance(e, finat.EnrichedElement):
-        return sum(list(map(get_base_elements, e.elements)), [])
+        return list(chain.from_iterable(map(get_base_elements, e.elements)))
     elif isinstance(e, finat.TensorProductElement):
-        return sum(list(map(get_base_elements, e.factors)), [])
+        return list(chain.from_iterable(map(get_base_elements, e.factors)))
     elif isinstance(e, finat.FlattenedDimensions):
         return get_base_elements(e.product)
     elif isinstance(e, (finat.HCurlElement, finat.HDivElement)):
@@ -1645,8 +1639,8 @@ def assemble_coefficients(self, J, fcp):
             replace_val = {t: ufl.dot(dummy_Piola, s) for t, s in zip(args_J, ref_val)}
         else:
             replace_val = {t: s for t, s in zip(args_J, ref_val)}
-        beta = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
-                                                ref_val[0]), ref_val[1]) for i in integrals_J]))
+        beta = expand_derivatives(sum(ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
+                                               ref_val[0]), ref_val[1]) for i in integrals_J))
         if Piola:
             beta = ufl.replace(beta, {dummy_Piola: Piola})
         # assemble zero-th order coefficient
@@ -1671,8 +1665,8 @@ def assemble_coefficients(self, J, fcp):
             ifacet_inner = lambda v, u: ((ufl.inner(v('+'), u('+')) + ufl.inner(v('-'), u('-')))/area)*dS_int
 
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
-            alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
-                                                     ref_grad[0]), ref_grad[1]) for i in integrals_J]))
+            alpha = expand_derivatives(sum(ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
+                                                    ref_grad[0]), ref_grad[1]) for i in integrals_J))
             G = alpha
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
             G = G * abs(ufl.JacobianDeterminant(mesh))
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index b441ebde16..e13f97b416 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -945,7 +945,7 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
             for({IntType_c} i=0; i<{nscal*numpy.prod(cshape)}; i++) {t_in}[i+{cskip}] = 0.0E0;
             """)
         else:
-            Jsize = numpy.cumsum([Jlen]+[Jk.size for Jk in J])
+            Jsize = numpy.cumsum([Jlen] + [Jk.size for Jk in J])
             Jptrs = ["%s+%d" % (mat_name, Jsize[k]) if J[k].size else "NULL" for k in range(len(J))]
             Jmats.extend(J)
             Jlen = Jsize[-1]
@@ -987,7 +987,7 @@ def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
         cskip += nscal*numpy.prod(cshape)
 
     # Pass the 1D interpolators as a hexadecimal string
-    Jdata = ", ".join(map(float.hex, chain(*[Jk.flat for Jk in Jmats])))
+    Jdata = ", ".join(map(float.hex, chain.from_iterable(Jk.flat for Jk in Jmats)))
     operator_decl.append(f"""
             PetscScalar {mat_name}[{Jlen}] = {{ {Jdata} }};
     """)

From 56e2f8b601d52bc4d2d40c8343b830ca2beaa363 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 26 Apr 2023 15:54:15 +0100
Subject: [PATCH 68/75] do not create rscale Vec

---
 firedrake/preconditioners/pmg.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index e13f97b416..e16040302d 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -362,9 +362,7 @@ def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        interpolate = self.create_transfer(mat_type, get_appctx(dmc), get_appctx(dmf), True, False)
-        rscale = interpolate.createVecRight()  # only used as a workaround in the creation of coarse vecs
-        return interpolate, rscale
+        return self.create_transfer(mat_type, get_appctx(dmc), get_appctx(dmf), True, False), None
 
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()

From 649ceed1ad0dbddc0977a2a2813bd001dabcd141 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 26 Apr 2023 16:38:01 +0100
Subject: [PATCH 69/75] do not change venv name

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ac716d2358..73ed727455 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -48,7 +48,7 @@ jobs:
       - name: Build Firedrake
         run: |
           cd ..
-          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name build --tinyasm --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager --package-branch tsfc pbrubeck/fdm-discontinuous || (cat firedrake-install.log && /bin/false)
+          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name firedrake_venv --tinyasm --netgen --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager --package-branch tsfc pbrubeck/fdm-discontinuous || (cat firedrake-install.log && /bin/false)
       - name: Install test dependencies
         run: |
           . ../firedrake_venv/bin/activate

From 0c3f5dd76350515cc8a973a22a36361856e2bc79 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 26 Apr 2023 16:39:39 +0100
Subject: [PATCH 70/75] define H(d)

---
 firedrake/preconditioners/fdm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index f47861a7a7..706ab73492 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -57,9 +57,9 @@
 class FDMPC(PCBase):
     """
     A preconditioner for tensor-product elements that changes the shape
-    functions so that the H(d) Riesz map is sparse on Cartesian cells,
-    and assembles a global sparse matrix on which other preconditioners,
-    such as `ASMStarPC`, can be applied.
+    functions so that the H(d) (d in {grad, curl, div}) Riesz map is sparse on
+    Cartesian cells, and assembles a global sparse matrix on which other
+    preconditioners, such as `ASMStarPC`, can be applied.
 
     Here we assume that the volume integrals in the Jacobian can be expressed as:
 

From 35ba3e629a4c6b018be0bb4dfb73736e796469e9 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Sun, 30 Apr 2023 12:05:44 +0100
Subject: [PATCH 71/75] use weakref for coarsening and transfer operators

---
 firedrake/preconditioners/pmg.py | 79 ++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 34 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index e16040302d..6769dfaa22 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -22,6 +22,7 @@
 import numpy
 import os
 import tempfile
+import weakref
 
 __all__ = ("PMGPC", "PMGSNES")
 
@@ -56,7 +57,8 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
-    _cache = {}
+    _coarsen_cache = weakref.WeakKeyDictionary()
+    _transfer_cache = weakref.WeakKeyDictionary()
 
     def coarsen_element(self, ele):
         """
@@ -277,27 +279,14 @@ def inject_state():
         interpolate = None
         if fctx._nullspace or fctx._nullspace_T or fctx._near_nullspace:
             interpolate, _ = cdm.createInterpolation(fdm)
-        cctx._nullspace = self.coarsen_nullspace(cV, interpolate, fctx._nullspace)
-        cctx._nullspace_T = self.coarsen_nullspace(cV, interpolate, fctx._nullspace_T)
-        cctx._near_nullspace = self.coarsen_nullspace(cV, interpolate, fctx._near_nullspace)
+        cctx._nullspace = self.coarsen_nullspace(fctx._nullspace, cV, interpolate)
+        cctx._nullspace_T = self.coarsen_nullspace(fctx._nullspace_T, cV, interpolate)
+        cctx._near_nullspace = self.coarsen_nullspace(fctx._near_nullspace, cV, interpolate)
         cctx.set_nullspace(cctx._nullspace, cV._ises, transpose=False, near=False)
         cctx.set_nullspace(cctx._nullspace_T, cV._ises, transpose=True, near=False)
         cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
         return cdm
 
-    def coarsen_bcs(self, fbcs, cV):
-        cbcs = []
-        for bc in fbcs:
-            cV_ = cV
-            for index in bc._indices:
-                cV_ = cV_.sub(index)
-            cbc_value = self.coarsen_bc_value(bc, cV_)
-            if isinstance(bc, firedrake.DirichletBC):
-                cbcs.append(bc.reconstruct(V=cV_, g=cbc_value))
-            else:
-                raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
-        return cbcs
-
     def coarsen_quadrature(self, metadata, fdeg, cdeg):
         """Coarsen the quadrature degree in a dictionary preserving the ratio of
            quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1)."""
@@ -308,28 +297,50 @@ def coarsen_quadrature(self, metadata, fdeg, cdeg):
         except (KeyError, TypeError):
             return metadata
 
-    def coarsen_nullspace(self, coarse_V, interpolate, fine_nullspace):
-        """Coarsen a nullspace or retrieve it from class cache"""
-        cache = self._cache.setdefault("nullspace", {})
-        key = (coarse_V.ufl_element(), fine_nullspace)
+    def coarsen_bcs(self, fbcs, cV):
+        """Coarsen a list of bcs"""
+        cbcs = []
+        for bc in fbcs:
+            cache = self._coarsen_cache.setdefault(bc, {})
+            key = (cV.ufl_element(), self.is_snes)
+            try:
+                coarse_bc = cache[key]
+            except KeyError:
+                cV_ = cV
+                for index in bc._indices:
+                    cV_ = cV_.sub(index)
+                cbc_value = self.coarsen_bc_value(bc, cV_)
+                if isinstance(bc, firedrake.DirichletBC):
+                    coarse_bc = cache.setdefault(key, bc.reconstruct(V=cV_, g=cbc_value))
+                else:
+                    raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
+            cbcs.append(coarse_bc)
+        return cbcs
+
+    def coarsen_nullspace(self, fine_nullspace, cV, interpolate):
+        """Coarsen a nullspace"""
+        if fine_nullspace is None:
+            return fine_nullspace
+        cache = self._coarsen_cache.setdefault(fine_nullspace, {})
+        key = cV.ufl_element()
         try:
             return cache[key]
         except KeyError:
             if isinstance(fine_nullspace, MixedVectorSpaceBasis):
                 if interpolate.getType() == "python":
                     interpolate = interpolate.getPythonContext()
-                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
+                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(cV))]
                 coarse_bases = []
-                for fs, submat, basis in zip(coarse_V, submats, fine_nullspace._bases):
+                for fs, submat, basis in zip(cV, submats, fine_nullspace._bases):
                     if isinstance(basis, VectorSpaceBasis):
-                        coarse_bases.append(self.coarsen_nullspace(fs, submat, basis))
+                        coarse_bases.append(self.coarsen_nullspace(basis, fs, submat))
                     else:
-                        coarse_bases.append(coarse_V.sub(basis.index))
-                coarse_nullspace = MixedVectorSpaceBasis(coarse_V, coarse_bases)
+                        coarse_bases.append(cV.sub(basis.index))
+                coarse_nullspace = MixedVectorSpaceBasis(cV, coarse_bases)
             elif isinstance(fine_nullspace, VectorSpaceBasis):
                 coarse_vecs = []
                 for xf in fine_nullspace._petsc_vecs:
-                    wc = firedrake.Function(coarse_V)
+                    wc = firedrake.Function(cV)
                     with wc.dat.vec_wo as xc:
                         # the nullspace basis is in the dual of V
                         interpolate.multTranspose(xf, xc)
@@ -341,13 +352,9 @@ def coarsen_nullspace(self, coarse_V, interpolate, fine_nullspace):
             return cache.setdefault(key, coarse_nullspace)
 
     def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
-        """Create a transfer or retrieve it from class cache"""
-        cV = cctx.J.arguments()[0].function_space()
-        fV = fctx.J.arguments()[0].function_space()
-        cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
-        fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
-        key = (mat_type, fV.mesh(), cV.ufl_element(), fV.ufl_element(), cbcs, fbcs)
-        cache = self._cache.setdefault("transfer", {})
+        """Create a transfer operator"""
+        cache = self._transfer_cache.setdefault(fctx, {})
+        key = (mat_type, cctx, cbcs, fbcs)
         try:
             return cache[key]
         except KeyError:
@@ -357,6 +364,10 @@ def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
                 construct_mat = prolongation_matrix_aij
             else:
                 raise ValueError("Unknown matrix type")
+            cV = cctx.J.arguments()[0].function_space()
+            fV = fctx.J.arguments()[0].function_space()
+            cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
+            fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
             return cache.setdefault(key, construct_mat(cV, fV, cbcs, fbcs))
 
     def create_interpolation(self, dmc, dmf):

From 3c5569990ebd6606adb0e34487ce800b90a3bd46 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Tue, 2 May 2023 17:23:56 +0100
Subject: [PATCH 72/75] fix typo in absolute tolerance

---
 firedrake/preconditioners/fdm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 706ab73492..6aba95b301 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1062,7 +1062,7 @@ def is_restricted(finat_element):
 
 def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
     """Convert dense numpy matrix into a sparse PETSc matrix"""
-    atol = rtol * max(A_numpy.min(), A_numpy.max(), key=abs)
+    atol = rtol * abs(max(A_numpy.min(), A_numpy.max(), key=abs))
     sparsity = abs(A_numpy) > atol
     nnz = numpy.count_nonzero(sparsity, axis=1).astype(PETSc.IntType)
     A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)

From 8792bc305acd15a8026442129111c7716cba95da Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 3 May 2023 17:09:04 +0100
Subject: [PATCH 73/75] test that weakref caches are parallel safe

---
 firedrake/preconditioners/pmg.py    |  1 +
 tests/multigrid/test_p_multigrid.py | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 6769dfaa22..7b87338ca6 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -57,6 +57,7 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
+    # This is parallel safe because the keys are ids of a collective objects
     _coarsen_cache = weakref.WeakKeyDictionary()
     _transfer_cache = weakref.WeakKeyDictionary()
 
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index c04913e7d4..abb77e450a 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -334,10 +334,14 @@ def test_p_multigrid_mixed(mat_type):
     ppc = solver.snes.ksp.pc.getPythonContext().ppc
     assert ppc.getMGLevels() == 3
 
-    level = solver._ctx
+    # test that nullspace component is zero
     assert abs(assemble(z[1]*dx)) < 1E-12
+    # test that we converge to the exact solution
     assert norm(z-z_exact, "H1") < 1E-12
+
+    # test that we have coarsened the nullspace correctly
     ctx_levels = 0
+    level = solver._ctx
     while level is not None:
         nsp = level._nullspace
         assert isinstance(nsp, MixedVectorSpaceBasis)
@@ -348,6 +352,13 @@ def test_p_multigrid_mixed(mat_type):
         ctx_levels += 1
     assert ctx_levels == 3
 
+    # test that caches are parallel safe
+    dummy_eq = type(object).__eq__
+    for cache in (PMGPC._coarsen_cache, PMGPC._transfer_cache):
+        assert len(cache) > 0
+        for k in cache:
+            assert type(k).__eq__ is dummy_eq
+
 
 def test_p_fas_scalar():
     mat_type = "matfree"

From 7d40f36e56e7221944e959cad7025f781c1ecbc5 Mon Sep 17 00:00:00 2001
From: Pablo Brubeck <brubeck@protonmail.com>
Date: Wed, 3 May 2023 17:10:57 +0100
Subject: [PATCH 74/75] small typo

---
 firedrake/preconditioners/pmg.py    | 2 +-
 tests/multigrid/test_p_multigrid.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 7b87338ca6..ff302a94a2 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -57,7 +57,7 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
-    # This is parallel safe because the keys are ids of a collective objects
+    # This is parallel-safe because the keys are ids of a collective objects
     _coarsen_cache = weakref.WeakKeyDictionary()
     _transfer_cache = weakref.WeakKeyDictionary()
 
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index abb77e450a..3954536f1f 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -352,7 +352,7 @@ def test_p_multigrid_mixed(mat_type):
         ctx_levels += 1
     assert ctx_levels == 3
 
-    # test that caches are parallel safe
+    # test that caches are parallel-safe
     dummy_eq = type(object).__eq__
     for cache in (PMGPC._coarsen_cache, PMGPC._transfer_cache):
         assert len(cache) > 0

From da35c07a50313dedfbb636c13ee9361878fa64da Mon Sep 17 00:00:00 2001
From: "David A. Ham" <david.ham@imperial.ac.uk>
Date: Wed, 10 May 2023 17:03:09 +0100
Subject: [PATCH 75/75] Drop build changes

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 73ed727455..4aca285fff 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -48,7 +48,7 @@ jobs:
       - name: Build Firedrake
         run: |
           cd ..
-          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name firedrake_venv --tinyasm --netgen --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager --package-branch tsfc pbrubeck/fdm-discontinuous || (cat firedrake-install.log && /bin/false)
+          ./firedrake/scripts/firedrake-install $COMPLEX --venv-name firedrake_venv --tinyasm --netgen --disable-ssh --minimal-petsc --slepc --documentation-dependencies --install thetis --install gusto --install icepack --install irksome --install femlium --no-package-manager || (cat firedrake-install.log && /bin/false)
       - name: Install test dependencies
         run: |
           . ../firedrake_venv/bin/activate