diff --git a/firedrake/matrix_free/operators.py b/firedrake/matrix_free/operators.py
index 93d86f00d9..90f4d08611 100644
--- a/firedrake/matrix_free/operators.py
+++ b/firedrake/matrix_free/operators.py
@@ -339,7 +339,6 @@ def view(self, mat, viewer=None):
                            type(self).__name__)
 
     def getInfo(self, mat, info=None):
-        from mpi4py import MPI
         memory = self._x.dat.nbytes + self._y.dat.nbytes
         if hasattr(self, "_xbc"):
             memory += self._xbc.dat.nbytes
diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 826e1f4447..6aba95b301 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -1,132 +1,168 @@
-from functools import lru_cache, partial
+from functools import partial
+from itertools import chain, product
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
+from firedrake.preconditioners.patch import bcdofs
+from firedrake.preconditioners.pmg import (prolongation_matrix_matfree,
+                                           evaluate_dual,
+                                           get_permutation_to_line_elements)
+from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
+from firedrake.formmanipulation import ExtractSubBlock
+from firedrake.function import Function
+from firedrake.functionspace import FunctionSpace
+from firedrake.ufl_expr import TestFunction, TestFunctions, TrialFunctions
+from firedrake.utils import cached_property
+from firedrake_citations import Citations
+from ufl.algorithms.ad import expand_derivatives
+from ufl.algorithms.expand_indices import expand_indices
+from tsfc.finatinterface import create_element
+from pyop2.compilation import load
+from pyop2.sparsity import get_preallocation
+from pyop2.utils import get_petsc_dir
+
 import firedrake.dmhooks as dmhooks
-import firedrake
-import numpy
 import ufl
-from firedrake_citations import Citations
+import FIAT
+import finat
+import numpy
+import ctypes
+import operator
 
-Citations().add("Brubeck2021", """
-@misc{Brubeck2021,
+Citations().add("Brubeck2022a", """
+@article{Brubeck2022a,
   title={A scalable and robust vertex-star relaxation for high-order {FEM}},
   author={Brubeck, Pablo D. and Farrell, Patrick E.},
+  journal = {SIAM J. Sci. Comput.},
+  volume = {44},
+  number = {5},
+  pages = {A2991-A3017},
+  year = {2022},
+  doi = {10.1137/21M1444187}
+""")
+
+Citations().add("Brubeck2022b", """
+@misc{Brubeck2022b,
+  title={{Multigrid solvers for the de Rham complex with optimal complexity in polynomial degree}},
+  author={Brubeck, Pablo D. and Farrell, Patrick E.},
   archiveprefix = {arXiv},
-  eprint = {2107.14758},
+  eprint = {2211.14284},
   primaryclass = {math.NA},
-  year={2021}
-}
+  year={2022}
 """)
 
-__all__ = ("FDMPC",)
+
+__all__ = ("FDMPC", "PoissonFDMPC")
 
 
 class FDMPC(PCBase):
     """
     A preconditioner for tensor-product elements that changes the shape
-    functions so that the H^1 Riesz map is diagonalized in the interior of a
-    Cartesian cell, and assembles a global sparse matrix on which other
+    functions so that the H(d) (d in {grad, curl, div}) Riesz map is sparse on
+    Cartesian cells, and assembles a global sparse matrix on which other
     preconditioners, such as `ASMStarPC`, can be applied.
 
     Here we assume that the volume integrals in the Jacobian can be expressed as:
 
-    inner(grad(v), alpha(grad(u)))*dx + inner(v, beta(u))*dx
+    inner(d(v), alpha * d(u))*dx + inner(v, beta * u)*dx
 
-    where alpha and beta are linear functions (tensor contractions).
-    The sparse matrix is obtained by approximating alpha and beta by cell-wise
-    constants and discarding the coefficients in alpha that couple together
-    mixed derivatives and mixed components.
+    where alpha and beta are possibly tensor-valued.  The sparse matrix is
+    obtained by approximating (v, alpha * u) and (v, beta * u) as diagonal mass
+    matrices.
 
-    For spaces that are not H^1-conforming, this preconditioner will use
-    the symmetric interior-penalty DG method. The penalty coefficient can be
-    provided in the application context, keyed on ``"eta"``.
+    The PETSc options inspected by this class are:
+    - 'fdm_mat_type': can be either 'aij' or 'sbaij'
+    - 'fdm_static_condensation': are we assembling the Schur complement on facets?
     """
 
     _prefix = "fdm_"
+    _variant = "fdm"
+    _citation = "Brubeck2022b"
+    _cache = {}
 
     @PETSc.Log.EventDecorator("FDMInit")
     def initialize(self, pc):
-        from firedrake.assemble import allocate_matrix, assemble
-        from firedrake.preconditioners.pmg import prolongation_matrix_matfree
-        from firedrake.preconditioners.patch import bcdofs
-        Citations().register("Brubeck2021")
-
+        Citations().register(self._citation)
         self.comm = pc.comm
-
+        Amat, Pmat = pc.getOperators()
         prefix = pc.getOptionsPrefix()
         options_prefix = prefix + self._prefix
+        options = PETSc.Options(options_prefix)
+
+        use_amat = options.getBool("pc_use_amat", True)
+        use_static_condensation = options.getBool("static_condensation", False)
+        pmat_type = options.getString("mat_type", PETSc.Mat.Type.AIJ)
 
         appctx = self.get_appctx(pc)
-        fcp = appctx.get("form_compiler_parameters")
+        fcp = appctx.get("form_compiler_parameters") or {}
+        self.appctx = appctx
 
         # Get original Jacobian form and bcs
-        octx = dmhooks.get_appctx(pc.getDM())
-        mat_type = octx.mat_type
-        oproblem = octx._problem
-        J = oproblem.J
-        bcs = tuple(oproblem.bcs)
+        if Pmat.getType() == "python":
+            ctx = Pmat.getPythonContext()
+            J = ctx.a
+            bcs = tuple(ctx.bcs)
+            mat_type = "matfree"
+        else:
+            ctx = dmhooks.get_appctx(pc.getDM())
+            J = ctx.Jp or ctx.J
+            bcs = tuple(ctx._problem.bcs)
+            mat_type = ctx.mat_type
+
+        # TODO assemble Schur complements specified by a SLATE Tensor
+        # we might extract the form on the interface-interface block like this:
+        #
+        # if isinstance(J, firedrake.slate.TensorBase) and use_static_condensation:
+        #     J = J.children[0].form
+        if not isinstance(J, ufl.Form):
+            raise ValueError("Expecting a ufl.Form, not a %r" % type(J))
 
         # Transform the problem into the space with FDM shape functions
-        V = J.arguments()[0].function_space()
+        V = J.arguments()[-1].function_space()
         element = V.ufl_element()
-        e_fdm = element.reconstruct(variant="fdm")
-
-        def interp_nullspace(I, nsp):
-            if not nsp:
-                return nsp
-            vectors = []
-            for x in nsp.getVecs():
-                y = I.createVecLeft()
-                I.mult(x, y)
-                vectors.append(y)
-            if nsp.hasConstant():
-                y = I.createVecLeft()
-                x = I.createVecRight()
-                x.set(1.0E0)
-                I.mult(x, y)
-                vectors.append(y)
-                x.destroy()
-            return PETSc.NullSpace().create(constant=False, vectors=vectors, comm=nsp.getComm())
-
-        # Matrix-free assembly of the transformed Jacobian
+        e_fdm = element.reconstruct(variant=self._variant)
+
         if element == e_fdm:
             V_fdm, J_fdm, bcs_fdm = (V, J, bcs)
-            Amat, _ = pc.getOperators()
-            self._ctx_ref = octx
         else:
-            V_fdm = firedrake.FunctionSpace(V.mesh(), e_fdm)
-            J_fdm = ufl.replace(J, {t: t.reconstruct(function_space=V_fdm) for t in J.arguments()})
-            bcs_fdm = tuple(bc.reconstruct(V=V_fdm) for bc in bcs)
-            self.fdm_interp = prolongation_matrix_matfree(V, V_fdm, [], bcs_fdm)
-            self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp, mat_type=mat_type,
-                                     options_prefix=options_prefix)
-            self._assemble_A = partial(assemble, J_fdm, tensor=self.A, bcs=bcs_fdm,
-                                       form_compiler_parameters=fcp, mat_type=mat_type)
-            self._assemble_A()
-            Amat = self.A.petscmat
-
-            omat, _ = pc.getOperators()
-            inject = prolongation_matrix_matfree(V_fdm, V, [], [])
-            Amat.setNullSpace(interp_nullspace(inject, omat.getNullSpace()))
-            Amat.setTransposeNullSpace(interp_nullspace(inject, omat.getTransposeNullSpace()))
-            Amat.setNearNullSpace(interp_nullspace(inject, omat.getNearNullSpace()))
-            self.work_vec_x = Amat.createVecLeft()
-            self.work_vec_y = Amat.createVecRight()
-
+            # Reconstruct Jacobian and bcs with variant element
+            V_fdm = FunctionSpace(V.mesh(), e_fdm)
+            J_fdm = J(*(t.reconstruct(function_space=V_fdm) for t in J.arguments()), coefficients={})
+            bcs_fdm = []
+            for bc in bcs:
+                W = V_fdm
+                for index in bc._indices:
+                    W = W.sub(index)
+                bcs_fdm.append(bc.reconstruct(V=W, g=0))
+
+            # Create a new _SNESContext in the variant space
             self._ctx_ref = self.new_snes_ctx(pc, J_fdm, bcs_fdm, mat_type,
                                               fcp=fcp, options_prefix=options_prefix)
 
-        if len(bcs) > 0:
-            self.bc_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
-        else:
-            self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
+            # Construct interpolation from variant to original spaces
+            self.fdm_interp = prolongation_matrix_matfree(V_fdm, V, bcs_fdm, [])
+            self.work_vec_x = Amat.createVecLeft()
+            self.work_vec_y = Amat.createVecRight()
+            if use_amat:
+                from firedrake.assemble import allocate_matrix, TwoFormAssembler
+                self.A = allocate_matrix(J_fdm, bcs=bcs_fdm, form_compiler_parameters=fcp,
+                                         mat_type=mat_type, options_prefix=options_prefix)
+                self._assemble_A = TwoFormAssembler(J_fdm, tensor=self.A, bcs=bcs_fdm,
+                                                    form_compiler_parameters=fcp,
+                                                    mat_type=mat_type).assemble
+                self._assemble_A()
+                Amat = self.A.petscmat
+
+            if len(bcs) > 0:
+                self.bc_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
+            else:
+                self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
 
         # Assemble the FDM preconditioner with sparse local matrices
-        Pmat, self._assemble_P = self.assemble_fdm_op(V_fdm, J_fdm, bcs_fdm, appctx)
-        self._assemble_P()
+        Pmat, self.assembly_callables = self.allocate_matrix(V_fdm, J_fdm, bcs_fdm, fcp, pmat_type, use_static_condensation)
         Pmat.setNullSpace(Amat.getNullSpace())
         Pmat.setTransposeNullSpace(Amat.getTransposeNullSpace())
         Pmat.setNearNullSpace(Amat.getNearNullSpace())
+        self._assemble_P()
 
         # Internally, we just set up a PC object that the user can configure
         # however from the PETSc command line.  Since PC allows the user to specify
@@ -136,18 +172,136 @@ def interp_nullspace(I, nsp):
 
         # We set a DM and an appropriate SNESContext on the constructed PC so one
         # can do e.g. multigrid or patch solves.
-        fdm_dm = V_fdm.dm
-        self._dm = fdm_dm
-
-        fdmpc.setDM(fdm_dm)
+        self._dm = V_fdm.dm
+        fdmpc.setDM(self._dm)
         fdmpc.setOptionsPrefix(options_prefix)
         fdmpc.setOperators(A=Amat, P=Pmat)
-        fdmpc.setUseAmat(True)
+        fdmpc.setUseAmat(use_amat)
         self.pc = fdmpc
-
-        with dmhooks.add_hooks(fdm_dm, self, appctx=self._ctx_ref, save=False):
+        if hasattr(self, "_ctx_ref"):
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref, save=False):
+                fdmpc.setFromOptions()
+        else:
             fdmpc.setFromOptions()
 
+    @PETSc.Log.EventDecorator("FDMPrealloc")
+    def allocate_matrix(self, V, J, bcs, fcp, pmat_type, use_static_condensation):
+        """
+        Allocate the FDM sparse preconditioner.
+
+        :arg V: the :class:`.FunctionSpace` of the form arguments
+        :arg J: the Jacobian bilinear form
+        :arg bcs: an iterable of boundary conditions on V
+        :arg fcp: form compiler parameters to assemble coefficients
+        :arg pmat_type: the `PETSc.Mat.Type` for the blocks in the diagonal
+        :arg use_static_condensation: are we assembling the statically-condensed Schur complement on facets?
+
+        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and a list of assembly callables
+        """
+        symmetric = pmat_type.endswith("sbaij")
+        ifacet = [i for i, Vsub in enumerate(V) if is_restricted(Vsub.finat_element)[1]]
+        if len(ifacet) == 0:
+            Vfacet = None
+            Vbig = V
+            ebig = V.ufl_element()
+            _, fdofs = split_dofs(V.finat_element)
+        elif len(ifacet) == 1:
+            Vfacet = V[ifacet[0]]
+            ebig, = set(unrestrict_element(Vsub.ufl_element()) for Vsub in V)
+            Vbig = FunctionSpace(V.mesh(), ebig)
+            if len(V) > 1:
+                dims = [Vsub.finat_element.space_dimension() for Vsub in V]
+                assert sum(dims) == Vbig.finat_element.space_dimension()
+            fdofs = restricted_dofs(Vfacet.finat_element, Vbig.finat_element)
+        else:
+            raise ValueError("Expecting at most one FunctionSpace restricted onto facets.")
+        self.embedding_element = ebig
+
+        if Vbig.value_size == 1:
+            self.fises = PETSc.IS().createGeneral(fdofs, comm=PETSc.COMM_SELF)
+        else:
+            self.fises = PETSc.IS().createBlock(Vbig.value_size, fdofs, comm=PETSc.COMM_SELF)
+
+        # Dictionary with kernel to compute the Schur complement
+        self.schur_kernel = {}
+        if V == Vbig and Vbig.finat_element.formdegree == 0:
+            # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
+            self.schur_kernel[V] = SchurComplementPattern
+        elif Vfacet and use_static_condensation:
+            # If we are in a facet space, we build the Schur complement on its diagonal block
+            if Vfacet.finat_element.formdegree == 0 and Vfacet.value_size == 1:
+                self.schur_kernel[Vfacet] = SchurComplementDiagonal
+            elif symmetric:
+                self.schur_kernel[Vfacet] = SchurComplementBlockCholesky
+            else:
+                self.schur_kernel[Vfacet] = SchurComplementBlockQR
+
+        # Create data structures needed for assembly
+        self.lgmaps = {Vsub: Vsub.local_to_global_map([bc for bc in bcs if bc.function_space() == Vsub]) for Vsub in V}
+        self.coefficients, assembly_callables = self.assemble_coefficients(J, fcp)
+        self.assemblers = {}
+
+        Pmats = {}
+        addv = PETSc.InsertMode.ADD_VALUES
+        # Store only off-diagonal blocks with more columns than rows to save memory
+        Vsort = sorted(V, key=lambda Vsub: Vsub.dim())
+        # Loop over all pairs of subspaces
+        for Vrow, Vcol in product(Vsort, Vsort):
+            if symmetric and (Vcol, Vrow) in Pmats:
+                P = PETSc.Mat().createTranspose(Pmats[Vcol, Vrow])
+            else:
+                on_diag = Vrow == Vcol
+                triu = on_diag and symmetric
+                ptype = pmat_type if on_diag else PETSc.Mat.Type.AIJ
+                sizes = tuple(Vsub.dof_dset.layout_vec.getSizes() for Vsub in (Vrow, Vcol))
+
+                preallocator = PETSc.Mat().create(comm=self.comm)
+                preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
+                preallocator.setSizes(sizes)
+                preallocator.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, False)
+                preallocator.setUp()
+                self.set_values(preallocator, Vrow, Vcol, addv, triu=triu)
+
+                preallocator.assemble()
+                d_nnz, o_nnz = get_preallocation(preallocator, sizes[0][0])
+                preallocator.destroy()
+                if on_diag:
+                    numpy.maximum(d_nnz, 1, out=d_nnz)
+
+                P = PETSc.Mat().create(comm=self.comm)
+                P.setType(ptype)
+                P.setSizes(sizes)
+                P.setPreallocationNNZ((d_nnz, o_nnz))
+                P.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+                if on_diag:
+                    P.setOption(PETSc.Mat.Option.STRUCTURALLY_SYMMETRIC, True)
+                if ptype.endswith("sbaij"):
+                    P.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
+                P.setUp()
+                # append callables to zero entries, insert element matrices, and apply BCs
+                assembly_callables.append(P.zeroEntries)
+                assembly_callables.append(partial(self.set_values, P, Vrow, Vcol, addv))
+                if on_diag:
+                    own = Vrow.dof_dset.layout_vec.getLocalSize()
+                    bdofs = numpy.flatnonzero(self.lgmaps[Vrow].indices[:own] < 0).astype(PETSc.IntType)[:, None]
+                    Vrow.dof_dset.lgmap.apply(bdofs, result=bdofs)
+                    if len(bdofs) > 0:
+                        vals = numpy.ones(bdofs.shape, dtype=PETSc.RealType)
+                        assembly_callables.append(partial(P.setValuesRCV, bdofs, bdofs, vals, addv))
+            Pmats[Vrow, Vcol] = P
+
+        if len(V) == 1:
+            Pmat = Pmats[V, V]
+        else:
+            Pmat = PETSc.Mat().createNest([[Pmats[Vrow, Vcol] for Vcol in V] for Vrow in V], comm=self.comm)
+        assembly_callables.append(Pmat.assemble)
+        return Pmat, assembly_callables
+
+    @PETSc.Log.EventDecorator("FDMAssemble")
+    def _assemble_P(self):
+        for _assemble in self.assembly_callables:
+            _assemble()
+
     @PETSc.Log.EventDecorator("FDMUpdate")
     def update(self, pc):
         if hasattr(self, "A"):
@@ -155,26 +309,24 @@ def update(self, pc):
         self._assemble_P()
 
     def apply(self, pc, x, y):
-        dm = self._dm
-        with dmhooks.add_hooks(dm, self, appctx=self._ctx_ref):
-            if hasattr(self, "fdm_interp"):
-                self.fdm_interp.multTranspose(x, self.work_vec_x)
+        if hasattr(self, "fdm_interp"):
+            self.fdm_interp.multTranspose(x, self.work_vec_x)
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.apply(self.work_vec_x, self.work_vec_y)
-                self.fdm_interp.mult(self.work_vec_y, y)
-                y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
-            else:
-                self.pc.apply(x, y)
+            self.fdm_interp.mult(self.work_vec_y, y)
+            y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
+        else:
+            self.pc.apply(x, y)
 
     def applyTranspose(self, pc, x, y):
-        dm = self._dm
-        with dmhooks.add_hooks(dm, self, appctx=self._ctx_ref):
-            if hasattr(self, "fdm_interp"):
-                self.fdm_interp.multTranspose(x, self.work_vec_y)
+        if hasattr(self, "fdm_interp"):
+            self.fdm_interp.multTranspose(x, self.work_vec_y)
+            with dmhooks.add_hooks(self._dm, self, appctx=self._ctx_ref):
                 self.pc.applyTranspose(self.work_vec_y, self.work_vec_x)
-                self.fdm_interp.mult(self.work_vec_x, y)
-                y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
-            else:
-                self.pc.applyTranspose(x, y)
+            self.fdm_interp.mult(self.work_vec_x, y)
+            y.array_w[self.bc_nodes] = x.array_r[self.bc_nodes]
+        else:
+            self.pc.applyTranspose(x, y)
 
     def view(self, pc, viewer=None):
         super(FDMPC, self).view(pc, viewer)
@@ -182,119 +334,1090 @@ def view(self, pc, viewer=None):
             viewer.printfASCII("PC to apply inverse\n")
             self.pc.view(viewer)
 
-    def assemble_fdm_op(self, V, J, bcs, appctx):
+    def destroy(self, pc):
+        if hasattr(self, "A"):
+            self.A.petscmat.destroy()
+        if hasattr(self, "pc"):
+            self.pc.getOperators()[-1].destroy()
+            self.pc.destroy()
+
+    @PETSc.Log.EventDecorator("FDMCoefficients")
+    def assemble_coefficients(self, J, fcp, block_diagonal=True):
         """
-        Assemble the sparse preconditioner with cell-wise constant coefficients.
+        Obtain coefficients for the auxiliary operator as the diagonal of a
+        weighted mass matrix in broken(V^k) * broken(V^{k+1}).
+        See Section 3.2 of Brubeck2022b.
 
-        :arg V: the :class:`~.FunctionSpace` of the form arguments
-        :arg J: the Jacobian bilinear form
-        :arg bcs: an iterable of boundary conditions on V
-        :arg appctx: the application context
+        :arg J: the Jacobian bilinear :class:`ufl.Form`,
+        :arg fcp: form compiler parameters to assemble the diagonal of the mass matrices.
+        :arg block_diagonal: are we assembling the block diagonal of the mass matrices?
+
+        :returns: a 2-tuple of a `dict` with the zero-th order and second
+                  order coefficients keyed on ``"beta"`` and ``"alpha"``,
+                  and a list of assembly callables.
+        """
+        coefficients = {}
+        assembly_callables = []
+        # Basic idea: take the original bilinear form and
+        # replace the exterior derivatives with arguments in broken(V^{k+1}).
+        # Then, replace the original arguments with arguments in broken(V^k).
+        # Where the broken spaces have L2-orthogonal FDM basis functions.
+        index = len(J.arguments()[-1].function_space())-1
+        if index:
+            splitter = ExtractSubBlock()
+            J = splitter.split(J, argument_indices=(index, index))
+
+        args_J = J.arguments()
+        e = args_J[0].ufl_element()
+        mesh = args_J[0].function_space().mesh()
+        tdim = mesh.topological_dimension()
+        if isinstance(e, (ufl.VectorElement, ufl.TensorElement)):
+            e = e._sub_element
+        e = unrestrict_element(e)
+        sobolev = e.sobolev_space()
+
+        # Replacement rule for the exterior derivative = grad(arg) * eps
+        map_grad = None
+        if sobolev == ufl.H1:
+            map_grad = lambda p: p
+        elif sobolev in [ufl.HCurl, ufl.HDiv]:
+            u = ufl.Coefficient(ufl.FunctionSpace(mesh, e))
+            du = ufl.variable(ufl.grad(u))
+            dku = ufl.div(u) if sobolev == ufl.HDiv else ufl.curl(u)
+            eps = expand_derivatives(ufl.diff(ufl.replace(expand_derivatives(dku), {ufl.grad(u): du}), du))
+            if sobolev == ufl.HDiv:
+                map_grad = lambda p: ufl.outer(p, eps/tdim)
+            elif len(eps.ufl_shape) == 3:
+                map_grad = lambda p: ufl.dot(p, eps/2)
+            else:
+                map_grad = lambda p: p*(eps/2)
+
+        # Construct Z = broken(V^k) * broken(V^{k+1})
+        V = args_J[0].function_space()
+        fe = V.finat_element
+        formdegree = fe.formdegree
+        degree = fe.degree
+        if type(degree) != int:
+            degree, = set(degree)
+        qdeg = degree
+        if formdegree == tdim:
+            qfam = "DG" if tdim == 1 else "DQ"
+            qdeg = 0
+        elif formdegree == 0:
+            qfam = "DG" if tdim == 1 else "RTCE" if tdim == 2 else "NCE"
+        elif formdegree == 1 and tdim == 3:
+            qfam = "NCF"
+        else:
+            qfam = "DQ L2"
+            qdeg = degree - 1
+
+        qvariant = "fdm_quadrature"
+        elements = [e.reconstruct(variant=qvariant),
+                    ufl.FiniteElement(qfam, cell=mesh.ufl_cell(), degree=qdeg, variant=qvariant)]
+        elements = list(map(ufl.BrokenElement, elements))
+        if V.shape:
+            elements = [ufl.TensorElement(ele, shape=V.shape) for ele in elements]
+        Z = FunctionSpace(mesh, ufl.MixedElement(elements))
+
+        # Transform the exterior derivative and the original arguments of J to arguments in Z
+        args = (TestFunctions(Z), TrialFunctions(Z))
+        repargs = {t: v[0] for t, v in zip(args_J, args)}
+        repgrad = {ufl.grad(t): map_grad(v[1]) for t, v in zip(args_J, args)} if map_grad else {}
+        Jcell = expand_indices(expand_derivatives(ufl.Form(J.integrals_by_type("cell"))))
+        mixed_form = ufl.replace(ufl.replace(Jcell, repgrad), repargs)
+
+        # Return coefficients and assembly callables
+        if block_diagonal and V.shape:
+            from firedrake.assemble import assemble
+            M = assemble(mixed_form, mat_type="matfree", form_compiler_parameters=fcp)
+            for iset, name in zip(Z.dof_dset.field_ises, ("beta", "alpha")):
+                sub = M.petscmat.createSubMatrix(iset, iset)
+                ctx = sub.getPythonContext()
+                coefficients[name] = ctx._block_diagonal
+                assembly_callables.append(ctx._assemble_block_diagonal)
+        else:
+            from firedrake.assemble import OneFormAssembler
+            tensor = Function(Z)
+            coefficients["beta"] = tensor.subfunctions[0]
+            coefficients["alpha"] = tensor.subfunctions[1]
+            assembly_callables.append(OneFormAssembler(mixed_form, tensor=tensor, diagonal=True,
+                                                       form_compiler_parameters=fcp).assemble)
+        return coefficients, assembly_callables
 
-        :returns: 2-tuple with the preconditioner :class:`PETSc.Mat` and its assembly callable
+    @PETSc.Log.EventDecorator("FDMRefTensor")
+    def assemble_reference_tensor(self, V, transpose=False, sort_interior=False):
         """
-        from pyop2.sparsity import get_preallocation
-        from firedrake.preconditioners.pmg import get_line_elements
+        Return the reference tensor used in the diagonal factorisation of the
+        sparse cell matrices.  See Section 3.2 of Brubeck2022b.
+
+        :arg V: a :class:`.FunctionSpace`
+
+        :returns: a :class:`PETSc.Mat` interpolating V^k * d(V^k) onto
+                  broken(V^k) * broken(V^{k+1}) on the reference element.
+        """
+        value_size = V.value_size
+        fe = V.finat_element
+        tdim = fe.cell.get_spatial_dimension()
+        formdegree = fe.formdegree
+        degree = fe.degree
+        if type(degree) != int:
+            degree, = set(degree)
+        if formdegree == tdim:
+            degree = degree + 1
+        is_interior, is_facet = is_restricted(fe)
+        key = (value_size, tdim, degree, formdegree, is_interior, is_facet, transpose, sort_interior)
+        cache = self._cache.setdefault("reference_tensor", {})
+        try:
+            return cache[key]
+        except KeyError:
+            pass
+
+        if transpose:
+            result = self.assemble_reference_tensor(V, transpose=False, sort_interior=sort_interior)
+            result = PETSc.Mat().createTranspose(result).convert(result.getType())
+            return cache.setdefault(key, result)
+
+        if sort_interior:
+            assert is_interior and not is_facet and not transpose
+            # Sort DOFs to make A00 block diagonal with blocks of increasing dimension along the diagonal
+            result = self.assemble_reference_tensor(V, transpose=transpose, sort_interior=False)
+            if formdegree != 0:
+                # Compute the stiffness matrix on the interior of a cell
+                A00 = self._element_mass_matrix.PtAP(result)
+                indptr, indices, _ = A00.getValuesCSR()
+                degree = numpy.diff(indptr)
+                # Sort by blocks
+                uniq, u_index = numpy.unique(indices, return_index=True)
+                perm = uniq[u_index.argsort(kind='stable')]
+                # Sort by degree
+                degree = degree[perm]
+                perm = perm[degree.argsort(kind='stable')]
+                A00.destroy()
+
+                isperm = PETSc.IS().createGeneral(perm, comm=result.getComm())
+                result = get_submat(result, iscol=isperm, permute=True)
+                isperm.destroy()
+            return cache.setdefault(key, result)
+
+        short_key = key[:-3] + (False,) * 3
+        try:
+            result = cache[short_key]
+        except KeyError:
+            # Get CG(k) and DG(k-1) 1D elements from V
+            elements = sorted(get_base_elements(fe), key=lambda e: e.formdegree)
+            e0 = elements[0] if elements[0].formdegree == 0 else None
+            e1 = elements[-1] if elements[-1].formdegree == 1 else None
+            if e0 and is_interior:
+                e0 = FIAT.RestrictedElement(e0, restriction_domain="interior")
+
+            # Get broken(CG(k)) and DG(k-1) 1D elements from the coefficient spaces
+            Q0 = self.coefficients["beta"].function_space().finat_element.element
+            elements = sorted(get_base_elements(Q0), key=lambda e: e.formdegree)
+            q0 = elements[0] if elements[0].formdegree == 0 else None
+            q1 = elements[-1]
+            if q1.formdegree != 1:
+                Q1 = self.coefficients["alpha"].function_space().finat_element.element
+                q1 = sorted(get_base_elements(Q1), key=lambda e: e.formdegree)[-1]
+
+            # Interpolate V * d(V) -> space(beta) * space(alpha)
+            comm = PETSc.COMM_SELF
+            zero = PETSc.Mat()
+            A00 = petsc_sparse(evaluate_dual(e0, q0), comm=comm) if e0 and q0 else zero
+            A11 = petsc_sparse(evaluate_dual(e1, q1), comm=comm) if e1 else zero
+            A10 = petsc_sparse(evaluate_dual(e0, q1, alpha=(1,)), comm=comm) if e0 else zero
+            B_blocks = mass_blocks(tdim, formdegree, A00, A11)
+            A_blocks = diff_blocks(tdim, formdegree, A00, A11, A10)
+            result = block_mat(B_blocks + A_blocks, destroy_blocks=True)
+            A00.destroy()
+            A10.destroy()
+            A11.destroy()
+            if value_size != 1:
+                eye = petsc_sparse(numpy.eye(value_size), comm=result.getComm())
+                temp = result
+                result = temp.kron(eye)
+                temp.destroy()
+                eye.destroy()
+
+        if is_facet:
+            cache[short_key] = result
+            result = get_submat(result, iscol=self.fises)
+        return cache.setdefault(key, result)
+
+    @cached_property
+    def _element_mass_matrix(self):
+        Z = [self.coefficients[name].function_space() for name in ("beta", "alpha")]
+        shape = (sum(V.finat_element.space_dimension() for V in Z),) + Z[0].shape
+        data = numpy.ones(shape, dtype=PETSc.RealType)
+        shape += (1,) * (3-len(shape))
+        nrows = shape[0] * shape[1]
+        ai = numpy.arange(nrows+1, dtype=PETSc.IntType)
+        aj = numpy.tile(ai[:-1].reshape((-1, shape[1])), (1, shape[2]))
+        if shape[2] > 1:
+            ai *= shape[2]
+            data = numpy.tile(numpy.eye(shape[2], dtype=data.dtype), shape[:1] + (1,)*(len(shape)-1))
+        return PETSc.Mat().createAIJ((nrows, nrows), csr=(ai, aj, data), comm=PETSc.COMM_SELF)
+
+    @PETSc.Log.EventDecorator("FDMSetValues")
+    def set_values(self, A, Vrow, Vcol, addv, triu=False):
+        """
+        Assemble the stiffness matrix in the FDM basis using sparse reference
+        tensors and diagonal mass matrices.
+
+        :arg A: the :class:`PETSc.Mat` to assemble
+        :arg Vrow: the :class:`.FunctionSpace` test space
+        :arg Vcol: the :class:`.FunctionSpace` trial space
+        :arg addv: a `PETSc.Mat.InsertMode`
+        :arg triu: are we assembling only the upper triangular part?
+        """
+        key = (Vrow.ufl_element(), Vcol.ufl_element())
+        try:
+            assembler = self.assemblers[key]
+        except KeyError:
+            # Interpolation of basis and exterior derivative onto broken spaces
+            C1 = self.assemble_reference_tensor(Vcol)
+            R1 = self.assemble_reference_tensor(Vrow, transpose=True)
+            M = self._element_mass_matrix
+            # Element stiffness matrix = R1 * M * C1, see Equation (3.9) of Brubeck2022b
+            element_kernel = TripleProductKernel(R1, M, C1, self.coefficients["beta"], self.coefficients["alpha"])
+
+            schur_kernel = None
+            if Vrow == Vcol:
+                schur_kernel = self.schur_kernel.get(Vrow)
+            if schur_kernel is not None:
+                V0 = FunctionSpace(Vrow.mesh(), restrict_element(self.embedding_element, "interior"))
+                C0 = self.assemble_reference_tensor(V0, sort_interior=True)
+                R0 = self.assemble_reference_tensor(V0, sort_interior=True, transpose=True)
+                # Only the facet block updates the coefficients in M
+                element_kernel = schur_kernel(element_kernel,
+                                              TripleProductKernel(R1, M, C0),
+                                              TripleProductKernel(R0, M, C1),
+                                              TripleProductKernel(R0, M, C0))
+
+            assembler = SparseAssembler(element_kernel, Vrow, Vcol, self.lgmaps[Vrow], self.lgmaps[Vcol])
+            self.assemblers.setdefault(key, assembler)
+        assembler.assemble(A, addv=addv, triu=triu)
+
+
+class SparseAssembler(object):
+
+    _cache = {}
+
+    @staticmethod
+    def setSubMatCSR(comm, triu=False):
+        """
+        Compile C code to insert sparse submatrices and store in class cache
+
+        :arg triu: are we inserting onto the upper triangular part of the matrix?
+
+        :returns: a python wrapper for the matrix insertion function
+        """
+        cache = SparseAssembler._cache.setdefault("setSubMatCSR", {})
+        key = triu
+        try:
+            return cache[key]
+        except KeyError:
+            return cache.setdefault(key, load_setSubMatCSR(comm, triu))
+
+    def __init__(self, kernel, Vrow, Vcol, rmap, cmap):
+        self.kernel = kernel
+        m, n = kernel.result.getSize()
+
+        spaces = [Vrow]
+        row_shape = tuple() if Vrow.value_size == 1 else (Vrow.value_size,)
+        map_rows = (self.map_block_indices, rmap) if row_shape else (rmap.apply,)
+        rows = numpy.empty((m, ), dtype=PETSc.IntType).reshape((-1,) + row_shape)
+        if Vcol == Vrow:
+            cols = rows
+            map_cols = (lambda *args, result=None: result, )
+        else:
+            spaces.append(Vcol)
+            col_shape = tuple() if Vcol.value_size == 1 else (Vcol.value_size,)
+            map_cols = (self.map_block_indices, cmap) if col_shape else (cmap.apply, )
+            cols = numpy.empty((n, ), dtype=PETSc.IntType).reshape((-1,) + col_shape)
+        spaces.extend(c.function_space() for c in kernel.coefficients)
+
+        integral_type = kernel.integral_type
+        if integral_type in ["cell", "interior_facet_horiz"]:
+            get_map = operator.methodcaller("cell_node_map")
+        elif integral_type in ["interior_facet", "interior_facet_vert"]:
+            get_map = operator.methodcaller("interior_facet_node_map")
+        else:
+            raise NotImplementedError("Only for cell or interior facet integrals")
+        self.node_maps = tuple(map(get_map, spaces))
+
+        ncell = 2 if integral_type.startswith("interior_facet") else 1
+        self.indices = tuple(numpy.empty((V.finat_element.space_dimension() * ncell,), dtype=PETSc.IntType) for V in spaces)
+        self.map_rows = partial(*map_rows, self.indices[spaces.index(Vrow)], result=rows)
+        self.map_cols = partial(*map_cols, self.indices[spaces.index(Vcol)], result=cols)
+        self.kernel_args = self.indices[-len(kernel.coefficients):]
+        self.set_indices = self.copy_indices
+
+        node_map = self.node_maps[0]
+        self.nel = node_map.values.shape[0]
+        if node_map.offset is None:
+            layers = None
+        else:
+            layers = node_map.iterset.layers_array
+            layers = layers[:, 1]-layers[:, 0]-1
+            if integral_type.endswith("horiz"):
+                layers -= 1
+                self.set_indices = self.copy_indices_horiz
+            if layers.shape[0] != self.nel:
+                layers = numpy.repeat(layers, self.nel)
+        self.layers = layers
+
+    def map_block_indices(self, lgmap, indices, result=None):
+        bsize = result.shape[-1]
+        numpy.copyto(result[:, 0], indices)
+        result[:, 0] *= bsize
+        numpy.add.outer(result[:, 0], numpy.arange(1, bsize, dtype=indices.dtype), out=result[:, 1:])
+        return lgmap.apply(result, result=result)
+
+    def copy_indices_horiz(self, e):
+        for index, node_map in zip(self.indices, self.node_maps):
+            index = index.reshape((2, -1))
+            numpy.copyto(index, node_map.values_with_halo[e])
+            index[1] += node_map.offset
+
+    def copy_indices(self, e):
+        for index, node_map in zip(self.indices, self.node_maps):
+            numpy.copyto(index, node_map.values_with_halo[e])
+
+    def add_offsets(self):
+        for index, node_map in zip(self.indices, self.node_maps):
+            index += node_map.offset
+
+    def assemble(self, A, addv=None, triu=False):
+        if A.getType() == PETSc.Mat.Type.PREALLOCATOR:
+            kernel = lambda *args, result=None: result
+        else:
+            kernel = self.kernel
+        result = self.kernel.result
+        insert = self.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
+
+        # Core assembly loop
+        if self.layers is None:
+            for e in range(self.nel):
+                self.set_indices(e)
+                insert(A, kernel(*self.kernel_args, result=result),
+                       self.map_rows(), self.map_cols(), addv)
+        else:
+            for e in range(self.nel):
+                self.set_indices(e)
+                for _ in range(self.layers[e]):
+                    insert(A, kernel(*self.kernel_args, result=result),
+                           self.map_rows(), self.map_cols(), addv)
+                    self.add_offsets()
+
+
+class ElementKernel(object):
+    """
+    A constant element kernel
+    """
+    def __init__(self, A, *coefficients):
+        self.result = A
+        self.coefficients = coefficients
+        self.integral_type = "cell"
+
+    def __call__(self, *args, result=None):
+        return result or self.result
+
+    def __del__(self):
+        self.destroy()
+
+    def destroy(self):
+        pass
+
+
+class TripleProductKernel(ElementKernel):
+    """
+    An element kernel to compute a triple matrix product A * B * C, where A and
+    C are constant matrices and B is a block diagonal matrix with entries given
+    by coefficients.
+    """
+    def __init__(self, A, B, C, *coefficients):
+        self.work = None
+        if len(coefficients) == 0:
+            self.data = numpy.array([])
+            self.update = lambda *args: args
+        else:
+            dshape = (-1, ) + coefficients[0].dat.data_ro.shape[1:]
+            if numpy.prod(dshape[1:]) == 1:
+                self.work = B.getDiagonal()
+                self.data = self.work.array_w.reshape(dshape)
+                self.update = partial(B.setDiagonal, self.work)
+            else:
+                indptr, indices, data = B.getValuesCSR()
+                self.data = data.reshape(dshape)
+                self.update = lambda *args: (B.setValuesCSR(indptr, indices, self.data), B.assemble())
+
+        stops = numpy.zeros((len(coefficients) + 1,), dtype=PETSc.IntType)
+        numpy.cumsum([c.function_space().finat_element.space_dimension() for c in coefficients], out=stops[1:])
+        self.slices = [slice(*stops[k:k+2]) for k in range(len(coefficients))]
+
+        self.product = partial(A.matMatMult, B, C)
+        super().__init__(self.product(), *coefficients)
+
+    def __call__(self, *indices, result=None):
+        for c, i, z in zip(self.coefficients, indices, self.slices):
+            numpy.take(c.dat.data_ro, i, axis=0, out=self.data[z])
+        self.update()
+        return self.product(result=result)
+
+    def destroy(self):
+        self.result.destroy()
+        if isinstance(self.work, PETSc.Object):
+            self.work.destroy()
+
+
+class SchurComplementKernel(ElementKernel):
+    """
+    An element kernel to compute Schur complements that reuses work matrices and the
+    symbolic factorization of the interior block.
+    """
+    def __init__(self, *kernels):
+        self.children = kernels
+        self.submats = [k.result for k in kernels]
+
+        # Create dict of slices with the extents of the diagonal blocks
+        A00 = self.submats[-1]
+        degree = numpy.diff(A00.getValuesCSR()[0])
+        istart = 0
+        self.slices = {1: slice(0, 0)}
+        unique_degree, counts = numpy.unique(degree, return_counts=True)
+        for k, kdofs in sorted(zip(unique_degree, counts)):
+            self.slices[k] = slice(istart, istart + k * kdofs)
+            istart += k * kdofs
+
+        self.blocks = sorted(degree for degree in self.slices if degree > 1)
+
+        self.work = [None for _ in range(2)]
+        coefficients = []
+        for k in self.children:
+            coefficients.extend(k.coefficients)
+        coefficients = list(dict.fromkeys(coefficients))
+        super().__init__(self.condense(), *coefficients)
+
+    def __call__(self, *args, result=None):
+        for k in self.children:
+            k(*args, result=k.result)
+        return self.condense(result=result)
+
+    def destroy(self):
+        for k in self.children:
+            k.destroy()
+        self.result.destroy()
+        for obj in self.work:
+            if isinstance(obj, PETSc.Object):
+                obj.destroy()
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        return result
+
+
+class SchurComplementPattern(SchurComplementKernel):
+
+    def __call__(self, *args, result=None):
+        k = self.children[0]
+        k(*args, result=k.result)
+        return self.condense(result=result)
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        """By default pad with zeros the statically condensed pattern"""
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        if result is None:
+            _, A10, A01, A00 = self.submats
+            result = A10.matMatMult(A00, A01, result=result)
+        result.aypx(0.0, self.submats[0], structure=structure)
+        return result
+
+
+class SchurComplementDiagonal(SchurComplementKernel):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A11, A10, A01, A00 = self.submats
+        self.work[0] = A00.getDiagonal(result=self.work[0])
+        self.work[0].reciprocal()
+        self.work[0].scale(-1)
+        A01.diagonalScale(L=self.work[0])
+        result = A10.matMult(A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockCholesky(SchurComplementKernel):
+
+    def __init__(self, K11, K10, K01, K00):
+        # asssume that K10 = K01^T
+        super().__init__(K11, K01, K00)
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A11, A01, A00 = self.submats
+        indptr, indices, R = A00.getValuesCSR()
+
+        zlice = self.slices[1]
+        numpy.sqrt(R[zlice], out=R[zlice])
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = 2 * (zlice.stop - zlice.start)
+        for k in self.blocks:
+            Rk = R[self.slices[k]]
+            A = Rk.reshape((-1, k, k))
+            rinv = numpy.linalg.inv(numpy.linalg.cholesky(A))
+            numpy.copyto(Rk, rinv.flat)
+            flops += A.shape[0] * ((k**3)//3 + k**3)
+
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        self.work[0] = A00.matMult(A01, result=self.work[0])
+        result = self.work[0].transposeMatMult(self.work[0], result=result)
+        result.aypx(-1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockQR(SchurComplementKernel):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A11, A10, A01, A00 = self.submats
+        indptr, indices, R = A00.getValuesCSR()
+        Q = numpy.ones(R.shape, dtype=R.dtype)
+
+        zlice = self.slices[1]
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = zlice.stop - zlice.start
+        for k in self.blocks:
+            zlice = self.slices[k]
+            A = R[zlice].reshape((-1, k, k))
+            q, r = numpy.linalg.qr(A, mode="complete")
+            numpy.copyto(Q[zlice], q.flat)
+            rinv = numpy.linalg.inv(r)
+            numpy.copyto(R[zlice], rinv.flat)
+            flops += A.shape[0] * ((4*k**3)//3 + k**3)
+
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, Q)
+        A00.assemble()
+        self.work[0] = A00.transposeMatMult(A01, result=self.work[0])
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        A00.scale(-1.0)
+        result = A10.matMatMult(A00, self.work[0], result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockSVD(SchurComplementKernel):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A11, A10, A01, A00 = self.submats
+        indptr, indices, U = A00.getValuesCSR()
+        V = numpy.ones(U.shape, dtype=U.dtype)
+        self.work[0] = A00.getDiagonal(result=self.work[0])
+        D = self.work[0]
+        dslice = self.slices[1]
+        numpy.sign(D.array_r[dslice], out=U[dslice])
+        flops = dslice.stop - dslice.start
+        for k in self.blocks:
+            bslice = self.slices[k]
+            A = U[bslice].reshape((-1, k, k))
+            u, s, v = numpy.linalg.svd(A, full_matrices=False)
+            dslice = slice(dslice.stop, dslice.stop + k * A.shape[0])
+            numpy.copyto(D.array_w[dslice], s.flat)
+            numpy.copyto(U[bslice], numpy.transpose(u, axes=(0, 2, 1)).flat)
+            numpy.copyto(V[bslice], numpy.transpose(v, axes=(0, 2, 1)).flat)
+            flops += A.shape[0] * ((4*k**3)//3 + 4*k**3)
+
+        PETSc.Log.logFlops(flops)
+        D.sqrtabs()
+        D.reciprocal()
+        A00.setValuesCSR(indptr, indices, V)
+        A00.assemble()
+        A00.diagonalScale(R=D)
+        self.work[1] = A10.matMult(A00, result=self.work[1])
+        D.scale(-1.0)
+        A00.setValuesCSR(indptr, indices, U)
+        A00.assemble()
+        A00.diagonalScale(L=D)
+        result = self.work[1].matMatMult(A00, A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+class SchurComplementBlockInverse(SchurComplementKernel):
+
+    @PETSc.Log.EventDecorator("FDMCondense")
+    def condense(self, result=None):
+        structure = PETSc.Mat.Structure.SUBSET if result else None
+        A11, A10, A01, A00 = self.submats
+        indptr, indices, R = A00.getValuesCSR()
+
+        zlice = self.slices[1]
+        numpy.reciprocal(R[zlice], out=R[zlice])
+        flops = zlice.stop - zlice.start
+        for k in self.blocks:
+            Rk = R[self.slices[k]]
+            A = Rk.reshape((-1, k, k))
+            rinv = numpy.linalg.inv(A)
+            numpy.copyto(Rk, rinv.flat)
+            flops += A.shape[0] * (k**3)
+
+        PETSc.Log.logFlops(flops)
+        A00.setValuesCSR(indptr, indices, R)
+        A00.assemble()
+        A00.scale(-1.0)
+        result = A10.matMatMult(A00, A01, result=result)
+        result.axpy(1.0, A11, structure=structure)
+        return result
+
+
+@PETSc.Log.EventDecorator("LoadCode")
+def load_c_code(code, name, **kwargs):
+    petsc_dir = get_petsc_dir()
+    cppargs = ["-I%s/include" % d for d in petsc_dir]
+    ldargs = (["-L%s/lib" % d for d in petsc_dir]
+              + ["-Wl,-rpath,%s/lib" % d for d in petsc_dir]
+              + ["-lpetsc", "-lm"])
+    return load(code, "c", name, cppargs=cppargs, ldargs=ldargs, **kwargs)
+
+
+def load_setSubMatCSR(comm, triu=False):
+    """Insert one sparse matrix into another sparse matrix.
+       Done in C for efficiency, since it loops over rows."""
+    if triu:
+        name = "setSubMatCSR_SBAIJ"
+        select_cols = "icol -= (icol < irow) * (1 + icol);"
+    else:
+        name = "setSubMatCSR_AIJ"
+        select_cols = ""
+    code = f"""
+#include <petsc.h>
+
+PetscErrorCode {name}(Mat A,
+                      Mat B,
+                      PetscInt *rindices,
+                      PetscInt *cindices,
+                      InsertMode addv)
+{{
+    PetscInt ncols, irow, icol;
+    PetscInt *cols, *indices;
+    PetscScalar *vals;
+
+    PetscInt m, n;
+    PetscErrorCode ierr;
+    PetscFunctionBeginUser;
+    MatGetSize(B, &m, NULL);
+
+    n = 0;
+    for (PetscInt i = 0; i < m; i++) {{
+        ierr = MatGetRow(B, i, &ncols, NULL, NULL);CHKERRQ(ierr);
+        n = ncols > n ? ncols : n;
+        ierr = MatRestoreRow(B, i, &ncols, NULL, NULL);CHKERRQ(ierr);
+    }}
+    PetscMalloc1(n, &indices);
+    for (PetscInt i = 0; i < m; i++) {{
+        ierr = MatGetRow(B, i, &ncols, &cols, &vals);CHKERRQ(ierr);
+        irow = rindices[i];
+        for (PetscInt j = 0; j < ncols; j++) {{
+            icol = cindices[cols[j]];
+            {select_cols}
+            indices[j] = icol;
+        }}
+        ierr = MatSetValues(A, 1, &irow, ncols, indices, vals, addv);CHKERRQ(ierr);
+        ierr = MatRestoreRow(B, i, &ncols, &cols, &vals);CHKERRQ(ierr);
+    }}
+    PetscFree(indices);
+    PetscFunctionReturn(0);
+}}
+"""
+    argtypes = [ctypes.c_voidp, ctypes.c_voidp,
+                ctypes.c_voidp, ctypes.c_voidp, ctypes.c_int]
+    funptr = load_c_code(code, name, comm=comm, argtypes=argtypes,
+                         restype=ctypes.c_int)
+
+    @PETSc.Log.EventDecorator(name)
+    def wrapper(A, B, rows, cols, addv):
+        return funptr(A.handle, B.handle, rows.ctypes.data, cols.ctypes.data, addv)
+
+    return wrapper
+
+
+def is_restricted(finat_element):
+    """Determine if an element is a restriction onto interior or facets"""
+    tdim = finat_element.cell.get_dimension()
+    idofs = len(finat_element.entity_dofs()[tdim][0])
+    is_interior = idofs == finat_element.space_dimension()
+    is_facet = idofs == 0
+    return is_interior, is_facet
+
+
+def petsc_sparse(A_numpy, rtol=1E-10, comm=None):
+    """Convert dense numpy matrix into a sparse PETSc matrix"""
+    atol = rtol * abs(max(A_numpy.min(), A_numpy.max(), key=abs))
+    sparsity = abs(A_numpy) > atol
+    nnz = numpy.count_nonzero(sparsity, axis=1).astype(PETSc.IntType)
+    A = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
+    rows, cols = numpy.nonzero(sparsity)
+    rows = rows.astype(PETSc.IntType)
+    cols = cols.astype(PETSc.IntType)
+    vals = A_numpy[sparsity]
+    A.setValuesRCV(rows[:, None], cols[:, None], vals[:, None], PETSc.InsertMode.INSERT)
+    A.assemble()
+    return A
+
+
+def kron3(A, B, C, scale=None):
+    """Returns scale * kron(A, kron(B, C))"""
+    temp = B.kron(C)
+    if scale is not None:
+        temp.scale(scale)
+    result = A.kron(temp)
+    temp.destroy()
+    return result
+
+
+def get_submat(A, isrow=None, iscol=None, permute=False):
+    """Return the sub matrix A[isrow, iscol]"""
+    needs_rows = isrow is None
+    needs_cols = iscol is None
+    if needs_rows and needs_cols:
+        return A
+    size = A.getSize()
+    if needs_rows:
+        isrow = PETSc.IS().createStride(size[0], step=1, comm=A.getComm())
+    if needs_cols:
+        iscol = PETSc.IS().createStride(size[1], step=1, comm=A.getComm())
+    if permute:
+        submat = A.permute(isrow, iscol)
+    else:
+        submat = A.createSubMatrix(isrow, iscol)
+    if needs_rows:
+        isrow.destroy()
+    if needs_cols:
+        iscol.destroy()
+    return submat
+
+
+def block_mat(A_blocks, destroy_blocks=False):
+    """Return a concrete Mat corresponding to a block matrix given as a list of lists.
+       Optionally, destroys the input Mats if a new Mat is created."""
+    if len(A_blocks) == 1:
+        if len(A_blocks[0]) == 1:
+            return A_blocks[0][0]
+
+    result = PETSc.Mat().createNest(A_blocks, comm=A_blocks[0][0].getComm())
+    # A nest Mat would not allow us to take matrix-matrix products
+    result = result.convert(mat_type=A_blocks[0][0].getType())
+    if destroy_blocks:
+        for row in A_blocks:
+            for mat in row:
+                mat.destroy()
+    return result
+
+
+def mass_blocks(tdim, formdegree, B00, B11):
+    """Construct mass block matrix on reference cell from 1D mass matrices B00 and B11.
+       The 1D matrices may come with different test and trial spaces."""
+    if tdim == 1:
+        B_diag = [B11 if formdegree else B00]
+    elif tdim == 2:
+        if formdegree == 0:
+            B_diag = [B00.kron(B00)]
+        elif formdegree == 1:
+            B_diag = [B00.kron(B11), B11.kron(B00)]
+        else:
+            B_diag = [B11.kron(B11)]
+    elif tdim == 3:
+        if formdegree == 0:
+            B_diag = [kron3(B00, B00, B00)]
+        elif formdegree == 1:
+            B_diag = [kron3(B00, B00, B11), kron3(B00, B11, B00), kron3(B11, B00, B00)]
+        elif formdegree == 2:
+            B_diag = [kron3(B00, B11, B11), kron3(B11, B00, B11), kron3(B11, B11, B00)]
+        else:
+            B_diag = [kron3(B11, B11, B11)]
+
+    n = len(B_diag)
+    if n == 1:
+        return [B_diag]
+    else:
+        zero = PETSc.Mat()
+        return [[B_diag[i] if i == j else zero for j in range(n)] for i in range(n)]
+
+
+def diff_blocks(tdim, formdegree, A00, A11, A10):
+    """Construct exterior derivative block matrix on reference cell from 1D
+       mass matrices A00 and A11, and exterior derivative moments A10.
+       The 1D matrices may come with different test and trial spaces."""
+    if formdegree == tdim:
+        ncols = A10.shape[0]**tdim
+        zero = PETSc.Mat().createAIJ((1, ncols), nnz=(0, 0), comm=A10.getComm())
+        zero.assemble()
+        A_blocks = [[zero]]
+    elif tdim == 1:
+        A_blocks = [[A10]]
+    elif tdim == 2:
+        if formdegree == 0:
+            A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
+        elif formdegree == 1:
+            A_blocks = [[A10.kron(A11), A11.kron(A10)]]
+            A_blocks[-1][-1].scale(-1)
+    elif tdim == 3:
+        if formdegree == 0:
+            A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
+        elif formdegree == 1:
+            zero = PETSc.Mat()
+            A_blocks = [[kron3(A00, A10, A11, scale=-1), kron3(A00, A11, A10), zero],
+                        [kron3(A10, A00, A11, scale=-1), zero, kron3(A11, A00, A10)],
+                        [zero, kron3(A10, A11, A00), kron3(A11, A10, A00, scale=-1)]]
+        elif formdegree == 2:
+            A_blocks = [[kron3(A10, A11, A11, scale=-1), kron3(A11, A10, A11), kron3(A11, A11, A10)]]
+    return A_blocks
+
+
+def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
+    """
+    Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
+    Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
+    """
+    if comm is None:
+        comm = Vf.comm
+    ec = Vc.finat_element
+    ef = Vf.finat_element
+    if ef.formdegree - ec.formdegree != 1:
+        raise ValueError("Expecting Vf = d(Vc)")
+
+    elements = sorted(get_base_elements(ec), key=lambda e: e.formdegree)
+    c0, c1 = elements[::len(elements)-1]
+    elements = sorted(get_base_elements(ef), key=lambda e: e.formdegree)
+    f0, f1 = elements[::len(elements)-1]
+    if f0.formdegree != 0:
+        f0 = None
+    if c1.formdegree != 1:
+        c1 = None
+
+    tdim = Vc.mesh().topological_dimension()
+    zero = PETSc.Mat()
+    A00 = petsc_sparse(evaluate_dual(c0, f0), comm=PETSc.COMM_SELF) if f0 else zero
+    A11 = petsc_sparse(evaluate_dual(c1, f1), comm=PETSc.COMM_SELF) if c1 else zero
+    A10 = petsc_sparse(evaluate_dual(c0, f1, alpha=(1,)), comm=PETSc.COMM_SELF)
+    Dhat = block_mat(diff_blocks(tdim, ec.formdegree, A00, A11, A10), destroy_blocks=True)
+    A00.destroy()
+    A10.destroy()
+    A11.destroy()
+
+    if any(is_restricted(ec)) or any(is_restricted(ef)):
+        scalar_element = lambda e: e._sub_element if isinstance(e, (ufl.TensorElement, ufl.VectorElement)) else e
+        fdofs = restricted_dofs(ef, create_element(unrestrict_element(scalar_element(Vf.ufl_element()))))
+        cdofs = restricted_dofs(ec, create_element(unrestrict_element(scalar_element(Vc.ufl_element()))))
+        temp = Dhat
+        fises = PETSc.IS().createGeneral(fdofs, comm=temp.getComm())
+        cises = PETSc.IS().createGeneral(cdofs, comm=temp.getComm())
+        Dhat = temp.createSubMatrix(fises, cises)
+        temp.destroy()
+        fises.destroy()
+        cises.destroy()
+
+    if Vf.value_size > 1:
+        temp = Dhat
+        eye = petsc_sparse(numpy.eye(Vf.value_size, dtype=PETSc.RealType), comm=temp.getComm())
+        Dhat = temp.kron(eye)
+        temp.destroy()
+        eye.destroy()
+
+    sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
+    block_size = Vf.dof_dset.layout_vec.getBlockSize()
+    preallocator = PETSc.Mat().create(comm=comm)
+    preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
+    preallocator.setSizes(sizes)
+    preallocator.setUp()
+
+    insert = PETSc.InsertMode.INSERT
+    rmap = Vf.local_to_global_map(fbcs)
+    cmap = Vc.local_to_global_map(cbcs)
+    assembler = SparseAssembler(ElementKernel(Dhat), Vf, Vc, rmap, cmap)
+    assembler.assemble(preallocator, addv=insert)
+    preallocator.assemble()
+
+    nnz = get_preallocation(preallocator, sizes[0][0])
+    preallocator.destroy()
+    Dmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=comm)
+    Dmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+
+    assembler.assemble(Dmat, addv=insert)
+    Dmat.assemble()
+    Dhat.destroy()
+    return Dmat
+
+
+def restrict_element(ele, restriction_domain):
+    """Get an element that is not restricted and return the restricted element."""
+    if isinstance(ele, ufl.VectorElement):
+        return type(ele)(restrict_element(ele._sub_element, restriction_domain), dim=ele.num_sub_elements())
+    elif isinstance(ele, ufl.TensorElement):
+        return type(ele)(restrict_element(ele._sub_element, restriction_domain), shape=ele._shape, symmetry=ele.symmetry())
+    elif isinstance(ele, ufl.MixedElement):
+        return type(ele)(*(restrict_element(e, restriction_domain) for e in ele.sub_elements()))
+    else:
+        return ele[restriction_domain]
+
+
+def unrestrict_element(ele):
+    """Get an element that might or might not be restricted and
+       return the parent unrestricted element."""
+    if isinstance(ele, ufl.VectorElement):
+        return type(ele)(unrestrict_element(ele._sub_element), dim=ele.num_sub_elements())
+    elif isinstance(ele, ufl.TensorElement):
+        return type(ele)(unrestrict_element(ele._sub_element), shape=ele._shape, symmetry=ele.symmetry())
+    elif isinstance(ele, ufl.MixedElement):
+        return type(ele)(*(unrestrict_element(e) for e in ele.sub_elements()))
+    elif isinstance(ele, ufl.RestrictedElement):
+        return unrestrict_element(ele._element)
+    else:
+        return ele
+
+
+def get_base_elements(e):
+    if isinstance(e, finat.EnrichedElement):
+        return list(chain.from_iterable(map(get_base_elements, e.elements)))
+    elif isinstance(e, finat.TensorProductElement):
+        return list(chain.from_iterable(map(get_base_elements, e.factors)))
+    elif isinstance(e, finat.FlattenedDimensions):
+        return get_base_elements(e.product)
+    elif isinstance(e, (finat.HCurlElement, finat.HDivElement)):
+        return get_base_elements(e.wrappee)
+    elif isinstance(e, finat.finiteelementbase.FiniteElementBase):
+        return get_base_elements(e.fiat_equivalent)
+    elif isinstance(e, FIAT.RestrictedElement):
+        return get_base_elements(e._element)
+    return [e]
+
+
+class PoissonFDMPC(FDMPC):
+    """
+    A preconditioner for tensor-product elements that changes the shape
+    functions so that the H^1 Riesz map is sparse in the interior of a
+    Cartesian cell, and assembles a global sparse matrix on which other
+    preconditioners, such as `ASMStarPC`, can be applied.
+
+    Here we assume that the volume integrals in the Jacobian can be expressed as:
+
+    inner(grad(v), alpha(grad(u)))*dx + inner(v, beta(u))*dx
+
+    where alpha and beta are possibly tensor-valued.
+    The sparse matrix is obtained by approximating alpha and beta by cell-wise
+    constants and discarding the coefficients in alpha that couple together
+    mixed derivatives and mixed components.
+
+    For spaces that are not H^1-conforming, this preconditioner will use
+    the symmetric interior-penalty DG method. The penalty coefficient can be
+    provided in the application context, keyed on ``"eta"``.
+    """
+
+    _variant = "fdm_ipdg"
+    _citation = "Brubeck2022a"
+
+    def assemble_reference_tensor(self, V):
         try:
-            line_elements = get_line_elements(V)
+            _, line_elements, shifts = get_permutation_to_line_elements(V)
         except ValueError:
             raise ValueError("FDMPC does not support the element %s" % V.ufl_element())
 
+        line_elements, = line_elements
+        axes_shifts, = shifts
+
         degree = max(e.degree() for e in line_elements)
-        eta = float(appctx.get("eta", (degree+1)**2))
-        quad_degree = 2*degree+1
+        eta = float(self.appctx.get("eta", degree*(degree+1)))
         element = V.finat_element
         is_dg = element.entity_dofs() == element.entity_closure_dofs()
 
         Afdm = []  # sparse interval mass and stiffness matrices for each direction
         Dfdm = []  # tabulation of normal derivatives at the boundary for each direction
         bdof = []  # indices of point evaluation dofs for each direction
+        cache = self._cache.setdefault("ipdg_reference_tensor", {})
         for e in line_elements:
-            Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(fdm_setup_ipdg(e, eta)))
-            if not (e.formdegree or is_dg):
+            key = (e.degree(), eta)
+            try:
+                rtensor = cache[key]
+            except KeyError:
+                rtensor = cache.setdefault(key, fdm_setup_ipdg(e, eta, comm=PETSc.COMM_SELF))
+            Afdm[:0], Dfdm[:0], bdof[:0] = tuple(zip(rtensor))
+            if not is_dg and e.degree() == degree:
+                # do not apply SIPG along continuous directions
                 Dfdm[0] = None
+        return Afdm, Dfdm, bdof, axes_shifts
 
-        # coefficients w.r.t. the reference values
-        coefficients, self.assembly_callables = self.assemble_coef(J, quad_degree)
-        # set arbitrary non-zero coefficients for preallocation
-        for coef in coefficients.values():
-            with coef.dat.vec as cvec:
-                cvec.set(1.0E0)
-
-        bcflags = get_weak_bc_flags(J)
-
-        # preallocate by calling the assembly routine on a PREALLOCATOR Mat
-        sizes = (V.dof_dset.layout_vec.getSizes(),)*2
-        block_size = V.dof_dset.layout_vec.getBlockSize()
-        prealloc = PETSc.Mat().create(comm=self.comm)
-        prealloc.setType(PETSc.Mat.Type.PREALLOCATOR)
-        prealloc.setSizes(sizes)
-        prealloc.setUp()
-        self.assemble_kron(prealloc, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
-        nnz = get_preallocation(prealloc, block_size * V.dof_dset.set.size)
-        Pmat = PETSc.Mat().createAIJ(sizes, block_size, nnz=nnz, comm=self.comm)
-        Pmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
-        assemble_P = partial(self.assemble_kron, Pmat, V, bcs, eta,
-                             coefficients, Afdm, Dfdm, bdof, bcflags)
-        prealloc.destroy()
-        return Pmat, assemble_P
-
-    def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags):
+    @PETSc.Log.EventDecorator("FDMSetValues")
+    def set_values(self, A, Vrow, Vcol, addv, triu=False):
         """
         Assemble the stiffness matrix in the FDM basis using Kronecker products of interval matrices
 
         :arg A: the :class:`PETSc.Mat` to assemble
-        :arg V: the :class:`~.FunctionSpace` of the form arguments
-        :arg bcs: an iterable of :class:`~.DirichletBC` s
-        :arg eta: a ``float`` penalty parameter for the symmetric interior penalty method
-        :arg coefficients: a ``dict`` mapping strings to :class:`firedrake.function.Function` s with the form coefficients
-        :arg Afdm: the list with sparse interval matrices
-        :arg Dfdm: the list with normal derivatives matrices
-        :arg bcflags: the :class:`numpy.ndarray` with BC facet flags returned by ``get_weak_bc_flags``
+        :arg Vrow: the :class:`.FunctionSpace` test space
+        :arg Vcol: the :class:`.FunctionSpace` trial space
+        :arg addv: a `PETSc.Mat.InsertMode`
+        :arg triu: are we assembling only the upper triangular part?
         """
-        from firedrake.preconditioners.pmg import get_axes_shift
-        Gq = coefficients.get("Gq")
-        Bq = coefficients.get("Bq")
-        Gq_facet = coefficients.get("Gq_facet")
-        PT_facet = coefficients.get("PT_facet")
-
-        imode = PETSc.InsertMode.ADD_VALUES
-        lgmap = V.local_to_global_map(bcs)
-
+        set_submat = SparseAssembler.setSubMatCSR(PETSc.COMM_SELF, triu=triu)
+        update_A = lambda A, Ae, rindices: set_submat(A, Ae, rindices, rindices, addv)
+        condense_element_mat = lambda x: x
+
+        def cell_to_global(lgmap, cell_to_local, cell_index, result=None):
+            # Be careful not to create new arrays
+            result = cell_to_local(cell_index, result=result)
+            return lgmap.apply(result, result=result)
+
+        bsize = Vrow.dof_dset.layout_vec.getBlockSize()
+        cell_to_local, nel = extrude_node_map(Vrow.cell_node_map(), bsize=bsize)
+        get_rindices = partial(cell_to_global, self.lgmaps[Vrow], cell_to_local)
+        Afdm, Dfdm, bdof, axes_shifts = self.assemble_reference_tensor(Vrow)
+
+        Gq = self.coefficients.get("alpha")
+        Bq = self.coefficients.get("beta")
+        bcflags = self.coefficients.get("bcflags")
+        Gq_facet = self.coefficients.get("Gq_facet")
+        PT_facet = self.coefficients.get("PT_facet")
+
+        V = Vrow
         bsize = V.value_size
         ncomp = V.ufl_element().reference_value_size()
         sdim = (V.finat_element.space_dimension() * bsize) // ncomp  # dimension of a single component
-        ndim = V.ufl_domain().topological_dimension()
-        shift = get_axes_shift(V.finat_element) % ndim
+        tdim = V.mesh().topological_dimension()
+        shift = axes_shifts * bsize
 
-        index_cell, nel = glonum_fun(V.cell_node_map())
-        index_coef, _ = glonum_fun(Gq.cell_node_map())
-        flag2id = numpy.kron(numpy.eye(ndim, ndim, dtype=PETSc.IntType), [[1], [2]])
+        index_coef, _ = extrude_node_map((Gq or Bq).cell_node_map())
+        index_bc, _ = extrude_node_map(bcflags.cell_node_map())
+        flag2id = numpy.kron(numpy.eye(tdim, tdim, dtype=PETSc.IntType), [[1], [2]])
 
         # pshape is the shape of the DOFs in the tensor product
         pshape = tuple(Ak[0].size[0] for Ak in Afdm)
-        if shift:
-            assert ncomp == ndim
-            pshape = [tuple(numpy.roll(pshape, -shift*k)) for k in range(ncomp)]
-
-        if A.getType() != PETSc.Mat.Type.PREALLOCATOR:
-            A.zeroEntries()
-            for assemble_coef in self.assembly_callables:
-                assemble_coef()
+        static_condensation = False
+        if sdim != numpy.prod(pshape):
+            static_condensation = True
 
-        # insert the identity in the Dirichlet rows and columns
-        for row in V.dof_dset.lgmap.indices[lgmap.indices < 0]:
-            A.setValue(row, row, 1.0E0, imode)
+        if set(shift) != {0}:
+            assert ncomp == tdim
+            pshape = [tuple(numpy.roll(pshape, -shift[k])) for k in range(ncomp)]
 
         # assemble zero-th order term separately, including off-diagonals (mixed components)
         # I cannot do this for hdiv elements as off-diagonals are not sparse, this is because
-        # the FDM eigenbases for GLL(N) and GLL(N-1) are not orthogonal to each other
-        use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2
+        # the FDM eigenbases for CG(k) and CG(k-1) are not orthogonal to each other
+        use_diag_Bq = Bq is None or len(Bq.ufl_shape) != 2 or static_condensation
+        rindices = None
         if not use_diag_Bq:
             bshape = Bq.ufl_shape
             # Be = Bhat kron ... kron Bhat
             Be = Afdm[0][0].copy()
-            for k in range(1, ndim):
+            for k in range(1, tdim):
                 Be = Be.kron(Afdm[k][0])
 
             aptr = numpy.arange(0, (bshape[0]+1)*bshape[1], bshape[1], dtype=PETSc.IntType)
@@ -304,64 +1427,80 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                 adata = numpy.sum(Bq.dat.data_ro[index_coef(e)], axis=0)
                 Ae = PETSc.Mat().createAIJWithArrays(bshape, (aptr, aidx, adata), comm=PETSc.COMM_SELF)
                 Ae = Be.kron(Ae)
-
-                ie = index_cell(e)
-                ie = numpy.repeat(ie*bsize, bsize) + numpy.tile(numpy.arange(bsize, dtype=ie.dtype), len(ie))
-                rows = lgmap.apply(ie)
-                set_submat_csr(A, Ae, rows, imode)
+                rindices = get_rindices(e, result=rindices)
+                update_A(A, Ae, rindices)
                 Ae.destroy()
             Be.destroy()
             Bq = None
 
         # assemble the second order term and the zero-th order term if any,
         # discarding mixed derivatives and mixed components
+        ae = numpy.zeros((ncomp, tdim), dtype=PETSc.RealType)
+        be = numpy.zeros((ncomp,), dtype=PETSc.RealType)
+        je = None
         for e in range(nel):
-            ie = numpy.reshape(index_cell(e), (ncomp//bsize, -1))
-            je = index_coef(e)
-            bce = bcflags[e]
-
-            # get second order coefficient on this cell
-            mue = numpy.atleast_1d(numpy.sum(Gq.dat.data_ro[je], axis=0))
+            je = index_coef(e, result=je)
+            bce = bcflags.dat.data_ro_with_halos[index_bc(e)] > 1E-8
+            # get coefficients on this cell
+            if Gq is not None:
+                ae[:] = numpy.sum(Gq.dat.data_ro[je], axis=0)
             if Bq is not None:
-                # get zero-th order coefficient on this cell
-                bqe = numpy.atleast_1d(numpy.sum(Bq.dat.data_ro[je], axis=0))
+                be[:] = numpy.sum(Bq.dat.data_ro[je], axis=0)
 
+            rindices = get_rindices(e, result=rindices)
+            rows = numpy.reshape(rindices, (-1, bsize))
+            rows = numpy.transpose(rows)
+            rows = numpy.reshape(rows, (ncomp, -1))
+            # for each component: compute the stiffness matrix Ae
             for k in range(ncomp):
                 # permutation of axes with respect to the first vector component
-                axes = numpy.roll(numpy.arange(ndim), -shift*k)
-                # for each component: compute the stiffness matrix Ae
-                muk = mue[k] if len(mue.shape) == 2 else mue
+                axes = numpy.roll(numpy.arange(tdim), -shift[k])
                 bck = bce[:, k] if len(bce.shape) == 2 else bce
                 fbc = numpy.dot(bck, flag2id)
 
-                # Ae = mue[k][0] Ahat + bqe[k] Bhat
-                Be = Afdm[axes[0]][0].copy()
-                Ae = Afdm[axes[0]][1+fbc[0]].copy()
-                Ae.scale(muk[0])
-                if Bq is not None:
-                    Ae.axpy(bqe[k], Be)
-
-                if ndim > 1:
-                    # Ae = Ae kron Bhat + mue[k][1] Bhat kron Ahat
-                    Ae = Ae.kron(Afdm[axes[1]][0])
-                    Ae.axpy(muk[1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
-                    if ndim > 2:
-                        # Ae = Ae kron Bhat + mue[k][2] Bhat kron Bhat kron Ahat
-                        Be = Be.kron(Afdm[axes[1]][0])
-                        Ae = Ae.kron(Afdm[axes[2]][0])
-                        Ae.axpy(muk[2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
-
-                rows = lgmap.apply(ie[0]*bsize+k if bsize == ncomp else ie[k])
-                set_submat_csr(A, Ae, rows, imode)
+                if Gq is not None:
+                    # Ae = ae[k][0] Ahat + be[k] Bhat
+                    Be = Afdm[axes[0]][0].copy()
+                    Ae = Afdm[axes[0]][1+fbc[0]].copy()
+                    Ae.scale(ae[k][0])
+                    if Bq is not None:
+                        Ae.axpy(be[k], Be)
+
+                    if tdim > 1:
+                        # Ae = Ae kron Bhat + ae[k][1] Bhat kron Ahat
+                        Ae = Ae.kron(Afdm[axes[1]][0])
+                        if Gq is not None:
+                            Ae.axpy(ae[k][1], Be.kron(Afdm[axes[1]][1+fbc[1]]))
+
+                        if tdim > 2:
+                            # Ae = Ae kron Bhat + ae[k][2] Bhat kron Bhat kron Ahat
+                            Be = Be.kron(Afdm[axes[1]][0])
+                            Ae = Ae.kron(Afdm[axes[2]][0])
+                            if Gq is not None:
+                                Ae.axpy(ae[k][2], Be.kron(Afdm[axes[2]][1+fbc[2]]))
+                    Be.destroy()
+
+                elif Bq is not None:
+                    Ae = Afdm[axes[0]][0]
+                    for m in range(1, tdim):
+                        Ae = Ae.kron(Afdm[axes[m]][0])
+                    Ae.scale(be[k])
+
+                Ae = condense_element_mat(Ae)
+                update_A(A, Ae, rows[k].astype(PETSc.IntType))
                 Ae.destroy()
-                Be.destroy()
 
         # assemble SIPG interior facet terms if the normal derivatives have been set up
         if any(Dk is not None for Dk in Dfdm):
-            if ndim < V.ufl_domain().geometric_dimension():
+            if static_condensation:
+                raise NotImplementedError("Static condensation for SIPG not implemented")
+            if tdim < V.mesh().geometric_dimension():
                 raise NotImplementedError("SIPG on immersed meshes is not implemented")
-            index_facet, local_facet_data, nfacets = get_interior_facet_maps(V)
-            index_coef, _, _ = get_interior_facet_maps(Gq_facet or Gq)
+            eta = float(self.appctx.get("eta"))
+
+            lgmap = self.lgmaps[V]
+            index_facet, local_facet_data, nfacets = extrude_interior_facet_maps(V)
+            index_coef, _, _ = extrude_interior_facet_maps(Gq_facet or Gq)
             rows = numpy.zeros((2, sdim), dtype=PETSc.IntType)
 
             for e in range(nfacets):
@@ -373,8 +1512,8 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
 
                 if PT_facet:
                     icell = numpy.reshape(lgmap.apply(ie), (2, ncomp, -1))
-                    iord0 = numpy.insert(numpy.delete(numpy.arange(ndim), idir[0]), 0, idir[0])
-                    iord1 = numpy.insert(numpy.delete(numpy.arange(ndim), idir[1]), 0, idir[1])
+                    iord0 = numpy.insert(numpy.delete(numpy.arange(tdim), idir[0]), 0, idir[0])
+                    iord1 = numpy.insert(numpy.delete(numpy.arange(tdim), idir[1]), 0, idir[1])
                     je = je[[0, 1], lfd]
                     Pfacet = PT_facet.dat.data_ro_with_halos[je]
                     Gfacet = Gq_facet.dat.data_ro_with_halos[je]
@@ -382,14 +1521,14 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                     Gfacet = numpy.sum(Gq.dat.data_ro_with_halos[je], axis=1)
 
                 for k in range(ncomp):
-                    axes = numpy.roll(numpy.arange(ndim), -shift*k)
+                    axes = numpy.roll(numpy.arange(tdim), -shift[k])
                     Dfacet = Dfdm[axes[0]]
                     if Dfacet is None:
                         continue
 
                     if PT_facet:
-                        k0 = iord0[k] if shift != 1 else ndim-1-iord0[-k-1]
-                        k1 = iord1[k] if shift != 1 else ndim-1-iord1[-k-1]
+                        k0 = iord0[k] if shift[1] != 1 else tdim-1-iord0[-k-1]
+                        k1 = iord1[k] if shift[1] != 1 else tdim-1-iord1[-k-1]
                         Piola = Pfacet[[0, 1], [k0, k1]]
                         mu = Gfacet[[0, 1], idir]
                     else:
@@ -424,10 +1563,10 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                             Adense[ii, j0:j1] -= smu[j] * Dfacet[:, jface % 2]
 
                     Ae = numpy_to_petsc(Adense, dense_indices, diag=False)
-                    if ndim > 1:
+                    if tdim > 1:
                         # assume that the mesh is oriented
                         Ae = Ae.kron(Afdm[axes[1]][0])
-                        if ndim > 2:
+                        if tdim > 2:
                             Ae = Ae.kron(Afdm[axes[2]][0])
 
                     if bsize == ncomp:
@@ -439,42 +1578,32 @@ def assemble_kron(self, A, V, bcs, eta, coefficients, Afdm, Dfdm, bdof, bcflags)
                         rows[0] = pull_axis(icell[0][k0], pshape[k0], idir[0])
                         rows[1] = pull_axis(icell[1][k1], pshape[k1], idir[1])
 
-                    set_submat_csr(A, Ae, rows, imode)
+                    update_A(A, Ae, rows)
                     Ae.destroy()
-        A.assemble()
-
-    def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
-        """
-        Return the coefficients of the Jacobian form arguments and their gradient with respect to the reference coordinates.
 
-        :arg J: the Jacobian bilinear form
-        :arg quad_deg: the quadrature degree used for the coefficients
-        :arg discard_mixed: discard entries in second order coefficient with mixed derivatives and mixed components
-        :arg cell_average: to return the coefficients as DG_0 Functions
-
-        :returns: a 2-tuple of
-            coefficients: a dictionary mapping strings to :class:`firedrake.function.Function` s with the coefficients of the form,
-            assembly_callables: a list of assembly callables for each coefficient of the form
-        """
-        from ufl import inner, diff
-        from ufl.algorithms.ad import expand_derivatives
+    @PETSc.Log.EventDecorator("FDMCoefficients")
+    def assemble_coefficients(self, J, fcp):
+        from firedrake.assemble import OneFormAssembler
         coefficients = {}
         assembly_callables = []
 
-        mesh = J.ufl_domain()
+        args_J = J.arguments()
+        V = args_J[-1].function_space()
+        mesh = V.mesh()
         tdim = mesh.topological_dimension()
         Finv = ufl.JacobianInverse(mesh)
-        dx = firedrake.dx(degree=quad_deg)
 
-        if cell_average:
-            family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
-            degree = 0
-        else:
-            family = "Quadrature"
-            degree = quad_deg
+        degree = V.ufl_element().degree()
+        try:
+            degree = max(degree)
+        except TypeError:
+            pass
+        quad_deg = fcp.get("degree", 2*degree+1)
+        dx = ufl.dx(degree=quad_deg, domain=mesh)
+        family = "Discontinuous Lagrange" if tdim == 1 else "DQ"
+        DG = ufl.FiniteElement(family, mesh.ufl_cell(), degree=0)
 
         # extract coefficients directly from the bilinear form
-        args_J = J.arguments()
         integrals_J = J.integrals_by_type("cell")
         mapping = args_J[0].ufl_element().mapping().lower()
         Piola = get_piola_tensor(mapping, mesh)
@@ -485,86 +1614,103 @@ def assemble_coef(self, J, quad_deg, discard_mixed=True, cell_average=True):
             replace_grad = {ufl.grad(t): ufl.dot(Piola, ufl.dot(dt, Finv)) for t, dt in zip(args_J, ref_grad)}
         else:
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
+        alpha = expand_derivatives(sum([ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
+                                                 ref_grad[0]), ref_grad[1]) for i in integrals_J]))
+        # discard mixed derivatives and mixed components
+        if len(alpha.ufl_shape) == 2:
+            alpha = ufl.diag_vector(alpha)
+        else:
+            ashape = alpha.ufl_shape
+            ashape = ashape[:len(ashape)//2]
+            alpha = ufl.as_tensor(numpy.reshape([alpha[i+i] for i in numpy.ndindex(ashape)], (ashape[0], -1)))
 
-        alpha = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_grad),
-                                             ref_grad[0]), ref_grad[1]) for i in integrals_J]))
+        # assemble second order coefficient
+        if not isinstance(alpha, ufl.constantvalue.Zero):
+            Q = FunctionSpace(mesh, ufl.TensorElement(DG, shape=alpha.ufl_shape))
+            tensor = coefficients.setdefault("alpha", Function(Q))
+            assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), alpha)*dx, tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
 
         # get zero-th order coefficent
         ref_val = [ufl.variable(t) for t in args_J]
         if Piola:
-            dummy_element = ufl.TensorElement("DQ", cell=mesh.ufl_cell(), degree=1, shape=Piola.ufl_shape)
+            dummy_element = ufl.TensorElement(family, cell=mesh.ufl_cell(), degree=1, shape=Piola.ufl_shape)
             dummy_Piola = ufl.Coefficient(ufl.FunctionSpace(mesh, dummy_element))
             replace_val = {t: ufl.dot(dummy_Piola, s) for t, s in zip(args_J, ref_val)}
         else:
             replace_val = {t: s for t, s in zip(args_J, ref_val)}
-
-        beta = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_val),
-                                            ref_val[0]), ref_val[1]) for i in integrals_J]))
+        beta = expand_derivatives(sum(ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_val),
+                                               ref_val[0]), ref_val[1]) for i in integrals_J))
         if Piola:
             beta = ufl.replace(beta, {dummy_Piola: Piola})
-
-        G = alpha
-        if discard_mixed:
-            # discard mixed derivatives and mixed components
-            if len(G.ufl_shape) == 2:
-                G = ufl.diag_vector(G)
-            else:
-                Gshape = G.ufl_shape
-                Gshape = Gshape[:len(Gshape)//2]
-                G = ufl.as_tensor(numpy.reshape([G[i+i] for i in numpy.ndindex(Gshape)], (Gshape[0], -1)))
-            Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape)
-        else:
-            Qe = ufl.TensorElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default", shape=G.ufl_shape, symmetry=True)
-
-        # assemble second order coefficient
-        Q = firedrake.FunctionSpace(mesh, Qe)
-        q = firedrake.TestFunction(Q)
-        Gq = firedrake.Function(Q)
-        coefficients["Gq"] = Gq
-        assembly_callables.append(partial(firedrake.assemble, inner(G, q)*dx, Gq))
-
         # assemble zero-th order coefficient
         if not isinstance(beta, ufl.constantvalue.Zero):
             if Piola:
                 # keep diagonal
                 beta = ufl.diag_vector(beta)
-            shape = beta.ufl_shape
-            Qe = ufl.FiniteElement(family, mesh.ufl_cell(), degree=degree, quad_scheme="default")
-            if shape:
-                Qe = ufl.TensorElement(Qe, shape=shape)
-            Q = firedrake.FunctionSpace(mesh, Qe)
-            q = firedrake.TestFunction(Q)
-            Bq = firedrake.Function(Q)
-            coefficients["Bq"] = Bq
-            assembly_callables.append(partial(firedrake.assemble, inner(beta, q)*dx, Bq))
-
+            Q = FunctionSpace(mesh, ufl.TensorElement(DG, shape=beta.ufl_shape) if beta.ufl_shape else DG)
+            tensor = coefficients.setdefault("beta", Function(Q))
+            assembly_callables.append(OneFormAssembler(ufl.inner(TestFunction(Q), beta)*dx, tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
+
+        family = "CG" if tdim == 1 else "DGT"
+        degree = 1 if tdim == 1 else 0
+        DGT = ufl.BrokenElement(ufl.FiniteElement(family, cell=mesh.ufl_cell(), degree=degree))
         if Piola:
             # make DGT functions with the second order coefficient
             # and the Piola tensor for each side of each facet
             extruded = mesh.cell_set._extruded
-            dS_int = firedrake.dS_h(degree=quad_deg) + firedrake.dS_v(degree=quad_deg) if extruded else firedrake.dS(degree=quad_deg)
-            ele = ufl.BrokenElement(ufl.FiniteElement("DGT", mesh.ufl_cell(), 0))
+            dS_int = ufl.dS_h(degree=quad_deg) + ufl.dS_v(degree=quad_deg) if extruded else ufl.dS(degree=quad_deg)
             area = ufl.FacetArea(mesh)
+            ifacet_inner = lambda v, u: ((ufl.inner(v('+'), u('+')) + ufl.inner(v('-'), u('-')))/area)*dS_int
 
             replace_grad = {ufl.grad(t): ufl.dot(dt, Finv) for t, dt in zip(args_J, ref_grad)}
-            alpha = expand_derivatives(sum([diff(diff(ufl.replace(i.integrand(), replace_grad),
-                                                 ref_grad[0]), ref_grad[1]) for i in integrals_J]))
-            vol = abs(ufl.JacobianDeterminant(mesh))
-            G = vol * alpha
+            alpha = expand_derivatives(sum(ufl.diff(ufl.diff(ufl.replace(i.integrand(), replace_grad),
+                                                    ref_grad[0]), ref_grad[1]) for i in integrals_J))
+            G = alpha
             G = ufl.as_tensor([[[G[i, k, j, k] for i in range(G.ufl_shape[0])] for j in range(G.ufl_shape[2])] for k in range(G.ufl_shape[3])])
+            G = G * abs(ufl.JacobianDeterminant(mesh))
 
-            Q = firedrake.TensorFunctionSpace(mesh, ele, shape=G.ufl_shape)
-            q = firedrake.TestFunction(Q)
-            Gq_facet = firedrake.Function(Q)
-            coefficients["Gq_facet"] = Gq_facet
-            assembly_callables.append(partial(firedrake.assemble, ((inner(q('+'), G('+')) + inner(q('-'), G('-')))/area)*dS_int, Gq_facet))
-
+            Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=G.ufl_shape))
+            tensor = coefficients.setdefault("Gq_facet", Function(Q))
+            assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), G), tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
             PT = Piola.T
-            Q = firedrake.TensorFunctionSpace(mesh, ele, shape=PT.ufl_shape)
-            q = firedrake.TestFunction(Q)
-            PT_facet = firedrake.Function(Q)
-            coefficients["PT_facet"] = PT_facet
-            assembly_callables.append(partial(firedrake.assemble, ((inner(q('+'), PT('+')) + inner(q('-'), PT('-')))/area)*dS_int, PT_facet))
+            Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=PT.ufl_shape))
+            tensor = coefficients.setdefault("PT_facet", Function(Q))
+            assembly_callables.append(OneFormAssembler(ifacet_inner(TestFunction(Q), PT), tensor=tensor,
+                                                       form_compiler_parameters=fcp).assemble)
+
+        # make DGT functions with BC flags
+        shape = V.ufl_element().reference_value_shape()
+        Q = FunctionSpace(mesh, ufl.TensorElement(DGT, shape=shape) if shape else DGT)
+        test = TestFunction(Q)
+
+        ref_args = [ufl.variable(t) for t in args_J]
+        replace_args = {t: s for t, s in zip(args_J, ref_args)}
+
+        forms = []
+        md = {"quadrature_degree": 0}
+        for it in J.integrals():
+            itype = it.integral_type()
+            if itype.startswith("exterior_facet"):
+                beta = ufl.diff(ufl.diff(ufl.replace(it.integrand(), replace_args), ref_args[0]), ref_args[1])
+                beta = expand_derivatives(beta)
+                if beta.ufl_shape:
+                    beta = ufl.diag_vector(beta)
+                ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
+                forms.append(ufl.inner(test, beta)*ds_ext)
+
+        tensor = coefficients.setdefault("bcflags", Function(Q))
+        if len(forms):
+            form = sum(forms)
+            if len(form.arguments()) == 1:
+                assembly_callables.append(OneFormAssembler(form, tensor=tensor,
+                                                           form_compiler_parameters=fcp).assemble)
+        # set arbitrary non-zero coefficients for preallocation
+        for coef in coefficients.values():
+            with coef.dat.vec as cvec:
+                cvec.set(1.0E0)
         return coefficients, assembly_callables
 
 
@@ -586,16 +1732,7 @@ def pull_axis(x, pshape, idir):
     return numpy.reshape(numpy.moveaxis(numpy.reshape(x.copy(), pshape), idir, 0), x.shape)
 
 
-def set_submat_csr(A_global, A_local, global_indices, imode):
-    """insert values from A_local to A_global on the diagonal block with indices global_indices"""
-    indptr, indices, data = A_local.getValuesCSR()
-    for i, row in enumerate(global_indices.flat):
-        i0 = indptr[i]
-        i1 = indptr[i+1]
-        A_global.setValues(row, global_indices.flat[indices[i0:i1]], data[i0:i1], imode)
-
-
-def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
+def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False, comm=None):
     """
     Create a SeqAIJ Mat from a dense matrix using the diagonal and a subset of rows and columns.
     If dense_indices is empty, then also include the off-diagonal corners of the matrix.
@@ -606,8 +1743,7 @@ def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
     nnz[dense_indices] = len(dense_indices) if block else n
 
     imode = PETSc.InsertMode.INSERT
-    A_petsc = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=PETSc.COMM_SELF)
-
+    A_petsc = PETSc.Mat().createAIJ(A_numpy.shape, nnz=(nnz, 0), comm=comm)
     idx = numpy.arange(n, dtype=PETSc.IntType)
     if block:
         values = A_numpy[dense_indices, :][:, dense_indices]
@@ -616,39 +1752,36 @@ def numpy_to_petsc(A_numpy, dense_indices, diag=True, block=False):
         for j in dense_indices:
             A_petsc.setValues(j, idx, A_numpy[j, :], imode)
             A_petsc.setValues(idx, j, A_numpy[:, j], imode)
-
     if diag:
         idx = idx[:, None]
         values = A_numpy.diagonal()[:, None]
         A_petsc.setValuesRCV(idx, idx, values, imode)
-
     A_petsc.assemble()
     return A_petsc
 
 
-@lru_cache(maxsize=10)
-def fdm_setup_ipdg(fdm_element, eta):
+def fdm_setup_ipdg(fdm_element, eta, comm=None):
     """
-    Setup for the fast diagonalization method for the IP-DG formulation.
+    Setup for the fast diagonalisation method for the IP-DG formulation.
     Compute sparsified interval stiffness and mass matrices
     and tabulate the normal derivative of the shape functions.
 
     :arg fdm_element: a :class:`FIAT.FDMElement`
     :arg eta: penalty coefficient as a `float`
+    :arg comm: a :class:`PETSc.Comm`
 
     :returns: 3-tuple of:
         Afdm: a list of :class:`PETSc.Mats` with the sparse interval matrices
         Bhat, and bcs(Ahat) for every combination of either natural or weak
         Dirichlet BCs on each endpoint.
         Dfdm: the tabulation of the normal derivatives of the Dirichlet eigenfunctions.
-        bdof: the indices of PointEvaluation dofs.
+        bdof: the indices of the vertex degrees of freedom.
     """
-    from FIAT.quadrature import GaussLegendreQuadratureLineRule
-    from FIAT.functional import PointEvaluation
     ref_el = fdm_element.get_reference_element()
     degree = fdm_element.degree()
-    rule = GaussLegendreQuadratureLineRule(ref_el, degree+1)
-    bdof = [k for k, f in enumerate(fdm_element.dual_basis()) if isinstance(f, PointEvaluation)]
+    rule = FIAT.quadrature.make_quadrature(ref_el, degree+1)
+    edof = fdm_element.entity_dofs()
+    bdof = edof[0][0] + edof[0][1]
 
     phi = fdm_element.tabulate(1, rule.get_points())
     Jhat = phi[(0, )]
@@ -661,7 +1794,7 @@ def fdm_setup_ipdg(fdm_element, eta):
     Dfacet = basis[(1,)]
     Dfacet[:, 0] = -Dfacet[:, 0]
 
-    Afdm = [numpy_to_petsc(Bhat, bdof, block=True)]
+    Afdm = [numpy_to_petsc(Bhat, bdof, block=True, comm=comm)]
     for bc in range(4):
         bcs = (bc % 2, bc//2)
         Abc = Ahat.copy()
@@ -671,23 +1804,24 @@ def fdm_setup_ipdg(fdm_element, eta):
                 Abc[:, j] -= Dfacet[:, k]
                 Abc[j, :] -= Dfacet[:, k]
                 Abc[j, j] += eta
-        Afdm.append(numpy_to_petsc(Abc, bdof))
+        Afdm.append(numpy_to_petsc(Abc, bdof, comm=comm))
     return Afdm, Dfacet, bdof
 
 
-@lru_cache(maxsize=10)
-def get_interior_facet_maps(V):
+def extrude_interior_facet_maps(V):
     """
-    Extrude V.interior_facet_node_map and V.ufl_domain().interior_facets.local_facet_dat
+    Extrude V.interior_facet_node_map and V.mesh().interior_facets.local_facet_dat
 
-    :arg V: a :class:`~.FunctionSpace`
+    :arg V: a :class:`.FunctionSpace`
 
     :returns: the 3-tuple of
         facet_to_nodes_fun: maps interior facets to the nodes of the two cells sharing it,
         local_facet_data_fun: maps interior facets to the local facet numbering in the two cells sharing it,
         nfacets: the total number of interior facets owned by this process
     """
-    mesh = V.ufl_domain()
+    if isinstance(V, Function):
+        V = V.function_space()
+    mesh = V.mesh()
     intfacets = mesh.interior_facets
     facet_to_cells = intfacets.facet_cell_map.values
     local_facet_data = intfacets.local_facet_dat.data_ro
@@ -747,90 +1881,63 @@ def get_interior_facet_maps(V):
     return facet_to_nodes_fun, local_facet_data_fun, nfacets
 
 
-@lru_cache(maxsize=10)
-def glonum_fun(node_map):
+def extrude_node_map(node_map, bsize=1):
     """
-    Return a function that maps each topological entity to its nodes and the total number of entities.
+    Construct a (possibly vector-valued) cell to node map from an un-extruded scalar map.
 
-    :arg node_map: a :class:`pyop2.Map` mapping entities to their nodes, including ghost entities.
+    :arg node_map: a :class:`pyop2.Map` mapping entities to their local dofs, including ghost entities.
+    :arg bsize: the block size
 
-    :returns: a 2-tuple with the map and the number of cells owned by this process
+    :returns: a 2-tuple with the cell to node map and the number of cells owned by this process
     """
-    nelv = node_map.values.shape[0]
+    nel = node_map.values.shape[0]
     if node_map.offset is None:
-        return lambda e: node_map.values_with_halo[e], nelv
+        def _scalar_map(map_values, e, result=None):
+            if result is None:
+                result = numpy.empty_like(map_values[e])
+            numpy.copyto(result, map_values[e])
+            return result
+
+        scalar_map = partial(_scalar_map, node_map.values_with_halo)
     else:
         layers = node_map.iterset.layers_array
         if layers.shape[0] == 1:
+            def _scalar_map(map_values, offset, nelz, e, result=None):
+                if result is None:
+                    result = numpy.empty_like(offset)
+                numpy.copyto(result, offset)
+                result *= (e % nelz)
+                result += map_values[e // nelz]
+                return result
+
             nelz = layers[0, 1]-layers[0, 0]-1
-            nel = nelz*nelv
-            return lambda e: node_map.values_with_halo[e//nelz] + (e % nelz)*node_map.offset, nel
+            nel *= nelz
+            scalar_map = partial(_scalar_map, node_map.values_with_halo, node_map.offset, nelz)
         else:
+            def _scalar_map(map_values, offset, to_base, to_layer, e, result=None):
+                if result is None:
+                    result = numpy.empty_like(offset)
+                numpy.copyto(result, offset)
+                result *= to_layer[e]
+                result += map_values[to_base[e]]
+                return result
+
             nelz = layers[:, 1]-layers[:, 0]-1
-            nel = sum(nelz[:nelv])
+            nel = sum(nelz[:nel])
             to_base = numpy.repeat(numpy.arange(node_map.values_with_halo.shape[0], dtype=node_map.offset.dtype), nelz)
             to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-            return lambda e: node_map.values_with_halo[to_base[e]] + to_layer[e]*node_map.offset, nel
-
+            scalar_map = partial(_scalar_map, node_map.values_with_halo, node_map.offset, to_base, to_layer)
 
-def glonum(node_map):
-    """
-    Return an array with the nodes of each topological entity of a certain kind.
+    if bsize == 1:
+        return scalar_map, nel
 
-    :arg node_map: a :class:`pyop2.Map` mapping entities to their nodes, including ghost entities.
+    def vector_map(bsize, ibase, e, result=None):
+        index = None
+        if result is not None:
+            index = result[:, 0]
+        index = scalar_map(e, result=index)
+        index *= bsize
+        return numpy.add.outer(index, ibase, out=result)
 
-    :returns: a :class:`numpy.ndarray` whose rows are the nodes for each cell
-    """
-    if (node_map.offset is None) or (node_map.values_with_halo.size == 0):
-        return node_map.values_with_halo
-    else:
-        layers = node_map.iterset.layers_array
-        if layers.shape[0] == 1:
-            nelz = layers[0, 1]-layers[0, 0]-1
-            to_layer = numpy.tile(numpy.arange(nelz, dtype=node_map.offset.dtype), len(node_map.values_with_halo))
-        else:
-            nelz = layers[:, 1]-layers[:, 0]-1
-            to_layer = numpy.concatenate([numpy.arange(nz, dtype=node_map.offset.dtype) for nz in nelz])
-        return numpy.repeat(node_map.values_with_halo, nelz, axis=0) + numpy.kron(to_layer.reshape((-1, 1)), node_map.offset)
-
-
-def get_weak_bc_flags(J):
-    """
-    Return flags indicating whether the zero-th order coefficient on each facet of every cell is non-zero
-    """
-    from ufl.algorithms.ad import expand_derivatives
-    mesh = J.ufl_domain()
-    args_J = J.arguments()
-    V = args_J[0].function_space()
-    rvs = V.ufl_element().reference_value_shape()
-    cell = mesh.ufl_cell()
-    family = "CG" if cell.topological_dimension() == 1 else "DGT"
-    degree = 1 if cell.topological_dimension() == 1 else 0
-    Qe = ufl.FiniteElement(family, cell=cell, degree=degree)
-    if rvs:
-        Qe = ufl.TensorElement(Qe, shape=rvs)
-    Q = firedrake.FunctionSpace(mesh, Qe)
-    q = firedrake.TestFunction(Q)
-
-    ref_args = [ufl.variable(t) for t in args_J]
-    replace_args = {t: s for t, s in zip(args_J, ref_args)}
-
-    forms = []
-    md = {"quadrature_degree": 0}
-    for it in J.integrals():
-        itype = it.integral_type()
-        if itype.startswith("exterior_facet"):
-            beta = ufl.diff(ufl.diff(ufl.replace(it.integrand(), replace_args), ref_args[0]), ref_args[1])
-            beta = expand_derivatives(beta)
-            if rvs:
-                beta = ufl.diag_vector(beta)
-            ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
-            forms.append(ufl.inner(q, beta)*ds_ext)
-
-    tol = 1E-8
-    if len(forms):
-        bq = firedrake.assemble(sum(forms))
-        fbc = bq.dat.data_with_halos[glonum(Q.cell_node_map())]
-        return (abs(fbc) > tol).astype(PETSc.IntType)
-    else:
-        return numpy.zeros(glonum(Q.cell_node_map()).shape, dtype=PETSc.IntType)
+    ibase = numpy.arange(bsize, dtype=node_map.values.dtype)
+    return partial(vector_map, bsize, ibase), nel
diff --git a/firedrake/preconditioners/gtmg.py b/firedrake/preconditioners/gtmg.py
index 35f1f1e570..d11cc548c1 100644
--- a/firedrake/preconditioners/gtmg.py
+++ b/firedrake/preconditioners/gtmg.py
@@ -29,7 +29,7 @@ def initialize(self, pc):
         if ctx is None:
             raise ValueError("No context found.")
         if not isinstance(ctx, _SNESContext):
-            raise ValueError("Don't know how to get form from %r", ctx)
+            raise ValueError("Don't know how to get form from %r" % ctx)
 
         prefix = pc.getOptionsPrefix()
         options_prefix = prefix + self._prefix
@@ -41,7 +41,7 @@ def initialize(self, pc):
             if ictx is None:
                 raise ValueError("No context found on matrix")
             if not isinstance(ictx, ImplicitMatrixContext):
-                raise ValueError("Don't know how to get form from %r", ictx)
+                raise ValueError("Don't know how to get form from %r" % ictx)
 
             fine_operator = ictx.a
             fine_bcs = ictx.row_bcs
@@ -70,7 +70,7 @@ def initialize(self, pc):
             fine_petscmat.setTransposeNullSpace(fine_transpose_nullspace)
 
         # Handle the coarse operator
-        coarse_options_prefix = options_prefix + "mg_coarse"
+        coarse_options_prefix = options_prefix + "mg_coarse_"
         coarse_mat_type = opts.getString(coarse_options_prefix + "mat_type",
                                          parameters["default_matrix_type"])
 
diff --git a/firedrake/preconditioners/hiptmair.py b/firedrake/preconditioners/hiptmair.py
index 317dc5614b..af1d1bab27 100644
--- a/firedrake/preconditioners/hiptmair.py
+++ b/firedrake/preconditioners/hiptmair.py
@@ -201,7 +201,7 @@ def coarsen(self, pc):
         if G_callback is None:
             interp_petscmat = chop(Interpolator(dminus(test), V, bcs=bcs + coarse_space_bcs).callable().handle)
         else:
-            interp_petscmat = G_callback(V, coarse_space, bcs, coarse_space_bcs)
+            interp_petscmat = G_callback(coarse_space, V, coarse_space_bcs, bcs)
 
         return coarse_operator, coarse_space_bcs, interp_petscmat
 
diff --git a/firedrake/preconditioners/hypre_ads.py b/firedrake/preconditioners/hypre_ads.py
index ca3728abdb..9cbc2537da 100644
--- a/firedrake/preconditioners/hypre_ads.py
+++ b/firedrake/preconditioners/hypre_ads.py
@@ -34,12 +34,12 @@ def initialize(self, obj):
         if G_callback is None:
             G = chop(Interpolator(grad(TestFunction(P1)), NC1).callable().handle)
         else:
-            G = G_callback(NC1, P1)
+            G = G_callback(P1, NC1)
         C_callback = appctx.get("get_curl", None)
         if C_callback is None:
             C = chop(Interpolator(curl(TestFunction(NC1)), V).callable().handle)
         else:
-            C = C_callback(V, NC1)
+            C = C_callback(NC1, V)
 
         pc = PETSc.PC().create(comm=obj.comm)
         pc.incrementTabLevel(1, parent=obj)
diff --git a/firedrake/preconditioners/hypre_ams.py b/firedrake/preconditioners/hypre_ams.py
index 8bfd14908e..a00334403b 100644
--- a/firedrake/preconditioners/hypre_ams.py
+++ b/firedrake/preconditioners/hypre_ams.py
@@ -54,7 +54,7 @@ def initialize(self, obj):
         if G_callback is None:
             G = chop(Interpolator(grad(TestFunction(P1)), V).callable().handle)
         else:
-            G = G_callback(V, P1)
+            G = G_callback(P1, V)
 
         pc = PETSc.PC().create(comm=obj.comm)
         pc.incrementTabLevel(1, parent=obj)
diff --git a/firedrake/preconditioners/patch.py b/firedrake/preconditioners/patch.py
index 08f538b876..f3e7e8123e 100644
--- a/firedrake/preconditioners/patch.py
+++ b/firedrake/preconditioners/patch.py
@@ -748,14 +748,14 @@ def initialize(self, obj):
         if ctx is None:
             raise ValueError("No context found on form")
         if not isinstance(ctx, _SNESContext):
-            raise ValueError("Don't know how to get form from %r", ctx)
+            raise ValueError("Don't know how to get form from %r" % ctx)
 
         if P.getType() == "python":
             ictx = P.getPythonContext()
             if ictx is None:
                 raise ValueError("No context found on matrix")
             if not isinstance(ictx, ImplicitMatrixContext):
-                raise ValueError("Don't know how to get form from %r", ictx)
+                raise ValueError("Don't know how to get form from %r" % ictx)
             J = ictx.a
             bcs = ictx.row_bcs
             if bcs != ictx.col_bcs:
diff --git a/firedrake/preconditioners/pmg.py b/firedrake/preconditioners/pmg.py
index 823849729c..ff302a94a2 100644
--- a/firedrake/preconditioners/pmg.py
+++ b/firedrake/preconditioners/pmg.py
@@ -1,20 +1,28 @@
-from functools import partial, lru_cache
+from functools import partial
 from itertools import chain
-from pyop2 import op2, PermutedMap
-from firedrake.preconditioners.base import PCBase, SNESBase, PCSNESBase
 from firedrake.dmhooks import (attach_hooks, get_appctx, push_appctx, pop_appctx,
                                add_hook, get_parent, push_parent, pop_parent,
                                get_function_space, set_function_space)
+from firedrake.petsc import PETSc
+from firedrake.preconditioners.base import PCBase, SNESBase, PCSNESBase
+from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
 from firedrake.solving_utils import _SNESContext
 from firedrake.tsfc_interface import extract_numbered_coefficients
-from firedrake.utils import ScalarType_c, IntType_c
-from firedrake.petsc import PETSc
+from firedrake.utils import ScalarType_c, IntType_c, cached_property
+from tsfc.finatinterface import create_element
+from tsfc import compile_expression_dual_evaluation
+from pyop2 import op2
+from pyop2.caching import cached
+
 import firedrake
+import finat
+import FIAT
 import ufl
 import loopy
 import numpy
 import os
 import tempfile
+import weakref
 
 __all__ = ("PMGPC", "PMGSNES")
 
@@ -30,9 +38,10 @@ class PMGBase(PCSNESBase):
     or any other solver in firedrake may be applied to the coarse problem.
 
     Other PETSc options inspected by this class are:
-    - 'pmg_coarse_degree': polynomial degree of the coarse level
-    - 'pmg_coarse_mat_type': can be either 'aij' or 'matfree'
-    - 'pmg_coarse_form_compiler_mode': can be 'spectral' (default), 'vanilla', 'coffee', or 'tensor'
+    - 'pmg_mg_coarse_degree': polynomial degree of the coarse level
+    - 'pmg_mg_coarse_mat_type': can be either a `PETSc.Mat.Type`, or 'matfree'
+    - 'pmg_mg_coarse_pmat_type': can be either a `PETSc.Mat.Type`, or 'matfree'
+    - 'pmg_mg_coarse_form_compiler_mode': can be 'spectral' (default), 'vanilla', 'coffee', or 'tensor'
     - 'pmg_mg_levels_transfer_mat_type': can be either 'aij' or 'matfree'
 
     The p-coarsening is implemented in the `coarsen_element` routine.
@@ -48,6 +57,9 @@ class PMGBase(PCSNESBase):
     """
 
     _prefix = "pmg_"
+    # This is parallel-safe because the keys are ids of a collective objects
+    _coarsen_cache = weakref.WeakKeyDictionary()
+    _transfer_cache = weakref.WeakKeyDictionary()
 
     def coarsen_element(self, ele):
         """
@@ -72,32 +84,39 @@ def coarsen_form(self, form, fine_to_coarse_map):
         """
         return ufl.replace(form, fine_to_coarse_map)
 
-    def initialize(self, pc):
+    def initialize(self, obj):
         # Make a new DM.
         # Hook up a (new) coarsen routine on that DM.
-        # Make a new PC, of type MG.
-        # Assign the DM to that PC.
+        # Make a new PC, of type MG (or SNES of type FAS).
+        # Assign the DM to that PC (or SNES).
 
-        odm = pc.getDM()
+        odm = obj.getDM()
         ctx = get_appctx(odm)
+        if ctx is None:
+            raise ValueError("No context found.")
+        if not isinstance(ctx, _SNESContext):
+            raise ValueError("Don't know how to get form from %r" % ctx)
+        fcp = ctx._problem.form_compiler_parameters
+        mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
 
         test, trial = ctx.J.arguments()
         if test.function_space() != trial.function_space():
             raise NotImplementedError("test and trial spaces must be the same")
 
-        prefix = pc.getOptionsPrefix()
+        prefix = obj.getOptionsPrefix()
         options_prefix = prefix + self._prefix
-        opts = PETSc.Options(options_prefix)
-        pdm = PETSc.DMShell().create(comm=pc.comm)
+        pdm = PETSc.DMShell().create(comm=obj.comm)
         pdm.setOptionsPrefix(options_prefix)
 
+        ppc = self.configure_pmg(obj, pdm)
+        self.is_snes = isinstance(obj, PETSc.SNES)
+
         # Get the coarse degree from PETSc options
-        fcp = ctx._problem.form_compiler_parameters
-        mode = fcp.get("mode", "spectral") if fcp is not None else "spectral"
-        self.coarse_degree = opts.getInt("coarse_degree", default=1)
-        self.coarse_mat_type = opts.getString("coarse_mat_type", default=ctx.mat_type)
-        self.coarse_pmat_type = opts.getString("coarse_pmat_type", default=self.coarse_mat_type)
-        self.coarse_form_compiler_mode = opts.getString("coarse_form_compiler_mode", default=mode)
+        copts = PETSc.Options(ppc.getOptionsPrefix() + ppc.getType() + "_coarse_")
+        self.coarse_degree = copts.getInt("degree", default=1)
+        self.coarse_mat_type = copts.getString("mat_type", default=ctx.mat_type)
+        self.coarse_pmat_type = copts.getString("pmat_type", default=self.coarse_mat_type)
+        self.coarse_form_compiler_mode = copts.getString("form_compiler_mode", default=mode)
 
         # Construct a list with the elements we'll be using
         V = test.function_space()
@@ -118,12 +137,10 @@ def initialize(self, pc):
         # Now overwrite some routines on the DM
         pdm.setRefine(None)
         pdm.setCoarsen(self.coarsen)
-        pdm.setCreateInterpolation(self.create_interpolation)
-        # We need this for p-FAS
-        pdm.setCreateInjection(self.create_injection)
-        pdm.setSNESJacobian(_SNESContext.form_jacobian)
-        pdm.setSNESFunction(_SNESContext.form_function)
-        pdm.setKSPComputeOperators(_SNESContext.compute_operators)
+        if self.is_snes:
+            pdm.setSNESFunction(_SNESContext.form_function)
+            pdm.setSNESJacobian(_SNESContext.form_jacobian)
+            pdm.setKSPComputeOperators(_SNESContext.compute_operators)
 
         set_function_space(pdm, get_function_space(odm))
 
@@ -132,28 +149,28 @@ def initialize(self, pc):
         add_hook(parent, setup=partial(push_parent, pdm, parent), teardown=partial(pop_parent, pdm, parent), call_setup=True)
         add_hook(parent, setup=partial(push_appctx, pdm, ctx), teardown=partial(pop_appctx, pdm, ctx), call_setup=True)
 
-        self.ppc = self.configure_pmg(pc, pdm)
-        self.ppc.setFromOptions()
-        self.ppc.setUp()
+        ppc.incrementTabLevel(1, parent=obj)
+        ppc.setFromOptions()
+        ppc.setUp()
+        self.ppc = ppc
 
-    def update(self, pc):
-        pass
+    def update(self, obj):
+        self.ppc.setUp()
 
-    def view(self, pc, viewer=None):
+    def view(self, obj, viewer=None):
         if viewer is None:
             viewer = PETSc.Viewer.STDOUT
         viewer.printfASCII("p-multigrid PC\n")
-        self.ppc.view(viewer)
+        if hasattr(self, "ppc"):
+            self.ppc.view(viewer=viewer)
 
-    def destroy(self, pc):
+    def destroy(self, obj):
         if hasattr(self, "ppc"):
             self.ppc.destroy()
 
     def coarsen(self, fdm, comm):
         # Coarsen the _SNESContext of a DM fdm
         # return the coarse DM cdm of the coarse _SNESContext
-        from firedrake.nullspace import VectorSpaceBasis, MixedVectorSpaceBasis
-
         fctx = get_appctx(fdm)
         parent = get_parent(fdm)
         assert parent is not None
@@ -190,9 +207,13 @@ def _coarsen_form(a):
                              for f in a.integrals()])
             return a
 
-        cF = _coarsen_form(fctx.F)
         cJ = _coarsen_form(fctx.J)
-        cJp = _coarsen_form(fctx.Jp)
+        cJp = cJ if fctx.Jp is fctx.J else _coarsen_form(fctx.Jp)
+        # This fixes a subtle bug where you are applying PMGPC on a mixed
+        # problem with geometric multigrid only on one block and an non-Lagrange element
+        # on the other block (gmg breaks for non-Lagrange elements)
+        cF = _coarsen_form(fctx.F) if self.is_snes else ufl.action(cJ, cu)
+
         fcp = self.coarsen_quadrature(fproblem.form_compiler_parameters, fdeg, cdeg)
         cbcs = self.coarsen_bcs(fproblem.bcs, cV)
 
@@ -218,11 +239,7 @@ def _coarsen_form(a):
         except ValueError:
             mat_type = self.coarse_mat_type
             pmat_type = self.coarse_pmat_type
-            if fcp is None:
-                fcp = dict()
-            elif fcp is fproblem.form_compiler_parameters:
-                fcp = dict(fcp)
-            fcp["mode"] = self.coarse_form_compiler_mode
+            fcp = dict(fcp or {}, mode=self.coarse_form_compiler_mode)
 
         # Coarsen the problem and the _SNESContext
         cproblem = firedrake.NonlinearVariationalProblem(cF, cu, bcs=cbcs, J=cJ, Jp=cJp,
@@ -250,113 +267,123 @@ def _coarsen_form(a):
         cdm.setCreateInterpolation(self.create_interpolation)
         cdm.setCreateInjection(self.create_injection)
 
-        # injection of the initial state
-        def inject_state(mat):
-            with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
-                mat.multTranspose(xf, xc)
+        if cu in cJ.coefficients():
+            # Only inject state if the coarse state is a dependency of the coarse Jacobian.
+            inject = cdm.createInjection(fdm)
+
+            def inject_state():
+                with cu.dat.vec_wo as xc, fu.dat.vec_ro as xf:
+                    inject.mult(xf, xc)
+
+            add_hook(parent, setup=inject_state, call_setup=True)
+
+        interpolate = None
+        if fctx._nullspace or fctx._nullspace_T or fctx._near_nullspace:
+            interpolate, _ = cdm.createInterpolation(fdm)
+        cctx._nullspace = self.coarsen_nullspace(fctx._nullspace, cV, interpolate)
+        cctx._nullspace_T = self.coarsen_nullspace(fctx._nullspace_T, cV, interpolate)
+        cctx._near_nullspace = self.coarsen_nullspace(fctx._near_nullspace, cV, interpolate)
+        cctx.set_nullspace(cctx._nullspace, cV._ises, transpose=False, near=False)
+        cctx.set_nullspace(cctx._nullspace_T, cV._ises, transpose=True, near=False)
+        cctx.set_nullspace(cctx._near_nullspace, cV._ises, transpose=False, near=True)
+        return cdm
+
+    def coarsen_quadrature(self, metadata, fdeg, cdeg):
+        """Coarsen the quadrature degree in a dictionary preserving the ratio of
+           quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1)."""
+        try:
+            qdeg = metadata["quadrature_degree"]
+            coarse_qdeg = max(2*cdeg+1, ((qdeg+1)*(cdeg+1)+fdeg)//(fdeg+1)-1)
+            return dict(metadata, quadrature_degree=coarse_qdeg)
+        except (KeyError, TypeError):
+            return metadata
 
-        injection = self.create_injection(cdm, fdm)
-        add_hook(parent, setup=partial(inject_state, injection), call_setup=True)
+    def coarsen_bcs(self, fbcs, cV):
+        """Coarsen a list of bcs"""
+        cbcs = []
+        for bc in fbcs:
+            cache = self._coarsen_cache.setdefault(bc, {})
+            key = (cV.ufl_element(), self.is_snes)
+            try:
+                coarse_bc = cache[key]
+            except KeyError:
+                cV_ = cV
+                for index in bc._indices:
+                    cV_ = cV_.sub(index)
+                cbc_value = self.coarsen_bc_value(bc, cV_)
+                if isinstance(bc, firedrake.DirichletBC):
+                    coarse_bc = cache.setdefault(key, bc.reconstruct(V=cV_, g=cbc_value))
+                else:
+                    raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
+            cbcs.append(coarse_bc)
+        return cbcs
 
-        # coarsen the nullspace basis
-        def coarsen_nullspace(coarse_V, mat, fine_nullspace):
+    def coarsen_nullspace(self, fine_nullspace, cV, interpolate):
+        """Coarsen a nullspace"""
+        if fine_nullspace is None:
+            return fine_nullspace
+        cache = self._coarsen_cache.setdefault(fine_nullspace, {})
+        key = cV.ufl_element()
+        try:
+            return cache[key]
+        except KeyError:
             if isinstance(fine_nullspace, MixedVectorSpaceBasis):
-                if mat.type == 'python':
-                    mat = mat.getPythonContext()
-                submats = [mat.getNestSubMatrix(i, i) for i in range(len(coarse_V))]
+                if interpolate.getType() == "python":
+                    interpolate = interpolate.getPythonContext()
+                submats = [interpolate.getNestSubMatrix(i, i) for i in range(len(cV))]
                 coarse_bases = []
-                for fs, submat, basis in zip(coarse_V, submats, fine_nullspace._bases):
+                for fs, submat, basis in zip(cV, submats, fine_nullspace._bases):
                     if isinstance(basis, VectorSpaceBasis):
-                        coarse_bases.append(coarsen_nullspace(fs, submat, basis))
+                        coarse_bases.append(self.coarsen_nullspace(basis, fs, submat))
                     else:
-                        coarse_bases.append(coarse_V.sub(basis.index))
-                return MixedVectorSpaceBasis(coarse_V, coarse_bases)
+                        coarse_bases.append(cV.sub(basis.index))
+                coarse_nullspace = MixedVectorSpaceBasis(cV, coarse_bases)
             elif isinstance(fine_nullspace, VectorSpaceBasis):
                 coarse_vecs = []
                 for xf in fine_nullspace._petsc_vecs:
-                    wc = firedrake.Function(coarse_V)
+                    wc = firedrake.Function(cV)
                     with wc.dat.vec_wo as xc:
-                        if mat.getSize()[1] == xf.getSize():
-                            mat.mult(xf, xc)
-                        else:
-                            mat.multTranspose(xf, xc)
+                        # the nullspace basis is in the dual of V
+                        interpolate.multTranspose(xf, xc)
                     coarse_vecs.append(wc)
-                vsb = VectorSpaceBasis(coarse_vecs, constant=fine_nullspace._constant)
-                vsb.orthonormalize()
-                return vsb
+                coarse_nullspace = VectorSpaceBasis(coarse_vecs, constant=fine_nullspace._constant)
+                coarse_nullspace.orthonormalize()
             else:
                 return fine_nullspace
+            return cache.setdefault(key, coarse_nullspace)
 
-        I, _ = self.create_interpolation(cdm, fdm)
-        ises = cV._ises
-        cctx._nullspace = coarsen_nullspace(cV, I, fctx._nullspace)
-        cctx.set_nullspace(cctx._nullspace, ises, transpose=False, near=False)
-        cctx._nullspace_T = coarsen_nullspace(cV, I, fctx._nullspace_T)
-        cctx.set_nullspace(cctx._nullspace_T, ises, transpose=True, near=False)
-        cctx._near_nullspace = coarsen_nullspace(cV, injection, fctx._near_nullspace)
-        cctx.set_nullspace(cctx._near_nullspace, ises, transpose=False, near=True)
-        return cdm
-
-    def coarsen_quadrature(self, metadata, fdeg, cdeg):
-        if isinstance(metadata, dict):
-            # Coarsen the quadrature degree in a dictionary
-            # such that the ratio of quadrature nodes to interpolation nodes (qdeg+1)//(fdeg+1) is preserved
-            qdeg = metadata.get("quadrature_degree", None)
-            if qdeg is not None:
-                cmd = dict(metadata)
-                cmd["quadrature_degree"] = max(2*cdeg+1, ((qdeg+1)*(cdeg+1)+fdeg)//(fdeg+1)-1)
-                return cmd
-        return metadata
-
-    def coarsen_bcs(self, fbcs, cV):
-        cbcs = []
-        for bc in fbcs:
-            cV_ = cV
-            for index in bc._indices:
-                cV_ = cV_.sub(index)
-            cbc_value = self.coarsen_bc_value(bc, cV_)
-            if type(bc) == firedrake.DirichletBC:
-                cbcs.append(firedrake.DirichletBC(cV_, cbc_value,
-                                                  bc.sub_domain))
+    def create_transfer(self, mat_type, cctx, fctx, cbcs, fbcs):
+        """Create a transfer operator"""
+        cache = self._transfer_cache.setdefault(fctx, {})
+        key = (mat_type, cctx, cbcs, fbcs)
+        try:
+            return cache[key]
+        except KeyError:
+            if mat_type == "matfree":
+                construct_mat = prolongation_matrix_matfree
+            elif mat_type == "aij":
+                construct_mat = prolongation_matrix_aij
             else:
-                raise NotImplementedError("Unsupported BC type, please get in touch if you need this")
-        return cbcs
-
-    @staticmethod
-    @lru_cache(maxsize=20)
-    def create_transfer(cctx, fctx, mat_type, cbcs, fbcs, inject):
-        cbcs = cctx._problem.bcs if cbcs else []
-        fbcs = fctx._problem.bcs if fbcs else []
-        if inject:
-            cV = cctx._problem.u
-            fV = fctx._problem.u
-        else:
+                raise ValueError("Unknown matrix type")
             cV = cctx.J.arguments()[0].function_space()
             fV = fctx.J.arguments()[0].function_space()
-
-        if mat_type == "matfree":
-            return prolongation_matrix_matfree(fV, cV, fbcs, cbcs)
-        elif mat_type == "aij":
-            return prolongation_matrix_aij(fV, cV, fbcs, cbcs)
-        else:
-            raise ValueError("Unknown matrix type")
+            cbcs = tuple(cctx._problem.bcs) if cbcs else tuple()
+            fbcs = tuple(fctx._problem.bcs) if fbcs else tuple()
+            return cache.setdefault(key, construct_mat(cV, fV, cbcs, fbcs))
 
     def create_interpolation(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        return self.create_transfer(get_appctx(dmc), get_appctx(dmf), mat_type, True, False, False), None
+        return self.create_transfer(mat_type, get_appctx(dmc), get_appctx(dmf), True, False), None
 
     def create_injection(self, dmc, dmf):
         prefix = dmc.getOptionsPrefix()
         mat_type = PETSc.Options(prefix).getString("mg_levels_transfer_mat_type", default="matfree")
-        I = self.create_transfer(get_appctx(dmf), get_appctx(dmc), mat_type, False, False, True)
-        return PETSc.Mat().createTranspose(I)
+        return self.create_transfer(mat_type, get_appctx(dmf), get_appctx(dmc), False, False)
 
     @staticmethod
     def max_degree(ele):
-        """
-        Return the maximum degree of a :class:`ufl.FiniteElement`
-        """
+        """Return the maximum degree of a :class:`ufl.FiniteElement`"""
         if isinstance(ele, (ufl.VectorElement, ufl.TensorElement)):
             return PMGBase.max_degree(ele._sub_element)
         elif isinstance(ele, (ufl.MixedElement, ufl.TensorProductElement)):
@@ -381,31 +408,33 @@ def reconstruct_degree(ele, degree):
 
         By default, reconstructed EnrichedElements, TensorProductElements,
         and MixedElements will have the degree of the sub-elements shifted
-        by the same amount so that the maximum degree is N.
-        This is useful to coarsen spaces like NCF(N) x DQ(N-1).
+        by the same amount so that the maximum degree is `degree`.
+        This is useful to coarsen spaces like NCF(k) x DQ(k-1).
 
         :arg ele: a :class:`ufl.FiniteElement` to reconstruct,
-        :arg N: an integer degree.
+        :arg degree: an integer degree.
 
         :returns: the reconstructed element
         """
         if isinstance(ele, ufl.VectorElement):
             return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), dim=ele.num_sub_elements())
         elif isinstance(ele, ufl.TensorElement):
-            return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), shape=ele.value_shape(), symmetry=ele.symmetry())
+            return type(ele)(PMGBase.reconstruct_degree(ele._sub_element, degree), shape=ele._shape, symmetry=ele.symmetry())
         elif isinstance(ele, ufl.EnrichedElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele._elements))
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele._elements))
         elif isinstance(ele, ufl.TensorProductElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele.sub_elements()), cell=ele.cell())
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele.sub_elements()), cell=ele.cell())
         elif isinstance(ele, ufl.MixedElement):
-            shift = degree-PMGBase.max_degree(ele)
-            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e)+shift) for e in ele.sub_elements()))
+            shift = degree - PMGBase.max_degree(ele)
+            return type(ele)(*(PMGBase.reconstruct_degree(e, PMGBase.max_degree(e) + shift) for e in ele.sub_elements()))
         elif isinstance(ele, ufl.WithMapping):
             return type(ele)(PMGBase.reconstruct_degree(ele.wrapee, degree), ele.mapping())
-        elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement, ufl.RestrictedElement)):
+        elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement)):
             return type(ele)(PMGBase.reconstruct_degree(ele._element, degree))
+        elif isinstance(ele, ufl.RestrictedElement):
+            return type(ele)(PMGBase.reconstruct_degree(ele._element, degree), restriction_domain=ele._restriction_domain)
         else:
             return ele.reconstruct(degree=degree)
 
@@ -420,7 +449,6 @@ def configure_pmg(self, pc, pdm):
         ppc.setType("mg")
         ppc.setOperators(*pc.getOperators())
         ppc.setDM(pdm)
-        ppc.incrementTabLevel(1, parent=pc)
 
         # PETSc unfortunately requires us to make an ugly hack.
         # We would like to use GMG for the coarse solve, at least
@@ -431,8 +459,7 @@ def configure_pmg(self, pc, pdm):
         # for the user, if they haven't already; I don't know any
         # other way to get PETSc to know this at the right time.
         opts = PETSc.Options(pc.getOptionsPrefix() + "pmg_")
-        if "mg_coarse_pc_mg_levels" not in opts:
-            opts["mg_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
+        opts["mg_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
 
         return ppc
 
@@ -443,7 +470,7 @@ def applyTranspose(self, pc, x, y):
         return self.ppc.applyTranspose(x, y)
 
     def coarsen_bc_value(self, bc, cV):
-        return firedrake.zero(cV.shape)
+        return 0
 
 
 class PMGSNES(SNESBase, PMGBase):
@@ -455,7 +482,8 @@ def configure_pmg(self, snes, pdm):
         psnes.setOptionsPrefix(snes.getOptionsPrefix() + "pfas_")
         psnes.setType("fas")
         psnes.setDM(pdm)
-        psnes.incrementTabLevel(1, parent=snes)
+        psnes.setTolerances(max_it=1)
+        psnes.setConvergenceTest("skip")
 
         (f, residual) = snes.getFunction()
         assert residual is not None
@@ -463,8 +491,7 @@ def configure_pmg(self, snes, pdm):
         psnes.setFunction(fun, f.duplicate(), args=args, kargs=kargs)
 
         pdm.setGlobalVector(f.duplicate())
-        self.dummy = f.duplicate()
-        psnes.setSolution(f.duplicate())
+        psnes.setSolution(snes.getSolution())
 
         # PETSc unfortunately requires us to make an ugly hack.
         # We would like to use GMG for the coarse solve, at least
@@ -475,10 +502,8 @@ def configure_pmg(self, snes, pdm):
         # for the user, if they haven't already; I don't know any
         # other way to get PETSc to know this at the right time.
         opts = PETSc.Options(snes.getOptionsPrefix() + "pfas_")
-        if "fas_coarse_pc_mg_levels" not in opts:
-            opts["fas_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
-        if "fas_coarse_snes_fas_levels" not in opts:
-            opts["fas_coarse_snes_fas_levels"] = odm.getRefineLevel() + 1
+        opts["fas_coarse_pc_mg_levels"] = odm.getRefineLevel() + 1
+        opts["fas_coarse_snes_fas_levels"] = odm.getRefineLevel() + 1
 
         return psnes
 
@@ -486,7 +511,7 @@ def step(self, snes, x, f, y):
         ctx = get_appctx(snes.dm)
         push_appctx(self.ppc.dm, ctx)
         x.copy(y)
-        self.ppc.solve(snes.vec_rhs or self.dummy, y)
+        self.ppc.solve(snes.vec_rhs or None, y)
         y.aypx(-1, x)
         snes.setConvergedReason(self.ppc.getConvergedReason())
         pop_appctx(self.ppc.dm)
@@ -501,13 +526,11 @@ def coarsen_bc_value(self, bc, cV):
 
 
 def prolongation_transfer_kernel_action(Vf, expr):
-    from tsfc import compile_expression_dual_evaluation
-    from tsfc.finatinterface import create_element
     to_element = create_element(Vf.ufl_element())
     kernel = compile_expression_dual_evaluation(expr, to_element, Vf.ufl_element(), log=PETSc.Log.isActive())
     coefficients = extract_numbered_coefficients(expr, kernel.coefficient_numbers)
     if kernel.needs_external_coords:
-        coefficients = [Vf.ufl_domain().coordinates] + coefficients
+        coefficients = [Vf.mesh().coordinates] + coefficients
 
     return op2.Kernel(kernel.ast, kernel.name,
                       requires_zeroed_output_arguments=True,
@@ -515,92 +538,173 @@ def prolongation_transfer_kernel_action(Vf, expr):
                       events=(kernel.event,)), coefficients
 
 
-@lru_cache(maxsize=10)
 def expand_element(ele):
-    """
-    Expand a FiniteElement as an EnrichedElement of TensorProductElements, discarding modifiers.
-    """
-    if ele.cell() == ufl.quadrilateral:
-        quadrilateral_tpc = ufl.TensorProductCell(ufl.interval, ufl.interval)
-        return expand_element(ele.reconstruct(cell=quadrilateral_tpc))
-    elif ele.cell() == ufl.hexahedron:
-        hexahedron_tpc = ufl.TensorProductCell(ufl.quadrilateral, ufl.interval)
-        return expand_element(ele.reconstruct(cell=hexahedron_tpc))
-    elif isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
-        return expand_element(ele._sub_element)
-    elif isinstance(ele, (ufl.HDivElement, ufl.HCurlElement, ufl.BrokenElement, ufl.RestrictedElement)):
-        return expand_element(ele._element)
-    elif isinstance(ele, ufl.WithMapping):
-        return expand_element(ele.wrapee)
-    elif isinstance(ele, ufl.MixedElement):
-        return ufl.MixedElement(*[expand_element(e) for e in ele.sub_elements()])
-    elif isinstance(ele, ufl.EnrichedElement):
-        terms = []
-        for e in ele._elements:
-            ee = expand_element(e)
-            if isinstance(ee, ufl.EnrichedElement):
-                terms.extend(ee._elements)
-            else:
-                terms.append(ee)
-        return ufl.EnrichedElement(*terms)
-    elif isinstance(ele, ufl.TensorProductElement):
-        factors = [expand_element(e) for e in ele.sub_elements()]
+    """Expand a FiniteElement as an EnrichedElement of TensorProductElements,
+       discarding modifiers."""
+    if isinstance(ele, finat.FlattenedDimensions):
+        return expand_element(ele.product)
+    elif isinstance(ele, (finat.HDivElement, finat.HCurlElement)):
+        return expand_element(ele.wrappee)
+    elif isinstance(ele, finat.DiscontinuousElement):
+        return expand_element(ele.element)
+    elif isinstance(ele, finat.EnrichedElement):
+        terms = list(map(expand_element, ele.elements))
+        return finat.EnrichedElement(terms)
+    elif isinstance(ele, finat.TensorProductElement):
+        factors = list(map(expand_element, ele.factors))
         terms = [tuple()]
         for e in factors:
             new_terms = []
-            for f in e._elements if isinstance(e, ufl.EnrichedElement) else [e]:
-                f_factors = f.sub_elements() if isinstance(f, ufl.TensorProductElement) else (f,)
-                new_terms.extend([t_factors + f_factors for t_factors in terms])
+            for f in e.elements if isinstance(e, finat.EnrichedElement) else [e]:
+                f_factors = tuple(f.factors) if isinstance(f, finat.TensorProductElement) else (f,)
+                new_terms.extend(t_factors + f_factors for t_factors in terms)
             terms = new_terms
-        if len(terms) == 1:
-            return ufl.TensorProductElement(*terms[0])
-        else:
-            return ufl.EnrichedElement(*[ufl.TensorProductElement(*k) for k in terms])
+        terms = list(map(finat.TensorProductElement, terms))
+        return finat.EnrichedElement(terms)
     else:
         return ele
 
 
-def get_line_elements(V):
-    from FIAT.reference_element import LINE
-    from tsfc.finatinterface import create_element
-    ele = V.ufl_element()
-    if isinstance(ele, ufl.MixedElement) and not isinstance(ele, (ufl.TensorElement, ufl.VectorElement)):
-        raise ValueError("MixedElements are not decomposed into tensor products")
-    rvs = ele.reference_value_size()
-    ele = expand_element(ele)
-    if isinstance(ele, ufl.EnrichedElement):
-        ele = ele._elements[0]
-
-    finat_ele = create_element(ele)
-    if rvs*finat_ele.space_dimension() != V.value_size*V.finat_element.space_dimension():
-        raise ValueError("Failed to decompose %s into a single tensor product" % V.ufl_element())
-    factors = finat_ele.factors if hasattr(finat_ele, "factors") else (finat_ele,)
+def hash_fiat_element(element):
+    """FIAT elements are not hashable,
+       this is not the best way to create a hash"""
+    restriction = None
+    e = element
+    if isinstance(e, FIAT.DiscontinuousElement):
+        # this hash does not care about inter-element continuity
+        e = e._element
+    if isinstance(e, FIAT.RestrictedElement):
+        restriction = tuple(e._indices)
+        e = e._element
+        if len(restriction) == e.space_dimension():
+            restriction = None
+    family = e.__class__.__name__
+    degree = e.order
+    return (family, element.ref_el, degree, restriction)
+
+
+def generate_key_evaluate_dual(source, target, alpha=tuple()):
+    return hash_fiat_element(source) + hash_fiat_element(target) + (alpha,)
+
+
+def get_readonly_view(arr):
+    result = arr.view()
+    result.flags.writeable = False
+    return result
+
+
+@cached({}, key=generate_key_evaluate_dual)
+def evaluate_dual(source, target, alpha=tuple()):
+    """Evaluate the action of a set of dual functionals of the target element
+       on the (derivative of order alpha of the) basis functions of the source
+       element."""
+    primal = source.get_nodal_basis()
+    dual = target.get_dual_set()
+    A = dual.to_riesz(primal)
+    B = numpy.transpose(primal.get_coeffs())
+    if sum(alpha) != 0:
+        dmats = primal.get_dmats()
+        for i in range(len(alpha)):
+            for j in range(alpha[i]):
+                B = numpy.dot(dmats[i], B)
+    return get_readonly_view(numpy.dot(A, B))
+
+
+@cached({}, key=generate_key_evaluate_dual)
+def compare_element(e1, e2):
+    """Numerically compare two :class:`FIAT.elements`.
+       Equality is satisfied if e2.dual_basis(e1.primal_basis) == identity."""
+    if e1 is e2:
+        return True
+    if e1.space_dimension() != e2.space_dimension():
+        return False
+    B = evaluate_dual(e1, e2)
+    return numpy.allclose(B, numpy.eye(B.shape[0]), rtol=1E-14, atol=1E-14)
+
+
+@cached({}, key=lambda V: V.ufl_element())
+@PETSc.Log.EventDecorator("GetLineElements")
+def get_permutation_to_line_elements(V):
+    """
+    Find DOF permutation to factor out the EnrichedElement expansion into common
+    TensorProductElements. This routine exposes structure to e.g vectorize
+    prolongation of NCE or NCF accross vector components, by permuting all
+    components into a common TensorProductElement.
+
+    This is temporary while we wait for dual evaluation of :class:`finat.EnrichedElement`.
+
+    :arg V: a :class:`.FunctionSpace`
+
+    :returns: a 3-tuple of the DOF permutation, the unique terms in expansion as
+              a list of tuples of :class:`FIAT.FiniteElements`, and the cyclic
+              permutations of the axes to form the element given by their shifts
+              in list of `int` tuples
+    """
+    finat_element = V.finat_element
+    expansion = expand_element(finat_element)
+    if expansion.space_dimension() != finat_element.space_dimension():
+        raise ValueError("Failed to decompose %s into tensor products" % V.ufl_element())
+
     line_elements = []
-    for e in reversed(factors):
-        fiat_ele = e.fiat_equivalent
-        if fiat_ele.get_reference_element().shape != LINE:
-            raise ValueError("Expecting %s to be on the interval" % fiat_ele)
-        line_elements.append(fiat_ele)
-    return line_elements
-
-
-@lru_cache(maxsize=10)
-def get_line_interpolator(felem, celem):
-    from FIAT import functional, make_quadrature
-    fdual = felem.dual_basis()
-    cdual = celem.dual_basis()
-    if len(fdual) == len(cdual):
-        if all(f.get_point_dict() == c.get_point_dict() for f, c in zip(fdual, cdual)):
-            return numpy.array([])
-
-    if all(isinstance(phi, functional.PointEvaluation) for phi in fdual):
-        pts = [list(phi.get_point_dict().keys())[0] for phi in fdual]
-        return celem.tabulate(0, pts)[(0,)]
-    else:
-        pts = make_quadrature(felem.get_reference_element(),
-                              felem.space_dimension()).get_points()
-        return numpy.dot(celem.tabulate(0, pts)[(0,)],
-                         numpy.linalg.inv(felem.tabulate(0, pts)[(0,)]))
+    terms = expansion.elements if hasattr(expansion, "elements") else [expansion]
+    for term in terms:
+        factors = term.factors if hasattr(term, "factors") else (term,)
+        fiat_factors = tuple(e.fiat_equivalent for e in reversed(factors))
+        if any(e.get_reference_element().get_spatial_dimension() != 1 for e in fiat_factors):
+            raise ValueError("Failed to decompose %s into line elements" % V.ufl_element())
+        line_elements.append(fiat_factors)
+
+    shapes = [tuple(e.space_dimension() for e in factors) for factors in line_elements]
+    sizes = list(map(numpy.prod, shapes))
+    dof_ranges = numpy.cumsum([0] + sizes)
+
+    dof_perm = []
+    unique_line_elements = []
+    shifts = []
+
+    visit = [False for e in line_elements]
+    while False in visit:
+        base = line_elements[visit.index(False)]
+        tdim = len(base)
+        pshape = tuple(e.space_dimension() for e in base)
+        unique_line_elements.append(base)
+
+        axes_shifts = tuple()
+        for shift in range(tdim):
+            if finat_element.formdegree != 2:
+                shift = (tdim - shift) % tdim
+
+            perm = base[shift:] + base[:shift]
+            for i, term in enumerate(line_elements):
+                if not visit[i]:
+                    is_perm = all(e1.space_dimension() == e2.space_dimension()
+                                  for e1, e2 in zip(perm, term))
+                    if is_perm:
+                        is_perm = all(compare_element(e1, e2) for e1, e2 in zip(perm, term))
+
+                    if is_perm:
+                        axes_shifts += ((tdim - shift) % tdim, )
+                        dofs = numpy.arange(*dof_ranges[i:i+2], dtype=PETSc.IntType).reshape(pshape)
+                        dofs = numpy.transpose(dofs, axes=numpy.roll(numpy.arange(tdim), -shift))
+                        assert dofs.shape == shapes[i]
+                        dof_perm.append(dofs.flat)
+                        visit[i] = True
+
+        shifts.append(axes_shifts)
+
+    dof_perm = get_readonly_view(numpy.concatenate(dof_perm))
+    return dof_perm, unique_line_elements, shifts
+
+
+def get_permuted_map(V):
+    """
+    Return a PermutedMap with the same tensor product shape for
+    every component of H(div) or H(curl) tensor product elements
+    """
+    indices, _, _ = get_permutation_to_line_elements(V)
+    if numpy.all(indices[:-1] < indices[1:]):
+        return V.cell_node_map()
+    return op2.PermutedMap(V.cell_node_map(), indices)
 
 
 # Common kernel to compute y = kron(A3, kron(A2, A1)) * x
@@ -609,7 +713,7 @@ def get_line_interpolator(felem, celem):
 #include <petscsys.h>
 #include <petscblaslapack.h>
 
-static inline void kronmxv(PetscBLASInt tflag,
+static inline void kronmxv_inplace(PetscBLASInt tflag,
     PetscBLASInt mx, PetscBLASInt my, PetscBLASInt mz,
     PetscBLASInt nx, PetscBLASInt ny, PetscBLASInt nz, PetscBLASInt nel,
     PetscScalar *A1, PetscScalar *A2, PetscScalar *A3,
@@ -629,7 +733,7 @@ def get_line_interpolator(felem, celem):
 y is (mx*my*mz)-by-nel.
 
 Important notes:
-The input data in x is destroyed in the process.
+This routine is in-place: the input data in x and y are destroyed in the process.
 Need to allocate nel*max(mx, nx)*max(my, ny)*max(mz, nz) memory for both x and y.
 */
 
@@ -675,42 +779,238 @@ def get_line_interpolator(felem, celem):
 *y = ptr[ires];
 return;
 }
-"""
 
+static inline void kronmxv(PetscBLASInt tflag,
+    PetscBLASInt mx, PetscBLASInt my, PetscBLASInt mz,
+    PetscBLASInt nx, PetscBLASInt ny, PetscBLASInt nz, PetscBLASInt nel,
+    PetscScalar *A1, PetscScalar *A2, PetscScalar *A3,
+    PetscScalar *x, PetscScalar *y, PetscScalar *xwork, PetscScalar *ywork){
+    /*
+    Same as kronmxv_inplace, but the work buffers allow the input data in x to
+    be kept untouched.
+    */
 
-def make_kron_code(Vf, Vc, t_in, t_out, mat_name):
-    nscal = Vf.ufl_element().reference_value_size()
-    felems = get_line_elements(Vf)
-    celems = get_line_elements(Vc)
-    if len(felems) != len(celems):
-        raise ValueError("Fine and coarse elements do not have the same number of factors")
-    if len(felems) > 3:
-        raise ValueError("More than three factors are not supported")
+    PetscScalar *ptr[2] = {xwork, ywork};
 
-    # Declare array shapes to be used as literals inside the kernels
-    fshape = [e.space_dimension() for e in felems]
-    cshape = [e.space_dimension() for e in celems]
-    shapes = [(nscal,) + tuple(fshape), (nscal,) + tuple(cshape)]
+    if(ptr[0] != x)
+        for(PetscBLASInt j=0; j<nx*ny*nz*nel; j++)
+            ptr[0][j] = x[j];
 
-    # Pass the 1D interpolators as a hexadecimal string
-    J = [get_line_interpolator(fe, ce) for fe, ce in zip(felems, celems)]
-    Jdata = ", ".join(map(float.hex, chain(*[Jk.flat for Jk in J])))
-    Jsize = numpy.cumsum([0]+[Jk.size for Jk in J])
-    Jptrs = ["%s+%d" % (mat_name, Jsize[k]) if J[k].size else "NULL" for k in range(len(J))]
-
-    # The Kronecker product routines assume 3D shapes, so in 1D and 2D we pass NULL instead of J
-    Jargs = ", ".join(Jptrs+["NULL"]*(3-len(Jptrs)))
-    fargs = ", ".join(map(str, fshape+[1]*(3-len(fshape))))
-    cargs = ", ".join(map(str, cshape+[1]*(3-len(cshape))))
-    operator_decl = f"""
-            PetscScalar {mat_name}[{Jsize[-1]}] = {{ {Jdata} }};
-    """
-    prolong_code = f"""
-            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, &{t_in}, &{t_out});
+    kronmxv_inplace(tflag, mx, my, mz, nx, ny, nz, nel, A1, A2, A3, &ptr[0], &ptr[1]);
+
+    if(ptr[1] != y)
+        for(PetscBLASInt j=0; j<mx*my*mz*nel; j++)
+            y[j] = ptr[1][j];
+    return;
+}
+
+static inline void permute_axis(PetscBLASInt axis,
+    PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
+    PetscScalar *x, PetscScalar *y){
+    /*
+    Apply a cyclic permutation to a n0 x n1 x n2 x n3 array x, exponsing axis as
+    the fast direction.  Write the result on y.
+    */
+
+    PetscBLASInt p = 0;
+    PetscBLASInt s0, s1, s2, s3;
+    if (axis == 0){
+        s0 = 1; s1 = s0*n0; s2 = s1*n1; s3 = s2*n2;
+    }else if(axis == 1){
+        s1 = 1; s2 = s1*n1; s0 = s2*n2; s3 = s0*n0;
+    }else if(axis == 2){
+        s2 = 1; s0 = s2*n2; s1 = s0*n0; s3 = s1*n1;
+    }
+    for(PetscBLASInt i3=0; i3<n3; i3++)
+        for(PetscBLASInt i2=0; i2<n2; i2++)
+            for(PetscBLASInt i1=0; i1<n1; i1++)
+                for(PetscBLASInt i0=0; i0<n0; i0++)
+                    y[s0*i0 + s1*i1 + s2*i2 + s3*i3] = x[p++];
+    return;
+}
+
+static inline void ipermute_axis(PetscBLASInt axis,
+    PetscBLASInt n0, PetscBLASInt n1, PetscBLASInt n2, PetscBLASInt n3,
+    PetscScalar *x, PetscScalar *y){
+    /*
+    Apply the transpose of permute_axis, reading from y and adding to x.
+    */
+
+    PetscBLASInt p = 0;
+    PetscBLASInt s0, s1, s2, s3;
+    if (axis == 0){
+        s0 = 1; s1 = s0*n0; s2 = s1*n1; s3 = s2*n2;
+    }else if(axis == 1){
+        s1 = 1; s2 = s1*n1; s0 = s2*n2; s3 = s0*n0;
+    }else if(axis == 2){
+        s2 = 1; s0 = s2*n2; s1 = s0*n0; s3 = s1*n1;
+    }
+
+    for(PetscBLASInt i3=0; i3<n3; i3++)
+        for(PetscBLASInt i2=0; i2<n2; i2++)
+            for(PetscBLASInt i1=0; i1<n1; i1++)
+                for(PetscBLASInt i0=0; i0<n0; i0++)
+                    x[p++] += y[s0*i0 + s1*i1 + s2*i2 + s3*i3];
+    return;
+}
+"""
+
+
+@PETSc.Log.EventDecorator("MakeKronCode")
+def make_kron_code(Vc, Vf, t_in, t_out, mat_name, scratch):
     """
-    restrict_code = f"""
-            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, &{t_out}, &{t_in});
+    Return interpolation and restriction kernels between enriched tensor product elements
     """
+    operator_decl = []
+    prolong_code = []
+    restrict_code = []
+    _, celems, cshifts = get_permutation_to_line_elements(Vc)
+    _, felems, fshifts = get_permutation_to_line_elements(Vf)
+
+    shifts = fshifts
+    in_place = False
+    if len(felems) == len(celems):
+        in_place = all((len(fs)*Vf.value_size == len(cs)*Vc.value_size) for fs, cs in zip(fshifts, cshifts))
+        psize = Vf.value_size
+
+    if not in_place:
+        if len(celems) == 1:
+            psize = Vc.value_size
+            pelem = celems[0]
+            perm_name = "perm_%s" % t_in
+            celems = celems*len(felems)
+
+        elif len(felems) == 1:
+            shifts = cshifts
+            psize = Vf.value_size
+            pelem = felems[0]
+            perm_name = "perm_%s" % t_out
+            felems = felems*len(celems)
+        else:
+            raise ValueError("Cannot assign fine to coarse DOFs")
+
+        if set(cshifts) == set(fshifts):
+            csize = Vc.value_size * Vc.finat_element.space_dimension()
+            prolong_code.append(f"""
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                for({IntType_c} i=0; i<{csize}; i++)
+                    {t_in}[j*{csize} + i] = {t_in}[i];
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} j=1; j<{len(fshifts)}; j++)
+                for({IntType_c} i=0; i<{csize}; i++)
+                    {t_in}[i] += {t_in}[j*{csize} + i];
+            """)
+
+        elif pelem == celems[0]:
+            for k in range(len(shifts)):
+                if Vc.value_size*len(shifts[k]) < Vf.value_size:
+                    shifts[k] = shifts[k]*(Vf.value_size//Vc.value_size)
+
+            pshape = [e.space_dimension() for e in pelem]
+            pargs = ", ".join(map(str, pshape+[1]*(3-len(pshape))))
+            pstride = psize * numpy.prod(pshape)
+
+            perm = sum(shifts, tuple())
+            perm_data = ", ".join(map(str, perm))
+            operator_decl.append(f"""
+                PetscBLASInt {perm_name}[{len(perm)}] = {{ {perm_data} }};
+            """)
+            prolong_code.append(f"""
+            for({IntType_c} j=1; j<{len(perm)}; j++)
+                permute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} j=1; j<{len(perm)}; j++)
+                ipermute_axis({perm_name}[j], {pargs}, {psize}, {t_in}, {t_in}+j*{pstride});
+            """)
+
+    fskip = 0
+    cskip = 0
+    Jlen = 0
+    Jmats = []
+    fshapes = []
+    cshapes = []
+    has_code = False
+    identity_filter = lambda A: numpy.array([]) if A.shape[0] == A.shape[1] and numpy.allclose(A, numpy.eye(A.shape[0])) else A
+    for celem, felem, shift in zip(celems, felems, shifts):
+        if len(felem) != len(celem):
+            raise ValueError("Fine and coarse elements do not have the same number of factors")
+        if len(felem) > 3:
+            raise ValueError("More than three factors are not supported")
+
+        # Declare array shapes to be used as literals inside the kernels
+        nscal = psize*len(shift)
+        fshape = [e.space_dimension() for e in felem]
+        cshape = [e.space_dimension() for e in celem]
+        fshapes.append((nscal,) + tuple(fshape))
+        cshapes.append((nscal,) + tuple(cshape))
+
+        J = [identity_filter(evaluate_dual(ce, fe)).T for ce, fe in zip(celem, felem)]
+        if any(Jk.size and numpy.isclose(Jk, 0.0E0).all() for Jk in J):
+            prolong_code.append(f"""
+            for({IntType_c} i=0; i<{nscal*numpy.prod(fshape)}; i++) {t_out}[i+{fskip}] = 0.0E0;
+            """)
+            restrict_code.append(f"""
+            for({IntType_c} i=0; i<{nscal*numpy.prod(cshape)}; i++) {t_in}[i+{cskip}] = 0.0E0;
+            """)
+        else:
+            Jsize = numpy.cumsum([Jlen] + [Jk.size for Jk in J])
+            Jptrs = ["%s+%d" % (mat_name, Jsize[k]) if J[k].size else "NULL" for k in range(len(J))]
+            Jmats.extend(J)
+            Jlen = Jsize[-1]
+
+            # The Kronecker product routines assume 3D shapes, so in 1D and 2D we pass NULL instead of J
+            Jargs = ", ".join(Jptrs+["NULL"]*(3-len(Jptrs)))
+            fargs = ", ".join(map(str, fshape+[1]*(3-len(fshape))))
+            cargs = ", ".join(map(str, cshape+[1]*(3-len(cshape))))
+            if in_place:
+                prolong_code.append(f"""
+            kronmxv_inplace(0, {fargs}, {cargs}, {nscal}, {Jargs}, &{t_in}, &{t_out});
+                """)
+                restrict_code.append(f"""
+            kronmxv_inplace(1, {cargs}, {fargs}, {nscal}, {Jargs}, &{t_out}, &{t_in});
+                """)
+            elif shifts == fshifts:
+                if has_code and psize > 1:
+                    raise ValueError("Single tensor product to many tensor products not implemented for vectors")
+                # Single tensor product to many
+                prolong_code.append(f"""
+            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, {t_in}+{cskip}, {t_out}+{fskip}, {scratch}, {t_out}+{fskip});
+                """)
+                restrict_code.append(f"""
+            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip}, {scratch});
+                """)
+            else:
+                # Many tensor products to single tensor product
+                if has_code:
+                    raise ValueError("Many tensor products to single tensor product not implemented")
+                fskip = 0
+                prolong_code.append(f"""
+            kronmxv(0, {fargs}, {cargs}, {nscal}, {Jargs}, {t_in}+{cskip}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip});
+                """)
+                restrict_code.append(f"""
+            kronmxv(1, {cargs}, {fargs}, {nscal}, {Jargs}, {t_out}+{fskip}, {t_in}+{cskip}, {t_out}+{fskip}, {t_in}+{cskip});
+                """)
+            has_code = True
+        fskip += nscal*numpy.prod(fshape)
+        cskip += nscal*numpy.prod(cshape)
+
+    # Pass the 1D interpolators as a hexadecimal string
+    Jdata = ", ".join(map(float.hex, chain.from_iterable(Jk.flat for Jk in Jmats)))
+    operator_decl.append(f"""
+            PetscScalar {mat_name}[{Jlen}] = {{ {Jdata} }};
+    """)
+
+    operator_decl = "".join(operator_decl)
+    prolong_code = "".join(prolong_code)
+    restrict_code = "".join(reversed(restrict_code))
+    shapes = [tuple(map(max, zip(*fshapes))), tuple(map(max, zip(*cshapes)))]
+
+    if fskip > numpy.prod(shapes[0]):
+        shapes[0] = (fskip, 1, 1, 1)
+    if cskip > numpy.prod(shapes[1]):
+        shapes[1] = (cskip, 1, 1, 1)
     return operator_decl, prolong_code, restrict_code, shapes
 
 
@@ -753,10 +1053,11 @@ def cache_generate_code(kernel, comm):
     return code
 
 
-def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
-    domain = Q.ufl_domain()
-    A = get_piola_tensor(cmapping, domain, inverse=False)
-    B = get_piola_tensor(fmapping, domain, inverse=True)
+def make_mapping_code(Q, cmapping, fmapping, t_in, t_out):
+    if fmapping == cmapping:
+        return None
+    A = get_piola_tensor(cmapping, Q.mesh(), inverse=False)
+    B = get_piola_tensor(fmapping, Q.mesh(), inverse=True)
     tensor = A
     if B:
         tensor = ufl.dot(B, tensor) if tensor else B
@@ -795,28 +1096,20 @@ def make_mapping_code(Q, fmapping, cmapping, t_in, t_out):
     return coef_decl, prolong_code, restrict_code, mapping_code, coefficients
 
 
-def get_axes_shift(ele):
-    """Return the form degree of a FInAT element after discarding modifiers"""
-    if hasattr(ele, "element"):
-        return get_axes_shift(ele.element)
-    else:
-        return ele.formdegree
-
-
 def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
-    shift = get_axes_shift(V.finat_element)
-    tdim = V.mesh().topological_dimension()
-    if shift % tdim:
+    _, _, shifts = get_permutation_to_line_elements(V)
+    shift = shifts[0]
+    if shift != (0,):
         ndof = numpy.prod(vshape)
         permutation = numpy.reshape(numpy.arange(ndof), pshape)
-        axes = numpy.arange(tdim)
+        axes = numpy.arange(len(shift))
         for k in range(permutation.shape[0]):
-            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, -shift*k)), pshape[1:])
+            permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, -shift[k])), pshape[1:])
         nflip = 0
         mapping = V.ufl_element().mapping().lower()
         if mapping == "contravariant piola":
             # flip the sign of the first component
-            nflip = ndof//tdim
+            nflip = ndof//len(shift)
         elif mapping == "covariant piola":
             # flip the order of reference components
             permutation = numpy.flip(permutation, axis=0)
@@ -850,60 +1143,65 @@ def make_permutation_code(V, vshape, pshape, t_in, t_out, array_name):
     return decl, prolong, restrict
 
 
-def get_permuted_map(V):
-    """
-    Return a PermutedMap with the same tensor product shape for
-    every component of H(div) or H(curl) tensor product elements
-    """
-    shift = get_axes_shift(V.finat_element)
-    if shift % V.mesh().topological_dimension() == 0:
-        return V.cell_node_map()
-
-    elements = get_line_elements(V)
-    axes = numpy.arange(len(elements))
-    pshape = [-1] + [e.space_dimension() for e in elements]
-    permutation = numpy.reshape(numpy.arange(V.finat_element.space_dimension()), pshape)
-    for k in range(permutation.shape[0]):
-        permutation[k] = numpy.reshape(numpy.transpose(permutation[k], axes=numpy.roll(axes, shift*k)), pshape[1:])
-
-    permutation = numpy.reshape(permutation, (-1,))
-    return PermutedMap(V.cell_node_map(), permutation)
-
-
 class StandaloneInterpolationMatrix(object):
     """
     Interpolation matrix for a single standalone space.
     """
-    def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
-        self.Vf_bcs = Vf_bcs
+
+    _cache_work = {}
+
+    def __init__(self, Vc, Vf, Vc_bcs, Vf_bcs):
+        self.uc = self.work_function(Vc)
+        self.uf = self.work_function(Vf)
+        self.Vc = self.uc.function_space()
+        self.Vf = self.uf.function_space()
         self.Vc_bcs = Vc_bcs
-        if isinstance(Vf, firedrake.Function):
-            self.uf = Vf
-            Vf = Vf.function_space()
-        else:
-            self.uf = firedrake.Function(Vf)
-        if isinstance(Vc, firedrake.Function):
-            self.uc = Vc
-            Vc = Vc.function_space()
-        else:
-            self.uc = firedrake.Function(Vc)
+        self.Vf_bcs = Vf_bcs
 
-        self.weight = self.multiplicity(Vf)
-        with self.weight.dat.vec as w:
+    def work_function(self, V):
+        if isinstance(V, firedrake.Function):
+            return V
+        key = (V.ufl_element(), V.mesh())
+        try:
+            return self._cache_work[key]
+        except KeyError:
+            return self._cache_work.setdefault(key, firedrake.Function(V))
+
+    @cached_property
+    def _weight(self):
+        weight = firedrake.Function(self.Vf)
+        size = self.Vf.finat_element.space_dimension() * self.Vf.value_size
+        kernel_code = f"""
+        void weight(PetscScalar *restrict w){{
+            for(PetscInt i=0; i<{size}; i++) w[i] += 1.0;
+            return;
+        }}
+        """
+        kernel = op2.Kernel(kernel_code, "weight", requires_zeroed_output_arguments=True)
+        op2.par_loop(kernel, weight.cell_set, weight.dat(op2.INC, weight.cell_node_map()))
+        with weight.dat.vec as w:
             w.reciprocal()
+        return weight
 
+    @cached_property
+    def _kernels(self):
         try:
-            uf_map = get_permuted_map(Vf)
-            uc_map = get_permuted_map(Vc)
-            prolong_kernel, restrict_kernel, coefficients = self.make_blas_kernels(Vf, Vc)
+            # We generate custom prolongation and restriction kernels mainly because:
+            # 1. Code generation for the transpose of prolongation is not readily available
+            # 2. Dual evaluation of EnrichedElement is not yet implemented in FInAT
+            uf_map = get_permuted_map(self.Vf)
+            uc_map = get_permuted_map(self.Vc)
+            prolong_kernel, restrict_kernel, coefficients = self.make_blas_kernels(self.Vf, self.Vc)
             prolong_args = [prolong_kernel, self.uf.cell_set,
                             self.uf.dat(op2.INC, uf_map),
                             self.uc.dat(op2.READ, uc_map),
-                            self.weight.dat(op2.READ, uf_map)]
+                            self._weight.dat(op2.READ, uf_map)]
         except ValueError:
-            uf_map = Vf.cell_node_map()
-            uc_map = Vc.cell_node_map()
-            prolong_kernel, restrict_kernel, coefficients = self.make_kernels(Vf, Vc)
+            # The elements do not have the expected tensor product structure
+            # Fall back to aij kernels
+            uf_map = self.Vf.cell_node_map()
+            uc_map = self.Vc.cell_node_map()
+            prolong_kernel, restrict_kernel, coefficients = self.make_kernels(self.Vf, self.Vc)
             prolong_args = [prolong_kernel, self.uf.cell_set,
                             self.uf.dat(op2.WRITE, uf_map),
                             self.uc.dat(op2.READ, uc_map)]
@@ -911,10 +1209,47 @@ def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
         restrict_args = [restrict_kernel, self.uf.cell_set,
                          self.uc.dat(op2.INC, uc_map),
                          self.uf.dat(op2.READ, uf_map),
-                         self.weight.dat(op2.READ, uf_map)]
+                         self._weight.dat(op2.READ, uf_map)]
         coefficient_args = [c.dat(op2.READ, c.cell_node_map()) for c in coefficients]
-        self._prolong = partial(op2.par_loop, *prolong_args, *coefficient_args)
-        self._restrict = partial(op2.par_loop, *restrict_args, *coefficient_args)
+        prolong = op2.ParLoop(*prolong_args, *coefficient_args)
+        restrict = op2.ParLoop(*restrict_args, *coefficient_args)
+        return prolong, restrict
+
+    def _prolong(self):
+        with self.uf.dat.vec_wo as uf:
+            uf.set(0.0E0)
+        self._kernels[0]()
+
+    def _restrict(self):
+        with self.uc.dat.vec_wo as uc:
+            uc.set(0.0E0)
+        self._kernels[1]()
+
+    def view(self, mat, viewer=None):
+        if viewer is None:
+            return
+        typ = viewer.getType()
+        if typ != PETSc.Viewer.Type.ASCII:
+            return
+        viewer.printfASCII("Firedrake matrix-free prolongator %s\n" %
+                           type(self).__name__)
+
+    def getInfo(self, mat, info=None):
+        memory = self.uf.dat.nbytes + self.uc.dat.nbytes
+        if self._weight is not None:
+            memory += self._weight.dat.nbytes
+        if info is None:
+            info = PETSc.Mat.InfoType.GLOBAL_SUM
+        if info == PETSc.Mat.InfoType.LOCAL:
+            return {"memory": memory}
+        elif info == PETSc.Mat.InfoType.GLOBAL_SUM:
+            gmem = mat.comm.tompi4py().allreduce(memory, op=op2.MPI.SUM)
+            return {"memory": gmem}
+        elif info == PETSc.Mat.InfoType.GLOBAL_MAX:
+            gmem = mat.comm.tompi4py().allreduce(memory, op=op2.MPI.MAX)
+            return {"memory": gmem}
+        else:
+            raise ValueError("Unknown info type %s" % info)
 
     @staticmethod
     def make_blas_kernels(Vf, Vc):
@@ -922,8 +1257,8 @@ def make_blas_kernels(Vf, Vc):
         Interpolation and restriction kernels between CG / DG
         tensor product spaces on quads and hexes.
 
-        Works by tabulating the coarse 1D Lagrange basis
-        functions as the (fdegree+1)-by-(cdegree+1) matrix Jhat,
+        Works by tabulating the coarse 1D basis functions
+        as the (fdegree+1)-by-(cdegree+1) matrix Jhat,
         and using the fact that the 2D / 3D tabulation is the
         tensor product J = kron(Jhat, kron(Jhat, Jhat))
         """
@@ -936,43 +1271,47 @@ def make_blas_kernels(Vf, Vc):
         coefficients = []
         mapping_code = ""
         coef_decl = ""
+
         if fmapping == cmapping:
             # interpolate on each direction via Kroncker product
-            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vf, Vc, "t0", "t1", "J0")
+            operator_decl, prolong_code, restrict_code, shapes = make_kron_code(Vc, Vf, "t0", "t1", "J0", "t2")
         else:
             decl = [""]*4
             prolong = [""]*5
             restrict = [""]*5
             # get embedding element for Vf with identity mapping and collocated vector component DOFs
             try:
-                Q = Vf if fmapping == "identity" else firedrake.FunctionSpace(Vf.ufl_domain(),
-                                                                              felem.reconstruct(mapping="identity"))
-                mapping_output = make_mapping_code(Q, fmapping, cmapping, "t0", "t1")
+                qelem = felem
+                if qelem.mapping() != "identity":
+                    qelem = qelem.reconstruct(mapping="identity")
+                Qf = Vf if qelem == felem else firedrake.FunctionSpace(Vf.mesh(), qelem)
+                mapping_output = make_mapping_code(Qf, cmapping, fmapping, "t0", "t1")
                 in_place_mapping = True
             except Exception:
-                Qe = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
+                qelem = ufl.FiniteElement("DQ", cell=felem.cell(), degree=PMGBase.max_degree(felem))
                 if felem.value_shape():
-                    Qe = ufl.TensorElement(Qe, shape=felem.value_shape(), symmetry=felem.symmetry())
-                Q = firedrake.FunctionSpace(Vf.ufl_domain(), Qe)
-                mapping_output = make_mapping_code(Q, fmapping, cmapping, "t0", "t1")
-
-            qshape = (Q.value_size, Q.finat_element.space_dimension())
-            # interpolate to embedding fine space, permute to FInAT ordering, and apply the mapping
-            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Q, Vc, "t0", "t1", "J0")
-            decl[1], restrict[1], prolong[1] = make_permutation_code(Vc, qshape, shapes[0], "t0", "t1", "perm0")
-            coef_decl, prolong[2], restrict[2], mapping_code, coefficients = mapping_output
-
-            if not in_place_mapping:
-                # permute to Kronecker-friendly ordering and interpolate to fine space
-                decl[2], prolong[3], restrict[3] = make_permutation_code(Vf, qshape, shapes[0], "t1", "t0", "perm1")
-                decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Vf, Q, "t0", "t1", "J1")
-                shapes.extend(_shapes)
+                    qelem = ufl.TensorElement(qelem, shape=felem.value_shape(), symmetry=felem.symmetry())
+                Qf = firedrake.FunctionSpace(Vf.mesh(), qelem)
+                mapping_output = make_mapping_code(Qf, cmapping, fmapping, "t0", "t1")
+
+            qshape = (Qf.value_size, Qf.finat_element.space_dimension())
+            # interpolate to embedding fine space
+            decl[0], prolong[0], restrict[0], shapes = make_kron_code(Vc, Qf, "t0", "t1", "J0", "t2")
+
+            if mapping_output is not None:
+                # permute to FInAT ordering, and apply the mapping
+                decl[1], restrict[1], prolong[1] = make_permutation_code(Vc, qshape, shapes[0], "t0", "t1", "perm0")
+                coef_decl, prolong[2], restrict[2], mapping_code, coefficients = mapping_output
+                if not in_place_mapping:
+                    # permute to Kronecker-friendly ordering and interpolate to fine space
+                    decl[2], prolong[3], restrict[3] = make_permutation_code(Vf, qshape, shapes[0], "t1", "t0", "perm1")
+                    decl[3], prolong[4], restrict[4], _shapes = make_kron_code(Qf, Vf, "t0", "t1", "J1", "t2")
+                    shapes.extend(_shapes)
 
             operator_decl = "".join(decl)
             prolong_code = "".join(prolong)
             restrict_code = "".join(reversed(restrict))
 
-        lwork = numpy.prod([max(*dims) for dims in zip(*shapes)])
         # FInAT elements order the component DOFs related to the same node contiguously.
         # We transpose before and after the multiplication times J to have each component
         # stored contiguously as a scalar field, thus reducing the number of dgemm calls.
@@ -982,6 +1321,10 @@ def make_blas_kernels(Vf, Vc):
 
         fshape = (Vf.value_size, Vf.finat_element.space_dimension())
         cshape = (Vc.value_size, Vc.finat_element.space_dimension())
+
+        lwork = numpy.prod([max(*dims) for dims in zip(*shapes)])
+        lwork = max(lwork, max(numpy.prod(fshape), numpy.prod(cshape)))
+
         if cshape[0] == 1:
             coarse_read = f"""for({IntType_c} i=0; i<{numpy.prod(cshape)}; i++) t0[i] = x[i];"""
             coarse_write = f"""for({IntType_c} i=0; i<{numpy.prod(cshape)}; i++) x[i] += t0[i];"""
@@ -1017,9 +1360,10 @@ def make_blas_kernels(Vf, Vc):
 
         void prolongation(PetscScalar *restrict y, const PetscScalar *restrict x,
                           const PetscScalar *restrict w{coef_decl}){{
-            PetscScalar work[2][{lwork}];
+            PetscScalar work[3][{lwork}] = {{0.0E0}};
             PetscScalar *t0 = work[0];
             PetscScalar *t1 = work[1];
+            PetscScalar *t2 = work[2];
             {operator_decl}
             {coarse_read}
             {prolong_code}
@@ -1029,9 +1373,10 @@ def make_blas_kernels(Vf, Vc):
 
         void restriction(PetscScalar *restrict x, const PetscScalar *restrict y,
                          const PetscScalar *restrict w{coef_decl}){{
-            PetscScalar work[2][{lwork}];
+            PetscScalar work[3][{lwork}] = {{0.0E0}};
             PetscScalar *t0 = work[0];
             PetscScalar *t1 = work[1];
+            PetscScalar *t2 = work[2];
             {operator_decl}
             {fine_read}
             {restrict_code}
@@ -1058,11 +1403,10 @@ def make_kernels(self, Vf, Vc):
         # A local matrix is generated each time the kernel is executed.
         element_kernel = loopy.generate_code_v2(matrix_kernel.code).device_code()
         element_kernel = element_kernel.replace("void expression_kernel", "static void expression_kernel")
-        dimc = Vc.finat_element.space_dimension() * Vc.value_size
-        dimf = Vf.finat_element.space_dimension() * Vf.value_size
-
         coef_args = "".join([", c%d" % i for i in range(len(coefficients))])
         coef_decl = "".join([", const %s *restrict c%d" % (ScalarType_c, i) for i in range(len(coefficients))])
+        dimc = Vc.finat_element.space_dimension() * Vc.value_size
+        dimf = Vf.finat_element.space_dimension() * Vf.value_size
         restrict_code = f"""
         {element_kernel}
 
@@ -1078,22 +1422,6 @@ def make_kernels(self, Vf, Vc):
         restrict_kernel = op2.Kernel(restrict_code, "restriction", requires_zeroed_output_arguments=True)
         return prolong_kernel, restrict_kernel, coefficients
 
-    @staticmethod
-    def multiplicity(V):
-        # Lawrence's magic code for calculating dof multiplicities
-        shapes = (V.finat_element.space_dimension(),
-                  numpy.prod(V.shape))
-        domain = "{[i,j]: 0 <= i < %d and 0 <= j < %d}" % shapes
-        instructions = """
-        for i, j
-            w[i,j] = w[i,j] + 1
-        end
-        """
-        weight = firedrake.Function(V)
-        firedrake.par_loop((domain, instructions), firedrake.dx,
-                           {"w": (weight, op2.INC)})
-        return weight
-
     def multTranspose(self, mat, rf, rc):
         """
         Implement restriction: restrict residual on fine grid rf to coarse grid rc.
@@ -1103,8 +1431,6 @@ def multTranspose(self, mat, rf, rc):
         for bc in self.Vf_bcs:
             bc.zero(self.uf)
 
-        with self.uc.dat.vec_wo as uc:
-            uc.set(0.0E0)
         self._restrict()
 
         for bc in self.Vc_bcs:
@@ -1121,8 +1447,6 @@ def mult(self, mat, xc, xf, inc=False):
         for bc in self.Vc_bcs:
             bc.zero(self.uc)
 
-        with self.uf.dat.vec_wo as uf:
-            uf.set(0.0E0)
         self._prolong()
 
         for bc in self.Vf_bcs:
@@ -1146,25 +1470,29 @@ class MixedInterpolationMatrix(StandaloneInterpolationMatrix):
     """
     Interpolation matrix for a mixed finite element space.
     """
-    def __init__(self, Vf, Vc, Vf_bcs, Vc_bcs):
-        self.Vf_bcs = Vf_bcs
-        self.Vc_bcs = Vc_bcs
-        self.uf = Vf if isinstance(Vf, firedrake.Function) else firedrake.Function(Vf)
-        self.uc = Vc if isinstance(Vc, firedrake.Function) else firedrake.Function(Vc)
-
-        self.standalones = []
-        for (i, (uf_sub, uc_sub)) in enumerate(zip(self.uf.subfunctions, self.uc.subfunctions)):
-            Vf_sub_bcs = [bc for bc in Vf_bcs if bc.function_space().index == i]
-            Vc_sub_bcs = [bc for bc in Vc_bcs if bc.function_space().index == i]
-            standalone = StandaloneInterpolationMatrix(uf_sub, uc_sub, Vf_sub_bcs, Vc_sub_bcs)
-            self.standalones.append(standalone)
+    @cached_property
+    def _weight(self):
+        return None
 
-        self._prolong = lambda: [standalone._prolong() for standalone in self.standalones]
-        self._restrict = lambda: [standalone._restrict() for standalone in self.standalones]
+    @cached_property
+    def _standalones(self):
+        standalones = []
+        for i, (uc_sub, uf_sub) in enumerate(zip(self.uc.subfunctions, self.uf.subfunctions)):
+            Vc_sub_bcs = tuple(bc for bc in self.Vc_bcs if bc.function_space().index == i)
+            Vf_sub_bcs = tuple(bc for bc in self.Vf_bcs if bc.function_space().index == i)
+            standalone = StandaloneInterpolationMatrix(uc_sub, uf_sub, Vc_sub_bcs, Vf_sub_bcs)
+            standalones.append(standalone)
+        return standalones
+
+    @cached_property
+    def _kernels(self):
+        prolong = lambda: [s._prolong() for s in self._standalones]
+        restrict = lambda: [s._restrict() for s in self._standalones]
+        return prolong, restrict
 
     def getNestSubMatrix(self, i, j):
         if i == j:
-            s = self.standalones[i]
+            s = self._standalones[i]
             sizes = (s.uf.dof_dset.layout_vec.getSizes(), s.uc.dof_dset.layout_vec.getSizes())
             M_shll = PETSc.Mat().createPython(sizes, s, comm=s.uf._comm)
             M_shll.setUp()
@@ -1173,17 +1501,17 @@ def getNestSubMatrix(self, i, j):
             return None
 
 
-def prolongation_matrix_aij(Pk, P1, Pk_bcs=[], P1_bcs=[]):
-    if isinstance(Pk, firedrake.Function):
-        Pk = Pk.function_space()
+def prolongation_matrix_aij(P1, Pk, P1_bcs=[], Pk_bcs=[]):
     if isinstance(P1, firedrake.Function):
         P1 = P1.function_space()
+    if isinstance(Pk, firedrake.Function):
+        Pk = Pk.function_space()
     sp = op2.Sparsity((Pk.dof_dset,
                        P1.dof_dset),
                       (Pk.cell_node_map(),
                        P1.cell_node_map()))
     mat = op2.Mat(sp, PETSc.ScalarType)
-    mesh = Pk.ufl_domain()
+    mesh = Pk.mesh()
 
     fele = Pk.ufl_element()
     if isinstance(fele, ufl.MixedElement) and not isinstance(fele, (ufl.VectorElement, ufl.TensorElement)):
@@ -1228,15 +1556,14 @@ def prolongation_matrix_aij(Pk, P1, Pk_bcs=[], P1_bcs=[]):
     return mat.handle
 
 
-def prolongation_matrix_matfree(Vf, Vc, Vf_bcs=[], Vc_bcs=[]):
+def prolongation_matrix_matfree(Vc, Vf, Vc_bcs=[], Vf_bcs=[]):
     fele = Vf.ufl_element()
     if isinstance(fele, ufl.MixedElement) and not isinstance(fele, (ufl.VectorElement, ufl.TensorElement)):
-        ctx = MixedInterpolationMatrix(Vf, Vc, Vf_bcs, Vc_bcs)
+        ctx = MixedInterpolationMatrix(Vc, Vf, Vc_bcs, Vf_bcs)
     else:
-        ctx = StandaloneInterpolationMatrix(Vf, Vc, Vf_bcs, Vc_bcs)
+        ctx = StandaloneInterpolationMatrix(Vc, Vf, Vc_bcs, Vf_bcs)
 
     sizes = (Vf.dof_dset.layout_vec.getSizes(), Vc.dof_dset.layout_vec.getSizes())
     M_shll = PETSc.Mat().createPython(sizes, ctx, comm=Vf._comm)
     M_shll.setUp()
-
     return M_shll
diff --git a/tests/multigrid/test_hiptmair.py b/tests/multigrid/test_hiptmair.py
index d851e43e2f..b59022d9c5 100644
--- a/tests/multigrid/test_hiptmair.py
+++ b/tests/multigrid/test_hiptmair.py
@@ -62,9 +62,14 @@ def run_riesz_map(V, mat_type):
     a = inner(d(u), d(v))*dx + inner(u, v)*dx
     L = inner(f, v)*dx
     bcs = [DirichletBC(V, u_exact, "on_boundary")]
-
+    if V.mesh().ufl_cell().is_simplex():
+        appctx = dict()
+    else:
+        from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+        appctx = {"get_gradient": tabulate_exterior_derivative,
+                  "get_curl": tabulate_exterior_derivative}
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
-    solver = LinearVariationalSolver(problem, solver_parameters=parameters)
+    solver = LinearVariationalSolver(problem, solver_parameters=parameters, appctx=appctx)
     solver.solve()
     its = solver.snes.ksp.getIterationNumber()
     return its
@@ -72,7 +77,7 @@ def run_riesz_map(V, mat_type):
 
 @pytest.mark.skipcomplexnoslate
 @pytest.mark.parametrize(["family", "cell"],
-                         [("N1curl", "tetrahedron")])
+                         [("N1curl", "tetrahedron"), ("NCE", "hexahedron")])
 def test_hiptmair_hcurl(family, cell):
     mesh = mesh_hierarchy(cell)[-1]
     V = FunctionSpace(mesh, family, degree=1)
@@ -82,7 +87,7 @@ def test_hiptmair_hcurl(family, cell):
 
 @pytest.mark.skipcomplexnoslate
 @pytest.mark.parametrize(["family", "cell"],
-                         [("RT", "tetrahedron")])
+                         [("RT", "tetrahedron"), ("NCF", "hexahedron")])
 def test_hiptmair_hdiv(family, cell):
     mesh = mesh_hierarchy(cell)[-1]
     V = FunctionSpace(mesh, family, degree=1)
diff --git a/tests/multigrid/test_p_multigrid.py b/tests/multigrid/test_p_multigrid.py
index 17e4e33296..3954536f1f 100644
--- a/tests/multigrid/test_p_multigrid.py
+++ b/tests/multigrid/test_p_multigrid.py
@@ -2,58 +2,121 @@
 from firedrake import *
 
 
-def test_reconstruct_degree():
-    meshes = [UnitSquareMesh(1, 1, quadrilateral=True)]
-    meshes.append(ExtrudedMesh(meshes[0], layers=1))
-    for mesh in meshes:
-        ndim = mesh.topological_dimension()
-        elist = []
-        for degree in [7, 2, 31]:
-            V = VectorFunctionSpace(mesh, "Q", degree)
-            Q = FunctionSpace(mesh, "DQ", degree-2)
-            Z = MixedFunctionSpace([V, Q])
-            e = Z.ufl_element()
-            elist.append(e)
-            assert e == PMGPC.reconstruct_degree(elist[0], degree)
-
-        elist = []
-        for degree in [7, 2, 31]:
-            V = FunctionSpace(mesh, "NCF" if ndim == 3 else "RTCF", degree)
-            Q = FunctionSpace(mesh, "DQ", degree-1)
-            Z = MixedFunctionSpace([V, Q])
-            e = Z.ufl_element()
-            elist.append(e)
-            assert e == PMGPC.reconstruct_degree(elist[0], degree)
-
-
-def test_prolongation_matrix_matfree():
+@pytest.fixture(params=[2, 3],
+                ids=["Rectangle", "Box"])
+def tp_mesh(request):
+    nx = 4
+    distribution = {"overlap_type": (DistributedMeshOverlapType.VERTEX, 1)}
+    m = UnitSquareMesh(nx, nx, quadrilateral=True, distribution_parameters=distribution)
+    if request.param == 3:
+        m = ExtrudedMesh(m, nx)
+
+    x = SpatialCoordinate(m)
+    xnew = as_vector([acos(1-2*xj)/pi for xj in x])
+    m.coordinates.interpolate(xnew)
+    return m
+
+
+@pytest.fixture(params=[0, 1, 2],
+                ids=["H1", "HCurl", "HDiv"])
+def tp_family(tp_mesh, request):
+    tdim = tp_mesh.topological_dimension()
+    if tdim == 3:
+        families = ["Q", "NCE", "NCF"]
+    else:
+        families = ["Q", "RTCE", "RTCF"]
+    return families[request.param]
+
+
+@pytest.fixture(params=[None, "hierarchical", "fdm"],
+                ids=["spectral", "hierarchical", "fdm"])
+def variant(request):
+    return request.param
+
+
+@pytest.fixture(params=[0, 1],
+                ids=["CG-DG", "HDiv-DG"])
+def mixed_family(tp_mesh, request):
+    if request.param == 0:
+        Vfamily = "Q"
+    else:
+        tdim = tp_mesh.topological_dimension()
+        Vfamily = "NCF" if tdim == 3 else "RTCF"
+    Qfamily = "DQ"
+    return Vfamily, Qfamily
+
+
+def test_reconstruct_degree(tp_mesh, mixed_family):
+    """ Construct a complicated mixed element and ensure we may recover it by
+        p-refining or p-coarsening an element of the same family with different
+        degree.
+    """
+    elist = []
+    Vfamily, Qfamily = mixed_family
+    for degree in [7, 2, 31]:
+        if Vfamily in ["NCF", "RTCF"]:
+            V = FunctionSpace(tp_mesh, Vfamily, degree)
+        else:
+            V = VectorFunctionSpace(tp_mesh, Vfamily, degree)
+        Q = FunctionSpace(tp_mesh, Qfamily, degree-2)
+        Z = MixedFunctionSpace([V, Q])
+        e = Z.ufl_element()
+
+        elist.append(e)
+        assert e == PMGPC.reconstruct_degree(elist[0], degree)
+
+
+def test_prolong_de_rham(tp_mesh):
+    """ Interpolate a linear vector function between [H1]^d, HCurl and HDiv spaces
+        where it can be exactly represented
+    """
+    from firedrake.preconditioners.pmg import prolongation_matrix_matfree
+
+    tdim = tp_mesh.topological_dimension()
+    b = Constant(list(range(tdim)))
+    mat = diag(Constant([tdim+1]*tdim)) + Constant([[-1]*tdim]*tdim)
+    expr = dot(mat, SpatialCoordinate(tp_mesh)) + b
+
+    cell = tp_mesh.ufl_cell()
+    elems = [VectorElement(FiniteElement("Q", cell=cell, degree=2)),
+             FiniteElement("NCE" if tdim == 3 else "RTCE", cell=cell, degree=2),
+             FiniteElement("NCF" if tdim == 3 else "RTCF", cell=cell, degree=2)]
+    fs = [FunctionSpace(tp_mesh, e) for e in elems]
+    us = [Function(V) for V in fs]
+    us[0].interpolate(expr)
+    for u in us:
+        for v in us:
+            if u != v:
+                P = prolongation_matrix_matfree(u, v).getPythonContext()
+                P._prolong()
+                assert norm(v-expr, "L2") < 1E-14
+
+
+def test_prolong_low_order_to_restricted(tp_mesh, tp_family, variant):
+    """ Interpolate a low-order function to interior and facet high-order spaces
+        and ensure that the sum of the two high-order functions is equal to the
+        low-order function
+    """
     from firedrake.preconditioners.pmg import prolongation_matrix_matfree
 
-    tol = 1E-14
-    meshes = [UnitSquareMesh(3, 2, quadrilateral=True)]
-    meshes.append(ExtrudedMesh(meshes[0], layers=2))
-    for mesh in meshes:
-        ndim = mesh.topological_dimension()
-        b = Constant(list(range(ndim)))
-        mat = diag(Constant([ndim+1]*ndim)) + Constant([[-1]*ndim]*ndim)
-        expr = dot(mat, SpatialCoordinate(mesh)) + b
-
-        variant = None
-        cell = mesh.ufl_cell()
-        elems = []
-        elems.append(VectorElement(FiniteElement("Q", cell=cell, degree=3, variant=variant)))
-        elems.append(FiniteElement("NCF" if ndim == 3 else "RTCF", cell=cell, degree=2, variant=variant))
-        elems.append(FiniteElement("NCE" if ndim == 3 else "RTCE", cell=cell, degree=2, variant=variant))
-        fs = [FunctionSpace(mesh, e) for e in elems]
-        us = [Function(V) for V in fs]
-        us[0].interpolate(expr)
-        for u in us:
-            for v in us:
-                if u != v:
-                    v.assign(0)
-                    P = prolongation_matrix_matfree(v, u).getPythonContext()
-                    P._prolong()
-                    assert norm(v-expr, "L2") < tol
+    degree = 5
+    cell = tp_mesh.ufl_cell()
+    element = FiniteElement(tp_family, cell=cell, degree=degree, variant=variant)
+    Vi = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="interior"))
+    Vf = FunctionSpace(tp_mesh, RestrictedElement(element, restriction_domain="facet"))
+    Vc = FunctionSpace(tp_mesh, tp_family, degree=1)
+
+    ui = Function(Vi)
+    uf = Function(Vf)
+    uc = Function(Vc)
+    uc.dat.data[0::2] = 0.0
+    uc.dat.data[1::2] = 1.0
+
+    for v in [ui, uf]:
+        P = prolongation_matrix_matfree(uc, v).getPythonContext()
+        P._prolong()
+
+    assert norm(ui + uf - uc, "L2") < 2E-14
 
 
 @pytest.fixture(params=["triangles", "quadrilaterals"], scope="module")
@@ -240,7 +303,8 @@ def test_p_multigrid_mixed(mat_type):
              "ksp_max_it": 3,
              "pc_type": "jacobi"}
 
-    coarse = {"ksp_type": "richardson",
+    coarse = {"mat_type": "aij",  # This circumvents the need for AssembledPC
+              "ksp_type": "richardson",
               "ksp_max_it": 1,
               "ksp_norm_type": "unpreconditioned",
               "ksp_monitor": None,
@@ -255,7 +319,7 @@ def test_p_multigrid_mixed(mat_type):
           "ksp_monitor_true_residual": None,
           "pc_type": "python",
           "pc_python_type": "firedrake.PMGPC",
-          # "mat_type": mat_type,  # FIXME bug with mat-free jacobi on MixedFunctionSpace
+          "mat_type": mat_type,
           "pmg_pc_mg_type": "multiplicative",
           "pmg_mg_levels": relax,
           "pmg_mg_coarse": coarse}
@@ -270,10 +334,14 @@ def test_p_multigrid_mixed(mat_type):
     ppc = solver.snes.ksp.pc.getPythonContext().ppc
     assert ppc.getMGLevels() == 3
 
-    level = solver._ctx
+    # test that nullspace component is zero
     assert abs(assemble(z[1]*dx)) < 1E-12
+    # test that we converge to the exact solution
     assert norm(z-z_exact, "H1") < 1E-12
+
+    # test that we have coarsened the nullspace correctly
     ctx_levels = 0
+    level = solver._ctx
     while level is not None:
         nsp = level._nullspace
         assert isinstance(nsp, MixedVectorSpaceBasis)
@@ -284,6 +352,13 @@ def test_p_multigrid_mixed(mat_type):
         ctx_levels += 1
     assert ctx_levels == 3
 
+    # test that caches are parallel-safe
+    dummy_eq = type(object).__eq__
+    for cache in (PMGPC._coarsen_cache, PMGPC._transfer_cache):
+        assert len(cache) > 0
+        for k in cache:
+            assert type(k).__eq__ is dummy_eq
+
 
 def test_p_fas_scalar():
     mat_type = "matfree"
@@ -313,6 +388,7 @@ def test_p_fas_scalar():
     atol = rtol * Fnorm
 
     coarse = {
+        "mat_type": "aij",
         "ksp_type": "preonly",
         "ksp_norm_type": None,
         "pc_type": "cholesky"}
@@ -321,7 +397,6 @@ def test_p_fas_scalar():
         "ksp_type": "chebyshev",
         "ksp_monitor_true_residual": None,
         "ksp_norm_type": "unpreconditioned",
-        "ksp_max_it": 3,
         "pc_type": "jacobi"}
 
     pmg = {
@@ -340,7 +415,7 @@ def test_p_fas_scalar():
         "pmg_mg_coarse": coarse}
 
     pfas = {
-        "mat_type": "aij",
+        "mat_type": mat_type,
         "snes_monitor": None,
         "snes_converged_reason": None,
         "snes_atol": atol,
@@ -364,23 +439,23 @@ def test_p_fas_scalar():
 @pytest.mark.skipcomplex
 def test_p_fas_nonlinear_scalar():
     mat_type = "matfree"
-    N = 4
-    dxq = dx(degree=3*N+2)  # here we also test coarsening of quadrature degree
+    degree = 4
+    dxq = dx(degree=3*degree+2)  # here we also test coarsening of quadrature degree
 
     mesh = UnitSquareMesh(4, 4, quadrilateral=True)
-    V = FunctionSpace(mesh, "CG", N)
+    V = FunctionSpace(mesh, "CG", degree)
     u = Function(V)
     f = Constant(1)
     bcs = DirichletBC(V, 0, "on_boundary")
 
     # Regularized p-Laplacian
     p = 5
-    eps = 1
+    eps = Constant(1)
     y = eps + inner(grad(u), grad(u))
     E = (1/p)*(y**(p/2))*dxq - inner(f, u)*dxq
     F = derivative(E, u, TestFunction(V))
 
-    fcp = {"quadrature_degree": 3*N+2}
+    fcp = {"quadrature_degree": 3*degree+2}
     problem = NonlinearVariationalProblem(F, u, bcs, form_compiler_parameters=fcp)
 
     # Due to the convoluted nature of the nested iteration
@@ -391,7 +466,7 @@ def test_p_fas_nonlinear_scalar():
 
     rtol = 1E-8
     atol = rtol * Fnorm
-
+    rtol = 0.0
     newton = {
         "mat_type": "aij",
         "snes_monitor": None,
@@ -399,7 +474,7 @@ def test_p_fas_nonlinear_scalar():
         "snes_type": "newtonls",
         "snes_max_it": 20,
         "snes_atol": atol,
-        "snes_rtol": 1E-50}
+        "snes_rtol": rtol}
 
     coarse = {
         "ksp_type": "preonly",
@@ -414,7 +489,7 @@ def test_p_fas_nonlinear_scalar():
 
     pmg = {
         "ksp_atol": atol*1E-1,
-        "ksp_rtol": 1E-50,
+        "ksp_rtol": rtol,
         "ksp_type": "cg",
         "ksp_converged_reason": None,
         "ksp_monitor_true_residual": None,
@@ -434,7 +509,7 @@ def test_p_fas_nonlinear_scalar():
         "snes_monitor": None,
         "snes_converged_reason": None,
         "snes_atol": atol,
-        "snes_rtol": 1E-50,
+        "snes_rtol": rtol,
         "snes_type": "python",
         "snes_python_type": "firedrake.PMGSNES",
         "pfas_snes_fas_type": "kaskade",
@@ -458,7 +533,7 @@ def check_coarsen_quadrature(solver):
             Nq, = Nq
             Nl = p.u.ufl_element().degree()
             try:
-                Nl, = set(Nl)
+                Nl = max(Nl)
             except TypeError:
                 pass
             assert Nq == 3*Nl+2
diff --git a/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py b/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
index f82acdf62e..b9eb6e830a 100644
--- a/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
+++ b/tests/multigrid/test_poisson_p1pcmg_extruded_serendipity.py
@@ -10,12 +10,12 @@ def run_poisson():
                    "ksp_monitor": None,
                    "pc_type": "python",
                    "pc_python_type": "firedrake.P1PC",
-                   "pmg_coarse_degree": coarse_deg,
                    "pmg_mg_levels": {
                        "ksp_type": "chebyshev",
                        "ksp_max_it": 2,
                        "pc_type": "jacobi"},
                    "pmg_mg_coarse": {
+                       "degree": coarse_deg,
                        "ksp_type": "preonly",
                        "pc_type": "lu",
                        "pc_factor_mat_solver_type": "mumps"
diff --git a/tests/regression/test_fdm.py b/tests/regression/test_fdm.py
index 34e6469396..3ad7db838e 100644
--- a/tests/regression/test_fdm.py
+++ b/tests/regression/test_fdm.py
@@ -1,30 +1,33 @@
 import pytest
 from firedrake import *
 
-
-fdmstar = {
+ksp = {
     "mat_type": "matfree",
     "ksp_type": "cg",
     "ksp_atol": 0.0E0,
     "ksp_rtol": 1.0E-8,
-    "ksp_norm_type": "unpreconditioned",
-    "ksp_monitor_true_residual": None,
-    "ksp_converged_reason": None,
+    "ksp_norm_type": "natural",
+    "ksp_monitor": None,
+}
+
+coarse = {
+    "mat_type": "aij",
+    "ksp_type": "preonly",
+    "pc_type": "cholesky",
+}
+
+# FDM without static condensation
+fdmstar = {
     "pc_type": "python",
     "pc_python_type": "firedrake.P1PC",
-    "pmg_coarse_mat_type": "aij",
-    "pmg_mg_coarse": {
-        "ksp_type": "preonly",
-        "pc_type": "cholesky",
-    },
+    "pmg_mg_coarse": coarse,
     "pmg_mg_levels": {
+        "ksp_max_it": 1,
         "ksp_type": "chebyshev",
         "ksp_norm_type": "none",
         "esteig_ksp_type": "cg",
         "esteig_ksp_norm_type": "natural",
-        "ksp_chebyshev_esteig": "0.75,0.25,0.0,1.0",
-        "ksp_chebyshev_esteig_noisy": True,
-        "ksp_chebyshev_esteig_steps": 8,
+        "ksp_chebyshev_esteig": "0.5,0.5,0.0,1.0",
         "pc_type": "python",
         "pc_python_type": "firedrake.FDMPC",
         "fdm": {
@@ -32,12 +35,80 @@
             "pc_python_type": "firedrake.ASMExtrudedStarPC",
             "pc_star_mat_ordering_type": "nd",
             "pc_star_sub_sub_pc_type": "cholesky",
-            "pc_star_sub_sub_pc_factor_mat_solver_type": "petsc",
-            "pc_star_sub_sub_pc_factor_mat_ordering_type": "natural",
         }
     }
 }
 
+# FDM with static condensation
+facetstar = {
+    "pc_type": "python",
+    "pc_python_type": "firedrake.FacetSplitPC",
+    "facet_pc_type": "python",
+    "facet_pc_python_type": "firedrake.FDMPC",
+    "facet_fdm_static_condensation": True,
+    "facet_fdm_pc_use_amat": False,
+    "facet_fdm_pc_type": "fieldsplit",
+    "facet_fdm_pc_fieldsplit_type": "symmetric_multiplicative",
+    "facet_fdm_fieldsplit_0": {
+        "ksp_type": "preonly",
+        "pc_type": "icc",  # this is exact for the sparse approximation used in FDM
+    },
+    "facet_fdm_fieldsplit_1": {
+        "ksp_type": "preonly",
+        "pc_type": "python",
+        "pc_python_type": "firedrake.P1PC",
+        "pmg_mg_coarse": coarse,
+        "pmg_mg_levels": {
+            "ksp_max_it": 1,
+            "ksp_type": "chebyshev",
+            "ksp_norm_type": "none",
+            "esteig_ksp_type": "cg",
+            "esteig_ksp_norm_type": "natural",
+            "ksp_chebyshev_esteig": "0.5,0.5,0.0,1.0",
+            "pc_type": "python",
+            "pc_python_type": "firedrake.ASMExtrudedStarPC",
+            "pc_star_mat_ordering_type": "nd",
+            "pc_star_sub_sub_pc_type": "cholesky",
+        }
+    }
+}
+
+fdmstar.update(ksp)
+facetstar.update(ksp)
+
+
+def build_riesz_map(V, d):
+    beta = Constant(1E-4)
+    subs = [(1, 3)]
+    if V.mesh().cell_set._extruded:
+        subs += ["top"]
+
+    x = SpatialCoordinate(V.mesh())
+    x -= Constant([0.5]*len(x))
+    if V.ufl_element().value_shape() == ():
+        u_exact = exp(-10*dot(x, x))
+        u_bc = u_exact
+    else:
+        A = Constant([[-1.]*len(x)]*len(x)) + diag(Constant([len(x)]*len(x)))
+        u_exact = dot(A, x) * exp(-10*dot(x, x))
+        u_bc = Function(V)
+        u_bc.project(u_exact, solver_parameters={"mat_type": "matfree", "pc_type": "jacobi"})
+
+    bcs = [DirichletBC(V, u_bc, sub) for sub in subs]
+
+    uh = Function(V)
+    test = TestFunction(V)
+    trial = TrialFunction(V)
+    a = lambda v, u: inner(v, beta*u)*dx + inner(d(v), d(u))*dx
+    return LinearVariationalProblem(a(test, trial), a(test, u_exact), uh, bcs=bcs)
+
+
+def solve_riesz_map(problem, solver_parameters):
+    problem.u.assign(0)
+    solver = LinearVariationalSolver(problem, solver_parameters=solver_parameters)
+    solver.solve()
+    return solver.snes.ksp.getIterationNumber()
+
 
 @pytest.fixture(params=[2, 3],
                 ids=["Rectangle", "Box"])
@@ -54,67 +125,65 @@ def mesh(request):
     return m
 
 
-@pytest.fixture
-def expected(mesh):
-    if mesh.topological_dimension() == 2:
-        return [5, 5, 5]
-    elif mesh.topological_dimension() == 3:
-        return [8, 8, 8]
-
-
 @pytest.fixture(params=[None, "fdm"], ids=["spectral", "fdm"])
 def variant(request):
     return request.param
 
 
 @pytest.mark.skipcomplex
-def test_p_independence(mesh, expected, variant):
-    nits = []
-    for p in range(3, 6):
-        e = FiniteElement("Lagrange", cell=mesh.ufl_cell(), degree=p, variant=variant)
-        V = FunctionSpace(mesh, e)
-        u = TrialFunction(V)
-        v = TestFunction(V)
-
-        ndim = mesh.geometric_dimension()
-        x = SpatialCoordinate(mesh)
-        x -= Constant([0.5]*ndim)
-        u_exact = dot(x, x)
-        f_exact = grad(u_exact)
-        B = -div(f_exact)
+def test_p_independence_hgrad(mesh, variant):
+    family = "Lagrange"
+    expected = [16, 12] if mesh.topological_dimension() == 3 else [9, 7]
+    solvers = [fdmstar] if variant is None else [fdmstar, facetstar]
+    for degree in range(3, 6):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant=variant)
+        V = FunctionSpace(mesh, element)
+        problem = build_riesz_map(V, grad)
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
-        a = inner(grad(v), grad(u))*dx
-        L = inner(v, B)*dx
 
-        subs = ("on_boundary",)
-        if mesh.cell_set._extruded:
-            subs += ("top", "bottom")
-        bcs = [DirichletBC(V, u_exact, sub) for sub in subs]
+@pytest.mark.skipcomplex
+def test_p_independence_hcurl(mesh):
+    family = "NCE" if mesh.topological_dimension() == 3 else "RTCE"
+    expected = [13, 10] if mesh.topological_dimension() == 3 else [6, 6]
+    solvers = [fdmstar, facetstar]
+    for degree in range(3, 6):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
+        V = FunctionSpace(mesh, element)
+        problem = build_riesz_map(V, curl)
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
-        uh = Function(V)
-        problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
-        solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
-        solver.solve()
-        nits.append(solver.snes.ksp.getIterationNumber())
-    assert norm(u_exact-uh, "H1") < 2.0E-7
-    assert nits <= expected
+
+@pytest.mark.skipcomplex
+def test_p_independence_hdiv(mesh):
+    family = "NCF" if mesh.topological_dimension() == 3 else "RTCF"
+    expected = [6, 6]
+    solvers = [fdmstar, facetstar]
+    for degree in range(3, 6):
+        element = FiniteElement(family, cell=mesh.ufl_cell(), degree=degree, variant="fdm")
+        V = FunctionSpace(mesh, element)
+        problem = build_riesz_map(V, div)
+        for sp, expected_it in zip(solvers, expected):
+            assert solve_riesz_map(problem, sp) <= expected_it
 
 
 @pytest.mark.skipcomplex
 def test_variable_coefficient(mesh):
-    ndim = mesh.geometric_dimension()
+    gdim = mesh.geometric_dimension()
     k = 4
     V = FunctionSpace(mesh, "Lagrange", k)
     u = TrialFunction(V)
     v = TestFunction(V)
     x = SpatialCoordinate(mesh)
-    x -= Constant([0.5]*ndim)
+    x -= Constant([0.5]*len(x))
 
     # variable coefficients
-    alphas = [0.1+10*dot(x, x)]*ndim
+    alphas = [0.1+10*dot(x, x)]*gdim
     alphas[0] = 1+10*exp(-dot(x, x))
     alpha = diag(as_vector(alphas))
-    beta = ((10*cos(3*pi*x[0]) + 20*sin(2*pi*x[1]))*cos(pi*x[ndim-1]))**2
+    beta = ((10*cos(3*pi*x[0]) + 20*sin(2*pi*x[1]))*cos(pi*x[gdim-1]))**2
 
     a = (inner(grad(v), dot(alpha, grad(u))) + inner(v, beta*u))*dx(degree=3*k+2)
     L = inner(v, Constant(1))*dx
@@ -128,68 +197,69 @@ def test_variable_coefficient(mesh):
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters=fdmstar)
     solver.solve()
-    assert solver.snes.ksp.getIterationNumber() <= 14
+    expected = 23 if gdim == 3 else 14
+    assert solver.snes.ksp.getIterationNumber() <= expected
 
 
 @pytest.fixture(params=["cg", "dg", "rt"],
                 ids=["cg", "dg", "rt"])
 def fs(request, mesh):
     degree = 3
-    ndim = mesh.topological_dimension()
+    tdim = mesh.topological_dimension()
     cell = mesh.ufl_cell()
     element = request.param
-    variant = None
+    variant = "fdm_ipdg"
     if element == "rt":
-        family = "RTCF" if ndim == 2 else "NCF"
+        family = "RTCF" if tdim == 2 else "NCF"
         return FunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant))
     else:
-        if ndim == 1:
+        if tdim == 1:
             family = "DG" if element == "dg" else "CG"
         else:
             family = "DQ" if element == "dg" else "Q"
-        return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-ndim)
+        return VectorFunctionSpace(mesh, FiniteElement(family, cell, degree=degree, variant=variant), dim=5-tdim)
 
 
 @pytest.mark.skipcomplex
-def test_direct_solver(fs):
+def test_ipdg_direct_solver(fs):
     mesh = fs.mesh()
     x = SpatialCoordinate(mesh)
-    ndim = mesh.geometric_dimension()
+    gdim = mesh.geometric_dimension()
     ncomp = fs.ufl_element().value_size()
-    u_exact = dot(x, x)
-    if ncomp:
-        u_exact = as_vector([u_exact + Constant(k) for k in range(ncomp)])
 
-    N = fs.ufl_element().degree()
+    homogenize = gdim > 2
+    if homogenize:
+        rg = RandomGenerator(PCG64(seed=123456789))
+        uh = rg.uniform(fs, -1, 1)
+        u_exact = zero(uh.ufl_shape)
+        u_bc = 0
+    else:
+        uh = Function(fs)
+        u_exact = dot(x, x)
+        if ncomp:
+            u_exact = as_vector([u_exact + Constant(k) for k in range(ncomp)])
+        u_bc = u_exact
+
+    degree = fs.ufl_element().degree()
     try:
-        N, = set(N)
+        degree = max(degree)
     except TypeError:
         pass
-
-    quad_degree = 2*(N+1)-1
-    uh = Function(fs)
-    u = TrialFunction(fs)
-    v = TestFunction(fs)
+    quad_degree = 2*(degree+1)-1
 
     # problem coefficients
-    A1 = diag(Constant(range(1, ndim+1)))
+    A1 = diag(Constant(range(1, gdim+1)))
     A2 = diag(Constant(range(1, ncomp+1)))
     alpha = lambda grad_u: dot(dot(A2, grad_u), A1)
     beta = diag(Constant(range(2, ncomp+2)))
 
-    n = FacetNormal(mesh)
-    f_exact = alpha(grad(u_exact))
-    B = dot(beta, u_exact) - div(f_exact)
-    T = dot(f_exact, n)
-
     extruded = mesh.cell_set._extruded
     subs = (1,)
-    if ndim > 1:
+    if gdim > 1:
         subs += (3,)
     if extruded:
         subs += ("top",)
-
-    bcs = [DirichletBC(fs, u_exact, sub) for sub in subs]
+    bcs = [DirichletBC(fs, u_bc, sub) for sub in subs]
 
     dirichlet_ids = subs
     if "on_boundary" in dirichlet_ids:
@@ -217,24 +287,30 @@ def test_direct_solver(fs):
 
     ds_Dir = sum(ds_Dir, ds(tuple()))
     ds_Neu = sum(ds_Neu, ds(tuple()))
-    eta = Constant((N+1)**2)
-    h = CellVolume(mesh)/FacetArea(mesh)
-    penalty = eta/h
+    n = FacetNormal(mesh)
+    h = CellVolume(mesh) / FacetArea(mesh)
+    eta = Constant((degree+1)**2)
+    penalty = eta / h
 
-    outer_jump = lambda w, n: outer(w('+'), n('+')) + outer(w('-'), n('-'))
-    num_flux = lambda w: alpha(avg(penalty/2) * outer_jump(w, n))
-    num_flux_b = lambda w: alpha((penalty/2) * outer(w, n))
+    num_flux = lambda u: avg(penalty) * avg(outer(u, n))
+    num_flux_b = lambda u: (penalty/2) * outer(u, n)
+    a_int = lambda v, u: inner(2 * avg(outer(v, n)), alpha(num_flux(u) - avg(grad(u))))
+    a_Dir = lambda v, u: inner(outer(v, n), alpha(num_flux_b(u) - grad(u)))
 
-    a = (inner(v, dot(beta, u)) * dxq
-         + inner(grad(v), alpha(grad(u))) * dxq
-         + inner(outer_jump(v, n), num_flux(u)-avg(alpha(grad(u)))) * dS_int
-         + inner(outer_jump(u, n), num_flux(v)-avg(alpha(grad(v)))) * dS_int
-         + inner(outer(v, n), num_flux_b(u)-alpha(grad(u))) * ds_Dir
-         + inner(outer(u, n), num_flux_b(v)-alpha(grad(v))) * ds_Dir)
+    v = TestFunction(fs)
+    u = TrialFunction(fs)
+    a = ((inner(v, dot(beta, u)) + inner(grad(v), alpha(grad(u)))) * dxq
+         + (a_int(v, u) + a_int(u, v)) * dS_int
+         + (a_Dir(v, u) + a_Dir(u, v)) * ds_Dir)
 
-    L = (inner(v, B)*dxq
-         + inner(v, T)*ds_Neu
-         + inner(outer(u_exact, n), 2*num_flux_b(v)-alpha(grad(v))) * ds_Dir)
+    if homogenize:
+        L = 0
+    else:
+        f_exact = alpha(grad(u_exact))
+        B = dot(beta, u_exact) - div(f_exact)
+        T = dot(f_exact, n)
+        L = (inner(v, B)*dxq + inner(v, T)*ds_Neu
+             + inner(outer(u_exact, n), alpha(2*num_flux_b(v) - grad(v))) * ds_Dir)
 
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
     solver = LinearVariationalSolver(problem, solver_parameters={
@@ -246,7 +322,7 @@ def test_direct_solver(fs):
         "ksp_monitor": None,
         "ksp_norm_type": "unpreconditioned",
         "pc_type": "python",
-        "pc_python_type": "firedrake.FDMPC",
+        "pc_python_type": "firedrake.PoissonFDMPC",
         "fdm_pc_type": "cholesky",
         "fdm_pc_factor_mat_solver_type": "mumps",
         "fdm_pc_factor_mat_ordering_type": "nd",
@@ -254,4 +330,8 @@ def test_direct_solver(fs):
     solver.solve()
 
     assert solver.snes.ksp.getIterationNumber() == 1
-    assert norm(u_exact-uh, "H1") < 1.0E-8
+    if homogenize:
+        with uh.dat.vec_ro as uvec:
+            assert uvec.norm() < 1E-8
+    else:
+        assert norm(u_exact-uh, "H1") < 1.0E-8