diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml
index 08f36d4c9..13303e6cb 100644
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
@@ -1,14 +1,6 @@
 - arg: ignore
   val:
     - mappers
-    - gas_dynamics
-    - burgers.py
-    - diffusion.py
-    - dt_finding.py
-    - nd_calculus.py
-    - pml.py
-    - poisson.py
-    - second_order.py
 - arg: ignored-modules
   val:
   - sympy
diff --git a/doc/conf.py b/doc/conf.py
index 0cd5ba6d6..1ff3ca070 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -35,6 +35,7 @@ def get_version():
     "https://documen.tician.de/arraycontext/": None,
     "https://documen.tician.de/meshmode/": None,
     "https://documen.tician.de/loopy/": None,
+    "https://mpi4py.readthedocs.io/en/stable": None,
     }
 
 # index-page demo uses pyopencl via plot_directive
diff --git a/examples/advection/var-velocity.py b/examples/advection/var-velocity.py
index de1b45354..fdf2bd9ed 100644
--- a/examples/advection/var-velocity.py
+++ b/examples/advection/var-velocity.py
@@ -31,6 +31,7 @@
 
 from grudge.array_context import PyOpenCLArrayContext
 
+from grudge.grudge_array_context import GrudgeArrayContext
 from meshmode.dof_array import flatten
 from meshmode.mesh import BTAG_ALL
 
@@ -100,6 +101,7 @@ def main(ctx_factory, dim=2, order=4, use_quad=False, visualize=False,
         flux_type="upwind"):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+    #actx = GrudgeArrayContext(queue)
     actx = PyOpenCLArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
diff --git a/examples/advection/weak.py b/examples/advection/weak.py
index 3470fdd60..2c5a1333f 100644
--- a/examples/advection/weak.py
+++ b/examples/advection/weak.py
@@ -99,7 +99,7 @@ def __call__(self, evt, basename, overwrite=True):
 def main(ctx_factory, dim=2, order=4, visualize=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
-    actx = PyOpenCLArrayContext(
+    actx = GrudgeArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
         force_device_scalars=True,
diff --git a/examples/geometry.py b/examples/geometry.py
index 442bbcfff..dfa39e048 100644
--- a/examples/geometry.py
+++ b/examples/geometry.py
@@ -32,12 +32,14 @@
 
 from grudge.array_context import PyOpenCLArrayContext
 
+from grudge.grudge_array_context import GrudgeArrayContext
 from grudge import DiscretizationCollection, shortcuts
 
 
 def main(write_output=True):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
+    #actx = GrudgeArrayContext(queue)
     actx = PyOpenCLArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
diff --git a/examples/hello-grudge.py b/examples/hello-grudge.py
index cfb724115..017430d86 100644
--- a/examples/hello-grudge.py
+++ b/examples/hello-grudge.py
@@ -14,7 +14,7 @@
 import grudge.op as op
 from meshmode.mesh.generation import generate_box_mesh
 from meshmode.array_context import PyOpenCLArrayContext
-from grudge.dof_desc import DTAG_BOUNDARY, FACE_RESTR_INTERIOR
+from grudge.dof_desc import BoundaryDomainTag, FACE_RESTR_INTERIOR
 
 
 ctx = cl.create_some_context()
@@ -51,8 +51,8 @@ def flux(dcoll, u_tpair):
 
 
 vol_discr = dcoll.discr_from_dd("vol")
-left_bndry = DTAG_BOUNDARY("left")
-right_bndry = DTAG_BOUNDARY("right")
+left_bndry = BoundaryDomainTag("left")
+right_bndry = BoundaryDomainTag("right")
 
 x_vol = actx.thaw(dcoll.nodes())
 x_bndry = actx.thaw(dcoll.discr_from_dd(left_bndry).nodes())
diff --git a/examples/maxwell/cavities.py b/examples/maxwell/cavities.py
index 3d581c18a..23870121e 100644
--- a/examples/maxwell/cavities.py
+++ b/examples/maxwell/cavities.py
@@ -44,6 +44,7 @@
 def main(ctx_factory, dim=3, order=4, visualize=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+
     actx = PyOpenCLArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
diff --git a/examples/wave/hjson/diff_2_axis.hjson b/examples/wave/hjson/diff_2_axis.hjson
new file mode 100644
index 000000000..7f600edf8
--- /dev/null
+++ b/examples/wave/hjson/diff_2_axis.hjson
@@ -0,0 +1,317 @@
+{
+  72a3ce98-5d21-48bf-b402-6ee96bafd1b6:
+  {
+    FP64:
+    {
+      10:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            176
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            16
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            10
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            10
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            10
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      6:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            320
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            6
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            2
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            6
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      15:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            15
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            15
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/examples/wave/hjson/diff_3_axis.hjson b/examples/wave/hjson/diff_3_axis.hjson
new file mode 100644
index 000000000..3069f2033
--- /dev/null
+++ b/examples/wave/hjson/diff_3_axis.hjson
@@ -0,0 +1,317 @@
+{
+  72a3ce98-5d21-48bf-b402-6ee96bafd1b6:
+  {
+    FP64:
+    {
+      20:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            80
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            16
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            20
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            20
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            2
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      10:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            128
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            16
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            10
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            5
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            10
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      35:
+      [
+        [
+          tag_inames
+          [
+            imatrix: ilp
+          ]
+        ]
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            35
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            7
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/examples/wave/hjson/elwise_linear.hjson b/examples/wave/hjson/elwise_linear.hjson
new file mode 100644
index 000000000..b48537c2a
--- /dev/null
+++ b/examples/wave/hjson/elwise_linear.hjson
@@ -0,0 +1,299 @@
+{
+  72a3ce98-5d21-48bf-b402-6ee96bafd1b6:
+  {
+    FP64:
+    {
+      20:
+      [
+        [
+          split_iname
+          [
+            iel
+            16
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            8
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            4
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            4
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            2
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      10:
+      [
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            10
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            2
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      35:
+      [
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            35
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            7
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/examples/wave/hjson/face_mass.hjson b/examples/wave/hjson/face_mass.hjson
new file mode 100644
index 000000000..06a0186ce
--- /dev/null
+++ b/examples/wave/hjson/face_mass.hjson
@@ -0,0 +1,299 @@
+{
+  72a3ce98-5d21-48bf-b402-6ee96bafd1b6:
+  {
+    FP64:
+    {
+      20:
+      [
+        [
+          split_iname
+          [
+            iel
+            128
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            16
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            20
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            20
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            f,j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            N1,N0,N2
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      10:
+      [
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            10
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            10
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            f,j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            N1,N0,N2
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            6
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+      35:
+      [
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            7
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            7
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            vec
+            f,j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            N1,N0,N2
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            1
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/examples/wave/hjson/nodes.hjson b/examples/wave/hjson/nodes.hjson
new file mode 100644
index 000000000..8c2e180c3
--- /dev/null
+++ b/examples/wave/hjson/nodes.hjson
@@ -0,0 +1,106 @@
+{
+  72a3ce98-5d21-48bf-b402-6ee96bafd1b6:
+  {
+    FP64:
+    {
+      35: []
+      15:
+      [
+        [
+          split_iname
+          [
+            iel
+            32
+          ]
+          {
+            outer_tag: g.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            iel_inner
+            32
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.0
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof
+            15
+          ]
+          {
+            outer_tag: g.1
+            slabs:
+            [
+              0
+              0
+            ]
+          }
+        ]
+        [
+          split_iname
+          [
+            idof_inner
+            15
+          ]
+          {
+            outer_tag: ilp
+            inner_tag: l.1
+            slabs:
+            [
+              0
+              1
+            ]
+          }
+        ]
+        [
+          add_prefetch
+          [
+            nodes
+            j,iel_inner_outer,iel_inner_inner
+          ]
+          {
+            temporary_name: vecf
+            default_tag: l.auto
+          }
+        ]
+        [
+          tag_array_axes
+          [
+            vecf
+            f,f
+          ]
+        ]
+        [
+          split_iname
+          [
+            j
+            3
+          ]
+          {
+            outer_tag: for
+            inner_tag: for
+          }
+        ]
+        [
+          add_inames_for_unused_hw_axes
+        ]
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/examples/wave/var-propagation-speed.py b/examples/wave/var-propagation-speed.py
index 9929f6dbf..0e48a7405 100644
--- a/examples/wave/var-propagation-speed.py
+++ b/examples/wave/var-propagation-speed.py
@@ -35,7 +35,11 @@
 
 from pytools.obj_array import flat_obj_array
 
+<<<<<<< HEAD
+from grudge.grudge_array_context import GrudgeArrayContext
+=======
 import grudge.op as op
+>>>>>>> upstream/main
 
 import logging
 logger = logging.getLogger(__name__)
@@ -44,12 +48,18 @@
 def main(ctx_factory, dim=2, order=4, visualize=False):
     cl_ctx = ctx_factory()
     queue = cl.CommandQueue(cl_ctx)
+<<<<<<< HEAD
+    actx = GrudgeArrayContext(queue)
+
+    dims = 3
+=======
     actx = PyOpenCLArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
         force_device_scalars=True,
     )
 
+>>>>>>> upstream/main
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dim,
diff --git a/examples/wave/wave-min-mpi.py b/examples/wave/wave-min-mpi.py
index 6c56353bd..1afcef10e 100644
--- a/examples/wave/wave-min-mpi.py
+++ b/examples/wave/wave-min-mpi.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import pyopencl as cl
+from grudge.grudge_array_context import GrudgeArrayContext
 import pyopencl.tools as cl_tools
 
 from grudge.array_context import MPIPyOpenCLArrayContext
@@ -43,10 +44,6 @@
 logger = logging.getLogger(__name__)
 
 
-class WaveTag:
-    pass
-
-
 def main(ctx_factory, dim=2, order=4, visualize=False):
     comm = MPI.COMM_WORLD
     num_parts = comm.Get_size()
diff --git a/examples/wave/wave-op-mpi.py b/examples/wave/wave-op-mpi.py
index 8c23336d0..c84ce2e5f 100644
--- a/examples/wave/wave-op-mpi.py
+++ b/examples/wave/wave-op-mpi.py
@@ -29,6 +29,9 @@
 import pyopencl as cl
 import pyopencl.tools as cl_tools
 
+from grudge.array_context import PyOpenCLArrayContext, PytatoPyOpenCLArrayContext
+from grudge.grudge_array_context import (AutotuningArrayContext, 
+    GrudgeArrayContext, ParameterFixingPyOpenCLArrayContext)
 from arraycontext import (
     with_container_arithmetic,
     dataclass_array_container
@@ -181,7 +184,22 @@ def bump(actx, dcoll, t=0):
 def main(ctx_factory, dim=2, order=3,
          visualize=False, lazy=False, use_quad=False, use_nonaffine_mesh=False):
     cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
+    queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    if lazy:
+        actx = PytatoPyOpenCLArrayContext(queue)
+    else:
+        #actx = ParameterFixingPyOpenCLArrayContext(
+        actx = AutotuningArrayContext(
+        #actx = GrudgeArrayContext(
+            queue,
+            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+            force_device_scalars=True,
+        )
+        #actx = PyOpenCLArrayContext(
+        #    queue,
+        #    allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+        #    force_device_scalars=True,
+        #)
 
     comm = MPI.COMM_WORLD
     num_parts = comm.Get_size()
@@ -198,7 +216,9 @@ def main(ctx_factory, dim=2, order=3,
     from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
     mesh_dist = MPIMeshDistributor(comm)
 
-    nel_1d = 16
+    order=2
+    dim = 3
+    nel_1d = 2**5
 
     if mesh_dist.is_mananger_rank():
         if use_nonaffine_mesh:
@@ -271,6 +291,8 @@ def rhs(t, w):
     t = 0
     t_final = 3
     istep = 0
+    end_step = 10
+
     while t < t_final:
         start = time.time()
 
diff --git a/examples/wave/wave-op-mpi.py.old b/examples/wave/wave-op-mpi.py.old
new file mode 100644
index 000000000..ad024ec8c
--- /dev/null
+++ b/examples/wave/wave-op-mpi.py.old
@@ -0,0 +1,251 @@
+"""Minimal example of a grudge driver."""
+
+__copyright__ = """
+Copyright (C) 2020 Andreas Kloeckner
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+import pyopencl.tools as cl_tools
+
+from arraycontext import thaw
+from grudge.array_context import PyOpenCLArrayContext
+
+from grudge.grudge_array_context import GrudgeArrayContext
+from pytools.obj_array import flat_obj_array
+
+from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+
+from grudge.discretization import DiscretizationCollection
+from grudge.shortcuts import make_visualizer
+
+import grudge.op as op
+
+import logging
+logger = logging.getLogger(__name__)
+
+from mpi4py import MPI
+
+
+# {{{ wave equation bits
+
+def wave_flux(dcoll, c, w_tpair):
+    u = w_tpair[0]
+    v = w_tpair[1:]
+
+    normal = thaw(dcoll.normal(w_tpair.dd), u.int.array_context)
+
+    flux_weak = flat_obj_array(
+            np.dot(v.avg, normal),
+            normal*u.avg,
+            )
+
+    # upwind
+    v_jump = np.dot(normal, v.ext-v.int)
+    flux_weak += flat_obj_array(
+            0.5*(u.ext-u.int),
+            0.5*normal*v_jump,
+            )
+
+    return op.project(dcoll, w_tpair.dd, "all_faces", c*flux_weak)
+
+
+def wave_operator(dcoll, c, w):
+    u = w[0]
+    v = w[1:]
+
+    dir_u = op.project(dcoll, "vol", BTAG_ALL, u)
+    dir_v = op.project(dcoll, "vol", BTAG_ALL, v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
+
+    return (
+        op.inverse_mass(
+            dcoll,
+            flat_obj_array(
+                -c*op.weak_local_div(dcoll, v),
+                -c*op.weak_local_grad(dcoll, u)
+            )
+            + op.face_mass(
+                dcoll,
+                wave_flux(
+                    dcoll, c=c,
+                    w_tpair=op.bdry_trace_pair(dcoll,
+                                               BTAG_ALL,
+                                               interior=dir_bval,
+                                               exterior=dir_bc)
+                ) + sum(
+                    wave_flux(dcoll, c=c, w_tpair=tpair)
+                    for tpair in op.interior_trace_pairs(dcoll, w)
+                )
+            )
+        )
+    )
+
+# }}}
+
+
+def rk4_step(y, t, h, f):
+    k1 = f(t, y)
+    k2 = f(t+h/2, y + h/2*k1)
+    k3 = f(t+h/2, y + h/2*k2)
+    k4 = f(t+h, y + h*k3)
+    return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
+
+
+def estimate_rk4_timestep(actx, dcoll, c):
+    from grudge.dt_utils import characteristic_lengthscales
+
+    local_dts = characteristic_lengthscales(actx, dcoll) / c
+
+    return op.nodal_min(dcoll, "vol", local_dts)
+
+
+def bump(actx, dcoll, t=0):
+    source_center = np.array([0.2, 0.35, 0.1])[:dcoll.dim]
+    source_width = 0.05
+    source_omega = 3
+
+    nodes = thaw(dcoll.nodes(), actx)
+    center_dist = flat_obj_array([
+        nodes[i] - source_center[i]
+        for i in range(dcoll.dim)
+        ])
+
+    return (
+        np.cos(source_omega*t)
+        * actx.np.exp(
+            -np.dot(center_dist, center_dist)
+            / source_width**2))
+
+
+def main(ctx_factory, dim=2, order=3, visualize=False):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+    #actx = GrudgeArrayContext(queue)
+    actx = PyOpenCLArrayContext(
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
+        force_device_scalars=True,
+    )
+
+    comm = MPI.COMM_WORLD
+    num_parts = comm.Get_size()
+
+    from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
+    mesh_dist = MPIMeshDistributor(comm)
+
+    nel_1d = 16
+
+    if mesh_dist.is_mananger_rank():
+        from meshmode.mesh.generation import generate_regular_rect_mesh
+        mesh = generate_regular_rect_mesh(
+                a=(-0.5,)*dim,
+                b=(0.5,)*dim,
+                nelements_per_axis=(nel_1d,)*dim)
+
+        logger.info("%d elements", mesh.nelements)
+
+        part_per_element = get_partition_by_pymetis(mesh, num_parts)
+
+        local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)
+
+        del mesh
+
+    else:
+        local_mesh = mesh_dist.receive_mesh_part()
+
+    dcoll = DiscretizationCollection(actx, local_mesh, order=order,
+                    mpi_communicator=comm)
+
+    fields = flat_obj_array(
+            bump(actx, dcoll),
+            [dcoll.zeros(actx) for i in range(dcoll.dim)]
+            )
+
+    c = 1
+    dt = 0.45 * estimate_rk4_timestep(actx, dcoll, c)
+
+    vis = make_visualizer(dcoll)
+
+    def rhs(t, w):
+        return wave_operator(dcoll, c=c, w=w)
+
+    if comm.rank == 0:
+        logger.info("dt = %g", dt)
+
+    t = 0
+    t_final = 3
+    istep = 0
+    while t < t_final:
+        fields = rk4_step(fields, t, dt, rhs)
+
+        l2norm = op.norm(dcoll, fields[0], 2)
+
+        if istep % 10 == 0:
+            linfnorm = op.norm(dcoll, fields[0], np.inf)
+            nodalmax = op.nodal_max(dcoll, "vol", fields[0])
+            nodalmin = op.nodal_min(dcoll, "vol", fields[0])
+            if comm.rank == 0:
+                logger.info(f"step: {istep} t: {t} "
+                            f"L2: {l2norm} "
+                            f"Linf: {linfnorm} "
+                            f"sol max: {nodalmax} "
+                            f"sol min: {nodalmin}")
+            if visualize:
+                vis.write_parallel_vtk_file(
+                    comm,
+                    f"fld-wave-eager-mpi-{{rank:03d}}-{istep:04d}.vtu",
+                    [
+                        ("u", fields[0]),
+                        ("v", fields[1:]),
+                    ]
+                )
+
+        t += dt
+        istep += 1
+
+        # NOTE: These are here to ensure the solution is bounded for the
+        # time interval specified
+        assert l2norm < 1
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dim", default=2, type=int)
+    parser.add_argument("--order", default=3, type=int)
+    parser.add_argument("--visualize", action="store_true")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    main(cl.create_some_context,
+         dim=args.dim,
+         order=args.order,
+         visualize=args.visualize)
+
+# vim: foldmethod=marker
diff --git a/examples/wave/wave-op-var-velocity.py b/examples/wave/wave-op-var-velocity.py
index 43c72eff9..3b7915934 100644
--- a/examples/wave/wave-op-var-velocity.py
+++ b/examples/wave/wave-op-var-velocity.py
@@ -31,6 +31,7 @@
 
 from grudge.array_context import PyOpenCLArrayContext
 
+from grudge.grudge_array_context import GrudgeArrayContext, AutoTuningArrayContext
 from pytools.obj_array import flat_obj_array
 
 from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
@@ -150,8 +151,8 @@ def bump(actx, dcoll, t=0, width=0.05, center=None):
 
 def main(ctx_factory, dim=2, order=3, visualize=False):
     cl_ctx = ctx_factory()
-    queue = cl.CommandQueue(cl_ctx)
-    actx = PyOpenCLArrayContext(
+    queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    actx = GrudgeArrayContext(
         queue,
         allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
         force_device_scalars=True,
diff --git a/examples/wave/wave-op.py b/examples/wave/wave-op.py
new file mode 100644
index 000000000..43622cac0
--- /dev/null
+++ b/examples/wave/wave-op.py
@@ -0,0 +1,402 @@
+__copyright__ = "Copyright (C) 2020 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import numpy.linalg as la  # noqa
+import pyopencl as cl
+
+from pytools.obj_array import flat_obj_array
+
+from grudge.grudge_array_context import GrudgeArrayContext, AutoTuningArrayContext
+from meshmode.array_context import PyOpenCLArrayContext  # noqa F401
+from meshmode.dof_array import thaw
+
+from meshmode.mesh import BTAG_ALL, BTAG_NONE  # noqa
+
+from grudge.discretization import DiscretizationCollection
+import grudge.op as op
+from grudge.shortcuts import make_visualizer
+from grudge.symbolic.primitives import TracePair
+from time import time
+
+# {{{ wave equation bits
+
+def wave_flux(dcoll, c, w_tpair):
+    u = w_tpair[0]
+    v = w_tpair[1:]
+
+    normal = thaw(u.int.array_context, op.normal(dcoll, w_tpair.dd))
+
+    flux_weak = flat_obj_array(
+            np.dot(v.avg, normal),
+            normal*u.avg,
+            )
+
+    # upwind
+    flux_weak += flat_obj_array(
+            0.5*(u.ext-u.int),
+            0.5*normal*np.dot(normal, v.ext-v.int),
+            )
+
+    return op.project(dcoll, w_tpair.dd, "all_faces", c*flux_weak)
+
+#'''
+def wave_operator(discr, c, w):
+    from pyopencl import MemoryError
+    from pyopencl.array import Array
+    try:
+
+        u = w[0]
+        v = w[1:]
+
+        dir_u = op.project(discr, "vol", BTAG_ALL, u)
+        dir_v = op.project(discr, "vol", BTAG_ALL, v)
+        dir_bval = flat_obj_array(dir_u, dir_v)
+        neg_dir_u = -dir_u; del dir_u
+        dir_bc = flat_obj_array(neg_dir_u, dir_v)
+        #print(discr._discr_scoped_subexpr_name_to_value.keys())
+        div = op.weak_local_div(discr,v)
+
+        #print(discr._discr_scoped_subexpr_name_to_value.keys())
+
+        neg_c_div = (-c)*div; del div
+
+        #print(discr._discr_scoped_subexpr_name_to_value.keys())
+        grad = op.weak_local_grad(discr,u)
+
+        neg_c_grad = (-c)*grad; del grad
+        obj_array = flat_obj_array(neg_c_div, neg_c_grad)
+
+        trace_pair1 = op.interior_trace_pair(discr, w)
+        wave_flux1 = wave_flux(discr, c=c, w_tpair=trace_pair1)
+        del trace_pair1
+
+        trace_pair2 = TracePair(BTAG_ALL, interior=dir_bval, exterior=dir_bc)
+        wave_flux2 = wave_flux(discr, c=c, w_tpair=trace_pair2)
+        del trace_pair2
+        del dir_bc
+        del neg_dir_u
+        del dir_v
+        del dir_bval
+
+        wave_flux_sum = wave_flux1 + wave_flux2;
+        """
+        print("####################")
+        print(type(wave_flux_sum))
+        for entry in wave_flux_sum:
+            print(type(entry))
+            print(entry._data.shape)
+        """
+
+        del wave_flux1
+        del wave_flux2
+
+        face_mass = op.face_mass(discr, wave_flux_sum)
+        del wave_flux_sum
+
+        inverse_arg = obj_array + face_mass
+        """
+        print("@@@@@@@@@@@@@@@@@@@@@")
+        print(type(inverse_arg))
+        for entry in inverse_arg:
+            print(type(entry))
+            print(type(entry._data))
+            print(len(entry._data))
+            print(entry._data[0].shape)
+        exit()
+        """
+
+        del obj_array
+        del face_mass
+        del neg_c_div
+        del neg_c_grad
+
+        result = op.inverse_mass(discr,inverse_arg)
+        del inverse_arg
+
+        """
+        # Original version
+        dir_u = discr.project("vol", BTAG_ALL, u)
+        dir_v = discr.project("vol", BTAG_ALL, v)
+        dir_bval = flat_obj_array(dir_u, dir_v)
+        dir_bc = flat_obj_array(-dir_u, dir_v)
+     
+        return (
+                discr.inverse_mass(
+                    flat_obj_array(
+                        -c*discr.weak_div(v),
+                        -c*discr.weak_grad(u)
+                        )
+                    +  # noqa: W504
+                    discr.face_mass(
+                        wave_flux(discr, c=c, w_tpair=op.interior_trace_pair(discr, w))
+                        + wave_flux(discr, c=c, w_tpair=TracePair(
+                            BTAG_ALL, interior=dir_bval, exterior=dir_bc))
+                        ))
+                    )
+        """
+
+        from time import sleep
+        sleep(3)
+        #print_allocated_arrays() 
+
+        scoped = discr._discr_scoped_subexpr_name_to_value
+        print(len(scoped.items()))
+        print(scoped.keys())
+        sum = 0
+        for value in scoped.values():
+            #print(type(value))
+            if isinstance(value._data, tuple):
+                for entry in value._data:
+                    print(entry.shape)
+                    sum += entry.shape[0]*entry.shape[1]*8
+            else:
+                print(value._data.shape)
+                sum += value._data.shape[0]*value_data.shape[1]*8
+        print(sum / 1e9)
+        #exit()
+
+    except MemoryError:
+        for key, value in Array.alloc_dict.items():
+            print("{} {}".format(key, value[1]/1e9))
+            for entry in value[0]:
+                print(entry)
+            print()
+        exit() 
+
+
+    return (result)
+#'''
+
+"""
+def wave_operator(dcoll, c, w):
+    u = w[0]
+    v = w[1:]
+
+    dir_u = op.project(dcoll, "vol", BTAG_ALL, u)
+    dir_v = op.project(dcoll, "vol", BTAG_ALL, v)
+    dir_bval = flat_obj_array(dir_u, dir_v)
+    dir_bc = flat_obj_array(-dir_u, dir_v)
+
+    return (
+            op.inverse_mass(dcoll,
+                flat_obj_array(
+                    -c*op.weak_local_div(dcoll, v),
+                    -c*op.weak_local_grad(dcoll, u)
+                    )
+                +  # noqa: W504
+                op.face_mass(dcoll,
+                    wave_flux(dcoll, c=c, w_tpair=op.interior_trace_pair(dcoll, w))
+                    + wave_flux(dcoll, c=c, w_tpair=TracePair(
+                        BTAG_ALL, interior=dir_bval, exterior=dir_bc))
+                    ))
+                )
+"""
+# }}}
+
+
+def rk4_step(y, t, h, f):
+    k1 = f(t, y)
+    kSum = k1
+    h2k1 = (h/2)*k1
+    del k1
+    yph2k1 = y + h2k1
+    del h2k1
+    k2 = f(t+h/2, y + yph2k1)
+    #k2 = f(t+h/2, y + h/2*k1)
+    twok2 = 2*k2
+    kSum = kSum + twok2
+    del twok2
+    h2k2 = (h/2)*k2
+    del k2
+    yph2k2 = y + h2k2
+    k3 = f(t+h/2, yph2k2)
+    #k3 = f(t+h/2, y + h/2*k2)
+    twok3 = 2*k3
+    kSum = kSum + twok3
+    del twok3
+    hk3 = h*k3
+    del k3
+    yphk3 = y + hk3
+    del hk3
+    k4 = f(t+h, yphk3)
+    kSum = kSum + k4
+    del k4
+    h6kSum = (h/6)*kSum
+    del kSum
+    return y + h6kSum
+    #return y + h/6*(k1 + 2*k2 + 2*k3 + k4)
+
+
+def bump(actx, dcoll, t=0):
+    source_center = np.array([0.2, 0.35, 0.1])[:dcoll.dim]
+    source_width = 0.05
+    source_omega = 3
+
+    nodes = thaw(actx, dcoll.nodes())
+    center_dist = flat_obj_array([
+        nodes[i] - source_center[i]
+        for i in range(dcoll.dim)
+        ])
+
+    return (
+        np.cos(source_omega*t)
+        * actx.np.exp(
+            -np.dot(center_dist, center_dist)
+            / source_width**2))
+
+
+def main():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    from pyopencl.tools import ImmediateAllocator
+    actx = AutoTuningArrayContext(queue, allocator=ImmediateAllocator(queue))
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+
+    dim = 3
+    order = 2
+
+    #nel_1d = 2**5
+    #mesh = generate_regular_rect_mesh(
+    #        coord_dtype=np.float64,
+    #        a=(-0.5,)*dim,
+    #        b=(0.5,)*dim,
+    #        nelements_per_axis=(nel_1d,)*dim)
+    #print(mesh.nelements)
+
+    #exit()
+
+    #target_num_points = 11010048
+    #target_num_points = 9000000
+    target_num_points = 6000000 # Order fails assertion with more than this
+    order_points_mapping = {2:10, 3:20, 4:35, 5:56, 6:84, 7:120}
+
+
+    cur_points = 0
+    cur_points_old = 0
+    nel_1d = 0
+    mesh_old = None
+    mesh = None
+    while cur_points < target_num_points:
+        print(cur_points)
+        nel_1d += 1
+        mesh_old = mesh
+        mesh = generate_regular_rect_mesh(
+                coord_dtype=np.float64,
+                a=(-0.5,)*dim,
+                b=(0.5,)*dim,
+                nelements_per_axis=(nel_1d,)*dim)
+        cur_points_old = cur_points
+        cur_points = order_points_mapping[order]*mesh.nelements
+
+    # Pick whichever is closer
+    if (target_num_points - cur_points_old) < (cur_points - target_num_points):
+        mesh = mesh_old
+        nel_1d -= 1       
+
+    print(mesh.nelements)
+    #exit()
+
+    #nel_1d = #2**5 # Order 6 runs out of memory with 2**5
+
+    #for nel_1d in 2**np.arange(6,dtype=np.int32):
+    #from meshmode.mesh.generation import generate_regular_rect_mesh
+    #mesh = generate_regular_rect_mesh(
+    #        coord_dtype=np.float64,
+    #        a=(-0.5,)*dim,
+    #        b=(0.5,)*dim,
+    #        nelements_per_axis=(nel_1d,)*dim)
+
+        #print("%d elements" % mesh.nelements)
+    #print(mesh.nelements*np.array([10,20,35,56,84,120]))
+
+    #exit()
+
+    if dim == 2:
+        # no deep meaning here, just a fudge factor
+        dt = 0.7/(nel_1d*order**2)
+    elif dim == 3:
+        # no deep meaning here, just a fudge factor
+        dt = 0.45/(nel_1d*order**2)
+    else:
+        raise ValueError("don't have a stable time step guesstimate")
+
+
+    dcoll = DiscretizationCollection(actx, mesh, order=order)
+
+    fields = flat_obj_array(
+            bump(actx, dcoll),
+            [dcoll.zeros(actx) for i in range(dcoll.dim)]
+            )
+
+    vis = make_visualizer(dcoll)
+
+    for field in fields:
+        print(field[0][0].shape)
+
+    def rhs(t, w):
+        return wave_operator(dcoll, c=1, w=w)
+
+    t = 0
+    t_final = (21)*dt
+    istep = 0
+    start = time()
+    nsteps = 0
+
+    nelements, ndofs = fields[0][0].shape
+    npts = nelements*ndofs
+    print(npts)
+    #exit()
+
+    while t < t_final:
+
+        print(f"===========TIME STEP {istep}===========")
+        fields = rk4_step(fields, t, dt, rhs)
+
+        if istep % 100 == 0:
+            print(f"step: {istep} t: {t} L2: {op.norm(dcoll, fields[0], 2)} "
+                  f"sol max: {op.nodal_max(dcoll, 'vol', fields[0])}")
+            vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep,
+                    [
+                        ("u", fields[0]),
+                        ("v", fields[1:]),
+                        ])
+
+        print(f"===========END TIME STEP {istep}===========")
+        istep += 1
+        t = istep*dt
+        nsteps += 1
+
+        # Should compare against base version at some point
+        #assert op.norm(dcoll, fields[0], 2) < 1
+    end = time()
+    diff = end - start
+    nelements, ndofs = fields[0][0].shape
+    npts = nelements*ndofs
+    time_per_timestep_per_point = diff / nsteps / npts
+    print(f"AVERAGE STEP TIME PER POINT: {time_per_timestep_per_point}")
+
+if __name__ == "__main__":
+    main()
+
+# vim: foldmethod=marker
diff --git a/grudge/__init__.py b/grudge/__init__.py
index aad8dbd1c..fa4a2b3b7 100644
--- a/grudge/__init__.py
+++ b/grudge/__init__.py
@@ -20,8 +20,9 @@
 THE SOFTWARE.
 """
 
-from grudge.discretization import DiscretizationCollection
+from grudge.discretization import (
+        DiscretizationCollection, make_discretization_collection)
 
 __all__ = [
-    "DiscretizationCollection"
+    "DiscretizationCollection", "make_discretization_collection"
 ]
diff --git a/grudge/discretization.py b/grudge/discretization.py
index 43bd24226..9cb9f5a3f 100644
--- a/grudge/discretization.py
+++ b/grudge/discretization.py
@@ -1,7 +1,13 @@
 """
-.. currentmodule:: grudge
 
+.. autoclass:: DiscretizationTag
+
+.. currentmodule:: grudge
 .. autoclass:: DiscretizationCollection
+.. autofunction:: make_discretization_collection
+
+.. currentmodule:: grudge.discretization
+.. autoclass:: PartID
 """
 
 __copyright__ = """
@@ -29,30 +35,174 @@
 THE SOFTWARE.
 """
 
-from pytools import memoize_method
+from typing import Sequence, Mapping, Optional, Union, Tuple, TYPE_CHECKING, Any
+
+from pytools import memoize_method, single_valued
+
+from dataclasses import dataclass, replace
 
 from grudge.dof_desc import (
-    DD_VOLUME,
-    DISCR_TAG_BASE,
-    DISCR_TAG_MODAL,
-    DTAG_BOUNDARY,
-    DOFDesc,
-    as_dofdesc
+        VTAG_ALL,
+        DD_VOLUME_ALL,
+        DISCR_TAG_BASE,
+        DISCR_TAG_MODAL,
+        VolumeDomainTag, BoundaryDomainTag,
+        DOFDesc,
+        VolumeTag, DomainTag,
+        DiscretizationTag,
+        as_dofdesc,
+        ConvertibleToDOFDesc
 )
 
 import numpy as np  # noqa: F401
 
 from arraycontext import ArrayContext
 
+from meshmode.discretization import ElementGroupFactory, Discretization
 from meshmode.discretization.connection import (
     FACE_RESTR_INTERIOR,
     FACE_RESTR_ALL,
-    make_face_restriction
+    make_face_restriction,
+    DiscretizationConnection
 )
 from meshmode.mesh import Mesh, BTAG_PARTITION
+from meshmode.dof_array import DOFArray
 
 from warnings import warn
 
+if TYPE_CHECKING:
+    import mpi4py.MPI
+
+
+@dataclass(frozen=True)
+class PartID:
+    """Unique identifier for a piece of a partitioned mesh.
+
+    .. attribute:: volume_tag
+
+        The volume of the part.
+
+    .. attribute:: rank
+
+        The (optional) MPI rank of the part.
+
+    """
+    volume_tag: VolumeTag
+    rank: Optional[int] = None
+
+
+# {{{ part ID normalization
+
+def _normalize_mesh_part_ids(
+        mesh: Mesh,
+        volume_tags: Sequence[VolumeTag],
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None):
+    """Convert a mesh's configuration-dependent "part ID" into a fixed type."""
+    from numbers import Integral
+    if VTAG_ALL not in volume_tags:
+        # Multi-volume
+        if mpi_communicator is not None:
+            # Accept PartID
+            def as_part_id(mesh_part_id):
+                if isinstance(mesh_part_id, PartID):
+                    return mesh_part_id
+                else:
+                    raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+        else:
+            # Accept PartID or volume tag
+            def as_part_id(mesh_part_id):
+                if isinstance(mesh_part_id, PartID):
+                    return mesh_part_id
+                elif mesh_part_id in volume_tags:
+                    return PartID(mesh_part_id)
+                else:
+                    raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+    else:
+        # Single-volume
+        if mpi_communicator is not None:
+            # Accept PartID or rank
+            def as_part_id(mesh_part_id):
+                if isinstance(mesh_part_id, PartID):
+                    return mesh_part_id
+                elif isinstance(mesh_part_id, Integral):
+                    return PartID(VTAG_ALL, int(mesh_part_id))
+                else:
+                    raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+        else:
+            # Shouldn't be called
+            def as_part_id(mesh_part_id):
+                raise TypeError(f"Unable to convert {mesh_part_id} to PartID.")
+
+    facial_adjacency_groups = mesh.facial_adjacency_groups
+
+    new_facial_adjacency_groups = []
+
+    from meshmode.mesh import InterPartAdjacencyGroup
+    for grp_list in facial_adjacency_groups:
+        new_grp_list = []
+        for fagrp in grp_list:
+            if isinstance(fagrp, InterPartAdjacencyGroup):
+                part_id = as_part_id(fagrp.part_id)
+                new_fagrp = replace(
+                    fagrp,
+                    boundary_tag=BTAG_PARTITION(part_id),
+                    part_id=part_id)
+            else:
+                new_fagrp = fagrp
+            new_grp_list.append(new_fagrp)
+        new_facial_adjacency_groups.append(new_grp_list)
+
+    return mesh.copy(facial_adjacency_groups=new_facial_adjacency_groups)
+
+# }}}
+
+
+# {{{ discr_tag_to_group_factory normalization
+
+def _normalize_discr_tag_to_group_factory(
+        dim: int,
+        discr_tag_to_group_factory: Optional[
+            Mapping[DiscretizationTag, ElementGroupFactory]],
+        order: Optional[int]
+        ) -> Mapping[DiscretizationTag, ElementGroupFactory]:
+    from meshmode.discretization.poly_element import \
+            default_simplex_group_factory
+
+    if discr_tag_to_group_factory is None:
+        if order is None:
+            raise TypeError(
+                "one of 'order' and 'discr_tag_to_group_factory' must be given"
+            )
+
+        discr_tag_to_group_factory = {
+                DISCR_TAG_BASE: default_simplex_group_factory(
+                    base_dim=dim, order=order)}
+    else:
+        discr_tag_to_group_factory = dict(discr_tag_to_group_factory)
+
+        if order is not None:
+            if DISCR_TAG_BASE in discr_tag_to_group_factory:
+                raise ValueError(
+                    "if 'order' is given, 'discr_tag_to_group_factory' must "
+                    "not have a key of DISCR_TAG_BASE"
+                )
+
+            discr_tag_to_group_factory[DISCR_TAG_BASE] = \
+                    default_simplex_group_factory(base_dim=dim, order=order)
+
+    assert discr_tag_to_group_factory is not None
+
+    # Modal discr should always come from the base discretization
+    if DISCR_TAG_MODAL not in discr_tag_to_group_factory:
+        discr_tag_to_group_factory[DISCR_TAG_MODAL] = \
+            _generate_modal_group_factory(
+                discr_tag_to_group_factory[DISCR_TAG_BASE]
+            )
+
+    return discr_tag_to_group_factory
+
+# }}}
+
 
 class DiscretizationCollection:
     """A collection of discretizations, defined on the same underlying
@@ -60,11 +210,13 @@ class DiscretizationCollection:
     (volume, interior facets, boundaries) and associated element
     groups.
 
-    .. automethod:: __init__
+    .. note::
+
+        Do not call the constructor directly. Use
+        :func:`make_discretization_collection` instead.
 
     .. autoattribute:: dim
     .. autoattribute:: ambient_dim
-    .. autoattribute:: mesh
     .. autoattribute:: real_dtype
     .. autoattribute:: complex_dtype
 
@@ -84,11 +236,16 @@ class DiscretizationCollection:
 
     # {{{ constructor
 
-    def __init__(self, array_context: ArrayContext, mesh: Mesh,
-                 order=None,
-                 discr_tag_to_group_factory=None, mpi_communicator=None,
-                 # FIXME: `quad_tag_to_group_factory` is deprecated
-                 quad_tag_to_group_factory=None):
+    def __init__(self, array_context: ArrayContext,
+            volume_discrs: Union[Mesh, Mapping[VolumeTag, Discretization]],
+            order: Optional[int] = None,
+            discr_tag_to_group_factory: Optional[
+                Mapping[DiscretizationTag, ElementGroupFactory]] = None,
+            mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None,
+            inter_part_connections: Optional[
+                Mapping[Tuple[PartID, PartID],
+                    DiscretizationConnection]] = None,
+            ) -> None:
         """
         :arg discr_tag_to_group_factory: A mapping from discretization tags
             (typically one of: :class:`grudge.dof_desc.DISCR_TAG_BASE`,
@@ -101,63 +258,8 @@ def __init__(self, array_context: ArrayContext, mesh: Mesh,
             discretization.
         """
 
-        if (quad_tag_to_group_factory is not None
-                and discr_tag_to_group_factory is not None):
-            raise ValueError(
-                "Both `quad_tag_to_group_factory` and `discr_tag_to_group_factory` "
-                "are specified. Use `discr_tag_to_group_factory` instead."
-            )
-
-        # FIXME: `quad_tag_to_group_factory` is deprecated
-        if (quad_tag_to_group_factory is not None
-                and discr_tag_to_group_factory is None):
-            warn("`quad_tag_to_group_factory` is a deprecated kwarg and will "
-                 "be dropped in version 2022.x. Use `discr_tag_to_group_factory` "
-                 "instead.",
-                 DeprecationWarning, stacklevel=2)
-            discr_tag_to_group_factory = quad_tag_to_group_factory
-
         self._setup_actx = array_context.clone()
 
-        from meshmode.discretization.poly_element import \
-                default_simplex_group_factory
-
-        if discr_tag_to_group_factory is None:
-            if order is None:
-                raise TypeError(
-                    "one of 'order' and 'discr_tag_to_group_factory' must be given"
-                )
-
-            discr_tag_to_group_factory = {
-                    DISCR_TAG_BASE: default_simplex_group_factory(
-                        base_dim=mesh.dim, order=order)}
-        else:
-            if order is not None:
-                discr_tag_to_group_factory = discr_tag_to_group_factory.copy()
-                if DISCR_TAG_BASE in discr_tag_to_group_factory:
-                    raise ValueError(
-                        "if 'order' is given, 'discr_tag_to_group_factory' must "
-                        "not have a key of DISCR_TAG_BASE"
-                    )
-
-                discr_tag_to_group_factory[DISCR_TAG_BASE] = \
-                        default_simplex_group_factory(base_dim=mesh.dim, order=order)
-
-        # Modal discr should always come from the base discretization
-        discr_tag_to_group_factory[DISCR_TAG_MODAL] = \
-            _generate_modal_group_factory(
-                discr_tag_to_group_factory[DISCR_TAG_BASE]
-            )
-
-        self.discr_tag_to_group_factory = discr_tag_to_group_factory
-
-        from meshmode.discretization import Discretization
-
-        self._volume_discr = Discretization(
-            array_context, mesh,
-            self.group_factory_for_discretization_tag(DISCR_TAG_BASE)
-        )
-
         # {{{ process mpi_communicator argument
 
         if mpi_communicator is not None:
@@ -181,9 +283,60 @@ def __init__(self, array_context: ArrayContext, mesh: Mesh,
 
         # }}}
 
-        self._dist_boundary_connections = \
-                self._set_up_distributed_communication(
-                        mpi_communicator, array_context)
+        from meshmode.discretization import Discretization
+
+        if isinstance(volume_discrs, Mesh):
+            # {{{ deprecated backward compatibility yuck
+
+            warn("Calling the DiscretizationCollection constructor directly "
+                    "is deprecated, call make_discretization_collection "
+                    "instead. This will stop working in 2023.",
+                    DeprecationWarning, stacklevel=2)
+
+            mesh = volume_discrs
+
+            mesh = _normalize_mesh_part_ids(
+                mesh, [VTAG_ALL], mpi_communicator=mpi_communicator)
+
+            discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory(
+                    dim=mesh.dim,
+                    discr_tag_to_group_factory=discr_tag_to_group_factory,
+                    order=order)
+            self._discr_tag_to_group_factory = discr_tag_to_group_factory
+
+            volume_discr = Discretization(
+                        array_context, mesh,
+                        self.group_factory_for_discretization_tag(DISCR_TAG_BASE))
+            volume_discrs = {VTAG_ALL: volume_discr}
+
+            del mesh
+
+            if inter_part_connections is not None:
+                raise TypeError("may not pass inter_part_connections when "
+                        "DiscretizationCollection constructor is called in "
+                        "legacy mode")
+
+            self._inter_part_connections = \
+                    _set_up_inter_part_connections(
+                            array_context=self._setup_actx,
+                            mpi_communicator=mpi_communicator,
+                            volume_discrs=volume_discrs,
+                            base_group_factory=(
+                                discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
+            # }}}
+        else:
+            assert discr_tag_to_group_factory is not None
+            self._discr_tag_to_group_factory = discr_tag_to_group_factory
+
+            if inter_part_connections is None:
+                raise TypeError("inter_part_connections must be passed when "
+                        "DiscretizationCollection constructor is called in "
+                        "'modern' mode")
+
+            self._inter_part_connections = inter_part_connections
+
+        self._volume_discrs = volume_discrs
 
     # }}}
 
@@ -196,16 +349,6 @@ def mpi_communicator(self):
 
         return self._mpi_communicator
 
-    @property
-    def quad_tag_to_group_factory(self):
-        warn("`DiscretizationCollection.quad_tag_to_group_factory` "
-             "is deprecated and will go away in 2022. Use "
-             "`DiscretizationCollection.discr_tag_to_group_factory` "
-             "instead.",
-             DeprecationWarning, stacklevel=2)
-
-        return self.discr_tag_to_group_factory
-
     def get_management_rank_index(self):
         return 0
 
@@ -216,86 +359,12 @@ def is_management_rank(self):
             return self.mpi_communicator.Get_rank() \
                     == self.get_management_rank_index()
 
-    # {{{ distributed
-
-    def _set_up_distributed_communication(self, mpi_communicator, array_context):
-        from_dd = DOFDesc("vol", DISCR_TAG_BASE)
-
-        boundary_connections = {}
-
-        from meshmode.distributed import get_connected_partitions
-        connected_parts = get_connected_partitions(self._volume_discr.mesh)
-
-        if connected_parts:
-            if mpi_communicator is None:
-                raise RuntimeError("must supply an MPI communicator when using a "
-                    "distributed mesh")
-
-            grp_factory = \
-                self.group_factory_for_discretization_tag(DISCR_TAG_BASE)
-
-            local_boundary_connections = {}
-            for i_remote_part in connected_parts:
-                local_boundary_connections[i_remote_part] = self.connection_from_dds(
-                        from_dd, DOFDesc(BTAG_PARTITION(i_remote_part),
-                        DISCR_TAG_BASE))
-
-            from meshmode.distributed import MPIBoundaryCommSetupHelper
-            with MPIBoundaryCommSetupHelper(mpi_communicator, array_context,
-                    local_boundary_connections, grp_factory) as bdry_setup_helper:
-                while True:
-                    conns = bdry_setup_helper.complete_some()
-                    if not conns:
-                        break
-                    for i_remote_part, conn in conns.items():
-                        boundary_connections[i_remote_part] = conn
-
-        return boundary_connections
-
-    def get_distributed_boundary_swap_connection(self, dd):
-        warn("`DiscretizationCollection.get_distributed_boundary_swap_connection` "
-             "is deprecated and will go away in 2022. Use "
-             "`DiscretizationCollection.distributed_boundary_swap_connection` "
-             "instead.",
-             DeprecationWarning, stacklevel=2)
-        return self.distributed_boundary_swap_connection(dd)
-
-    def distributed_boundary_swap_connection(self, dd):
-        """Provides a mapping from the base volume discretization
-        to the exterior boundary restriction on a parallel boundary
-        partition described by *dd*. This connection is used to
-        communicate across element boundaries in different parallel
-        partitions during distributed runs.
-
-        :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value
-            convertible to one. The domain tag must be a subclass
-            of :class:`grudge.dof_desc.DTAG_BOUNDARY` with an
-            associated :class:`meshmode.mesh.BTAG_PARTITION`
-            corresponding to a particular communication rank.
-        """
-        if dd.discretization_tag is not DISCR_TAG_BASE:
-            # FIXME
-            raise NotImplementedError(
-                "Distributed communication with discretization tag "
-                f"{dd.discretization_tag} is not implemented."
-            )
-
-        assert isinstance(dd.domain_tag, DTAG_BOUNDARY)
-        assert isinstance(dd.domain_tag.tag, BTAG_PARTITION)
-
-        return self._dist_boundary_connections[dd.domain_tag.tag.part_nr]
-
-    # }}}
-
     # {{{ discr_from_dd
 
     @memoize_method
-    def discr_from_dd(self, dd):
+    def discr_from_dd(self, dd: "ConvertibleToDOFDesc") -> Discretization:
         """Provides a :class:`meshmode.discretization.Discretization`
         object from *dd*.
-
-        :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value
-            convertible to one.
         """
         dd = as_dofdesc(dd)
 
@@ -305,45 +374,43 @@ def discr_from_dd(self, dd):
             return self._modal_discr(dd.domain_tag)
 
         if dd.is_volume():
-            if discr_tag is not DISCR_TAG_BASE:
-                return self._discr_tag_volume_discr(discr_tag)
-            return self._volume_discr
+            return self._volume_discr_from_dd(dd)
 
         if discr_tag is not DISCR_TAG_BASE:
-            no_quad_discr = self.discr_from_dd(DOFDesc(dd.domain_tag))
+            base_discr = self.discr_from_dd(dd.with_discr_tag(DISCR_TAG_BASE))
 
             from meshmode.discretization import Discretization
             return Discretization(
                 self._setup_actx,
-                no_quad_discr.mesh,
+                base_discr.mesh,
                 self.group_factory_for_discretization_tag(discr_tag)
             )
 
         assert discr_tag is DISCR_TAG_BASE
 
-        if dd.domain_tag is FACE_RESTR_ALL:
-            return self._all_faces_volume_connection().to_discr
-        elif dd.domain_tag is FACE_RESTR_INTERIOR:
-            return self._interior_faces_connection().to_discr
-        elif dd.is_boundary_or_partition_interface():
-            return self._boundary_connection(dd.domain_tag.tag).to_discr
+        if isinstance(dd.domain_tag, BoundaryDomainTag):
+            if dd.domain_tag.tag in [FACE_RESTR_ALL, FACE_RESTR_INTERIOR]:
+                return self._faces_connection(dd.domain_tag).to_discr
+            else:
+                return self._boundary_connection(dd.domain_tag).to_discr
         else:
-            raise ValueError("DOF desc tag not understood: " + str(dd))
+            raise ValueError(f"DOF desc not understood: {dd}")
 
     # }}}
 
     # {{{ _base_to_geoderiv_connection
 
     @memoize_method
-    def _has_affine_groups(self):
+    def _has_affine_groups(self, domain_tag: DomainTag) -> bool:
         from modepy.shapes import Simplex
+        discr = self.discr_from_dd(DOFDesc(domain_tag, DISCR_TAG_BASE))
         return any(
                 megrp.is_affine
                 and issubclass(megrp._modepy_shape_cls, Simplex)
-                for megrp in self._volume_discr.mesh.groups)
+                for megrp in discr.mesh.groups)
 
     @memoize_method
-    def _base_to_geoderiv_connection(self, dd: DOFDesc):
+    def _base_to_geoderiv_connection(self, dd: DOFDesc) -> DiscretizationConnection:
         r"""The "geometry derivatives" discretization for a given *dd* is
         typically identical to the one returned by :meth:`discr_from_dd`,
         however for affinely-mapped simplicial elements, it will use a
@@ -356,7 +423,7 @@ def _base_to_geoderiv_connection(self, dd: DOFDesc):
         :mod:`grudge`.
         """
         base_discr = self.discr_from_dd(dd)
-        if not self._has_affine_groups():
+        if not self._has_affine_groups(dd.domain_tag):
             # no benefit to having another discretization that takes
             # advantage of affine-ness
             from meshmode.discretization.connection import \
@@ -393,7 +460,9 @@ def geo_group_factory(megrp, index):
     # {{{ connection_from_dds
 
     @memoize_method
-    def connection_from_dds(self, from_dd, to_dd):
+    def connection_from_dds(
+            self, from_dd: "ConvertibleToDOFDesc", to_dd: "ConvertibleToDOFDesc"
+            ) -> DiscretizationConnection:
         """Provides a mapping (connection) from one discretization to
         another, e.g. from the volume to the boundary, or from the
         base to the an overintegrated quadrature discretization, or from
@@ -425,12 +494,15 @@ def connection_from_dds(self, from_dd, to_dd):
         assert (to_discr_tag is not DISCR_TAG_MODAL
                     and from_discr_tag is not DISCR_TAG_MODAL)
 
-        if (not from_dd.is_volume()
+        if (isinstance(from_dd.domain_tag, BoundaryDomainTag)
                 and from_discr_tag == to_discr_tag
-                and to_dd.domain_tag is FACE_RESTR_ALL):
+                and isinstance(to_dd.domain_tag, BoundaryDomainTag)
+                and to_dd.domain_tag.tag is FACE_RESTR_ALL):
             faces_conn = self.connection_from_dds(
-                    DOFDesc("vol"),
-                    DOFDesc(from_dd.domain_tag))
+                    DOFDesc(
+                        VolumeDomainTag(from_dd.domain_tag.volume_tag),
+                        DISCR_TAG_BASE),
+                    from_dd.with_discr_tag(DISCR_TAG_BASE))
 
             from meshmode.discretization.connection import \
                     make_face_to_all_faces_embedding
@@ -448,7 +520,7 @@ def connection_from_dds(self, from_dd, to_dd):
 
             from meshmode.discretization.connection import \
                     ChainedDiscretizationConnection
-            intermediate_dd = DOFDesc(to_dd.domain_tag)
+            intermediate_dd = to_dd.with_discr_tag(DISCR_TAG_BASE)
             return ChainedDiscretizationConnection(
                     [
                         # first change domain
@@ -482,73 +554,79 @@ def connection_from_dds(self, from_dd, to_dd):
         # }}}
 
         if from_discr_tag is not DISCR_TAG_BASE:
-            raise ValueError("cannot interpolate *from* a "
-                    "(non-interpolatory) quadrature grid")
+            raise ValueError("cannot get a connection *from* a "
+                    f"(non-interpolatory) quadrature grid: '{from_dd}'")
 
         assert to_discr_tag is DISCR_TAG_BASE
 
-        if from_dd.is_volume():
-            if to_dd.domain_tag is FACE_RESTR_ALL:
-                return self._all_faces_volume_connection()
-            if to_dd.domain_tag is FACE_RESTR_INTERIOR:
-                return self._interior_faces_connection()
-            elif to_dd.is_boundary_or_partition_interface():
-                assert from_discr_tag is DISCR_TAG_BASE
-                return self._boundary_connection(to_dd.domain_tag.tag)
+        if isinstance(from_dd.domain_tag, VolumeDomainTag):
+            if isinstance(to_dd.domain_tag, BoundaryDomainTag):
+                if to_dd.domain_tag.volume_tag != from_dd.domain_tag.tag:
+                    raise ValueError("cannot get a connection from one volume "
+                            f"('{from_dd.domain_tag.tag}') "
+                            "to the boundary of another volume "
+                            f"('{to_dd.domain_tag.volume_tag}') ")
+                if to_dd.domain_tag.tag in [FACE_RESTR_ALL, FACE_RESTR_INTERIOR]:
+                    return self._faces_connection(to_dd.domain_tag)
+                elif isinstance(to_dd.domain_tag, BoundaryDomainTag):
+                    assert from_discr_tag is DISCR_TAG_BASE
+                    return self._boundary_connection(to_dd.domain_tag)
             elif to_dd.is_volume():
+                if to_dd.domain_tag != from_dd.domain_tag:
+                    raise ValueError("cannot get a connection between "
+                            "volumes of different tags: requested "
+                            f"'{from_dd.domain_tag}' -> '{to_dd.domain_tag}'")
+
                 from meshmode.discretization.connection import \
                         make_same_mesh_connection
-                to_discr = self._discr_tag_volume_discr(to_discr_tag)
-                from_discr = self._volume_discr
-                return make_same_mesh_connection(self._setup_actx, to_discr,
-                            from_discr)
+                return make_same_mesh_connection(
+                        self._setup_actx,
+                        self._volume_discr_from_dd(to_dd),
+                        self._volume_discr_from_dd(from_dd))
 
             else:
-                raise ValueError("cannot interpolate from volume to: " + str(to_dd))
+                raise ValueError(
+                        f"cannot get a connection from volume to: '{to_dd}'")
 
         else:
-            raise ValueError("cannot interpolate from: " + str(from_dd))
+            raise ValueError(f"cannot get a connection from: '{from_dd}'")
 
     # }}}
 
     # {{{ group_factory_for_discretization_tag
 
-    def group_factory_for_quadrature_tag(self, discretization_tag):
-        warn("`DiscretizationCollection.group_factory_for_quadrature_tag` "
-             "is deprecated and will go away in 2022. Use "
-             "`DiscretizationCollection.group_factory_for_discretization_tag` "
-             "instead.",
-             DeprecationWarning, stacklevel=2)
-
-        return self.group_factory_for_discretization_tag(discretization_tag)
-
     def group_factory_for_discretization_tag(self, discretization_tag):
-        """
-        OK to override in user code to control mode/node choice.
-        """
         if discretization_tag is None:
             discretization_tag = DISCR_TAG_BASE
 
-        return self.discr_tag_to_group_factory[discretization_tag]
+        return self._discr_tag_to_group_factory[discretization_tag]
 
     # }}}
 
+    # {{{ (internal) discretization getters
+
     @memoize_method
-    def _discr_tag_volume_discr(self, discretization_tag):
-        assert discretization_tag is not None
+    def _volume_discr_from_dd(self, dd: DOFDesc) -> Discretization:
+        assert isinstance(dd.domain_tag, VolumeDomainTag)
+
+        try:
+            base_volume_discr = self._volume_discrs[dd.domain_tag.tag]
+        except KeyError:
+            raise ValueError("a volume discretization with volume tag "
+                    f"'{dd.domain_tag.tag}' is not known")
 
         # Refuse to re-make the volume discretization
-        if discretization_tag is DISCR_TAG_BASE:
-            return self._volume_discr
+        if dd.discretization_tag is DISCR_TAG_BASE:
+            return base_volume_discr
 
         from meshmode.discretization import Discretization
         return Discretization(
-            self._setup_actx, self._volume_discr.mesh,
-            self.group_factory_for_discretization_tag(discretization_tag)
+            self._setup_actx, base_volume_discr.mesh,
+            self.group_factory_for_discretization_tag(dd.discretization_tag)
         )
 
     @memoize_method
-    def _modal_discr(self, domain_tag):
+    def _modal_discr(self, domain_tag) -> Discretization:
         from meshmode.discretization import Discretization
 
         discr_base = self.discr_from_dd(DOFDesc(domain_tag, DISCR_TAG_BASE))
@@ -557,10 +635,12 @@ def _modal_discr(self, domain_tag):
             self.group_factory_for_discretization_tag(DISCR_TAG_MODAL)
         )
 
+    # }}}
+
     # {{{ connection factories: modal<->nodal
 
     @memoize_method
-    def _modal_to_nodal_connection(self, to_dd):
+    def _modal_to_nodal_connection(self, to_dd: DOFDesc) -> DiscretizationConnection:
         """
         :arg to_dd: a :class:`grudge.dof_desc.DOFDesc`
             describing the dofs corresponding to the
@@ -575,7 +655,8 @@ def _modal_to_nodal_connection(self, to_dd):
         )
 
     @memoize_method
-    def _nodal_to_modal_connection(self, from_dd):
+    def _nodal_to_modal_connection(
+            self, from_dd: DOFDesc) -> DiscretizationConnection:
         """
         :arg from_dd: a :class:`grudge.dof_desc.DOFDesc`
             describing the dofs corresponding to the
@@ -594,25 +675,31 @@ def _nodal_to_modal_connection(self, from_dd):
     # {{{ connection factories: boundary
 
     @memoize_method
-    def _boundary_connection(self, boundary_tag):
+    def _boundary_connection(
+            self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection:
         return make_face_restriction(
-            self._setup_actx,
-            self._volume_discr,
-            self.group_factory_for_discretization_tag(DISCR_TAG_BASE),
-            boundary_tag=boundary_tag
-        )
+                self._setup_actx,
+                self._volume_discr_from_dd(
+                    DOFDesc(VolumeDomainTag(domain_tag.volume_tag), DISCR_TAG_BASE)),
+                self.group_factory_for_discretization_tag(DISCR_TAG_BASE),
+                boundary_tag=domain_tag.tag)
 
     # }}}
 
-    # {{{ connection factories: interior faces
+    # {{{ connection factories: faces
 
     @memoize_method
-    def _interior_faces_connection(self):
+    def _faces_connection(
+            self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection:
+        assert domain_tag.tag in [FACE_RESTR_INTERIOR, FACE_RESTR_ALL]
+
         return make_face_restriction(
             self._setup_actx,
-            self._volume_discr,
+            self._volume_discr_from_dd(
+                DOFDesc(
+                    VolumeDomainTag(domain_tag.volume_tag), DISCR_TAG_BASE)),
             self.group_factory_for_discretization_tag(DISCR_TAG_BASE),
-            FACE_RESTR_INTERIOR,
+            domain_tag.tag,
 
             # FIXME: This will need to change as soon as we support
             # pyramids or other elements with non-identical face
@@ -621,7 +708,8 @@ def _interior_faces_connection(self):
         )
 
     @memoize_method
-    def opposite_face_connection(self):
+    def opposite_face_connection(
+            self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection:
         """Provides a mapping from the base volume discretization
         to the exterior boundary restriction on a neighboring element.
         This does not take into account parallel partitions.
@@ -629,93 +717,78 @@ def opposite_face_connection(self):
         from meshmode.discretization.connection import \
                 make_opposite_face_connection
 
+        assert domain_tag.tag is FACE_RESTR_INTERIOR
+
         return make_opposite_face_connection(
                 self._setup_actx,
-                self._interior_faces_connection())
+                self._faces_connection(domain_tag))
 
     # }}}
 
-    # {{{ connection factories: all-faces
-
-    @memoize_method
-    def _all_faces_volume_connection(self):
-        return make_face_restriction(
-            self._setup_actx,
-            self._volume_discr,
-            self.group_factory_for_discretization_tag(DISCR_TAG_BASE),
-            FACE_RESTR_ALL,
-
-            # FIXME: This will need to change as soon as we support
-            # pyramids or other elements with non-identical face
-            # types.
-            per_face_groups=False
-        )
-
-    # }}}
+    # {{{ properties
 
     @property
-    def dim(self):
+    def dim(self) -> int:
         """Return the topological dimension."""
-        return self._volume_discr.dim
+        return single_valued(discr.dim for discr in self._volume_discrs.values())
 
     @property
-    def ambient_dim(self):
+    def ambient_dim(self) -> int:
         """Return the dimension of the ambient space."""
-        return self._volume_discr.ambient_dim
+        return single_valued(
+                discr.ambient_dim for discr in self._volume_discrs.values())
 
     @property
-    def real_dtype(self):
+    def real_dtype(self) -> "np.dtype[Any]":
         """Return the data type used for real-valued arithmetic."""
-        return self._volume_discr.real_dtype
+        return single_valued(
+                discr.real_dtype for discr in self._volume_discrs.values())
 
     @property
-    def complex_dtype(self):
+    def complex_dtype(self) -> "np.dtype[Any]":
         """Return the data type used for complex-valued arithmetic."""
-        return self._volume_discr.complex_dtype
+        return single_valued(
+                discr.complex_dtype for discr in self._volume_discrs.values())
 
-    @property
-    def mesh(self):
-        """Return the :class:`meshmode.mesh.Mesh` over which the discretization
-        collection is built.
-        """
-        return self._volume_discr.mesh
+    # }}}
+
+    # {{{ array creators
 
-    def empty(self, array_context: ArrayContext, dtype=None):
+    def empty(self, array_context: ArrayContext, dtype=None,
+            *, dd: Optional[DOFDesc] = None) -> DOFArray:
         """Return an empty :class:`~meshmode.dof_array.DOFArray` defined at
-        the volume nodes: :class:`grudge.dof_desc.DD_VOLUME`.
+        the volume nodes: :class:`grudge.dof_desc.DD_VOLUME_ALL`.
 
         :arg array_context: an :class:`~arraycontext.context.ArrayContext`.
         :arg dtype: type special value 'c' will result in a
             vector of dtype :attr:`complex_dtype`. If
             *None* (the default), a real vector will be returned.
         """
-        return self._volume_discr.empty(array_context, dtype)
+        if dd is None:
+            dd = DD_VOLUME_ALL
+        return self.discr_from_dd(dd).empty(array_context, dtype)
 
-    def zeros(self, array_context: ArrayContext, dtype=None):
+    def zeros(self, array_context: ArrayContext, dtype=None,
+            *, dd: Optional[DOFDesc] = None) -> DOFArray:
         """Return a zero-initialized :class:`~meshmode.dof_array.DOFArray`
-        defined at the volume nodes, :class:`grudge.dof_desc.DD_VOLUME`.
+        defined at the volume nodes, :class:`grudge.dof_desc.DD_VOLUME_ALL`.
 
         :arg array_context: an :class:`~arraycontext.context.ArrayContext`.
         :arg dtype: type special value 'c' will result in a
             vector of dtype :attr:`complex_dtype`. If
             *None* (the default), a real vector will be returned.
         """
-        return self._volume_discr.zeros(array_context, dtype)
+        if dd is None:
+            dd = DD_VOLUME_ALL
+
+        return self.discr_from_dd(dd).zeros(array_context, dtype)
 
     def is_volume_where(self, where):
         return where is None or as_dofdesc(where).is_volume()
 
-    @property
-    def order(self):
-        warn("DiscretizationCollection.order is deprecated, "
-                "consider using the orders of element groups instead. "
-                "'order' will go away in 2021.",
-                DeprecationWarning, stacklevel=2)
-
-        from pytools import single_valued
-        return single_valued(egrp.order for egrp in self._volume_discr.groups)
+    # }}}
 
-    # {{{ Discretization-specific geometric properties
+    # {{{ discretization-specific geometric fields
 
     def nodes(self, dd=None):
         r"""Return the nodes of a discretization specified by *dd*.
@@ -725,7 +798,7 @@ def nodes(self, dd=None):
         :returns: an object array of frozen :class:`~meshmode.dof_array.DOFArray`\ s
         """
         if dd is None:
-            dd = DD_VOLUME
+            dd = DD_VOLUME_ALL
         return self.discr_from_dd(dd).nodes()
 
     def normal(self, dd):
@@ -741,14 +814,106 @@ def normal(self, dd):
     # }}}
 
 
-class DGDiscretizationWithBoundaries(DiscretizationCollection):
-    def __init__(self, *args, **kwargs):
-        warn("DGDiscretizationWithBoundaries is deprecated and will go away "
-                "in 2022. Use DiscretizationCollection instead.",
-                DeprecationWarning, stacklevel=2)
+# {{{ distributed/multi-volume setup
+
+def _set_up_inter_part_connections(
+        array_context: ArrayContext,
+        mpi_communicator: Optional["mpi4py.MPI.Intracomm"],
+        volume_discrs: Mapping[VolumeTag, Discretization],
+        base_group_factory: ElementGroupFactory,
+        ) -> Mapping[
+                Tuple[PartID, PartID],
+                DiscretizationConnection]:
+
+    from meshmode.distributed import (get_connected_parts,
+            make_remote_group_infos, InterRankBoundaryInfo,
+            MPIBoundaryCommSetupHelper)
+
+    rank = mpi_communicator.Get_rank() if mpi_communicator is not None else None
+
+    # Save boundary restrictions as they're created to avoid potentially creating
+    # them twice in the loop below
+    cached_part_bdry_restrictions: Mapping[
+        Tuple[PartID, PartID],
+        DiscretizationConnection] = {}
+
+    def get_part_bdry_restriction(self_part_id, other_part_id):
+        cached_result = cached_part_bdry_restrictions.get(
+            (self_part_id, other_part_id), None)
+        if cached_result is not None:
+            return cached_result
+        return cached_part_bdry_restrictions.setdefault(
+            (self_part_id, other_part_id),
+            make_face_restriction(
+                array_context, volume_discrs[self_part_id.volume_tag],
+                base_group_factory,
+                boundary_tag=BTAG_PARTITION(other_part_id)))
+
+    inter_part_conns: Mapping[
+            Tuple[PartID, PartID],
+            DiscretizationConnection] = {}
+
+    irbis = []
+
+    for vtag, volume_discr in volume_discrs.items():
+        part_id = PartID(vtag, rank)
+        connected_part_ids = get_connected_parts(volume_discr.mesh)
+        for connected_part_id in connected_part_ids:
+            bdry_restr = get_part_bdry_restriction(
+                self_part_id=part_id, other_part_id=connected_part_id)
+
+            if connected_part_id.rank == rank:
+                # {{{ rank-local interface between multiple volumes
+
+                connected_bdry_restr = get_part_bdry_restriction(
+                    self_part_id=connected_part_id, other_part_id=part_id)
+
+                from meshmode.discretization.connection import \
+                        make_partition_connection
+                inter_part_conns[connected_part_id, part_id] = \
+                    make_partition_connection(
+                        array_context,
+                        local_bdry_conn=bdry_restr,
+                        remote_bdry_discr=connected_bdry_restr.to_discr,
+                        remote_group_infos=make_remote_group_infos(
+                            array_context, part_id, connected_bdry_restr))
+
+                # }}}
+            else:
+                # {{{ cross-rank interface
+
+                if mpi_communicator is None:
+                    raise RuntimeError("must supply an MPI communicator "
+                        "when using a distributed mesh")
+
+                irbis.append(
+                        InterRankBoundaryInfo(
+                            local_part_id=part_id,
+                            remote_part_id=connected_part_id,
+                            remote_rank=connected_part_id.rank,
+                            local_boundary_connection=bdry_restr))
+
+                # }}}
+
+    if irbis:
+        assert mpi_communicator is not None
+
+        with MPIBoundaryCommSetupHelper(mpi_communicator, array_context,
+                irbis, base_group_factory) as bdry_setup_helper:
+            while True:
+                conns = bdry_setup_helper.complete_some()
+                if not conns:
+                    # We're done.
+                    break
+
+                inter_part_conns.update(conns)
 
-        super().__init__(*args, **kwargs)
+    return inter_part_conns
 
+# }}}
+
+
+# {{{ modal group factory
 
 def _generate_modal_group_factory(nodal_group_factory):
     from meshmode.discretization.poly_element import (
@@ -769,4 +934,101 @@ def _generate_modal_group_factory(nodal_group_factory):
             f"Unknown mesh element group: {mesh_group_cls}"
         )
 
+# }}}
+
+
+# {{{ make_discretization_collection
+
+MeshOrDiscr = Union[Mesh, Discretization]
+
+
+def make_discretization_collection(
+        array_context: ArrayContext,
+        volumes: Union[
+            MeshOrDiscr,
+            Mapping[VolumeTag, MeshOrDiscr]],
+        order: Optional[int] = None,
+        discr_tag_to_group_factory: Optional[
+            Mapping[DiscretizationTag, ElementGroupFactory]] = None,
+        _result_type: type = DiscretizationCollection
+        ) -> DiscretizationCollection:
+    """
+    :arg discr_tag_to_group_factory: A mapping from discretization tags
+        (typically one of: :class:`~grudge.dof_desc.DISCR_TAG_BASE`,
+        :class:`~grudge.dof_desc.DISCR_TAG_MODAL`, or
+        :class:`~grudge.dof_desc.DISCR_TAG_QUAD`) to a
+        :class:`~meshmode.discretization.ElementGroupFactory`
+        indicating with which type of discretization the operations are
+        to be carried out, or *None* to indicate that operations with this
+        discretization tag should be carried out with the standard volume
+        discretization.
+
+    .. note::
+
+        If passing a :class:`~meshmode.discretization.Discretization` in
+        *volumes*, it must be nodal and unisolvent, consistent with
+        :class:`~grudge.dof_desc.DISCR_TAG_BASE`.
+
+    .. note::
+
+        To use the resulting :class:`DiscretizationCollection` in a
+        distributed-memory manner, the *array_context* passed in
+        must be one of the distributed-memory array contexts
+        from :mod:`grudge.array_context`. Unlike the (now-deprecated,
+        for direct use) constructor of :class:`DiscretizationCollection`,
+        this function no longer accepts a separate MPI communicator.
+
+    .. note::
+
+        If the resulting :class:`DiscretizationCollection` is distributed
+        across multiple ranks, then this is an MPI-collective operation,
+        i.e. all ranks in the communicator must enter this function at the same
+        time.
+    """
+
+    if isinstance(volumes, (Mesh, Discretization)):
+        volumes = {VTAG_ALL: volumes}
+
+    from pytools import single_valued, is_single_valued
+
+    assert len(volumes) > 0
+    assert is_single_valued(mesh_or_discr.ambient_dim
+            for mesh_or_discr in volumes.values())
+
+    discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory(
+            dim=single_valued(
+                mesh_or_discr.dim for mesh_or_discr in volumes.values()),
+            discr_tag_to_group_factory=discr_tag_to_group_factory,
+            order=order)
+
+    del order
+
+    mpi_communicator = getattr(array_context, "mpi_communicator", None)
+
+    if any(
+            isinstance(mesh_or_discr, Discretization)
+            for mesh_or_discr in volumes.values()):
+        raise NotImplementedError("Doesn't work at the moment")
+
+    volume_discrs = {
+        vtag: Discretization(
+            array_context,
+            _normalize_mesh_part_ids(
+                mesh, volumes.keys(), mpi_communicator=mpi_communicator),
+            discr_tag_to_group_factory[DISCR_TAG_BASE])
+        for vtag, mesh in volumes.items()}
+
+    return _result_type(
+            array_context=array_context,
+            volume_discrs=volume_discrs,
+            discr_tag_to_group_factory=discr_tag_to_group_factory,
+            inter_part_connections=_set_up_inter_part_connections(
+                array_context=array_context,
+                mpi_communicator=mpi_communicator,
+                volume_discrs=volume_discrs,
+                base_group_factory=discr_tag_to_group_factory[DISCR_TAG_BASE]))
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/grudge/dof_desc.py b/grudge/dof_desc.py
index 267e4f56e..cf285a30e 100644
--- a/grudge/dof_desc.py
+++ b/grudge/dof_desc.py
@@ -1,4 +1,55 @@
-"""Degree of freedom (DOF) descriptions"""
+"""
+Volume tags
+-----------
+
+.. autoclass:: VolumeTag
+.. autoclass:: VTAG_ALL
+
+:mod:`grudge`-specific boundary tags
+------------------------------------
+
+Domain tags
+-----------
+
+A domain tag identifies a geometric part (or whole) of the domain described
+by a :class:`grudge.DiscretizationCollection`. This can be a volume or a boundary.
+
+.. autoclass:: DTAG_SCALAR
+.. autoclass:: DTAG_VOLUME_ALL
+.. autoclass:: VolumeDomainTag
+.. autoclass:: BoundaryDomainTag
+
+Discretization tags
+-------------------
+
+A discretization tag serves as a symbolic identifier of the manner in which
+meaning is assigned to degrees of freedom.
+
+.. autoclass:: DISCR_TAG_BASE
+.. autoclass:: DISCR_TAG_QUAD
+.. autoclass:: DISCR_TAG_MODAL
+
+DOF Descriptor
+--------------
+
+.. autoclass:: DOFDesc
+.. autofunction:: as_dofdesc
+
+Shortcuts
+---------
+
+.. data:: DD_SCALAR
+.. data:: DD_VOLUME_ALL
+.. data:: DD_VOLUME_ALL_MODAL
+
+Internal things that are visble due to type annotations
+-------------------------------------------------------
+
+.. class:: _DiscretizationTag
+.. class:: ConvertibleToDOFDesc
+
+    Anything that is convertible to a :class:`DOFDesc` via :func:`as_dofdesc`.
+"""
 
 __copyright__ = """
 Copyright (C) 2008 Andreas Kloeckner
@@ -25,31 +76,18 @@
 THE SOFTWARE.
 """
 
-from meshmode.discretization.connection import \
-    FACE_RESTR_INTERIOR, FACE_RESTR_ALL
-from meshmode.mesh import \
-    BTAG_PARTITION, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE
-from warnings import warn
 import sys
+from warnings import warn
+from typing import Hashable, Union, Type, Optional, Any, Tuple
+from dataclasses import dataclass, replace
 
+from meshmode.discretization.connection import (
+    FACE_RESTR_INTERIOR, FACE_RESTR_ALL)
+from meshmode.mesh import (
+    BTAG_PARTITION, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE, BoundaryTag)
 
-__doc__ = """
-.. autoclass:: DTAG_SCALAR
-.. autoclass:: DTAG_VOLUME_ALL
-.. autoclass:: DTAG_BOUNDARY
-
-.. autoclass:: DISCR_TAG_BASE
-.. autoclass:: DISCR_TAG_QUAD
-.. autoclass:: DISCR_TAG_MODAL
-
-.. autoclass:: DOFDesc
-.. autofunction:: as_dofdesc
-
-.. data:: DD_SCALAR
-.. data:: DD_VOLUME
-.. data:: DD_VOLUME_MODAL
-"""
 
+# {{{ _to_identifier
 
 def _to_identifier(name: str) -> str:
     if not name.isidentifier():
@@ -57,71 +95,91 @@ def _to_identifier(name: str) -> str:
     else:
         return name
 
+# }}}
+
+
+# {{{ volume tags
 
-# {{{ DOF description
+class VTAG_ALL:  # noqa: N801
+    pass
 
-class DTAG_SCALAR:  # noqa: N801
+
+VolumeTag = Hashable
+
+# }}}
+
+
+# {{{ domain tag
+
+@dataclass(frozen=True, eq=True)
+class ScalarDomainTag:  # noqa: N801
     """A domain tag denoting scalar values."""
 
 
-class DTAG_VOLUME_ALL:  # noqa: N801
-    """
-    A domain tag denoting values defined
-    in all cell volumes.
+DTAG_SCALAR = ScalarDomainTag()
+
+
+@dataclass(frozen=True, eq=True, init=True)
+class VolumeDomainTag:
+    """A domain tag referring to a volume identified by the
+    volume tag :attr:`tag`. These volume identifiers are only used
+    when the :class:`~grudge.discretization.DiscretizationCollection` contains
+    more than one volume.
+
+    .. attribute:: tag
+
+    .. automethod:: __init__
     """
+    tag: VolumeTag
 
 
-class DTAG_BOUNDARY:  # noqa: N801
-    """A domain tag describing the values on element
-    boundaries which are adjacent to elements
-    of another :class:`~meshmode.mesh.Mesh`.
+DTAG_VOLUME_ALL = VolumeDomainTag(VTAG_ALL)
+
+
+@dataclass(frozen=True, eq=True, init=True)
+class BoundaryDomainTag:
+    """A domain tag referring to a boundary identified by the
+    boundary tag :attr:`tag`.
 
     .. attribute:: tag
+    .. attribute:: volume_tag
 
     .. automethod:: __init__
-    .. automethod:: __eq__
-    .. automethod:: __ne__
-    .. automethod:: __hash__
     """
+    tag: BoundaryTag
+    volume_tag: VolumeTag = VTAG_ALL
 
-    def __init__(self, tag):
-        """
-        :arg tag: One of the following:
-            :class:`~meshmode.mesh.BTAG_ALL`,
-            :class:`~meshmode.mesh.BTAG_NONE`,
-            :class:`~meshmode.mesh.BTAG_REALLY_ALL`,
-            :class:`~meshmode.mesh.BTAG_PARTITION`.
-        """
-        self.tag = tag
 
-    def __eq__(self, other):
-        return isinstance(other, DTAG_BOUNDARY) and self.tag == other.tag
+DomainTag = Union[ScalarDomainTag, VolumeDomainTag, BoundaryDomainTag]
 
-    def __ne__(self, other):
-        return not self.__eq__(other)
+# }}}
+
+
+# {{{ discretization tag
+
+class _DiscretizationTag:  # noqa: N801
+    pass
 
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.tag)
 
-    def __repr__(self):
-        return "<{}({})>".format(type(self).__name__, repr(self.tag))
+DiscretizationTag = Type[_DiscretizationTag]
 
 
-class DISCR_TAG_BASE:  # noqa: N801
+class DISCR_TAG_BASE(_DiscretizationTag):  # noqa: N801
     """A discretization tag indicating the use of a
-    basic discretization grid. This tag is used
+    nodal and unisolvent discretization. This tag is used
     to distinguish the base discretization from quadrature
     (e.g. overintegration) or modal (:class:`DISCR_TAG_MODAL`)
     discretizations.
     """
 
 
-class DISCR_TAG_QUAD:  # noqa: N801
-    """A discretization tag indicating the use of a
-    quadrature discretization grid. This tag is used
-    to distinguish the quadrature discretization
-    (e.g. overintegration) from modal (:class:`DISCR_TAG_MODAL`)
-    or base (:class:`DISCR_TAG_BASE`) discretizations.
+class DISCR_TAG_QUAD(_DiscretizationTag):  # noqa: N801
+    """A discretization tag indicating the use of a quadrature discretization
+    grid, which typically affords higher quadrature accuracy (e.g. for
+    nonlinear terms) at the expense of unisolvency. This tag is used to
+    distinguish the quadrature discretization (e.g. overintegration) from modal
+    (:class:`DISCR_TAG_MODAL`) or base (:class:`DISCR_TAG_BASE`)
+    discretizations.
 
     For working with multiple quadrature grids, it is
     recommended to create appropriate subclasses of
@@ -135,20 +193,22 @@ class CustomQuadTag(DISCR_TAG_QUAD):
             "A custom quadrature discretization tag."
 
         dd = DOFDesc(DTAG_VOLUME_ALL, CustomQuadTag)
-
     """
 
 
-class DISCR_TAG_MODAL:  # noqa: N801
-    """A discretization tag indicating the use of a
-    basic discretization grid with modal degrees of
-    freedom. This tag is used to distinguish the
-    modal discretization from the base (nodal)
-    discretization (e.g. :class:`DISCR_TAG_BASE`) or
+class DISCR_TAG_MODAL(_DiscretizationTag):  # noqa: N801
+    """A discretization tag indicating the use of unisolvent modal degrees of
+    freedom. This tag is used to distinguish the modal discretization from the
+    base (nodal) discretization (e.g.  :class:`DISCR_TAG_BASE`) or
     discretizations on quadrature grids (:class:`DISCR_TAG_QUAD`).
     """
 
+# }}}
+
+
+# {{{ DOF descriptor
 
+@dataclass(frozen=True, eq=True)
 class DOFDesc:
     """Describes the meaning of degrees of freedom.
 
@@ -165,8 +225,9 @@ class DOFDesc:
 
     .. automethod:: uses_quadrature
 
+    .. automethod:: with_domain_tag
     .. automethod:: with_discr_tag
-    .. automethod:: with_dtag
+    .. automethod:: trace
 
     .. automethod:: __eq__
     .. automethod:: __ne__
@@ -174,159 +235,87 @@ class DOFDesc:
     .. automethod:: as_identifier
     """
 
-    def __init__(self, domain_tag, discretization_tag=None,
-                 # FIXME: `quadrature_tag` is deprecated
-                 quadrature_tag=None):
-        """
-        :arg domain_tag: One of the following:
-            :class:`DTAG_SCALAR` (or the string ``"scalar"``),
-            :class:`DTAG_VOLUME_ALL` (or the string ``"vol"``)
-            for the default volume discretization,
-            :data:`~meshmode.discretization.connection.FACE_RESTR_ALL`
-            (or the string ``"all_faces"``), or
-            :data:`~meshmode.discretization.connection.FACE_RESTR_INTERIOR`
-            (or the string ``"int_faces"``), or one of
-            :class:`~meshmode.mesh.BTAG_ALL`,
-            :class:`~meshmode.mesh.BTAG_NONE`,
-            :class:`~meshmode.mesh.BTAG_REALLY_ALL`,
-            :class:`~meshmode.mesh.BTAG_PARTITION`,
-            or *None* to indicate that the geometry is not yet known.
-
-        :arg discretization_tag:
-            *None* or :class:`DISCR_TAG_BASE` to indicate the use of the basic
-            discretization grid, :class:`DISCR_TAG_MODAL` to indicate a
-            modal discretization, or :class:`DISCR_TAG_QUAD` to indicate
-            the use of a quadrature grid.
-        """
+    domain_tag: DomainTag
+    discretization_tag: DiscretizationTag
 
-        if domain_tag is None:
-            pass
-        elif domain_tag in [DTAG_SCALAR, "scalar"]:
-            domain_tag = DTAG_SCALAR
-        elif domain_tag in [DTAG_VOLUME_ALL, "vol"]:
-            domain_tag = DTAG_VOLUME_ALL
-        elif domain_tag in [FACE_RESTR_ALL, "all_faces"]:
-            domain_tag = FACE_RESTR_ALL
-        elif domain_tag in [FACE_RESTR_INTERIOR, "int_faces"]:
-            domain_tag = FACE_RESTR_INTERIOR
-        elif isinstance(domain_tag, BTAG_PARTITION):
-            domain_tag = DTAG_BOUNDARY(domain_tag)
-        elif domain_tag in [BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE]:
-            domain_tag = DTAG_BOUNDARY(domain_tag)
-        elif isinstance(domain_tag, DTAG_BOUNDARY):
-            pass
-        else:
-            raise ValueError("domain tag not understood: %s" % domain_tag)
+    def __init__(self, domain_tag: Any,
+            discretization_tag: Optional[type[DiscretizationTag]] = None):
 
-        if (quadrature_tag is not None and discretization_tag is not None):
-            raise ValueError(
-                "Both `quadrature_tag` and `discretization_tag` are specified. "
-                "Use `discretization_tag` instead."
-            )
-
-        # FIXME: `quadrature_tag` is deprecated
-        if (quadrature_tag is not None and discretization_tag is None):
-            warn("`quadrature_tag` is a deprecated kwarg and will be dropped "
-                 "in version 2022.x. Use `discretization_tag` instead.",
-                 DeprecationWarning, stacklevel=2)
-            discretization_tag = quadrature_tag
+        if (
+                not (isinstance(domain_tag,
+                    (ScalarDomainTag, BoundaryDomainTag, VolumeDomainTag)))
+                or discretization_tag is None
+                or (
+                    not isinstance(discretization_tag, type)
+                    or not issubclass(discretization_tag, _DiscretizationTag))):
+            warn("Sloppy construction of DOFDesc is deprecated. "
+                    "This will stop working in 2023. "
+                    "Call as_dofdesc instead, with the same arguments. ",
+                    DeprecationWarning, stacklevel=2)
 
-        if domain_tag is DTAG_SCALAR and discretization_tag is not None:
-            raise ValueError("cannot have nontrivial discretization tag on scalar")
+            domain_tag, discretization_tag = _normalize_domain_and_discr_tag(
+                    domain_tag, discretization_tag)
 
-        if discretization_tag is None:
-            discretization_tag = DISCR_TAG_BASE
-
-        # FIXME: String tags are deprecated
-        if isinstance(discretization_tag, str):
-            warn("Support for string values of `discretization_tag` will "
-                 "be dropped in version 2022.x. Use one of the `DISCR_TAG_` "
-                 "tags instead.",
-                 DeprecationWarning, stacklevel=2)
-
-        self.domain_tag = domain_tag
-        self.discretization_tag = discretization_tag
-
-    @property
-    def quadrature_tag(self):
-        warn("`DOFDesc.quadrature_tag` is deprecated and will be dropped "
-             "in version 2022.x. Use `DOFDesc.discretization_tag` instead.",
-             DeprecationWarning, stacklevel=2)
-        return self.discretization_tag
+        object.__setattr__(self, "domain_tag", domain_tag)
+        object.__setattr__(self, "discretization_tag", discretization_tag)
 
-    def is_scalar(self):
-        return self.domain_tag is DTAG_SCALAR
+    def is_scalar(self) -> bool:
+        return isinstance(self.domain_tag, ScalarDomainTag)
 
-    def is_discretized(self):
+    def is_discretized(self) -> bool:
         return not self.is_scalar()
 
-    def is_volume(self):
-        return self.domain_tag is DTAG_VOLUME_ALL
+    def is_volume(self) -> bool:
+        return isinstance(self.domain_tag, VolumeDomainTag)
 
-    def is_boundary_or_partition_interface(self):
-        return isinstance(self.domain_tag, DTAG_BOUNDARY)
-
-    def is_trace(self):
-        return (self.is_boundary_or_partition_interface()
-                or self.domain_tag in [
+    def is_boundary_or_partition_interface(self) -> bool:
+        return (isinstance(self.domain_tag, BoundaryDomainTag)
+                and self.domain_tag.tag not in [
                     FACE_RESTR_ALL,
                     FACE_RESTR_INTERIOR])
 
-    def uses_quadrature(self):
+    def is_trace(self) -> bool:
+        return isinstance(self.domain_tag, BoundaryDomainTag)
+
+    def uses_quadrature(self) -> bool:
         # FIXME: String tags are deprecated
-        # Check for string first, otherwise
-        # `issubclass` will raise an exception whenever
-        # its first argument is not a class.
-        # This can go away once support for strings is dropped
-        # completely.
         if isinstance(self.discretization_tag, str):
             # All strings are interpreted as quadrature-related tags
             return True
-        elif issubclass(self.discretization_tag, DISCR_TAG_QUAD):
-            return True
-        elif issubclass(self.discretization_tag,
-                        (DISCR_TAG_BASE, DISCR_TAG_MODAL)):
-            return False
-        else:
-            raise ValueError(
-                f"Unsure how to interpret tag: {self.discretization_tag}"
-            )
-
-    def with_qtag(self, discr_tag):
-        warn("`DOFDesc.with_qtag` is deprecated and will be dropped "
-             "in version 2022.x. Use `DOFDesc.with_discr_tag` instead.",
-             DeprecationWarning, stacklevel=2)
-        return self.with_discr_tag(discr_tag)
-
-    def with_discr_tag(self, discr_tag):
-        return type(self)(domain_tag=self.domain_tag,
-                          discretization_tag=discr_tag)
-
-    def with_dtag(self, dtag):
-        return type(self)(domain_tag=dtag,
-                          discretization_tag=self.discretization_tag)
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.domain_tag == other.domain_tag
-                and self.discretization_tag == other.discretization_tag)
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def __hash__(self):
-        return hash((type(self), self.domain_tag, self.discretization_tag))
-
-    def __repr__(self):
-        def fmt(s):
-            if isinstance(s, type):
-                return s.__name__
-            else:
-                return repr(s)
+        elif isinstance(self.discretization_tag, type):
+            if issubclass(self.discretization_tag, DISCR_TAG_QUAD):
+                return True
+            elif issubclass(self.discretization_tag,
+                            (DISCR_TAG_BASE, DISCR_TAG_MODAL)):
+                return False
+
+        raise ValueError(
+            f"Invalid discretization tag: {self.discretization_tag}")
+
+    def with_dtag(self, dtag) -> "DOFDesc":
+        from warnings import warn
+        warn("'with_dtag' is deprecated. Use 'with_domain_tag' instead. "
+                "This will stop working in 2023",
+                DeprecationWarning, stacklevel=2)
+        return replace(self, domain_tag=dtag)
+
+    def with_domain_tag(self, dtag) -> "DOFDesc":
+        return replace(self, domain_tag=dtag)
+
+    def trace(self, btag: BoundaryTag) -> "DOFDesc":
+        """Return a :class:`DOFDesc` for the restriction of the volume
+        descriptor *self* to the boundary named by *btag*.
+
+        An error is raised if this method is called on a non-volume instance of
+        :class:`DOFDesc`.
+        """
+        if not isinstance(self.domain_tag, VolumeDomainTag):
+            raise ValueError(f"must originate on volume, got '{self.domain_tag}'")
+        return replace(self,
+                domain_tag=BoundaryDomainTag(btag, volume_tag=self.domain_tag.tag))
 
-        return "DOFDesc({}, {})".format(
-                fmt(self.domain_tag),
-                fmt(self.discretization_tag))
+    def with_discr_tag(self, discr_tag) -> "DOFDesc":
+        return replace(self, discretization_tag=discr_tag)
 
     def as_identifier(self) -> str:
         """Returns a descriptive string for this :class:`DOFDesc` that is usable
@@ -341,7 +330,16 @@ def as_identifier(self) -> str:
             dom_id = "f_all"
         elif self.domain_tag is FACE_RESTR_INTERIOR:
             dom_id = "f_int"
-        elif isinstance(self.domain_tag, DTAG_BOUNDARY):
+        elif isinstance(self.domain_tag, VolumeDomainTag):
+            vtag = self.domain_tag.tag
+            if isinstance(vtag, type):
+                vtag = vtag.__name__.replace("VTAG_", "").lower()
+            elif isinstance(vtag, str):
+                vtag = _to_identifier(vtag)
+            else:
+                vtag = _to_identifier(str(vtag))
+            dom_id = f"v_{vtag}"
+        elif isinstance(self.domain_tag, BoundaryDomainTag):
             btag = self.domain_tag.tag
             if isinstance(btag, type):
                 btag = btag.__name__.replace("BTAG_", "").lower()
@@ -369,31 +367,101 @@ def as_identifier(self) -> str:
         return f"{dom_id}{discr_id}"
 
 
-DD_SCALAR = DOFDesc(DTAG_SCALAR, None)
+DD_SCALAR = DOFDesc(DTAG_SCALAR, DISCR_TAG_BASE)
+DD_VOLUME_ALL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_BASE)
+DD_VOLUME_ALL_MODAL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_MODAL)
+
+
+def _normalize_domain_and_discr_tag(
+        domain: Any,
+        discretization_tag: Optional[DiscretizationTag] = None,
+        *, _contextual_volume_tag: Optional[VolumeTag] = None
+        ) -> Tuple[DomainTag, DiscretizationTag]:
+
+    if _contextual_volume_tag is None:
+        _contextual_volume_tag = VTAG_ALL
+
+    if domain == "scalar":
+        domain = DTAG_SCALAR
+    elif isinstance(domain, (ScalarDomainTag, BoundaryDomainTag, VolumeDomainTag)):
+        pass
+    elif domain in [VTAG_ALL, "vol"]:
+        domain = DTAG_VOLUME_ALL
+    elif domain in [FACE_RESTR_ALL, "all_faces"]:
+        domain = BoundaryDomainTag(FACE_RESTR_ALL, _contextual_volume_tag)
+    elif domain in [FACE_RESTR_INTERIOR, "int_faces"]:
+        domain = BoundaryDomainTag(FACE_RESTR_INTERIOR, _contextual_volume_tag)
+    elif isinstance(domain, BTAG_PARTITION):
+        domain = BoundaryDomainTag(domain, _contextual_volume_tag)
+    elif domain in [BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE]:
+        domain = BoundaryDomainTag(domain, _contextual_volume_tag)
+    else:
+        raise ValueError("domain tag not understood: %s" % domain)
+
+    if domain is DTAG_SCALAR and discretization_tag is not None:
+        raise ValueError("cannot have nontrivial discretization tag on scalar")
+
+    if discretization_tag is None:
+        discretization_tag = DISCR_TAG_BASE
 
-DD_VOLUME = DOFDesc(DTAG_VOLUME_ALL, None)
+    return domain, discretization_tag
+
+
+ConvertibleToDOFDesc = Any
+
+
+def as_dofdesc(
+        domain: "ConvertibleToDOFDesc",
+        discretization_tag: Optional[DiscretizationTag] = None,
+        *, _contextual_volume_tag: Optional[VolumeTag] = None) -> DOFDesc:
+    """
+    :arg domain_tag: One of the following:
+        :class:`DTAG_SCALAR` (or the string ``"scalar"``),
+        :class:`DTAG_VOLUME_ALL` (or the string ``"vol"``)
+        for the default volume discretization,
+        :data:`~meshmode.discretization.connection.FACE_RESTR_ALL`
+        (or the string ``"all_faces"``), or
+        :data:`~meshmode.discretization.connection.FACE_RESTR_INTERIOR`
+        (or the string ``"int_faces"``), or one of
+        :class:`~meshmode.mesh.BTAG_ALL`,
+        :class:`~meshmode.mesh.BTAG_NONE`,
+        :class:`~meshmode.mesh.BTAG_REALLY_ALL`,
+        :class:`~meshmode.mesh.BTAG_PARTITION`,
+        or *None* to indicate that the geometry is not yet known.
+
+    :arg discretization_tag:
+        *None* or :class:`DISCR_TAG_BASE` to indicate the use of the basic
+        discretization grid, :class:`DISCR_TAG_MODAL` to indicate a
+        modal discretization, or :class:`DISCR_TAG_QUAD` to indicate
+        the use of a quadrature grid.
+    """
 
-DD_VOLUME_MODAL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_MODAL)
+    if isinstance(domain, DOFDesc):
+        return domain
 
+    domain, discretization_tag = _normalize_domain_and_discr_tag(
+            domain, discretization_tag,
+            _contextual_volume_tag=_contextual_volume_tag)
 
-def as_dofdesc(dd):
-    if isinstance(dd, DOFDesc):
-        return dd
-    return DOFDesc(dd, discretization_tag=None)
+    return DOFDesc(domain, discretization_tag)
 
 # }}}
 
 
-# {{{ Deprecated tags
+# {{{ deprecations
 
-_deprecated_name_to_new_name = {"QTAG_NONE": "DISCR_TAG_BASE",
-                                "QTAG_MODAL": "DISCR_TAG_MODAL"}
+_deprecated_name_to_new_name = {
+        "DTAG_VOLUME": "VolumeDomainTag",
+        "DTAG_BOUNDARY": "BoundaryDomainTag",
+        "DD_VOLUME": "DD_VOLUME_ALL",
+        "DD_VOLUME_MODAL": "DD_VOLUME_ALL_MODAL"
+        }
 
 
 def __getattr__(name):
     if name in _deprecated_name_to_new_name:
         warn(f"'{name}' is deprecated and will be dropped "
-             f"in version 2022.x. Use '{_deprecated_name_to_new_name[name]}' "
+             f"in version 2023.x. Use '{_deprecated_name_to_new_name[name]}' "
              "instead.",
              DeprecationWarning, stacklevel=2)
         return globals()[_deprecated_name_to_new_name[name]]
diff --git a/grudge/dt_utils.py b/grudge/dt_utils.py
index 73f307b38..c45701e01 100644
--- a/grudge/dt_utils.py
+++ b/grudge/dt_utils.py
@@ -43,7 +43,9 @@
 """
 
 
+from typing import Optional, Sequence
 import numpy as np
+import loopy as lp
 
 from arraycontext import ArrayContext, Scalar, tag_axes
 from arraycontext.metadata import NameHint
@@ -52,8 +54,10 @@
                                          DiscretizationFaceAxisTag,
                                          DiscretizationElementAxisTag)
 
-from grudge.dof_desc import DD_VOLUME, DOFDesc, as_dofdesc
+from grudge.dof_desc import (
+        DD_VOLUME_ALL, DOFDesc, as_dofdesc, BoundaryDomainTag, FACE_RESTR_ALL)
 from grudge.discretization import DiscretizationCollection
+from grudge.grudge_tags import KernelDataTag, ParameterValue, IsFaceDOFArray, IsDOFArray
 
 import grudge.op as op
 
@@ -63,7 +67,8 @@
 
 
 def characteristic_lengthscales(
-        actx: ArrayContext, dcoll: DiscretizationCollection) -> DOFArray:
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None) -> DOFArray:
     r"""Computes the characteristic length scale :math:`h_{\text{loc}}` at
     each node. The characteristic length scale is mainly useful for estimating
     the stable time step size. E.g. for a hyperbolic system, an estimate of the
@@ -79,7 +84,7 @@ def characteristic_lengthscales(
     node distance on the reference cell (see :func:`dt_non_geometric_factors`),
     and :math:`r_D` is the inradius of the cell (see :func:`dt_geometric_factors`).
 
-    :returns: a frozen :class:`~meshmode.dof_array.DOFArray` containing a
+    :returns: a :class:`~meshmode.dof_array.DOFArray` containing a
         characteristic lengthscale for each element, at each nodal location.
 
     .. note::
@@ -91,7 +96,7 @@ def characteristic_lengthscales(
         methods has been used as a guide. Any concrete time integrator will
         likely require scaling of the values returned by this routine.
     """
-    @memoize_in(dcoll, (characteristic_lengthscales,
+    @memoize_in(dcoll, (characteristic_lengthscales, dd,
                         "compute_characteristic_lengthscales"))
     def _compute_characteristic_lengthscales():
         return actx.freeze(
@@ -103,15 +108,16 @@ def _compute_characteristic_lengthscales():
                             # corresponding group non-geometric factor
                             cng * geo_facts
                             for cng, geo_facts in zip(
-                                dt_non_geometric_factors(dcoll),
-                                actx.thaw(dt_geometric_factors(dcoll)))))))
+                                dt_non_geometric_factors(dcoll, dd),
+                                actx.thaw(dt_geometric_factors(dcoll, dd)))))))
 
     return actx.thaw(_compute_characteristic_lengthscales())
 
 
 @memoize_on_first_arg
 def dt_non_geometric_factors(
-        dcoll: DiscretizationCollection, dd=None) -> list:
+        dcoll: DiscretizationCollection, dd: Optional[DOFDesc] = None
+        ) -> Sequence[float]:
     r"""Computes the non-geometric scale factors following [Hesthaven_2008]_,
     section 6.4, for each element group in the *dd* discretization:
 
@@ -128,7 +134,7 @@ def dt_non_geometric_factors(
         node distance on the reference element for each group.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     discr = dcoll.discr_from_dd(dd)
     min_delta_rs = []
@@ -160,7 +166,8 @@ def dt_non_geometric_factors(
 
 @memoize_on_first_arg
 def h_max_from_volume(
-        dcoll: DiscretizationCollection, dim=None, dd=None) -> Scalar:
+        dcoll: DiscretizationCollection, dim=None,
+        dd: Optional[DOFDesc] = None) -> Scalar:
     """Returns a (maximum) characteristic length based on the volume of the
     elements. This length may not be representative if the elements have very
     high aspect ratios.
@@ -175,7 +182,7 @@ def h_max_from_volume(
     from grudge.reductions import nodal_max, elementwise_sum
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
     dd = as_dofdesc(dd)
 
     if dim is None:
@@ -191,7 +198,8 @@ def h_max_from_volume(
 
 @memoize_on_first_arg
 def h_min_from_volume(
-        dcoll: DiscretizationCollection, dim=None, dd=None) -> Scalar:
+        dcoll: DiscretizationCollection, dim=None,
+        dd: Optional[DOFDesc] = None) -> Scalar:
     """Returns a (minimum) characteristic length based on the volume of the
     elements. This length may not be representative if the elements have very
     high aspect ratios.
@@ -206,7 +214,7 @@ def h_min_from_volume(
     from grudge.reductions import nodal_min, elementwise_sum
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
     dd = as_dofdesc(dd)
 
     if dim is None:
@@ -221,7 +229,7 @@ def h_min_from_volume(
 
 
 def dt_geometric_factors(
-        dcoll: DiscretizationCollection, dd=None) -> DOFArray:
+        dcoll: DiscretizationCollection, dd: Optional[DOFDesc] = None) -> DOFArray:
     r"""Computes a geometric scaling factor for each cell following [Hesthaven_2008]_,
     section 6.4, defined as the inradius (radius of an inscribed circle/sphere).
 
@@ -244,7 +252,7 @@ def dt_geometric_factors(
     from meshmode.discretization.poly_element import SimplexElementGroupBase
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     actx = dcoll._setup_actx
     volm_discr = dcoll.discr_from_dd(dd)
@@ -271,7 +279,8 @@ def dt_geometric_factors(
         # Inscribed "circle" radius is half the cell size
         return actx.freeze(cell_vols/2)
 
-    dd_face = DOFDesc("all_faces", dd.discretization_tag)
+    dd_face = dd.with_domain_tag(
+            BoundaryDomainTag(FACE_RESTR_ALL, dd.domain_tag.tag))
     face_discr = dcoll.discr_from_dd(dd_face)
 
     # Compute areas of each face
@@ -281,6 +290,62 @@ def dt_geometric_factors(
         )
     )
 
+    data = []
+
+    if actx.supports_nonscalar_broadcasting:
+        for vgrp, face_ae_i in zip(volm_discr.groups, face_areas):
+
+            fp_format = face_ae_i.dtype
+            Ne = vgrp.nelements
+            Nf = vgrp.mesh_el_group.nfaces
+            Nj = face_ae_i.shape[-1]#afgrp.nunit_dofs
+
+            kernel_data = [
+                lp.GlobalArg("arg0", fp_format, strides=lp.auto, shape=(Nf, Ne, Nj), tags=[IsFaceDOFArray()]), 
+                #lp.GlobalArg("out", fp_format, is_output=True), # Specifying causes wrong soln
+                lp.ValueArg("Nf", tags=[ParameterValue(Nf)]),
+                lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+                lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+                ...
+            ]
+            kd_tag = KernelDataTag(kernel_data)
+
+            data.append(actx.einsum("fej->e",
+                        tag_axes(actx, {
+                            0: DiscretizationFaceAxisTag(),
+                            1: DiscretizationElementAxisTag(),
+                            2: DiscretizationDOFAxisTag()
+                            },
+                        #face_ae_i.reshape(Nf, Ne, face_ae_i.shape[-1])),
+                        face_ae_i.reshape(Nf, Ne, Nj)),
+                        tagged=(FirstAxisIsElementsTag(),kd_tag)))
+    else:
+
+        for vgrp, afgrp, face_ae_i in zip(volm_discr.groups, face_discr.groups, face_areas):
+            fp_format = face_ae_i.dtype
+            Ne = vgrp.nelements
+            Nf = vgrp.mesh_el_group.nfaces
+            Nj = face_ae_i.shape[-1]#afgrp.nunit_dofs
+            
+            kernel_data = [
+                lp.GlobalArg("arg0", fp_format, strides=lp.auto, shape=(Nf, Ne, Nj), tags=[IsFaceDOFArray()]), 
+                #lp.GlobalArg("out", fp_format, is_output=True), # Specifying causes wrong soln
+                lp.ValueArg("Nf", tags=[ParameterValue(Nf)]),
+                lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+                lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+                ...
+            ]
+            kd_tag = KernelDataTag(kernel_data)
+
+
+            data.append(actx.einsum("fej->e",
+                        #face_ae_i.reshape(Nf, Ne, face_ae_i.shape[-1]),
+                        face_ae_i.reshape(Nf, Ne, Nj),
+                        tagged=(FirstAxisIsElementsTag(),kd_tag)) / afgrp.nunit_dofs)
+
+    surface_areas = DOFArray(actx, data=tuple(data))
+
+    """
     if actx.supports_nonscalar_broadcasting:
         # Compute total surface area of an element by summing over the
         # individual face areas
@@ -325,14 +390,30 @@ def dt_geometric_factors(
                                                   face_areas)
             )
         )
+    """
 
-    return actx.freeze(
-            actx.tag(NameHint(f"dt_geometric_{dd.as_identifier()}"),
-                DOFArray(actx,
-                    data=tuple(
-                        actx.einsum("e,ei->ei", 1/sae_i, cv_i,
-                            tagged=(FirstAxisIsElementsTag(),)) * dcoll.dim
-                        for cv_i, sae_i in zip(cell_vols, surface_areas)))))
+    data = []
+    for cv_i, sae_i, in zip(cell_vols, surface_areas):
+
+        fp_format = cv_i.dtype
+        Ne, Ni = cv_i.shape
+ 
+        kernel_data = [
+            lp.GlobalArg("arg0", sae_i.dtype, shape=(Ne,), strides=lp.auto), 
+            lp.GlobalArg("arg1", fp_format, shape=(Ne, Ni), tags=[IsDOFArray()]), 
+            lp.GlobalArg("out", fp_format, shape=(Ne, Ni), tags=[IsDOFArray()], is_output=True),
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            ...
+        ]
+        kd_tag = KernelDataTag(kernel_data)
+
+        data.append(actx.einsum("e,ei->ei",
+                        1/sae_i,
+                        cv_i,
+                        tagged=(FirstAxisIsElementsTag(),kd_tag)) * dcoll.dim)
+
+    return actx.freeze(actx.tag(NameHint(f"dt_geometric_{dd.as_identifier()}"),DOFArray(actx, data=tuple(data))))
 
 # }}}
 
diff --git a/grudge/eager.py b/grudge/eager.py
index 1886cfd04..2175592d4 100644
--- a/grudge/eager.py
+++ b/grudge/eager.py
@@ -47,14 +47,14 @@ def __init__(self, *args, **kwargs):
     def project(self, src, tgt, vec):
         return op.project(self, src, tgt, vec)
 
-    def grad(self, vec):
-        return op.local_grad(self, vec)
+    def grad(self, *args):
+        return op.local_grad(self, *args)
 
-    def d_dx(self, xyz_axis, vec):
-        return op.local_d_dx(self, xyz_axis, vec)
+    def d_dx(self, xyz_axis, *args):
+        return op.local_d_dx(self, xyz_axis, *args)
 
-    def div(self, vecs):
-        return op.local_div(self, vecs)
+    def div(self, *args):
+        return op.local_div(self, *args)
 
     def weak_grad(self, *args):
         return op.weak_local_grad(self, *args)
@@ -68,8 +68,8 @@ def weak_div(self, *args):
     def mass(self, *args):
         return op.mass(self, *args)
 
-    def inverse_mass(self, vec):
-        return op.inverse_mass(self, vec)
+    def inverse_mass(self, *args):
+        return op.inverse_mass(self, *args)
 
     def face_mass(self, *args):
         return op.face_mass(self, *args)
@@ -87,8 +87,8 @@ def nodal_max(self, dd, vec):
         return op.nodal_max(self, dd, vec)
 
 
-connected_ranks = op.connected_ranks
 interior_trace_pair = op.interior_trace_pair
 cross_rank_trace_pairs = op.cross_rank_trace_pairs
+inter_volume_trace_pairs = op.inter_volume_trace_pairs
 
 # vim: foldmethod=marker
diff --git a/grudge/geometry/metrics.py b/grudge/geometry/metrics.py
index 89e1f1f2c..f5c622cd5 100644
--- a/grudge/geometry/metrics.py
+++ b/grudge/geometry/metrics.py
@@ -58,6 +58,7 @@
 """
 
 
+from typing import Optional, Tuple, Union
 import numpy as np
 
 from arraycontext import ArrayContext, tag_axes
@@ -68,7 +69,7 @@
 import grudge.dof_desc as dof_desc
 
 from grudge.dof_desc import (
-    DD_VOLUME, DOFDesc, DISCR_TAG_BASE
+    DD_VOLUME_ALL, DOFDesc, DISCR_TAG_BASE
 )
 
 from meshmode.transform_metadata import (DiscretizationAmbientDimAxisTag,
@@ -115,7 +116,8 @@ def to_quad(vec):
 
 def forward_metric_nth_derivative(
         actx: ArrayContext, dcoll: DiscretizationCollection,
-        xyz_axis, ref_axes, dd=None,
+        xyz_axis: int, ref_axes: Union[int, Tuple[Tuple[int, int], ...]],
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> DOFArray:
     r"""Pointwise metric derivatives representing repeated derivatives of the
     physical coordinate enumerated by *xyz_axis*: :math:`x_{\mathrm{xyz\_axis}}`
@@ -150,7 +152,7 @@ def forward_metric_nth_derivative(
         metric derivative at each nodal coordinate.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     inner_dd = dd.with_discr_tag(DISCR_TAG_BASE)
 
@@ -182,8 +184,10 @@ def forward_metric_nth_derivative(
 
 
 def forward_metric_derivative_vector(
-        actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, dd=None,
-        *, _use_geoderiv_connection=False) -> np.ndarray:
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        rst_axis: Union[int, Tuple[Tuple[int, int], ...]],
+        dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False
+        ) -> np.ndarray:
     r"""Computes an object array containing the forward metric derivatives
     of each physical coordinate.
 
@@ -207,7 +211,9 @@ def forward_metric_derivative_vector(
 
 
 def forward_metric_derivative_mv(
-        actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        rst_axis: Union[int, Tuple[Tuple[int, int], ...]],
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> MultiVector:
     r"""Computes a :class:`pymbolic.geometric_algebra.MultiVector` containing
     the forward metric derivatives of each physical coordinate.
@@ -230,7 +236,8 @@ def forward_metric_derivative_mv(
 
 
 def forward_metric_derivative_mat(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> np.ndarray:
     r"""Computes the forward metric derivative matrix, also commonly
     called the Jacobian matrix, with entries defined as the
@@ -257,7 +264,7 @@ def forward_metric_derivative_mat(
     ambient_dim = dcoll.ambient_dim
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.discr_from_dd(dd).dim
 
@@ -271,7 +278,8 @@ def forward_metric_derivative_mat(
 
 
 def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection,
-        dd=None, *, _use_geoderiv_connection=False) -> np.ndarray:
+        dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False
+        ) -> np.ndarray:
     r"""Computes the first fundamental form using the Jacobian matrix:
 
     .. math::
@@ -297,7 +305,7 @@ def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection,
         form.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     mder = forward_metric_derivative_mat(
         actx, dcoll, dd=dd, _use_geoderiv_connection=_use_geoderiv_connection)
@@ -306,7 +314,8 @@ def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection,
 
 
 def inverse_metric_derivative_mat(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> np.ndarray:
     r"""Computes the inverse metric derivative matrix, which is
     the inverse of the Jacobian (forward metric derivative) matrix.
@@ -320,7 +329,7 @@ def inverse_metric_derivative_mat(
     ambient_dim = dcoll.ambient_dim
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.discr_from_dd(dd).dim
 
@@ -336,7 +345,8 @@ def inverse_metric_derivative_mat(
 
 
 def inverse_first_fundamental_form(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> np.ndarray:
     r"""Computes the inverse of the first fundamental form:
 
@@ -360,7 +370,7 @@ def inverse_first_fundamental_form(
         first fundamental form.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.discr_from_dd(dd).dim
 
@@ -387,7 +397,8 @@ def inverse_first_fundamental_form(
 
 
 def inverse_metric_derivative(
-        actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, xyz_axis, dd,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        rst_axis: int, xyz_axis: int, dd: DOFDesc,
         *, _use_geoderiv_connection=False
         ) -> DOFArray:
     r"""Computes the inverse metric derivative of the physical
@@ -446,7 +457,7 @@ def outprod_with_unit(i, at):
 
 def inverse_surface_metric_derivative(
         actx: ArrayContext, dcoll: DiscretizationCollection,
-        rst_axis, xyz_axis, dd=None,
+        rst_axis, xyz_axis, dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False):
     r"""Computes the inverse surface metric derivative of the physical
     coordinate enumerated by *xyz_axis* with respect to the
@@ -468,7 +479,7 @@ def inverse_surface_metric_derivative(
     ambient_dim = dcoll.ambient_dim
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
     dd = dof_desc.as_dofdesc(dd)
 
     if ambient_dim == dim:
@@ -488,7 +499,8 @@ def inverse_surface_metric_derivative(
 
 
 def inverse_surface_metric_derivative_mat(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, times_area_element=False, _use_geoderiv_connection=False):
     r"""Computes the matrix of inverse surface metric derivatives, indexed by
     ``(xyz_axis, rst_axis)``. It returns all values of
@@ -509,7 +521,7 @@ def inverse_surface_metric_derivative_mat(
     """
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
     dd = dof_desc.as_dofdesc(dd)
 
     @memoize_in(dcoll, (inverse_surface_metric_derivative_mat, dd,
@@ -542,7 +554,7 @@ def _inv_surf_metric_deriv():
 
 
 def _signed_face_ones(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd
+        actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc
         ) -> DOFArray:
 
     assert dd.is_trace()
@@ -550,7 +562,7 @@ def _signed_face_ones(
     # NOTE: ignore quadrature_tags on dd, since we only care about
     # the face_id here
     all_faces_conn = dcoll.connection_from_dds(
-        DD_VOLUME, DOFDesc(dd.domain_tag)
+        DD_VOLUME_ALL, DOFDesc(dd.domain_tag, DISCR_TAG_BASE)
     )
     signed_ones = dcoll.discr_from_dd(dd.with_discr_tag(DISCR_TAG_BASE)).zeros(
         actx, dtype=dcoll.real_dtype
@@ -571,7 +583,7 @@ def _signed_face_ones(
 
 
 def parametrization_derivative(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd,
+        actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc,
         *, _use_geoderiv_connection=False) -> MultiVector:
     r"""Computes the product of forward metric derivatives spanning the
     tangent space with topological dimension *dim*.
@@ -584,7 +596,7 @@ def parametrization_derivative(
         the product of metric derivatives.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.discr_from_dd(dd).dim
     if dim == 0:
@@ -605,8 +617,10 @@ def parametrization_derivative(
     )
 
 
-def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection,
-        dd=None, *, _use_geoderiv_connection=False) -> MultiVector:
+def pseudoscalar(
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False
+        ) -> MultiVector:
     r"""Computes the field of pseudoscalars for the domain/discretization
     identified by *dd*.
 
@@ -618,7 +632,7 @@ def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection,
         :class:`~meshmode.dof_array.DOFArray`\ s.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     return parametrization_derivative(
         actx, dcoll, dd,
@@ -626,7 +640,8 @@ def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection,
 
 
 def area_element(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False
         ) -> DOFArray:
     r"""Computes the scale factor used to transform integrals from reference
@@ -642,7 +657,7 @@ def area_element(
         volumes for each element.
     """
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     @memoize_in(dcoll, (area_element, dd, _use_geoderiv_connection))
     def _area_elements():
@@ -662,7 +677,8 @@ def _area_elements():
 # {{{ surface normal vectors
 
 def rel_mv_normal(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None,
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None,
         *, _use_geoderiv_connection=False) -> MultiVector:
     r"""Computes surface normals at each nodal location as a
     :class:`~pymbolic.geometric_algebra.MultiVector` relative to the
@@ -688,7 +704,7 @@ def rel_mv_normal(
 
 
 def mv_normal(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd,
+        actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc,
         *, _use_geoderiv_connection=False
         ) -> MultiVector:
     r"""Exterior unit normal as a :class:`~pymbolic.geometric_algebra.MultiVector`.
@@ -744,10 +760,10 @@ def _normal():
             from grudge.op import project
 
             volm_normal = MultiVector(
-                project(dcoll, dof_desc.DD_VOLUME, dd,
+                project(dcoll, DD_VOLUME_ALL, dd,
                         rel_mv_normal(
                             actx, dcoll,
-                            dd=dof_desc.DD_VOLUME,
+                            dd=DD_VOLUME_ALL,
                             _use_geoderiv_connection=_use_geoderiv_connection
                         ).as_vector(dtype=object))
             )
@@ -768,7 +784,7 @@ def _normal():
     return actx.thaw(_normal())
 
 
-def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd,
+def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc,
         *, _use_geoderiv_connection=None):
     """Get the unit normal to the specified surface discretization, *dd*.
     This supports both volume discretizations
@@ -798,8 +814,8 @@ def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd,
 # {{{ Curvature computations
 
 def second_fundamental_form(
-        actx: ArrayContext, dcoll: DiscretizationCollection, dd=None
-        ) -> np.ndarray:
+        actx: ArrayContext, dcoll: DiscretizationCollection,
+        dd: Optional[DOFDesc] = None) -> np.ndarray:
     r"""Computes the second fundamental form:
 
     .. math::
@@ -817,7 +833,7 @@ def second_fundamental_form(
     """
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.discr_from_dd(dd).dim
     normal = rel_mv_normal(actx, dcoll, dd=dd).as_vector(dtype=object)
@@ -846,7 +862,7 @@ def second_fundamental_form(
 
 
 def shape_operator(actx: ArrayContext, dcoll: DiscretizationCollection,
-        dd=None) -> np.ndarray:
+        dd: Optional[DOFDesc] = None) -> np.ndarray:
     r"""Computes the shape operator (also called the curvature tensor) containing
     second order derivatives:
 
@@ -871,7 +887,7 @@ def shape_operator(actx: ArrayContext, dcoll: DiscretizationCollection,
 
 
 def summed_curvature(actx: ArrayContext, dcoll: DiscretizationCollection,
-        dd=None) -> DOFArray:
+        dd: Optional[DOFDesc] = None) -> DOFArray:
     r"""Computes the sum of the principal curvatures:
 
     .. math::
@@ -888,7 +904,7 @@ def summed_curvature(actx: ArrayContext, dcoll: DiscretizationCollection,
     """
 
     if dd is None:
-        dd = DD_VOLUME
+        dd = DD_VOLUME_ALL
 
     dim = dcoll.ambient_dim - 1
 
diff --git a/grudge/grudge_array_context.py b/grudge/grudge_array_context.py
new file mode 100644
index 000000000..cbec3d72b
--- /dev/null
+++ b/grudge/grudge_array_context.py
@@ -0,0 +1,1688 @@
+from meshmode.array_context import PyOpenCLArrayContext
+from grudge.array_context import MPIPyOpenCLArrayContext
+from pytools import memoize_method, memoize_in, memoize
+import loopy as lp
+import pyopencl as cl
+import pyopencl.array as cla
+import numpy as np
+
+import grudge.loopy_dg_kernels as dgk
+from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray, IsFaceDOFArray, 
+    IsOpArray, IsSepVecOpArray, ParameterValue, IsFaceMassOpArray, KernelDataTag,
+    IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray, EinsumArgsTags)
+
+from arraycontext.impl.pyopencl.fake_numpy import (PyOpenCLFakeNumpyNamespace)
+from arraycontext.container.traversal import (rec_map_array_container,
+    multimapped_over_array_containers)
+
+from hashlib import md5
+import hjson
+import os
+import pickle
+from os.path import exists
+
+from grudge.loopy_dg_kernels.run_tests import (generic_test, random_search,
+        exhaustive_search, exhaustive_search_v2)
+from arraycontext.container.traversal import rec_multimap_array_container
+from typing import Optional
+
+#from grudge.loopy_dg_kernels.run_tests import analyzeResult
+
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Use backported version for python < 3.7
+    import importlib_resources as pkg_resources
+
+ctof_knl_base = lp.make_copy_kernel("f,f", old_dim_tags="c,c")
+ctof_knl = lp.make_kernel(ctof_knl_base.default_entrypoint.domains,
+                     ctof_knl_base.default_entrypoint.instructions,
+                     default_offset=lp.auto)
+ctof_knl = lp.tag_array_axes(ctof_knl, "input", "c,c")
+ctof_knl = lp.tag_array_axes(ctof_knl, "output", "f,f")
+
+#ftoc_knl = lp.make_copy_kernel("c,c", old_dim_tags="f,f")
+
+def get_transformation_id(device_id):
+    hjson_file = pkg_resources.open_text(dgk, "device_mappings.hjson") 
+    hjson_text = hjson_file.read()
+    hjson_file.close()
+    od = hjson.loads(hjson_text)
+    return od[device_id]
+
+def get_fp_string(dtype):
+    return "FP64" if dtype == np.float64 else "FP32"
+
+#def get_order_from_dofs(dofs):
+#    dofs_to_order = {10: 2, 20: 3, 35: 4, 56: 5, 84: 6, 120: 7}
+#    return dofs_to_order[dofs]
+
+
+def fix_program_parameters(program):
+    for arg in program.default_entrypoint.args:
+        for tag in arg.tags:
+            if isinstance(tag, ParameterValue):
+                program = lp.fix_parameters(program, **{arg.name: tag.value})
+    return program
+
+def set_memory_layout(program, order="F"):
+    # This assumes arguments have only one tag
+    if order == "F":
+        for arg in program.default_entrypoint.args:
+            if IsDOFArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "f,f")
+            elif IsSepVecDOFArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "sep,f,f")
+            elif IsSepVecOpArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "sep,c,c")
+            elif IsFaceDOFArray() in arg.tags:
+                # Why is this the data layout with fortran ordering?
+                program = lp.tag_array_axes(program, arg.name, "N1,N0,N2")
+            elif IsVecDOFArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "N2,N0,N1")
+            elif IsVecOpArray() in arg.tags or IsFaceMassOpArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "c,c,c")
+            elif IsFourAxisDOFArray() in arg.tags:
+                program = lp.tag_array_axes(program, arg.name, "N3,N2,N0,N1")
+
+    #for arg in program.default_entrypoint.args:
+    #    for tag in arg.tags:
+    #        if isinstance(tag, ParameterValue):
+    #            program = lp.fix_parameters(program, **{arg.name: tag.value})
+    program = fix_program_parameters(program)
+    program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+    return program
+
+
+# {{{ _get_scalar_func_loopy_program
+
+def _get_scalar_func_loopy_program(actx, c_name, nargs, axis_lengths):
+    @memoize_in(actx, _get_scalar_func_loopy_program)
+    def get(c_name, nargs, naxes):
+        from pymbolic import var
+        naxes = len(axis_lengths)
+
+        var_names = ["i%d" % i for i in range(naxes)]
+        size_names = ["n%d" % i for i in range(naxes)]
+        subscript = tuple(var(vname) for vname in var_names)
+        from islpy import make_zero_and_vars
+        v = make_zero_and_vars(var_names, params=size_names)
+        domain = v[0].domain()
+        for vname, sname in zip(var_names, size_names):
+            domain = domain & v[0].le_set(v[vname]) & v[vname].lt_set(v[sname])
+
+        domain_bset, = domain.get_basic_sets()
+
+        from arraycontext.loopy import make_loopy_program
+        from arraycontext.transform_metadata import ElementwiseMapKernelTag
+
+        tags = [IsDOFArray()] if naxes > 1 else []
+        kernel_data = [
+                lp.GlobalArg("inp%d" % i, None, shape=tuple(size_names), tags=tags)
+                for i in range(nargs)]
+        kernel_data.append(
+            lp.GlobalArg("out", None, shape=tuple(size_names), tags=tags))
+        #for name, val in zip(size_names, axis_lengths):
+        #    kernel_data.append(lp.ValueArg(name, tags=[ParameterValue(val)]))
+        kernel_data.append(...)
+
+        prg = make_loopy_program(
+                [domain_bset],
+                [
+                    lp.Assignment(
+                        var("out")[subscript],
+                        var(c_name)(*[
+                            var("inp%d" % i)[subscript] for i in range(nargs)]))
+                    ],
+                kernel_data=kernel_data,
+                name="actx_special_%s" % c_name,
+                tags=(ElementwiseMapKernelTag(),))
+
+        return prg
+
+    return get(c_name, nargs, axis_lengths)
+
+# }}}
+
+
+class GrudgeFakeNumpyNamespace(PyOpenCLFakeNumpyNamespace):
+
+    # ¿Debería este ser más inteligente?
+    # This function has no idea if `a` is in flattened C or F order. Should it be assumed to be in "C" layout?
+    def reshape(self, a, newshape, order="C"): # Order here is the input layout or output layout?
+        #print("================CALLING RESHAPE================")
+        #print(type(a))
+        #assert np.allclose(a.reshape(newshape, order="F").get(), a.reshape(newshape, order="C").get())
+
+        return rec_map_array_container(
+                lambda ary: ctof_knl(self._array_context.queue, input=ary.reshape(newshape, order="C"))[1][0], a) 
+        # Need to override the default for now.
+    
+    # Could be problematic. Unflatten has no idea if the data has been changed from "F" layout to
+    # (flattened) "C" layout so when order="F" is specified data is moved around.
+    # Maybe some tags should be attached to the flattened arrays?
+    def ravel(self, a, order="C"): # Order here is the output layout
+        def _rec_ravel(a):
+            # Couldn't this be accomplished with an ftoc kernel followed by an a.reshape?
+            if order == "C" and len(a.shape) == 2 and a.flags.f_contiguous:
+                @memoize_in(self._array_context, (_rec_ravel, "flatten_grp_ary_prg"))
+                def prg():
+                    from arraycontext import make_loopy_program
+                    t_unit = make_loopy_program(
+                        [
+                            "{[iel]: 0 <= iel < nelements}",
+                            "{[idof]: 0 <= idof < ndofs_per_element}"
+                        ],
+                        """
+                            result[iel * ndofs_per_element + idof] = grp_ary[iel, idof]
+                        """,
+                        [
+                            lp.GlobalArg("result", None,
+                                         shape="nelements * ndofs_per_element"),
+                            lp.GlobalArg("grp_ary", None,
+                                         shape=("nelements", "ndofs_per_element"), tags=[IsDOFArray()]),
+                            lp.ValueArg("nelements", np.int32),
+                           lp.ValueArg("ndofs_per_element", np.int32),
+                            "..."
+                        ],
+                        name="flatten_grp_ary"
+                    )
+                    return t_unit
+                    #return lp.tag_inames(t_unit, {
+                    #    "iel": ConcurrentElementInameTag(),
+                    #    "idof": ConcurrentDOFInameTag()})
+
+                result = self._array_context.call_loopy(prg(), grp_ary=a)["result"]
+                return result
+            elif order in "FC":
+                return a.reshape(-1, order=order)
+            elif order == "A":
+                # TODO: upstream this to pyopencl.array
+                if a.flags.f_contiguous:
+                    return a.reshape(-1, order="F")
+                elif a.flags.c_contiguous:
+                    return a.reshape(-1, order="C")
+                else:
+                    raise ValueError("For `order='A'`, array should be either"
+                                     " F-contiguous or C-contiguous.")
+            elif order == "K":
+                raise NotImplementedError("PyOpenCLArrayContext.np.ravel not "
+                                          "implemented for 'order=K'")
+            else:
+                raise ValueError("`order` can be one of 'F', 'C', 'A' or 'K'. "
+                                 f"(got {order})")
+
+        return rec_map_array_container(_rec_ravel, a)
+
+
+    def stack(self,arrays, axis=0):
+        from pytools.obj_array import make_obj_array
+
+        if not axis == 0:
+            raise NotImplementedError("Axes other than 0 are not currently supported")
+
+        def _stack(arrays, queue):
+
+            #print(len(arrays))
+            #print(arrays[0].shape)
+            #print(arrays[0].strides)
+
+            # This sorts the strides from lowest to highest and then
+            # uses their original indices to create a list of "N{i}"
+            # strings.
+
+            ndims = len(arrays[0].shape)
+            lp_strides_ordered = np.array([f"N{i}" for i in range(ndims)])
+            lp_strides = np.empty_like(lp_strides_ordered)
+            sorted_estrides = np.array(sorted(list(enumerate(arrays[0].strides)), key=lambda tup : tup[1]))
+            for i, j in enumerate(sorted_estrides[:,0]):
+                lp_strides[j] = lp_strides_ordered[i]
+
+            lp_strides_out = [f"N{ndims}"] + list(lp_strides)
+            lp_strides_in = ["sep"] + list(lp_strides)
+
+            # Loopy errors with this, constructing string instead
+            #prg = lp.make_copy_kernel(lp_strides_out, old_dim_tags=lp_strides_in)
+
+            # Loopy errors when try to use the lp_strides lists directly
+            str_strides_in = ""
+            str_strides_out = ""
+
+            for s0, s1 in zip(lp_strides_out, lp_strides_in):
+                str_strides_out += s0 + ","
+                str_strides_in += s1 + ","
+            str_strides_out = str_strides_out[:len(str_strides_out) - 1]
+            str_strides_in = str_strides_in[:len(str_strides_in) - 1]
+           
+            #print(arrays[0].strides) 
+            #print(str_strides_in)
+            #print(str_strides_out)
+
+            prg = lp.make_copy_kernel(str_strides_out, old_dim_tags=str_strides_in)
+
+            # Fix the kernel parameters
+            d = {"n{}".format(i+1): n for i,n in enumerate(arrays[0].shape)}
+            d["n0"] = len(arrays)
+            prg = lp.fix_parameters(prg,  **d)
+
+            # Should call_loopy be used instead? Probably. No reason no to
+            result = prg(queue, input=make_obj_array(arrays))[1][0]
+            #print(result.shape)
+            return result
+
+        return rec_multimap_array_container(
+                 lambda *args: _stack(args, self._array_context.queue),
+                 *arrays)
+
+        #return rec_multimap_array_container(
+        #         lambda *args: cla.stack(arrays=args, axis=axis,
+        #             queue=self._array_context.queue),
+        #         *arrays)
+
+    def __getattr__(self, name):
+        def loopy_implemented_elwise_func(*args):
+            if all(np.isscalar(ary) for ary in args):
+                return getattr(
+                         np, self._c_to_numpy_arc_functions.get(name, name)
+                         )(*args)
+            actx = self._array_context
+            prg = _get_scalar_func_loopy_program(actx,
+                    c_name, nargs=len(args), axis_lengths=args[0].shape)
+            #for arg in args:
+                #print("Input dtype:", arg.dtype)
+                #print("Input shape:", arg.shape)
+                #print("Input strides:", arg.strides)
+                #print("Input Sum:", cla.sum(arg))
+                ##print("Input Max:", cla.max(arg))
+                ##print("Input Min:", cla.min(arg))
+                #print("Input numpy:", np.sum(np.abs(arg.get())))
+                #if arg.shape == (0,2):
+                #    print("Input array:", arg.get())
+            #cargs = []
+            #for arg in args:
+            #    print(
+            #evt, (out,) = ftoc_knl(self._array_context.queue, input=arg)
+            #    cargs.append(out)
+            # Workaround
+            #if len(args) == 1 and args[0].shape[0] == 0:
+            #    return args[0]
+            #print(prg)
+
+            outputs = actx.call_loopy(prg,
+                    #**{"inp%d" % i: cargs[i] for i, arg in enumerate(args)})
+                    **{"inp%d" % i: arg for i, arg in enumerate(args)})
+            
+            #print("PyOpenCL Output sum:", cla.sum(outputs["out"]))
+            #print("Output numpy:", np.sum(np.abs(outputs["out"].get())))
+            #1/0
+            #exit()
+            return outputs["out"]
+
+        if name in self._c_to_numpy_arc_functions:
+            from warnings import warn
+            warn(f"'{name}' in ArrayContext.np is deprecated. "
+                    f"Use '{self._c_to_numpy_arc_functions[name]}' as in numpy. "
+                    "The old name will stop working in 2022.",
+                    DeprecationWarning, stacklevel=3)
+
+        # normalize to C names anyway
+        c_name = self._numpy_to_c_arc_functions.get(name, name)
+
+        # limit which functions we try to hand off to loopy
+        if (name in self._numpy_math_functions
+                or name in self._c_to_numpy_arc_functions):
+            return multimapped_over_array_containers(loopy_implemented_elwise_func)
+        else:
+            raise AttributeError(
+                    f"'{type(self._array_context).__name__}.np' object "
+                    f"has no attribute '{name}'")
+
+    """ Old version
+    def __getattr__(self, name):
+        def loopy_implemented_elwise_func(*args):
+            actx = self._array_context
+            prg = _get_scalar_func_loopy_program(actx,
+                    c_name, nargs=len(args), naxes=len(args[0].shape))
+            outputs = actx.call_loopy(prg,
+                    **{"inp%d" % i: arg for i, arg in enumerate(args)})
+            return outputs["out"]
+
+        if name in self._c_to_numpy_arc_functions:
+            from warnings import warn
+            warn(f"'{name}' in ArrayContext.np is deprecated. "
+                    "Use '{c_to_numpy_arc_functions[name]}' as in numpy. "
+                    "The old name will stop working in 2021.",
+                    DeprecationWarning, stacklevel=3)
+
+        # normalize to C names anyway
+        c_name = self._numpy_to_c_arc_functions.get(name, name)
+
+        # limit which functions we try to hand off to loopy
+        if name in self._numpy_math_functions:
+            return multimapped_over_array_containers(loopy_implemented_elwise_func)
+        else:
+            raise AttributeError(name)
+    """
+
+# The PyOpenCLArrayContext needs this since the array dimensions are
+# Maybe the parameter fixing should be moved into the PyOpenCLArrayContext
+class ParameterFixingPyOpenCLArrayContext(MPIPyOpenCLArrayContext):
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+
+        program = set_memory_layout(program, order="C")
+        #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+        # Set no_numpy and return_dict options here?
+        #for arg in program.default_entrypoint.args:
+        #    for tag in arg.tags:
+        #        if isinstance(tag, ParameterValue):
+        #            program = lp.fix_parameters(program, **{arg.name: tag.value})
+
+        #program = super().transform_loopy_program(program)
+        return program
+
+
+    def call_loopy(self, program, **kwargs):
+
+        #print(program)
+        result = super().call_loopy(program, **kwargs)
+        
+        queue_properties = self.queue.get_info(cl.command_queue_info.PROPERTIES)
+        profiling_enabled = cl.command_queue_properties.PROFILING_ENABLE
+        profiling_is_enabled = queue_properties & profiling_enabled == profiling_enabled
+
+        #try: # Only if profiling is enabled
+        if profiling_is_enabled:
+
+            evt = None
+            for val in result.values():
+                if isinstance(val, cla.Array):
+                    if val.events is not None and len(val.events) > 0:
+                        evt = val.events[0]                    
+                        break
+
+            #evt = result["evt"]
+            evt.wait()
+            dt = evt.profile.end - evt.profile.start
+            print("Clock ticks:", dt)
+            dt = dt / 1e9
+
+            nbytes = 0
+            # Could probably just use program.default_entrypoint.args but maybe all
+            # parameters are not set
+            if "resample_by_mat" in program.default_entrypoint.name:
+                n_to_nodes, n_from_nodes = kwargs["resample_mat"].shape
+                nbytes = (kwargs["to_element_indices"].shape[0]*n_to_nodes +
+                            n_to_nodes*n_from_nodes +
+                            kwargs["from_element_indices"].shape[0]*n_from_nodes) * 8
+            elif program.default_entrypoint.name == "resample_by_picking_group":
+                nelements = kwargs["from_element_indices"].shape[0]
+                dpl1, nunit_dofs_tgt = kwargs["dof_pick_lists"].shape
+                ary_bytes = kwargs["ary"].dtype.itemsize
+                dpl_bytes = kwargs["dof_pick_lists"].dtype.itemsize
+                dpli_bytes = kwargs["dof_pick_list_indices"].dtype.itemsize
+                fei_bytes = kwargs["from_element_indices"].dtype.itemsize
+                # Data from source and target + the indirections arrays
+                # Assume indirection arrays and data arrays are fetched only once
+                nbytes = 2*nelements*nunit_dofs_tgt*ary_bytes
+                nbytes += nelements*fei_bytes + nelements*dpli_bytes + nunit_dofs_tgt*dpl1*dpl_bytes 
+            elif "resample_by_picking" in program.default_entrypoint.name:
+                # Double check this - this may underestimate the number of bytes transferred
+                print("Inaccurate byte count for resample_by_picking")
+                """
+                if "rhs" not in program.default_entrypoint.name:
+                    nbytes = kwargs["pick_list"].shape[0] * (kwargs["from_element_indices"].shape[0]
+                            + kwargs["to_element_indices"].shape[0])*8
+                else:
+                    nbytes = kwargs["pick_list"].shape[0] * (kwargs["from_element_indices"].shape[0])*8
+                """
+            else:
+                # This won't work because not all kernels have dimensions specified
+                #for arg in program.default_entrypoint.args:
+                #    nbytes += arg.dtype.dtype.itemsize*np.prod(arg.shape)
+                for key, val in kwargs.items():
+                    # output may be a list of pyopenclarrays or it could be a 
+                    # pyopenclarray. This prevents double counting (allowing
+                    # other for-loop to count the bytes in the former case)
+                    if key not in result.keys(): 
+                        try: 
+                            nbytes += np.prod(val.shape)*8
+                        except AttributeError:
+                            nbytes += 0 # Or maybe 1*8 if this is a scalar
+                for val in result.values():
+                    try:
+                        nbytes += np.prod(val.shape)*8
+                    except AttributeError:
+                        nbytes += 0 # Or maybe this is a scalar?
+            bw = nbytes / dt / 1e9
+
+            print("Kernel {}, Time {}, Bytes {}, Bandwidth {}".format(program.default_entrypoint.name, dt, nbytes, bw))
+
+        #except cl._cl.RuntimeError as e:
+        #    pass 
+
+        return result
+
+    #@memoize_method # Somehow causes a shape mismatch
+    def _wrap_get_einsum_prg(self, spec, arg_names, tagged): 
+
+        prg = self._get_einsum_prg(spec, arg_names, tagged)
+        for tag in tagged:
+            if isinstance(tag, KernelDataTag):
+                ep = prg.default_entrypoint
+                prg = lp.make_kernel(ep.domains, ep.instructions, kernel_data=tag.kernel_data, name=ep.name)
+        return prg
+
+
+    def einsum(self, spec, *args, arg_names=None, tagged=()):
+        """Computes the result of Einstein summation following the
+        convention in :func:`numpy.einsum`.
+
+        :arg spec: a string denoting the subscripts for
+            summation as a comma-separated list of subscript labels.
+            This follows the usual :func:`numpy.einsum` convention.
+            Note that the explicit indicator `->` for the precise output
+            form is required.
+        :arg args: a sequence of array-like operands, whose order matches
+            the subscript labels provided by *spec*.
+        :arg arg_names: an optional iterable of string types denoting
+            the names of the *args*. If *None*, default names will be
+            generated.
+        :arg tagged: an optional sequence of :class:`pytools.tag.Tag`
+            objects specifying the tags to be applied to the operation.
+
+        :return: the output of the einsum :mod:`loopy` program
+        """
+        if arg_names is None:
+            arg_names = tuple("arg%d" % i for i in range(len(args)))
+
+        td = None
+        for tag in tagged:
+            if isinstance(tag, EinsumArgsTags):
+                td = tag.tags_map
+        
+        if td is not None:
+            prg = self._get_einsum_prg(spec, arg_names, tagged)
+
+            arg_spec, out_spec = spec.split("->")
+            dim_dict = {}
+            kernel_data = []
+
+            # Are there always as many arg_specs as there are args?            
+            for index_chars, arg, name, in zip(arg_spec.split(","), args, arg_names):
+                dim_dict.update(dict(zip(index_chars, arg.shape)))
+                kd = lp.GlobalArg(name, arg.dtype, shape=arg.shape, offset=lp.auto, tags=td.get(name))
+                kernel_data.append(kd)
+            out_shape = tuple([dim_dict[index_char] for index_char in out_spec])
+            # TODO: More robust way to find output dtype
+            kd = lp.GlobalArg("out", args[-1].dtype, shape=out_shape, 
+                    offset=lp.auto, tags=td.get("out"), is_output=True)
+            kernel_data.append(kd)
+            for key, value in dim_dict.items():
+                kernel_data.append(lp.ValueArg(f"N{key}", tags=[ParameterValue(value)]))
+            kernel_data.append(...)
+
+            ep = prg.default_entrypoint
+            prg = lp.make_kernel(ep.domains, ep.instructions, kernel_data=kernel_data, name=ep.name)
+        else:
+            prg = self._wrap_get_einsum_prg(spec, arg_names, tagged)
+
+        return self.call_loopy(
+            prg, **{arg_names[i]: arg for i, arg in enumerate(args)}
+        )["out"]
+
+
+class FortranOrderedArrayContext(ParameterFixingPyOpenCLArrayContext):
+
+    def _get_fake_numpy_namespace(self):
+        return GrudgeFakeNumpyNamespace(self)
+
+    def empty(self, shape, dtype):
+        return cla.empty(self.queue, shape=shape, dtype=dtype,
+                allocator=self.allocator, order="F")
+
+    def zeros(self, shape, dtype):
+        return cla.zeros(self.queue, shape=shape, dtype=dtype,
+                allocator=self.allocator, order="F")
+
+    def thaw(self, array):
+        #print("THAWING", array.shape)
+        thawed = super().thaw(array)
+        #print("Shape:", thawed.shape)
+        #print("C_contiguous:", array.flags.c_contiguous)
+        #print("F_contiguous:", array.flags.f_contiguous)
+        if hasattr(thawed, "shape") and len(thawed.shape) == 2 and array.flags.c_contiguous and not array.flags.f_contiguous:
+            result = self.call_loopy(ctof_knl, **{"input": thawed})
+            #print("CALLED CTOF")
+            #assert cla.sum(thawed - result["output"]) == 0
+            #exit()
+            thawed = result["output"]
+
+            #result = ctof_knl(thawed.queue, input=thawed)
+            #evt, (out,) = ctof_knl(thawed.queue, input=thawed)
+            #print("CALLED CTOF")
+            #thawed = out
+
+        return thawed
+
+
+    def from_numpy(self, np_array: np.ndarray):
+        cl_a = super().from_numpy(np_array)
+        tags = getattr(np_array, "tags", None)
+        if tags is not None and IsDOFArray() in tags:
+            # Should this call go through the array context?
+            print("CHANGING LAYOUT OF INPUT NUMPY ARRAY In from_numpy")
+            evt, (out,) = ctof_knl(self.queue, input=cl_a)
+            cl_a = out
+        return cl_a
+
+
+    def transform_loopy_program(self, program):
+        #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+        program = set_memory_layout(program, order="F")
+
+        # This should probably be a separate function
+        #for arg in program.default_entrypoint.args:
+        #    for tag in arg.tags:
+        #        if isinstance(tag, ParameterValue):
+        #            program = lp.fix_parameters(program, **{arg.name: tag.value})
+
+        # PyOpenCLArrayContext default transformations can't handle fortran ordering
+        #program = super().transform_loopy_program(program)
+        return program
+
+
+class KernelSavingArrayContext(FortranOrderedArrayContext):
+#class KernelSavingArrayContext(ParameterFixingPyOpenCLArrayContext):
+    def __init__(self,
+            mpi_communicator,
+            queue: "pyopencl.CommandQueue",
+            *, allocator: Optional["pyopencl.tools.AllocatorInterface"] = None,
+            wait_event_queue_length: Optional[int] = None,
+            force_device_scalars: bool = False,
+            save_dir: str = "./pickled_programs") -> None:
+
+        # Currently placed in cwd
+        self.save_dir = save_dir
+        os.makedirs(self.save_dir, exist_ok=True)
+
+        super().__init__(mpi_communicator, queue, allocator=allocator,
+            wait_event_queue_length=wait_event_queue_length,
+            force_device_scalars=force_device_scalars)
+
+    def transform_loopy_program(self, program):
+
+        if program.default_entrypoint.name in autotuned_kernels:
+
+            # Needs to be set here so autotuner knows dimensions for test data
+            program = set_memory_layout(program, order="F")
+            #program = fix_program_parameters(program)
+            pid = unique_program_id(program)
+        
+            # Is there a possible race condition in the multirank case?
+            # Is there a way to obtain the current rank?
+            file_path = f"{self.save_dir}/{program.default_entrypoint.name}_{pid}.pickle"
+            
+            if not exists(file_path):
+                # For some reason this doesn't create the directory
+                print(program.default_entrypoint)
+                print("====WRITING PROGRAM TO FILE===", file_path)
+                out_file = open(file_path, "wb")
+                pickle.dump(program, out_file)
+                out_file.close()
+                # Check that the identifier is the same.
+                print("====READING PROGRAM FROM FILE===", file_path)
+                f = open(file_path, "rb")
+                loaded = pickle.load(f)
+                f.close()
+                pid2 = unique_program_id(loaded)
+                #print(pid, pid2)
+                assert pid == pid2
+
+            else:
+                print("PICKLED FILE ALREADY EXISTS", file_path)
+        #else:
+        program = super().transform_loopy_program(program)
+
+        return program
+
+class COrderedKernelSavingArrayContext(ParameterFixingPyOpenCLArrayContext):
+    def __init__(self,
+            mpi_communicator,
+            queue: "pyopencl.CommandQueue",
+            *, allocator: Optional["pyopencl.tools.AllocatorInterface"] = None,
+            wait_event_queue_length: Optional[int] = None,
+            force_device_scalars: bool = False,
+            save_dir: str = "./pickled_programs") -> None:
+
+        # Currently placed in cwd
+        self.save_dir = save_dir
+        os.makedirs(self.save_dir, exist_ok=True)
+
+        super().__init__(mpi_communicator, queue, allocator=allocator,
+            wait_event_queue_length=wait_event_queue_length,
+            force_device_scalars=force_device_scalars)
+
+    def transform_loopy_program(self, program):
+
+        if program.default_entrypoint.name in autotuned_kernels:
+
+            # Needs to be set here so autotuner knows dimensions for test data
+            program = set_memory_layout(program, order="C")
+            #program = fix_program_parameters(program)
+            pid = unique_program_id(program)
+        
+            # Is there a possible race condition in the multirank case?
+            # Is there a way to obtain the current rank?
+            file_path = f"{self.save_dir}/{program.default_entrypoint.name}_{pid}.pickle"
+            
+            if not exists(file_path):
+                # For some reason this doesn't create the directory
+                print(program.default_entrypoint)
+                print("====WRITING PROGRAM TO FILE===", file_path)
+                out_file = open(file_path, "wb")
+                pickle.dump(program, out_file)
+                out_file.close()
+                # Check that the identifier is the same.
+                print("====READING PROGRAM FROM FILE===", file_path)
+                f = open(file_path, "rb")
+                loaded = pickle.load(f)
+                f.close()
+                pid2 = unique_program_id(loaded)
+                #print(pid, pid2)
+                assert pid == pid2
+
+            else:
+                print("PICKLED FILE ALREADY EXISTS", file_path)
+
+        program = super().transform_loopy_program(program)
+
+        return program
+
+
+
+
+# This class could be used for some set of default transformations
+class GrudgeArrayContext(FortranOrderedArrayContext):
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+        #print(program.default_entrypoint.name)
+
+        #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+
+        
+        #device_id = "NVIDIA Titan V"
+        #transform_id = get_transformation_id(device_id)
+
+        # Static (non-autotuned) transformations for the GPU
+        # This needs to be fixed for new resample by picking kernel
+        ary_itemsize = 8 # Assume doubles
+        if "resample_by_picking" in program.default_entrypoint.name:
+            for arg in program.default_entrypoint.args:
+                print(arg.name, arg.tags)
+                if arg.name == "nunit_dofs_tgt" or arg.name == "n_to_nodes":
+                    # Assumes this has has a single ParameterValue tag
+                    n_to_nodes = arg.tags[0].value
+                elif arg.name == "nelements":
+                    nelements = arg.tags[0].value
+                elif arg.name == "ary":
+                    ary_itemsize = arg.dtype.dtype.itemsize
+
+            l1 = min(n_to_nodes, 32)
+            outer = min(nelements, 128)
+            l0 = min(nelements, 32)#32#((1024 // n_to_nodes) // 32) * 32 # Closest multiple of 32 to 1024 // n_to_nodes
+            #if l0 == 0:
+            #    l0 = 16
+            #if n_to_nodes*16 > 1024:
+            #    l0 = 8
+
+            #outer = 128#max(l0, 32)
+            # Prefetch ary if it can fit in shared memory
+
+            # Broken, plus if elements are fetched only once this helps not.
+            #if nelements*n_to_nodes <= self.queue.device.local_mem_size // ary_itemsize:
+            #    program = lp.add_prefetch(program, "ary", "iel,idof", temporary_address_space=lp.AddressSpace.LOCAL, default_tag="l.auto")
+ 
+            #program = set_memory_layout(program)
+            if nelements*n_to_nodes > 0:
+                if nelements*n_to_nodes <= self.queue.device.max_work_group_size:
+                    program = lp.split_iname(program, "iel", nelements, outer_tag="g.0",
+                                                inner_tag="l.0", slabs=(0,0))
+                    program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1",
+                                                inner_tag="l.1", slabs=(0,0))
+                else:
+                    slabs = (0,0) if outer == nelements else (0,1)
+                    program = lp.split_iname(program, "iel", outer, outer_tag="g.0",
+                                                slabs=slabs)
+                    program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp",
+                                                inner_tag="l.0", slabs=(0,0))
+                    slabs = (0,0) if l1 == n_to_nodes else (0,1)
+                    program = lp.split_iname(program, "idof", l1, outer_tag="g.1",
+                                                inner_tag="l.1", slabs=slabs)
+
+
+            #program = lp.add_inames_for_unused_hw_axes(program)   
+            #program = lp.set_options(program, "write_cl")
+        elif "actx_special" in program.default_entrypoint.name: # Fixed
+            #program = set_memory_layout(program)
+            # Sometimes sqrt is called on single values.
+            if "i0" in program.default_entrypoint.inames:
+                program = lp.split_iname(program, "i0", 512, outer_tag="g.0",
+                                        inner_tag="l.0", slabs=(0, 1))
+            #program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+            #                           slabs=(0,1))
+            #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+            #                           inner_tag="l.0")
+            #program = lp.split_iname(program, "i1", 20, outer_tag="g.1",
+            #                           inner_tag="l.1", slabs=(0,0))
+            #program2 = lp.join_inames(program, ("i1", "i0"), "i")
+            #from islpy import BasicMap
+            #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}")
+            #program2 = lp.map_domain(program, m)
+            #print(program2)
+            #exit()
+
+            #program = super().transform_loopy_program(program)
+            #print(program)
+            #print(lp.generate_code_v2(program).device_code())
+
+        # Not really certain how to do grudge_assign, done for flatten
+        elif "flatten" in program.default_entrypoint.name: 
+
+            #program = set_memory_layout(program)
+            # This is hardcoded. Need to move this to separate transformation file
+            #program = lp.set_options(program, "write_cl")
+            program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                        slabs=(0, 1))
+            program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                        inner_tag="l.0")
+            program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                        inner_tag="l.1", slabs=(0, 0))
+        # ctof kernel
+        elif "loopy_kernel" in program.default_entrypoint.name: 
+
+            #program = set_memory_layout(program)
+            # This is hardcoded. Need to move this to separate transformation file
+            #program = lp.set_options(program, "write_cl")
+            print("TRANSFORMING CTOF KERNEL")
+            program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+                                        slabs=(0, 1))
+            program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+                                        inner_tag="l.0")
+            program = lp.split_iname(program, "i1", 32, outer_tag="g.1",
+                                        inner_tag="l.1", slabs=(0, 0))
+
+        elif "einsum3to1_kernel" in program.default_entrypoint.name:
+
+            Ne = 0
+            for arg in program.default_entrypoint.args:
+                if arg.name == "Ne":
+                    Ne = arg.tags[0].value
+
+            if Ne != 0:
+                program = lp.split_iname(program, "e", 128, outer_tag="g.0", slabs=(0,1))
+                program = lp.split_iname(program, "e_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+            program = lp.prioritize_loops(program, "f,j")
+
+        #else:
+            #print(program)
+            #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name)
+            #    The PyOpenCLArrayContext transformations can fail when inames are fixed.
+        program = super().transform_loopy_program(program)
+
+        return program
+
+
+class COrderedGrudgeArrayContext(ParameterFixingPyOpenCLArrayContext):
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+        #print(program.default_entrypoint.name)
+
+        #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+
+        
+        #device_id = "NVIDIA Titan V"
+        #transform_id = get_transformation_id(device_id)
+
+        # Static (non-autotuned) transformations for the GPU
+        # This needs to be fixed for new resample by picking kernel
+        ary_itemsize = 8 # Assume doubles
+        if "resample_by_picking" in program.default_entrypoint.name:
+            for arg in program.default_entrypoint.args:
+                print(arg.name, arg.tags)
+                if arg.name == "nunit_dofs_tgt" or arg.name == "n_to_nodes":
+                    # Assumes this has has a single ParameterValue tag
+                    n_to_nodes = arg.tags[0].value
+                elif arg.name == "nelements":
+                    nelements = arg.tags[0].value
+                elif arg.name == "ary":
+                    ary_itemsize = arg.dtype.dtype.itemsize
+
+            l1 = min(n_to_nodes, 32)
+            outer = min(nelements, 128)
+            l0 = min(nelements, 32)#32#((1024 // n_to_nodes) // 32) * 32 # Closest multiple of 32 to 1024 // n_to_nodes
+            #if l0 == 0:
+            #    l0 = 16
+            #if n_to_nodes*16 > 1024:
+            #    l0 = 8
+
+            #outer = 128#max(l0, 32)
+            # Prefetch ary if it can fit in shared memory
+
+            # Broken, plus if elements are fetched only once this helps not.
+            #if nelements*n_to_nodes <= self.queue.device.local_mem_size // ary_itemsize:
+
+            #program = set_memory_layout(program)
+            #program = lp.add_prefetch(program, "dof_pick_lists", temporary_address_space=lp.AddressSpace.LOCAL)
+            if nelements*n_to_nodes > 0:
+                if nelements*n_to_nodes <= self.queue.device.max_work_group_size:
+                    program = lp.split_iname(program, "iel", nelements, outer_tag="g.0",
+                                                inner_tag="l.0", slabs=(0,0))
+                    program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1",
+                                                inner_tag="l.1", slabs=(0,0))
+                    #program = lp.add_prefetch(program, "dof_pick_list_index", "iel_inner", default_tag="l.auto")
+                    #program = lp.add_prefetch(program, "from_element_indices", "iel_inner", default_tag="l.auto")
+                    #program = lp.add_prefetch(program, "dof_pick_lists", "", temporary_address_space=lp.AddressSpace.LOCAL)
+                else:
+                    slabs = (0,0) if outer == nelements else (0,1)
+                    program = lp.split_iname(program, "iel", outer, outer_tag="g.0",
+                                                slabs=slabs)
+                    program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp",
+                                                inner_tag="l.0", slabs=(0,0))
+                    slabs = (0,0) if l1 == n_to_nodes else (0,1)
+                    program = lp.split_iname(program, "idof", l1, outer_tag="g.1",
+                                                inner_tag="l.1", slabs=slabs)
+                    # Prefetching these two just slows the kernel, not sure about dof_pick_lists
+                    #program = lp.add_prefetch(program, "dof_pick_list_index", "iel_inner_outer,iel_inner_inner", default_tag="l.auto")
+                    #program = lp.add_prefetch(program, "from_element_indices", "iel_inner_outer,iel_inner_inner", default_tag="l.auto")
+                    #program = lp.add_prefetch(program, "dof_pick_lists", "idof_outer,idof_inner", \
+                    #        temporary_address_space=lp.AddressSpace.LOCAL, default_tag="l.auto")
+
+            program = lp.add_inames_for_unused_hw_axes(program)
+            #program = lp.set_options(program, "write_cl")
+        elif "actx_special" in program.default_entrypoint.name: # Fixed
+            # Sometimes sqrt is called on single values.
+
+            if "i0" in program.default_entrypoint.inames:
+                program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+                                           slabs=(0,1))
+                program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+                                           inner_tag="l.0")
+                program = lp.split_iname(program, "i1", 32, outer_tag="g.1",
+                                           inner_tag="l.1", slabs=(0,1))
+
+                #program = lp.split_iname(program, "i0", 512, outer_tag="g.0",
+                #                        inner_tag="l.0", slabs=(0, 1))
+                #print(program)
+                #exit()
+            #program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+            #                           slabs=(0,1))
+            #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+            #                           inner_tag="l.0")
+            #program = lp.split_iname(program, "i1", 20, outer_tag="g.1",
+            #                           inner_tag="l.1", slabs=(0,0))
+            #program2 = lp.join_inames(program, ("i1", "i0"), "i")
+            #from islpy import BasicMap
+            #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}")
+            #program2 = lp.map_domain(program, m)
+            #print(program2)
+            #exit()
+
+            #program = super().transform_loopy_program(program)
+            #print(program)
+            #print(lp.generate_code_v2(program).device_code())
+
+        # Not really certain how to do grudge_assign, done for flatten
+        elif "flatten" in program.default_entrypoint.name: 
+
+            #program = set_memory_layout(program)
+            # This is hardcoded. Need to move this to separate transformation file
+            #program = lp.set_options(program, "write_cl")
+            program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                        slabs=(0, 1))
+            program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                        inner_tag="l.0")
+            program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                        inner_tag="l.1", slabs=(0, 0))
+        # ctof kernel
+        elif "loopy_kernel" in program.default_entrypoint.name: 
+
+            #program = set_memory_layout(program)
+            # This is hardcoded. Need to move this to separate transformation file
+            #program = lp.set_options(program, "write_cl")
+            print("TRANSFORMING CTOF KERNEL")
+            program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+                                        slabs=(0, 1))
+            program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+                                        inner_tag="l.0")
+            program = lp.split_iname(program, "i1", 32, outer_tag="g.1",
+                                        inner_tag="l.1", slabs=(0, 0))
+        elif "einsum3to1_kernel" == program.default_entrypoint.name:
+
+            print("================EINSUM3TO1_KERNEL=====================")
+            #program = set_memory_layout(program, order="C")
+            Ne = 0
+            for arg in program.default_entrypoint.args:
+                if arg.name == "Ne":
+                    Ne = arg.tags[0].value
+
+            if Ne != 0:
+                program = lp.split_iname(program, "e", 128, outer_tag="g.0", slabs=(0,1))
+                program = lp.split_iname(program, "e_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+            program = lp.prioritize_loops(program, "f,j")
+
+        #else:
+            #print(program)
+            #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name)
+            #    The PyOpenCLArrayContext transformations can fail when inames are fixed.
+
+        program = super().transform_loopy_program(program)
+        return program
+
+
+
+def unique_program_id(program):
+    #code = lp.generate_code_v2(program).device_code() # Not unique
+    #return md5(str(program.default_entrypoint).encode()).hexdigest() # Also not unique
+
+    ep = program.default_entrypoint
+    domains = ep.domains
+    instr = [str(entry) for entry in ep.instructions]
+    args = ep.args
+    name = ep.name
+
+    # Is the name really relevant? 
+    #all_list = [name] + domains + instr + args
+    # Somehow this can change even if the string is the same
+    #identifier = md5(str(all_list).encode()).hexdigest()
+
+    """
+    print("NAME")
+    print(name)
+    print()
+    print("DOMAINS")
+    print(domains)
+    print()
+    print("INSTRUCTIONS")
+    print(instr)
+    print()
+    print("ARGS")
+    print(args)
+    print()
+    """
+
+    dstr = md5(str(domains).encode()).hexdigest() #List
+    istr = md5(str(instr).encode()).hexdigest()   #List
+    astr = md5(str(args).encode()).hexdigest()    #List
+    nstr = md5(name.encode()).hexdigest()
+    #print("dstr", dstr)
+    #print("nstr", nstr)
+    #print("istr", istr)
+    #print("astr", astr)
+    #for entry in all_list:
+    #    print(entry)
+    #print(str(all_list))
+    identifier = nstr[:4] + dstr[:4] + istr[:4] + astr[:4]
+
+    return identifier
+
+
+def convert(o):
+    if isinstance(o, np.generic): return o.item()
+    raise TypeError
+
+
+# Meshmode and Grudge kernels to autotune
+autotuned_kernels = {"einsum3to2_kernel",
+                     "einsum4to2_kernel", 
+                     "einsum5to3_kernel", 
+                     "einsum2to2_kernel",
+                     "diff", 
+                     "lp_nodes",
+                     "grudge_elementwise_sum_knl",
+                     "resample_by_picking_single_indirection",
+                     #"resample_by_picking_group", # Will require implementing a special testing function
+                     "smooth_comp" } # This last one is a mirgecom kernel. Should probably have some class variable.
+
+
+class AutotuningArrayContext(GrudgeArrayContext):
+
+    #@memoize_method #Should this be memoized?
+    def get_generators(self, program):
+
+        # Maybe the generators should be classes so we can use inheritance.
+        if program.default_entrypoint.name == "einsum3to2_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum4to2_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum5to3_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum2to2_kernel" or program.default_entrypoint.name == "resample_by_picking_single_indirection":
+            from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "grudge_elementwise_sum_knl":
+            from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator
+        else:
+            from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator
+            from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator
+
+        return tlist_generator, pspace_generator
+
+
+    def autotune_and_save(self, queue, program, search_fn, tlist_generator,
+            pspace_generator,  hjson_file_str, time_limit=np.inf):
+        from hjson import dump
+
+        try:
+            avg_time, transformations, data = search_fn(queue, program, generic_test,
+                                        pspace_generator, tlist_generator, time_limit=time_limit)
+        except cl._cl.RuntimeError as e:
+            print(e)
+            print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.")
+
+        od = {"transformations": transformations}
+        out_file = open(hjson_file_str, "wt+")
+
+        hjson.dump(od, out_file,default=convert)
+        out_file.close()
+        print("WRITING TRANSFORMATION FILE:", hjson_file_str)
+
+        return transformations
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+
+        # Really just need to add metadata to the hjson file
+        # Could convert the kernel itself to base 64 and store it
+        # in the hjson file
+        # TODO: Dynamically determine device id,
+        device_id = "NVIDIA Titan V"
+
+        print(program.default_entrypoint.name)
+        print(unique_program_id(program))
+        print(program)
+
+        # These are the most compute intensive kernels
+        to_optimize = {}
+        if program.default_entrypoint.name in to_optimize:
+            print(program)
+            for arg in program.default_entrypoint.args:
+                print(arg.tags)
+            exit()
+
+        if program.default_entrypoint.name in autotuned_kernels:
+            # Set no_numpy and return_dict options here?
+            #program = fix_program_parameters(program)
+            program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+            program = set_memory_layout(program, order="F")
+            pid = unique_program_id(program)
+            os.makedirs(os.getcwd() + "/hjson", exist_ok=True)
+            hjson_file_str = f"hjson/{program.default_entrypoint.name}_{pid}.hjson"
+
+            try:
+                # Attempt to read from a transformation file in the current directory first,
+                # then try to read from the package files - this is not currently implemented
+                # Maybe should have ability to search in arbitrary specified directories.
+
+                print("Opening file:", hjson_file_str)
+                hjson_file = open(hjson_file_str, "rt")
+
+                try: # New hjson structure
+                    transformations = dgk.load_transformations_from_file(hjson_file,
+                        ["transformations"])
+                    print("LOCATED TRANSFORMATION:", hjson_file_str)
+                    #exit()
+                except KeyError as e:
+                    # This can eventually be removed since we're now using the hash of the program code to specify the file.
+                    # Kernels with different dimensions will have different files.
+                    hjson_file.seek(0,0) # Move read location back to beginning
+
+                    fp_format = None
+                    ndofs = None # The value doesn't matter now
+                    transform_id = get_transformation_id(device_id)
+
+                    for arg in program.default_entrypoint.args:
+                        if IsOpArray() in arg.tags:
+                            dim = 1
+                            ndofs = arg.shape[0]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags:
+                            ndofs = arg.shape[1]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsFaceMassOpArray() in arg.tags:
+                            ndofs = arg.shape[0]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsDOFArray() in arg.tags:
+                            ndofs = arg.shape[1]
+                            fp_format = arg.dtype.numpy_dtype
+                            break 
+
+                    if fp_format is None:
+                        print("Unknown fp_format")
+                        exit()                
+                    if ndofs is None:
+                        print("Unknown ndofs")
+                        exit()
+
+                    fp_string = get_fp_string(fp_format)
+                    indices = [transform_id, fp_string, str(ndofs)]
+                    transformations = dgk.load_transformations_from_file(hjson_file,
+                        indices)
+
+                hjson_file.close()
+
+            #except (KeyError, FileNotFoundError) as e:
+            # There shouldn't be any more key errors now that PIDs are used
+            except FileNotFoundError as e:
+                
+                """
+                # Maybe the generators should be classes so we can use inheritance.
+                if program.default_entrypoint.name == "einsum3to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum4to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum5to3_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum2to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "grudge_elementwise_sum_knl":
+                    from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator
+                else:
+                    from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator
+                    from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator
+
+                try:
+                    avg_time, transformations, data = search_fn(self.queue, program, generic_test, 
+                                                pspace_generator, tlist_generator, time_limit=np.inf)
+                except cl._cl.RuntimeError as e:
+                    print(e)
+                    print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.")
+
+                od = {"transformations": transformations}
+                out_file = open(hjson_file_str, "wt+")
+
+                hjson.dump(od, out_file,default=convert)
+                out_file.close()
+                #from pprint import pprint
+                #pprint(od)
+                """
+                print("TRANSFORMATION FILE NOT FOUND", hjson_file_str)
+                #exit()
+                tlist_generator, pspace_generator = self.get_generators(program)
+                search_fn = exhaustive_search_v2#random_search
+                transformations = self.autotune_and_save(self.queue, program, search_fn, 
+                        tlist_generator, pspace_generator, hjson_file_str)
+
+            program = dgk.apply_transformation_list(program, transformations)
+
+            """
+            # Kernels to not autotune. Should probably still load the transformation from a
+            # generator function. Should these be put in GrudgeArrayContext
+
+            # Maybe this should have an autotuner
+            # There isn't much room for optimization due to the indirection
+            elif "resample_by_picking" in program.default_entrypoint.name:
+                for arg in program.default_entrypoint.args:
+                    if arg.name == "n_to_nodes":
+                        # Assumes this has has a single ParameterValue tag
+                        n_to_nodes = arg.tags[0].value
+
+                l0 = ((1024 // n_to_nodes) // 32) * 32
+                if l0 == 0:
+                    l0 = 16
+                if n_to_nodes*16 > 1024:
+                    l0 = 8
+                    c
+
+                outer = max(l0, 32)
+
+                program = set_memory_layout(program)
+                program = lp.split_iname(program, "iel", outer, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+
+            elif "actx_special" in program.default_entrypoint.name: # Fixed
+                program = set_memory_layout(program)
+                # Sometimes sqrt is called on single values.
+                if "i0" in program.default_entrypoint.inames:
+                    program = lp.split_iname(program, "i0", 512, outer_tag="g.0",
+                                            inner_tag="l.0", slabs=(0, 1))
+                #program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+                #                           slabs=(0,1))
+                #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+                #                           inner_tag="l.0")
+                #program = lp.split_iname(program, "i1", 20, outer_tag="g.1",
+                #                           inner_tag="l.1", slabs=(0,0))
+                #program2 = lp.join_inames(program, ("i1", "i0"), "i")
+                #from islpy import BasicMap
+                #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}")
+                #program2 = lp.map_domain(program, m)
+                #print(program2)
+                #exit()
+
+                #program = super().transform_loopy_program(program)
+                #print(program)
+                #print(lp.generate_code_v2(program).device_code())
+
+            # Not really certain how to do grudge_assign, done for flatten
+            elif "flatten" in program.default_entrypoint.name: 
+
+                program = set_memory_layout(program)
+                # This is hardcoded. Need to move this to separate transformation file
+                #program = lp.set_options(program, "write_cl")
+                program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+            
+            else:
+                #print(program)
+                #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name)
+                #    The PyOpenCLArrayContext transformations can fail when inames are fixed.
+               program = super().transform_loopy_program(program)
+
+            '''       
+            # These still depend on the polynomial order = 3
+            # Never called?
+            # This is going away anyway probably
+            elif "resample_by_mat" in program.default_entrypoint.name:
+                hjson_file = pkg_resources.open_text(dgk, f"{program.default_entrypoint.name}.hjson")
+        
+                # Order 3: 10 x 10
+                # Order 4: 15 x 35
+                
+                #print(program)
+                #exit()
+                pn = 3 # This needs to  be not fixed
+                fp_string = "FP64"
+                
+                indices = [transform_id, fp_string, str(pn)]
+                transformations = dgk.load_transformations_from_file(hjson_file,
+                    indices)
+                hjson_file.close()
+                print(transformations)
+                program = dgk.apply_transformation_list(program, transformations)
+
+            # Not really certain how to do grudge_assign, done for flatten
+            elif "grudge_assign" in program.default_entrypoint.name or "flatten" in program.default_entrypoint.name: 
+                # This is hardcoded. Need to move this to separate transformation file
+                #program = lp.set_options(program, "write_cl")
+                program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+            
+
+            '''
+            """
+        else:
+            # print("USING FALLBACK TRANSFORMATIONS FOR " + program.default_entrypoint.name)
+            program = super().transform_loopy_program(program)
+
+        return program
+
+
+class COrderedAutotuningArrayContext(COrderedGrudgeArrayContext):
+
+    #@memoize_method #Should this be memoized?
+    def get_generators(self, program):
+
+        # Maybe the generators should be classes so we can use inheritance.
+        if program.default_entrypoint.name == "einsum3to2_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum4to2_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum5to3_kernel":
+            from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "einsum2to2_kernel" or program.default_entrypoint.name == "resample_by_picking_group":
+            from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator
+        elif program.default_entrypoint.name == "grudge_elementwise_sum_knl":
+            from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator
+            from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator
+        else:
+            from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator
+            from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator
+
+        return tlist_generator, pspace_generator
+
+
+    def autotune_and_save(self, queue, program, search_fn, tlist_generator,
+            pspace_generator,  hjson_file_str, time_limit=np.inf):
+        from hjson import dump
+
+        try:
+            avg_time, transformations, data = search_fn(queue, program, generic_test,
+                                        pspace_generator, tlist_generator, time_limit=time_limit)
+        except cl._cl.RuntimeError as e:
+            print(e)
+            print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.")
+
+        od = {"transformations": transformations}
+        out_file = open(hjson_file_str, "wt+")
+
+        hjson.dump(od, out_file,default=convert)
+        out_file.close()
+        print("WRITING TRANSFORMATION FILE:", hjson_file_str)
+
+        return transformations
+
+    @memoize_method
+    def transform_loopy_program(self, program):
+
+        # Really just need to add metadata to the hjson file
+        # Could convert the kernel itself to base 64 and store it
+        # in the hjson file
+        # TODO: Dynamically determine device id,
+        device_id = "NVIDIA Titan V"
+
+        print(program.default_entrypoint.name)
+        print(unique_program_id(program))
+        print(program)
+
+        # These are the most compute intensive kernels
+        to_optimize = {}#{"einsum5to3_kernel"}#{"einsum4to2_kernel", "resample_by_picking_group"}
+        if program.default_entrypoint.name in to_optimize:
+            print(program)
+            for arg in program.default_entrypoint.args:
+                print(arg.tags)
+            exit()
+
+        if program.default_entrypoint.name in autotuned_kernels:
+            # Set no_numpy and return_dict options here?
+            #program = fix_program_parameters(program)
+            program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True))
+            program = set_memory_layout(program, order="C")
+            pid = unique_program_id(program)
+            os.makedirs(os.getcwd() + "/hjson", exist_ok=True)
+            hjson_file_str = f"hjson/{program.default_entrypoint.name}_{pid}.hjson"
+
+            try:
+                # Attempt to read from a transformation file in the current directory first,
+                # then try to read from the package files - this is not currently implemented
+                # Maybe should have ability to search in arbitrary specified directories.
+
+                print("Opening file:", hjson_file_str)
+                hjson_file = open(hjson_file_str, "rt")
+
+                try: # New hjson structure
+                    transformations = dgk.load_transformations_from_file(hjson_file,
+                        ["transformations"])
+                    print("LOCATED TRANSFORMATION:", hjson_file_str)
+                    #exit()
+                except KeyError as e:
+                    # This can eventually be removed since we're now using the hash of the program code to specify the file.
+                    # Kernels with different dimensions will have different files.
+                    hjson_file.seek(0,0) # Move read location back to beginning
+
+                    fp_format = None
+                    ndofs = None # The value doesn't matter now
+                    transform_id = get_transformation_id(device_id)
+
+                    for arg in program.default_entrypoint.args:
+                        if IsOpArray() in arg.tags:
+                            dim = 1
+                            ndofs = arg.shape[0]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags:
+                            ndofs = arg.shape[1]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsFaceMassOpArray() in arg.tags:
+                            ndofs = arg.shape[0]
+                            fp_format = arg.dtype.numpy_dtype
+                            break
+                        elif IsDOFArray() in arg.tags:
+                            ndofs = arg.shape[1]
+                            fp_format = arg.dtype.numpy_dtype
+                            break 
+
+                    if fp_format is None:
+                        print("Unknown fp_format")
+                        exit()                
+                    if ndofs is None:
+                        print("Unknown ndofs")
+                        exit()
+
+                    fp_string = get_fp_string(fp_format)
+                    indices = [transform_id, fp_string, str(ndofs)]
+                    transformations = dgk.load_transformations_from_file(hjson_file,
+                        indices)
+
+                hjson_file.close()
+
+            #except (KeyError, FileNotFoundError) as e:
+            # There shouldn't be any more key errors now that PIDs are used
+            except FileNotFoundError as e:
+                
+                """
+                # Maybe the generators should be classes so we can use inheritance.
+                if program.default_entrypoint.name == "einsum3to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum4to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum5to3_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "einsum2to2_kernel":
+                    from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator
+                elif program.default_entrypoint.name == "grudge_elementwise_sum_knl":
+                    from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator
+                    from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator
+                else:
+                    from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator
+                    from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator
+
+                try:
+                    avg_time, transformations, data = search_fn(self.queue, program, generic_test, 
+                                                pspace_generator, tlist_generator, time_limit=np.inf)
+                except cl._cl.RuntimeError as e:
+                    print(e)
+                    print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.")
+
+                od = {"transformations": transformations}
+                out_file = open(hjson_file_str, "wt+")
+
+                hjson.dump(od, out_file,default=convert)
+                out_file.close()
+                #from pprint import pprint
+                #pprint(od)
+                """
+                print("TRANSFORMATION FILE NOT FOUND", hjson_file_str)
+                #exit()
+                tlist_generator, pspace_generator = self.get_generators(program)
+                search_fn = exhaustive_search_v2#random_search
+                transformations = self.autotune_and_save(self.queue, program, search_fn, 
+                        tlist_generator, pspace_generator, hjson_file_str)
+
+            program = dgk.apply_transformation_list(program, transformations)
+
+            """
+            # Kernels to not autotune. Should probably still load the transformation from a
+            # generator function. Should these be put in GrudgeArrayContext
+
+            # Maybe this should have an autotuner
+            # There isn't much room for optimization due to the indirection
+            elif "resample_by_picking" in program.default_entrypoint.name:
+                for arg in program.default_entrypoint.args:
+                    if arg.name == "n_to_nodes":
+                        # Assumes this has has a single ParameterValue tag
+                        n_to_nodes = arg.tags[0].value
+
+                l0 = ((1024 // n_to_nodes) // 32) * 32
+                if l0 == 0:
+                    l0 = 16
+                if n_to_nodes*16 > 1024:
+                    l0 = 8
+                    c
+
+                outer = max(l0, 32)
+
+                program = set_memory_layout(program)
+                program = lp.split_iname(program, "iel", outer, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+
+            elif "actx_special" in program.default_entrypoint.name: # Fixed
+                program = set_memory_layout(program)
+                # Sometimes sqrt is called on single values.
+                if "i0" in program.default_entrypoint.inames:
+                    program = lp.split_iname(program, "i0", 512, outer_tag="g.0",
+                                            inner_tag="l.0", slabs=(0, 1))
+                #program = lp.split_iname(program, "i0", 128, outer_tag="g.0",
+                #                           slabs=(0,1))
+                #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp",
+                #                           inner_tag="l.0")
+                #program = lp.split_iname(program, "i1", 20, outer_tag="g.1",
+                #                           inner_tag="l.1", slabs=(0,0))
+                #program2 = lp.join_inames(program, ("i1", "i0"), "i")
+                #from islpy import BasicMap
+                #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}")
+                #program2 = lp.map_domain(program, m)
+                #print(program2)
+                #exit()
+
+                #program = super().transform_loopy_program(program)
+                #print(program)
+                #print(lp.generate_code_v2(program).device_code())
+
+            # Not really certain how to do grudge_assign, done for flatten
+            elif "flatten" in program.default_entrypoint.name: 
+
+                program = set_memory_layout(program)
+                # This is hardcoded. Need to move this to separate transformation file
+                #program = lp.set_options(program, "write_cl")
+                program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+            
+            else:
+                #print(program)
+                #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name)
+                #    The PyOpenCLArrayContext transformations can fail when inames are fixed.
+               program = super().transform_loopy_program(program)
+
+            '''       
+            # These still depend on the polynomial order = 3
+            # Never called?
+            # This is going away anyway probably
+            elif "resample_by_mat" in program.default_entrypoint.name:
+                hjson_file = pkg_resources.open_text(dgk, f"{program.default_entrypoint.name}.hjson")
+        
+                # Order 3: 10 x 10
+                # Order 4: 15 x 35
+                
+                #print(program)
+                #exit()
+                pn = 3 # This needs to  be not fixed
+                fp_string = "FP64"
+                
+                indices = [transform_id, fp_string, str(pn)]
+                transformations = dgk.load_transformations_from_file(hjson_file,
+                    indices)
+                hjson_file.close()
+                print(transformations)
+                program = dgk.apply_transformation_list(program, transformations)
+
+            # Not really certain how to do grudge_assign, done for flatten
+            elif "grudge_assign" in program.default_entrypoint.name or "flatten" in program.default_entrypoint.name: 
+                # This is hardcoded. Need to move this to separate transformation file
+                #program = lp.set_options(program, "write_cl")
+                program = lp.split_iname(program, "iel", 128, outer_tag="g.0",
+                                            slabs=(0, 1))
+                program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp",
+                                            inner_tag="l.0")
+                program = lp.split_iname(program, "idof", 20, outer_tag="g.1",
+                                            inner_tag="l.1", slabs=(0, 0))
+            
+
+            '''
+            """
+        else:
+            # print("USING FALLBACK TRANSFORMATIONS FOR " + program.default_entrypoint.name)
+            program = super().transform_loopy_program(program)
+
+        return program
+
+
+
+class KernelSavingAutotuningArrayContext(AutotuningArrayContext):
+    def transform_loopy_program(self, program):
+
+        if program.default_entrypoint.name in autotuned_kernels:
+            import pickle
+            # Set no_numpy and return_dict options here?
+            program = set_memory_layout(program, order="F")
+
+            print("====CALCULATING PROGRAM ID====")
+            filename = "./pickled_programs"
+            pid = unique_program_id(program)
+        
+            # Is there a way to obtain the current rank?
+            file_path = f"{filename}/{program.default_entrypoint.name}_{pid}.pickle"
+            hjson_path = f"hjson/{program.default_entrypoint.name}_{pid}.hjson"
+            from os.path import exists
+            
+            if not exists(file_path):
+                # For some reason this doesn't create the directory
+                os.makedirs(os.path.dirname(filename), exist_ok=True)
+                print(program.default_entrypoint)
+                print("====WRITING PROGRAM TO FILE===", file_path)
+                out_file = open(file_path, "wb")
+                pickle.dump(program, out_file)
+                out_file.close()
+                print("====READING PROGRAM FROM FILE===", file_path)
+                f = open(file_path, "rb")
+                loaded = pickle.load(f)
+                f.close()
+                pid2 = unique_program_id(loaded)
+                print(pid, pid2)
+                assert pid == pid2
+                print("DUMPED PICKLED FILE. EXITING - RUN THE AUTOTUNER")
+            elif exists(hjson_path): # Use the transformations
+                program = super().transform_loopy_program(program)
+            else:
+                print("PICKLED FILE ALREADY EXISTS. RUN THE AUTOTUNER.", file_path)
+                exit()
+        else:
+            program = super().transform_loopy_program(program)
+
+        return program
+
+
+# vim: foldmethod=marker
diff --git a/grudge/grudge_tags.py b/grudge/grudge_tags.py
new file mode 100644
index 000000000..92badc130
--- /dev/null
+++ b/grudge/grudge_tags.py
@@ -0,0 +1,32 @@
+from pytools.tag import Tag, UniqueTag
+from meshmode.transform_metadata import IsDOFArray, IsOpArray, ParameterValue, EinsumArgsTags
+
+class KernelDataTag(Tag): # Delete this when no longer needed
+    """A tag that applies to :class:`loopy.LoopKernel`. Kernel data provided
+    with this tag can be later applied to the kernel. This is used, for
+    instance, to specify kernel data in einsum kernels."""
+
+    def __init__(self, kernel_data):
+        self.kernel_data = kernel_data
+
+
+class IsVecDOFArray(Tag):
+    pass
+
+class IsFaceDOFArray(Tag):
+    pass
+
+class IsVecOpArray(Tag):
+    pass
+
+class IsSepVecDOFArray(Tag):
+    pass
+
+class IsSepVecOpArray(Tag):
+    pass
+
+class IsFaceMassOpArray(Tag):
+    pass
+
+class IsFourAxisDOFArray(Tag):
+    pass
diff --git a/grudge/loopy_dg_kernels/__init__.py b/grudge/loopy_dg_kernels/__init__.py
new file mode 100644
index 000000000..5c48b7b8e
--- /dev/null
+++ b/grudge/loopy_dg_kernels/__init__.py
@@ -0,0 +1,425 @@
+import numpy as np
+from pytools import memoize_in
+
+#import pyopencl as cl
+#import pyopencl.array
+#import pyopencl.clrandom
+
+import loopy as lp
+from grudge.grudge_tags import IsDOFArray, ParameterValue
+#from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+#from loopy.kernel.data import AddressSpace
+
+#import pycuda.gpuarray as cuarray
+#import pycuda.driver as drv
+#import pycuda.tools
+#import pycuda.autoinit
+#from pycuda.compiler import SourceModule
+#from pycuda.curandom import rand as curand
+
+#from modepy import equidistant_nodes
+
+#from bs4 import UnicodeDammit
+import hjson
+#import time
+#from math import ceil
+#import sys
+
+# setup
+# -----
+lp.set_caching_enabled(False)
+import loopy.options
+loopy.options.ALLOW_TERMINAL_COLORS = False
+
+# A lot of this could probably be deleted
+
+def gen_face_mass_knl_merged(nelements, nfaces, nvol_nodes, nface_nodes, fp_format):
+    knl =  lp.make_kernel(
+         """{[iel,idof,fj]:
+             0<=iel<nelements and
+             0<=idof<nvol_nodes and
+             0<=fj<nf_times_j}""",
+         """
+         result[iel,idof] = sum(fj, mat[idof, fj] * vec[iel, fj])
+         """,
+         kernel_data=[
+             lp.GlobalArg("result", fp_format, shape=lp.auto, order="F"),
+             lp.GlobalArg("vec", fp_format, shape=lp.auto, order="F"),
+             lp.GlobalArg("mat", fp_format, shape=lp.auto, order="C"),
+             "..."
+         ],
+         name="face_mass")
+
+    # Gets around 470 GB/s
+    knl = lp.fix_parameters(knl, nelements=nelements, nf_times_j=nfaces*nface_nodes, nvol_nodes=nvol_nodes)
+    #knl = lp.tag_array_axes(knl, "result", "f,f")
+    #knl = lp.tag_array_axes(knl, "vec", "f,f")
+
+    knl = lp.split_iname(knl, "iel", 96, outer_tag="g.0", slabs=(0,1))
+    knl = lp.split_iname(knl, "iel_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+    knl = lp.add_prefetch(knl, "vec", "iel_inner_outer,iel_inner_inner,fj",
+                            temporary_name="vecf", default_tag="l.auto")
+
+    knl = lp.tag_array_axes(knl, "vecf", "f,f")
+    knl = lp.split_iname(knl, "idof", 20, outer_tag="g.1", slabs=(0,0))
+    knl = lp.split_iname(knl, "idof_inner", 2, outer_tag="ilp", inner_tag="l.1", slabs=(0,0))
+    knl = lp.split_iname(knl, "fj", 10, slabs=(0,0), inner_tag="unr")
+
+    return knl
+
+
+def gen_face_mass_knl(nelements, nfaces, nvol_nodes, nface_nodes, fp_format):
+    knl =  lp.make_kernel(
+         """{[iel,idof,f,j]:
+             0<=iel<nelements and
+             0<=f<nfaces and
+             0<=idof<nvol_nodes and
+             0<=j<nface_nodes}""",
+         """
+         #result[iel,idof] = sum(fj, mat[idof, fj] * vec[iel, fj])
+         result[iel,idof] = sum(f, sum(j, mat[idof, f, j] * vec[f, iel, j]))
+         """,
+         kernel_data=[
+             lp.GlobalArg("result", fp_format, shape=lp.auto),
+             lp.GlobalArg("vec", fp_format, shape=lp.auto),
+             lp.GlobalArg("mat", fp_format, shape=lp.auto),
+             "..."
+         ],
+         name="face_mass")
+
+    knl = lp.fix_parameters(knl, nelements=nelements, nfaces=nfaces, nvol_nodes=nvol_nodes, nface_nodes=nface_nodes)
+    knl = lp.tag_array_axes(knl, "result", "f,f")
+    knl = lp.tag_array_axes(knl, "vec", "N1,N0,N2")
+
+    # Gets around 450 GB/s
+
+    knl = lp.split_iname(knl, "iel", 96, outer_tag="g.0", slabs=(0,1))
+    knl = lp.split_iname(knl, "iel_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+    knl = lp.add_prefetch(knl, "vec", "j,iel_inner_outer,iel_inner_inner,f",
+                            temporary_name="vecf", default_tag="l.auto")
+
+    knl = lp.tag_array_axes(knl, "vecf", "N1,N0,N2")
+    knl = lp.split_iname(knl, "idof", 20, outer_tag="g.1", slabs=(0,0))
+    knl = lp.split_iname(knl, "idof_inner", 4, outer_tag="ilp", inner_tag="l.1", slabs=(0,0))
+    knl = lp.split_iname(knl, "j", 10, slabs=(0,0))
+
+    return knl
+
+
+def gen_elwise_linear_knl(n_elem, n_in, n_out, fp_format):
+
+    knl = lp.make_kernel(
+        """{[iel, idof, j]:
+            0<=iel<nelements and
+            0<=idof<ndiscr_nodes_out and
+            0<=j<ndiscr_nodes_in}""",
+        "result[iel, idof] = sum(j, mat[idof, j] * vec[iel, j])",
+        kernel_data=[
+            lp.GlobalArg("result", fp_format, shape=(n_elem, n_out), order="F"),
+            lp.GlobalArg("vec", fp_format, shape=(n_elem, n_in), order="F"),
+            lp.GlobalArg("mat", fp_format, shape=(n_out, n_in), order="C")    
+        ],
+        name="elwise_linear")
+    knl = lp.fix_parameters(knl, nelements=n_elem,
+        ndiscr_nodes_in=n_in, ndiscr_nodes_out=n_out)
+
+
+    #result = lp.tag_array_axes(result, "mat", "stride:auto,stride:auto")
+    return knl
+
+# Se podría usar el de Grudge.
+#@memoize_method
+def gen_diff_knl_fortran2(n_mat, n_elem, n_in, n_out, fp_format=np.float32,
+        options=None):
+    
+    @memoize_in(gen_diff_knl_fortran2, "_gen_diff_knl")
+    def _gen_diff_knl(n_mat, n_elem, n_in, n_out, fp_format):
+        knl = lp.make_kernel(
+        """{[imatrix,iel,idof,j]:
+            0<=imatrix<nmatrices and
+            0<=iel<nelements and
+            0<=idof<ndiscr_nodes_out and
+            0<=j<ndiscr_nodes_in}""",
+        """
+        result[imatrix,iel,idof] = simul_reduce(sum, j, diff_mat[imatrix, idof, j] * vec[iel, j])
+        """,
+        kernel_data=[
+            lp.GlobalArg("result", fp_format, shape=(n_mat, n_elem, n_out),
+                offset=lp.auto),
+            lp.GlobalArg("diff_mat", fp_format, shape=(n_mat, n_out, n_in),
+                order="C", offset=lp.auto),
+            lp.GlobalArg("vec", fp_format, shape=(n_elem, n_in), order="F",
+                offset=lp.auto),
+            lp.ValueArg("nelements", tags=ParameterValue(n_elem)),
+            lp.ValueArg("nmatrices", tags=ParameterValue(n_mat)),
+            lp.ValueArg("ndiscr_nodes_out", tags=ParameterValue(n_out)),
+            lp.ValueArg("ndiscr_nodes_in", tags=ParameterValue(n_in))
+        ],
+        assumptions="nelements > 0 \
+                     and ndiscr_nodes_out > 0 \
+                     and ndiscr_nodes_in > 0 and nmatrices > 0",
+        options=options,
+        name="diff_{}_axis".format(n_mat)
+        )
+        return knl
+
+    knl = _gen_diff_knl(n_mat, n_elem, n_in, n_out, fp_format)
+
+    # This should be in array context probably but need to avoid circular dependency
+    # Probably should split kernels out of grudge_array_context
+    knl = lp.tag_inames(knl, "imatrix: ilp")
+    knl = lp.tag_array_axes(knl, "diff_mat", "sep,c,c")
+    knl = lp.tag_array_axes(knl, "result", "sep,f,f")
+    knl = lp.tag_array_axes(knl, "vec", "f,f")
+    knl = lp.fix_parameters(knl, nmatrices=n_mat, nelements=n_elem,
+        ndiscr_nodes_in=n_in, ndiscr_nodes_out=n_out)
+    return knl
+
+
+# Is k x i in F layout equivalent to i x k in C layout?
+# If so, can we just call the gen_diff_knl?
+# Pretty sure it is...
+def gen_diff_knl_fortran(n_elem, n_in, n_out, fp_format=np.float32, options=None):
+    knl = lp.make_kernel(
+        """{[k,i,j]:
+            0<=k<nelements and
+            0<=i<ndiscr_nodes_out and
+            0<=j<ndiscr_nodes_in}""",
+        """
+        result1[k,i] = simul_reduce(sum, j, mat1[i, j] * vec[k, j])
+        result2[k,i] = simul_reduce(sum, j, mat2[i, j] * vec[k, j])
+        result3[k,i] = simul_reduce(sum, j, mat3[i, j] * vec[k, j])
+        """,
+        kernel_data=[
+            lp.GlobalArg("result1", fp_format, shape=(n_elem, n_out), order="F",
+                offset=lp.auto),
+            lp.GlobalArg("result2", fp_format, shape=(n_elem, n_out), order="F",
+                offset=lp.auto),
+            lp.GlobalArg("result3", fp_format, shape=(n_elem, n_out), order="F",
+                offset=lp.auto),
+            lp.GlobalArg("mat1", fp_format, shape=(n_out, n_in), order="C",
+                offset=lp.auto),
+            lp.GlobalArg("mat2", fp_format, shape=(n_out, n_in), order="C",
+                offset=lp.auto),
+            lp.GlobalArg("mat3", fp_format, shape=(n_out, n_in), order="C",
+                offset=lp.auto),
+            lp.GlobalArg("vec", fp_format, shape=(n_elem, n_in), order="F",
+                offset=lp.auto)
+        ],
+        assumptions="nelements > 0 \
+                     and ndiscr_nodes_out > 0 \
+                     and ndiscr_nodes_in > 0",
+        options=options,
+        name="diff"
+
+    )
+
+    knl = lp.fix_parameters(knl, nelements=n_elem, ndiscr_nodes_in=n_in,
+        ndiscr_nodes_out=n_out)
+
+    return knl
+
+#@memoize_method
+def gen_diff_knl(n_mat, n_elem, n_in, n_out, fp_format=np.float32, options=None):
+    print(fp_format)
+    knl = lp.make_kernel(
+        """{[m,k,i,j]:
+            0<=k<nelements and
+            0<=i<ndiscr_nodes_out and
+            0<=j<ndiscr_nodes_in and
+            0<=m<nmatrices}""",
+        """
+        result[m, i ,k] = simul_reduce(sum, j, diff_mat[m, i, j] * vec[j, k])
+        """,
+        kernel_data=[
+            lp.GlobalArg("result", fp_format, shape=(n_mat, n_out, n_elem),
+                offset=lp.auto),
+            lp.GlobalArg("diff_mat", fp_format, shape=(n_mat, n_out, n_in),
+                order="C", offset=lp.auto),
+            lp.GlobalArg("vec", fp_format, shape=(n_in, n_elem), order="C",
+                offset=lp.auto)
+        ],
+        #kernel_data = [
+        #    lp.GlobalArg("result1", fp_format, shape=None, strides=(n_elem,1),
+        #       dim_tags=None, offset=lp.auto, order="C"),
+        #    lp.GlobalArg("result2", fp_format, shape=None, strides=(n_elem,1),
+        #       dim_tags=None, offset=lp.auto, order="C"),
+        #    lp.GlobalArg("result3", fp_format, shape=None, strides=(n_elem,1),
+        #       dim_tags=None, offset=lp.auto, order="C"),
+        #    lp.GlobalArg("mat1", fp_format, shape=lp.auto, offset=lp.auto,
+        #       order="C"),
+        #    lp.GlobalArg("mat2", fp_format, shape=lp.auto, offset=lp.auto,
+        #       order="C"),
+        #    lp.GlobalArg("mat3", fp_format, shape=lp.auto, offset=lp.auto,
+        #       order="C"),
+        #    lp.GlobalArg("vec", fp_format, shape=None, strides=(1, n_elem),
+        #       offset=lp.auto, order="C")
+        #],
+        assumptions="nelements > 0 \
+                     and ndiscr_nodes_out > 0 \
+                     and ndiscr_nodes_in > 0 \
+                     and nmatrices > 0",
+        options=options,
+        name="diff"
+    )
+    knl = lp.tag_array_axes(knl, "diff_mat", "sep,c,c")
+    knl = lp.tag_array_axes(knl, "result", "sep,c,c")
+    knl = lp.tag_array_axes(knl, "vec", "c,c")
+
+    knl = lp.fix_parameters(knl, nmatrices=n_mat, nelements=n_elem,
+        ndiscr_nodes_in=n_in, ndiscr_nodes_out=n_out)
+
+    #mat_string = ["result1", "result2", "result3", "vec"]
+    #for i in range(len(mat_string)):
+    #   knl = lp.tag_array_axes(knl, mat_string, "stride:auto,stride:auto")
+    #   knl = lp.tag_array_axes(knl, mat_string, "N1,N0")
+
+    return knl
+
+
+# This is redundant with the above but is more clear than the above
+# so to keep it around may be worthwhile.
+'''
+def gen_diff_knl(n_elem, n_in, n_out, k_inner_outer,k_inner_inner,i_inner_outer,
+                    i_inner_inner,j_inner, fp_format=np.float32):
+    knl = lp.make_kernel(
+        """{[k,i,j]:
+            0<=k<nelements and
+            0<=i<ndiscr_nodes_out and
+            0<=j<ndiscr_nodes_in}""",
+        """
+        result1[i,k] = simul_reduce(sum, j, mat1[i, j] * vec[j, k])
+        result2[i,k] = simul_reduce(sum, j, mat2[i, j] * vec[j, k])
+        result3[i,k] = simul_reduce(sum, j, mat3[i, j] * vec[j, k])
+        """,
+        kernel_data = [
+            lp.GlobalArg("result1", fp_format, shape=(n_out, n_elem), order="C"),
+            lp.GlobalArg("result2", fp_format, shape=(n_out, n_elem), order="C"),
+            lp.GlobalArg("result3", fp_format, shape=(n_out, n_elem), order="C"),
+            lp.GlobalArg("mat1", fp_format, shape=(n_out, n_in), order="C"),
+            lp.GlobalArg("mat2", fp_format, shape=(n_out, n_in), order="C"),
+            lp.GlobalArg("mat3", fp_format, shape=(n_out, n_in), order="C"),
+            lp.GlobalArg("vec", fp_format, shape=(n_in, n_elem), order="C")
+        ],
+        assumptions="nelements > 0 \
+                     and ndiscr_nodes_out > 0 \
+                     and ndiscr_nodes_in > 0",
+        default_offset=None,
+        name="diff"
+    )
+
+    knl = lp.fix_parameters(knl, nelements=n_elem, ndiscr_nodes_in=n_in,
+                                ndiscr_nodes_out=n_out)
+
+    slabs0 = (0,0) if n_elem % k_inner_outer == 0 else (0,1)
+    knl = lp.split_iname(knl, "k", k_inner_outer, outer_tag="g.0", slabs=slabs0)
+    knl = lp.split_iname(knl, "k_inner", k_inner_inner, outer_tag="ilp",
+                            inner_tag="l.0")
+    knl = lp.split_iname(knl, "j", j_inner)
+    knl = lp.split_iname(knl, "i", i_inner_outer, outer_tag="g.1")#slabs=(0,1))
+    knl = lp.split_iname(knl, "i_inner", i_inner_inner, outer_tag="ilp",
+                            inner_tag="l.1")
+
+    #knl = lp.prioritize_loops(knl, "j_outer,j_inner,k_inner_outer")
+
+    knl = lp.add_prefetch(knl, "vec", "j_outer,j_inner,k_inner_outer,k_inner_inner",
+                            temporary_name="vecf", default_tag="l.auto")
+    knl = lp.add_prefetch(knl, "mat1", "j_inner", temporary_name="mat1fp",
+                            default_tag="unr")
+    knl = lp.add_prefetch(knl, "mat2", "j_inner", temporary_name="mat2fp",
+                            default_tag="unr")
+    knl = lp.add_prefetch(knl, "mat3", "j_inner", temporary_name="mat3fp",
+                            default_tag="unr")
+
+    return knl
+'''
+
+
+def load_transformations_from_file(hjson_file, indices): 
+    od = hjson.loads(hjson_file.read())
+    for index in indices:
+        od = od[index]
+    return od
+
+def generate_transformation_list_old(k_inner_outer, k_inner_inner, i_inner_outer,
+                                    i_inner_inner, j_inner):
+    transformations = []
+    # transformation name, list of args, dict of keyward args
+    transformations.append(("split_iname", ["k", k_inner_outer], {"outer_tag": "g.0",
+                                "slabs": (0, 1)}))
+    transformations.append(("split_iname", ["k_inner", k_inner_inner],
+                            {"outer_tag": "ilp", "inner_tag": "l.0"}))
+    transformations.append(("split_iname", ["j", j_inner]))
+    transformations.append(("split_iname", ["i", i_inner_outer],
+                            {"outer_tag": "g.1"}))
+    transformations.append(("split_iname", ["i_inner", i_inner_inner],
+                            {"outer_tag": "ilp", "inner_tag": "l.1"}))
+    transformations.append(("add_prefetch", ["vec",
+                            "j_outer,j_inner,k_inner_outer,k_inner_inner"],
+                            {"temporary_name": "vecf", "default_tag": "l.auto"}))
+    transformations.append(("add_prefetch", ["mat1", "j_inner"],
+                            {"temporary_name": "mat1fp", "default_tag": "unr"}))
+    transformations.append(("add_prefetch", ["mat2", "j_inner"],
+                            {"temporary_name": "mat2fp", "default_tag": "unr"}))
+    transformations.append(("add_prefetch", ["mat3", "j_inner"],
+                            {"temporary_name": "mat3fp", "default_tag": "unr"}))
+    return tuple(transformations)
+
+# This is rather nvidia specific at present
+# And also specific to the diff kernel
+# May need different ones of these for different kernels
+def generate_transformation_list(k_inner_outer, k_inner_inner, i_inner_outer,
+                                i_inner_inner, j_inner):
+    transformations = []
+    # transformation name, list of args, dict of keyward args
+
+    # Set data layouts
+    # This should be handled by the array context?
+    #transformations.append(("tag_array_axes", ["diff_mat", "sep,c,c"]))
+    #transformations.append(("tag_array_axes", ["result", "sep,f,f"]))
+
+    # Split and tag inames
+    #transformations.append(("tag_inames", [[("imatrix", "ilp")]]))
+    transformations.append(("split_iname", ["iel", k_inner_outer], {"outer_tag": "g.0",
+                            "slabs": (0, 1)}))
+    transformations.append(("split_iname", ["iel_inner", k_inner_inner],
+                            {"outer_tag": "ilp", "inner_tag": "l.0"}))
+    transformations.append(("split_iname", ["idof", i_inner_outer],
+                            {"outer_tag": "g.1"}))
+    transformations.append(("split_iname", ["idof_inner", i_inner_inner],
+                            {"outer_tag": "ilp", "inner_tag": "l.1"}))
+    transformations.append(("split_iname", ["j", j_inner]))
+
+    # Prefetching
+    transformations.append(("add_prefetch", ["vec",
+                            "j_outer,j_inner,iel_inner_outer,iel_inner_inner"],
+                            {"temporary_name": "vecf", "default_tag": "l.auto"}))
+    transformations.append(("tag_array_axes", ["vecf", "f,f"]))
+    transformations.append(["add_inames_for_unused_hw_axes"])
+    return tuple(transformations)
+
+#@memoize_method
+def apply_transformation_list(knl, transformations):
+    # Could just construct a string for the function handle and retrieve the function from that
+    function_mapping = {"split_iname": lp.split_iname,
+                        "add_prefetch": lp.add_prefetch,
+                        "prioritize_loops": lp.prioritize_loops,
+                        "rename_iname": lp.rename_iname,
+                        "tag_array_axes": lp.tag_array_axes,
+                        "tag_inames": lp.tag_inames,
+                        "add_inames_for_unused_hw_axes": lp.add_inames_for_unused_hw_axes}
+
+    # Maybe add some logic to add slabs=(0,0) if n_elem % k_inner_outer == 0
+    # Maybe can do this based on tranformation name, loop variable, and loop variable
+    # bounds
+    #print(knl)
+    for t in transformations:
+        print(t)
+        func = function_mapping[t[0]]
+        args = [knl]
+        if len(t) > 1:
+            args = args + t[1]
+        kwargs = t[2] if len(t) > 2 else {}
+        knl = func(*args, **kwargs)
+
+    return knl
diff --git a/grudge/loopy_dg_kernels/device_mappings.hjson b/grudge/loopy_dg_kernels/device_mappings.hjson
new file mode 100644
index 000000000..aa90615cc
--- /dev/null
+++ b/grudge/loopy_dg_kernels/device_mappings.hjson
@@ -0,0 +1,8 @@
+{
+  # The idea with mapping devices to uuids is that multiple devices can map to 
+  # a single set of transformations.
+	"NVIDIA Titan V": 72a3ce98-5d21-48bf-b402-6ee96bafd1b6 
+	"NVIDIA GTX Titan X": 1d7cab16-19bd-4474-95f2-44ed1c0e60df
+}
+
+
diff --git a/grudge/loopy_dg_kernels/diff_1d_transform.hjson b/grudge/loopy_dg_kernels/diff_1d_transform.hjson
new file mode 100644
index 000000000..289aa61c8
--- /dev/null
+++ b/grudge/loopy_dg_kernels/diff_1d_transform.hjson
@@ -0,0 +1,149 @@
+# transform ID -> fp format -> pn
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              #["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              #["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          4:[
+ 
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          5:[
+              #["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              #["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              #["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        FP64: {
+          10:[
+              # Format: [Transformation, args, kwargs]
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          20:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              #["split_iname", ["iel", 16], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+
+              # For tests comment this
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["diff_mat", "idof_inner_outer,idof_inner_inner,j"], {temporary_name: "matfp", default_tag: "l.auto"}], 
+              #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+              ["split_iname", ["j", 20], {outer_tag: "for", inner_tag: "for"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          35:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], 
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          56:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+            ]
+          84:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          120:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/diff_2d_transform.hjson b/grudge/loopy_dg_kernels/diff_2d_transform.hjson
new file mode 100644
index 000000000..0b48f5519
--- /dev/null
+++ b/grudge/loopy_dg_kernels/diff_2d_transform.hjson
@@ -0,0 +1,162 @@
+# transform ID -> dimension -> fp format -> pn
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["diff_mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+   
+          ],
+          5:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        # Not optimized, just copied from 32 bit version
+        FP64: {
+          10:[
+              # Format: [Transformation, args, kwargs]
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 101], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          20:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 80], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              #["split_iname", ["iel", 16], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+
+              #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              # For tests comment this
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+              #["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}],
+                ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          35:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], 
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          56:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+            ]
+          84:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          120:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/diff_3d_transform.hjson b/grudge/loopy_dg_kernels/diff_3d_transform.hjson
new file mode 100644
index 000000000..785b8ecfd
--- /dev/null
+++ b/grudge/loopy_dg_kernels/diff_3d_transform.hjson
@@ -0,0 +1,165 @@
+# transform ID -> fp format -> pn
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["diff_mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+   
+          ],
+          5:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        FP64: {
+          10:[
+              # Format: [Transformation, args, kwargs]
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 352], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          20:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 288], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", slabs:[0,0]}], 
+              ["split_iname", ["idof_inner", 20], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}],
+              # For tests comment this
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["split_iname", ["j", 5], {outer_tag: "for", inner_tag: "for"}],
+              #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          35:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], 
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", slabs:[0,0]}], 
+              ["split_iname", ["idof_inner", 35], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 35], {outer_tag: "for", inner_tag: "for"}],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          56:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag:"l.1", slabs:[0,0]}],             
+              ["split_iname", ["idof", 56], {outer_tag: "g.1"}],             
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}],
+              ["split_iname", ["j", 56], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              #["add_prefetch", ["diff_mat", "imatrix,idof_inner,j_outer,j_inner"], {temporary_name: "matf", default_tag: "l.auto"}], 
+              ["add_inames_for_unused_hw_axes"]
+              #["prioritize_loops", ["iel_outer,iel_inner_outer,iel_inner_inner,imatrix,j_outer,j_inner"]]
+            ]
+          84:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+              ["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              ["add_inames_for_unused_hw_axes"]
+          ], 
+          120:[
+              ["tag_inames", [[["imatrix", "ilp"]]]],
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/elwise_linear_transform.hjson b/grudge/loopy_dg_kernels/elwise_linear_transform.hjson
new file mode 100644
index 000000000..b67c13a81
--- /dev/null
+++ b/grudge/loopy_dg_kernels/elwise_linear_transform.hjson
@@ -0,0 +1,185 @@
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], 
+              #["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+   
+          ],
+          5:[
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        # Not optimized, just copied from 32 bit version
+        FP64: {
+          10:[
+              # Format: [Transformation, args, kwargs]
+              #["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", slabs:[0,0]}],
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}], 
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}],
+              ["add_inames_for_unused_hw_axes"],
+          ],
+          20:[
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner",  32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}],
+              #["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], 
+              # For tests comment this
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag:"l.auto"}],
+              #["split_iname", ["vec_dim_1_outer", "20"], {outer_tag:"g.1"}],
+              #["split_iname", ["vec_dim_1_outer", "20"], {outer_tag:"g.1", inner_tag:"ilp", slabs:[0,0]}],
+              #["tag_inames", [[ ["vec_dim_0_inner", "l.0"],
+              #                  ["vec_dim_1_inner", "l.1"], 
+              #                  ["vec_dim_1_outer","ilp"],
+              #                  ["vec_dim_0_outer","ilp"]]]],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}],  
+              ["split_iname", ["j", 20], {outer_tag: "for", inner_tag: "for", slabs:[0,0]}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"],
+          ],
+          35:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 56], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              # See if these pass the tests
+              #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], 
+
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["split_iname", ["j", 35], {outer_tag: "for", inner_tag: "for"}], 
+ 
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+              ["add_inames_for_unused_hw_axes"],
+          ],
+          56:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], 
+ 
+              #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+            ]
+          84:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 84], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          120:[
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+ 
+              #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/face_mass_transform.hjson b/grudge/loopy_dg_kernels/face_mass_transform.hjson
new file mode 100644
index 000000000..84632357b
--- /dev/null
+++ b/grudge/loopy_dg_kernels/face_mass_transform.hjson
@@ -0,0 +1,170 @@
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], 
+              #["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+   
+          ],
+          5:[
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        # Not optimized, just copied from 32 bit version
+        FP64: {
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          3:[
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner",  32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 4], {outer_tag: "ilp", inner_tag: "l.1"}], 
+              # For tests comment this
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for", slabs:[0,0]}],
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              # See if these pass the tests
+              #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], 
+ 
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              ["tag_array_axes", ["vecf", "f,f"]],
+              ["add_inames_for_unused_hw_axes"]
+          ],
+          5:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], 
+ 
+              #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+            ]
+          6:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 84], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+ 
+              #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/generators.py b/grudge/loopy_dg_kernels/generators.py
new file mode 100644
index 000000000..07a94e7f3
--- /dev/null
+++ b/grudge/loopy_dg_kernels/generators.py
@@ -0,0 +1,625 @@
+import numpy as np
+from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray,
+    IsOpArray, IsSepVecOpArray, IsFaceDOFArray, IsFaceMassOpArray,
+    IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray)
+
+def k_inner_inner_options(start_val=None):
+    #options = [8, 16, 4, 32]
+    options = [32, 16, 8]
+    start_ind = 0 if start_val is None else options.index(start_val)
+    options = options[start_ind:]
+    return options
+
+
+def k_inner_outer_options(n_in, k_inner_inner, sm_size,
+                            fp_bytes=8, start_val=None, nelem=None):
+    ilp_limit = min(nelem // k_inner_inner, 6) if nelem is not None else 6
+    # Possibilities limited by size of local memory
+    # Use sm_size - 1 because CUDA errors when all of local memory is used
+    options = np.arange(1, ((sm_size - 1) // (fp_bytes*k_inner_inner*n_in)) + 1)
+    #Arbitrarily limit to at max 6 inline to limit search space
+    options = list(k_inner_inner*options[options <= ilp_limit])
+    start_ind = 0 if start_val is None else options.index(start_val)
+    options = options[start_ind:]
+    return options
+
+def i_inner_inner_options(n_out, k_inner_inner, max_work_group_size=1024, start_val=None):
+    factors = np.arange(1, n_out+1)[(n_out % np.arange(1, n_out+1)) == 0]
+    # Fix for AMD
+    #factors = np.arange(3, n_out+1)[(n_out % np.arange(2, n_out+1)) == 0]
+    # Ensure total number of workitems is less than maximum
+    usable_factors = factors[factors*k_inner_inner <= max_work_group_size]
+    options = sorted(usable_factors, reverse=True)
+    start_ind = 0 if start_val is None else options.index(start_val)
+    options = options[start_ind:]
+    return options
+
+def i_inner_outer_options(n_out, i_inner_inner, start_val=None):
+    # Select a number of inline blocks such that n_out % outer*inner == 0
+    # Bumping up the start of the range could reduce autotune time, but an empty
+    # autotune set might be returned if i < start value
+    
+    # Loopy confused about the number of dimensions when 
+    # i_outer, i_inner_outer, and i_inner_inner are all 1
+    inline = [1] if n_out == 1 else np.arange(2, (n_out // i_inner_inner) + 1)
+    options = list(i_inner_inner*inline[n_out % (inline*i_inner_inner) == 0])
+    start_ind = 0 if start_val is None else options.index(start_val)
+    options = options[start_ind:]
+    return options
+
+
+def j_inner_options(n_in, start_val=None):
+
+    start = 1
+    factors = list(np.arange(start, n_in + 1)[(n_in % np.arange(start, n_in + 1)) == 0])
+    #factors = list(np.arange(1, n_in + 1)[(n_in % np.arange(1, n_in + 1)) == 0])
+    # Should this be limited by the number of registers
+    start_ind = 0 if start_val is None else factors.index(start_val)
+    factors = factors[start_ind:]
+    return factors
+
+# Creates a list containing tuples of search space parameters.
+# Will need to create separate ones of this for each einsum kernel
+def gen_autotune_list(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+    nfaces = 1
+
+    n_in = None
+    print(knl.default_entrypoint.name)
+    ndof_arrays = 0
+    for arg in knl.default_entrypoint.args:
+        print(arg.name)
+        if "resample_by_mat" not in knl.default_entrypoint.name:
+            if IsDOFArray() in arg.tags:
+                n_elem, n_out = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+                ndof_arrays += 1
+            elif IsSepVecOpArray() in arg.tags:
+                n_mat, n_out, n_in = arg.shape
+            elif IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+            elif IsFaceDOFArray() in arg.tags:
+                nfaces, n_elem, n_in = arg.shape
+        else:
+            if IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+    ndof_arrays = max(ndof_arrays, 1)
+    if n_in is None:
+        n_in = n_out
+
+    n_in = n_in * nfaces #Prevents shared memory from overflowing in face mass kernel   
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None)
+
+    # Iterate over five search dimensions
+    # Maybe there is a way to use islpy to do this? 
+    parameter_list = []
+    for kii in k_inner_inner_options(start_val=kii_s):
+        # Should come up with a way to set the effective local memory size. It depends on the number of
+        # arrays actually prefetched.
+        for kio in k_inner_outer_options(n_in*nfaces, kii, local_mem_size // ndof_arrays, fp_bytes=fp_bytes,start_val=kio_s):
+            kio_s = None # Set to None so will form the full set the next time around
+            for iii in i_inner_inner_options(n_out, kii,
+                    max_work_group_size=max_work_group_size, start_val=iii_s):
+                iii_s = None
+                for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                    # Kernel does not reach here.
+                    iio_s = None
+                    for ji in j_inner_options(n_in, start_val=ji_s):
+                        ji_s = None
+                        choices = (kio, kii, iio, iii, ji)
+                        parameter_list.append(choices)
+
+    return parameter_list
+
+
+# Should separate this so don't need to supply knl
+def mxm_trans_list_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji = params
+    knl = kwargs["knl"]
+
+
+    #if "diff" in knl.default_entrypoint.name:
+    #    trans_list.append(["tag_inames", ["imatrix: ilp"]])
+
+    trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["iel_inner", kii], 
+        {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+    trans_list.append(["split_iname", ["idof_inner", iii], 
+        {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+
+    if knl.default_entrypoint.name == "face_mass":
+        pass
+        #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"],
+        #    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+        #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]])
+    #elif knl.default_entrypoint.name == "nodes":
+    elif knl.default_entrypoint.name == "lp_nodes":
+        trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"],
+            {"temporary_name":"vecf", "default_tag":"l.auto"}])
+        trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+    elif "resample_by_mat" in knl.default_entrypoint.name:
+        # Indirection may prevent prefetching
+        pass
+    else:
+        trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"],
+            {"temporary_name":"vecf", "default_tag":"l.auto"}])
+        trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+
+    trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list
+
+
+def grudge_elementwise_sum_knl_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji = params
+    knl = kwargs["knl"]
+
+    trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["iel_inner", kii], 
+        {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+    trans_list.append(["split_iname", ["idof_inner", iii], 
+        {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+    # Should the i loop have (0,1) slabs for both?
+
+    #trans_list.append(["add_prefetch", ["operand", "iel_inner_outer,iel_inner_inner"],
+    #    {"temporary_name":"operandf", "default_tag":"l.auto"}])
+    #trans_list.append(["tag_array_axes", ["operandf", "f,f"]])
+
+    # Realistically, splitting the j loop probably is not necessary for this.
+    trans_list.append(["split_iname", ["jdof", ji], {"outer_tag":"for", "inner_tag":"for"}])
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list 
+
+def grudge_elementwise_sum_knl_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            n_in = n_out
+            fp_bytes = arg.dtype.dtype.itemsize
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None)
+
+    # Iterate over five search dimensions. Could reduce this to 4 if ignore j-loop.
+    parameter_list = []
+    if n_elem > 0:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            # Both jac and vec are prefetched so the available local_memory per prefetched array is halved
+            for kio in k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        for ji in j_inner_options(n_in, start_val=ji_s):
+                            ji_s = None
+                            choices = (kio, kii, iio, iii, ji)
+                            parameter_list.append(choices)
+
+    return parameter_list
+
+
+def einsum3to2_kernel_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji, lm_layout = params
+    if 0 not in params: # If there is a zero length dimension then don't transform
+        knl = kwargs["knl"]
+
+        if kio != kii:
+            trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+            trans_list.append(["split_iname", ["e_inner", kii], 
+                {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+            prefetch_str = "j,e_inner_outer,e_inner_inner"
+        else:
+            trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "inner_tag": "l.0", "slabs":(0,0)}])
+            prefetch_str = "j,e_inner"    
+        if iio != iii:
+            trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+            trans_list.append(["split_iname", ["i_inner", iii], 
+                {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+        else:
+            trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "inner_tag": "l.1", "slabs":(0,0)}])
+        # Should the i loop have (0,1) slabs for both?
+
+        for arg in knl.default_entrypoint.args:
+
+            if "vec" == arg.name:
+                trans_list.append(["add_prefetch", ["vec", prefetch_str],
+                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["vecf", lm_layout]])
+            elif "jac" == arg.name:
+                trans_list.append(["add_prefetch", ["jac", prefetch_str],
+                    {"temporary_name":"jacf", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["jacf", lm_layout]])
+            elif "arg2" == arg.name and IsDOFArray() in arg.tags:
+                trans_list.append(["add_prefetch", ["arg2", prefetch_str],
+                    {"temporary_name":"arg2f", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["arg2f", lm_layout]])
+            elif "arg1" == arg.name and IsDOFArray() in arg.tags:
+                trans_list.append(["add_prefetch", ["arg1", prefetch_str],
+                    {"temporary_name":"arg1f", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["arg1f", lm_layout]])
+            elif "arg0" == arg.name and IsDOFArray() in arg.tags:
+                arg0_prefetch_str = "i_inner," if iio == iii else "i_inner_outer,i_inner_inner,"
+                arg0_prefetch_str += "e_inner" if kio == kii else "e_inner_outer,e_inner_inner"
+                trans_list.append(["add_prefetch",
+                    ["arg0", arg0_prefetch_str],
+                    {"temporary_name":"arg0f", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["arg0f", lm_layout]])
+
+        trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list 
+
+def einsum3to2_kernel_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    n_dof_arrays = 0
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+            n_dof_arrays += 1
+        elif IsOpArray() in arg.tags:
+            n_out, n_in = arg.shape
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s, lm_layout = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s, lm_layout = (None, None, None, None, None, None)
+
+    # Iterate over six search dimensions
+    parameter_list = []
+
+    if n_elem*n_out <= 1024:
+        choices = (n_elem, n_elem, n_out, n_out, n_in, "c,c")
+        parameter_list.append(choices)
+        choices = (n_elem, n_elem, n_out, n_out, n_in, "f,f")
+        parameter_list.append(choices)
+    else:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            # Both jac and vec are prefetched so the available local_memory per prefetched array is halved
+            # Should check if jac is present
+            for kio in k_inner_outer_options(n_in, kii, local_mem_size // n_dof_arrays,
+                        fp_bytes=fp_bytes,start_val=kio_s,nelem=n_elem):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        for ji in j_inner_options(n_in, start_val=ji_s):
+                            ji_s = None
+                            for lm_layout in ["f,f", "c,c"]:
+                                choices = (kio, kii, iio, iii, ji, lm_layout)
+                                parameter_list.append(choices)
+
+    return parameter_list
+
+
+def einsum2to2_kernel_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii = params
+    knl = kwargs["knl"]
+
+    if knl.default_entrypoint.name == "resample_by_picking_single_indirection":
+        trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["iel_inner", kii], 
+            {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+        trans_list.append(["split_iname", ["idof_inner", iii], 
+            {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+    else:
+        trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["e_inner", kii], 
+            {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+        trans_list.append(["split_iname", ["i_inner", iii], 
+            {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+        # Should the i loop have (0,1) slabs for both?
+
+    # Prefetching probably matters not for this kernel
+    #trans_list.append(["add_prefetch", ["arg1", "e_inner_outer,e_inner_inner,i_inner_outer,i_inner_inner"],
+    #    {"temporary_name":"arg1f", "default_tag":"l.auto"}])
+    #trans_list.append(["tag_array_axes", ["arg1f", "f,f"]])
+
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list 
+
+
+def einsum2to2_kernel_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    n_elem = None
+    n_out = None
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            if n_elem is None:
+                n_elem, n_out = arg.shape
+            else: # Needed to handle resample_by_picking_group
+                n_elem = min(arg.shape[0], n_elem)
+                n_out = min(arg.shape[1], n_out)
+            n_in = n_out
+            fp_bytes = arg.dtype.dtype.itemsize
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s = (None, None, None, None)
+
+    # Iterate over five search dimensions
+    parameter_list = []
+    if n_elem > 0:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            for kio in k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        #for ji in j_inner_options(n_in, start_val=ji_s):
+                        #    ji_s = None
+                        choices = (kio, kii, iio, iii)
+                        parameter_list.append(choices)
+
+    return parameter_list
+
+
+def einsum4to2_face_mass_kernel_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji = params
+
+    trans_list.append(["tag_inames", ["f: unr"]])
+    trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["e_inner", kii], 
+        {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+    trans_list.append(["split_iname", ["i_inner", iii], 
+        {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+    # Should the i loop have (0,1) slabs for both?
+
+    trans_list.append(["add_prefetch", ["vec", "f,j,e_inner_outer,e_inner_inner"],
+        {"temporary_name":"vecf", "default_tag":"l.auto"}])
+    trans_list.append(["tag_array_axes", ["vecf", "N2,N0,N1"]])
+
+    trans_list.append(["add_prefetch", ["jac_surf", "f,j,e_inner_outer,e_inner_inner"],
+        {"temporary_name":"jac_surff", "default_tag":"l.auto"}])
+    trans_list.append(["tag_array_axes", ["jac_surff", "N2,N0,N1"]])
+
+    trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+
+    return trans_list 
+
+"""
+def einsum4to2_face_mass_kernel_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsVecDOFArray() in arg.tags:
+            n_r, n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsVecOpArray() in arg.tags:
+            n_r, n_out, n_in = arg.shape
+        elif IsFaceMassOpArray() in arg.tags:
+            n_out, n_r, n_in = arg.shape
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None)
+
+    # Iterate over five search dimensions
+    parameter_list = []
+    for kii in k_inner_inner_options(start_val=kii_s):
+        # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced
+        for kio in k_inner_outer_options(n_in, kii, local_mem_size // (n_r + 1), fp_bytes=fp_bytes,start_val=kio_s):
+            kio_s = None # Set to None so will form the full set the next time around
+            for iii in i_inner_inner_options(n_out, kii,
+                    max_work_group_size=max_work_group_size, start_val=iii_s):
+                iii_s = None
+                for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                    iio_s = None
+                    for ji in j_inner_options(n_in, start_val=ji_s):
+                        ji_s = None
+                        choices = (kio, kii, iio, iii, ji)
+                        parameter_list.append(choices)
+
+    return parameter_list
+"""
+
+
+def einsum4to2_kernel_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji, o = params
+    knl = kwargs["knl"]
+    arg_names = {arg.name for arg in knl.default_entrypoint.args}
+    inames = knl.default_entrypoint.inames.keys()
+    
+    if "r" in inames:
+        trans_list.append(["tag_inames", ["r: unr"]])
+    if "f" in inames:
+        trans_list.append(["tag_inames", ["f: unr"]])
+    
+
+    trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["e_inner", kii], 
+        {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+    trans_list.append(["split_iname", ["i_inner", iii], 
+        {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+    # Should the i loop have (0,1) slabs for both?
+
+    #trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"],
+    #    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+    #trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+    if "inv_jac_t" in arg_names:
+        trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"],
+            {"temporary_name":"vecf", "default_tag":"l.auto"}])
+        trans_list.append(["tag_array_axes", ["vecf", "N0,N1" if o == "F" else "N1,N0"]])
+ 
+        trans_list.append(["add_prefetch", ["inv_jac_t", "r,j,e_inner_outer,e_inner_inner"],
+            {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}])
+        trans_list.append(["tag_array_axes", ["inv_jac_tf", "N2,N0,N1" if o == "F" else "N2,N1,N0"]])
+    elif "jac_surf" in arg_names:
+        trans_list.append(["add_prefetch", ["vec", "f,j,e_inner_outer,e_inner_inner"],
+            {"temporary_name":"vecf", "default_tag":"l.auto"}])
+        # See if N2,N0,N1 works for "F" order, may need to change it in the array context
+        trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2" if o =="F" else "N2,N1,N0"]])
+ 
+        trans_list.append(["add_prefetch", ["jac_surf", "f,j,e_inner_outer,e_inner_inner"],
+            {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}])
+        trans_list.append(["tag_array_axes", ["inv_jac_tf", "N1,N0,N2" if o == "F" else "N2,N1,N0"]])
+ 
+    trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list 
+
+def einsum4to2_kernel_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+    lmem_divisor = 0
+
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsVecDOFArray() in arg.tags:
+            n_r, n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsFaceDOFArray() in arg.tags:
+            n_r, n_elem, n_in = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsVecOpArray() in arg.tags:
+            n_r, n_out, n_in = arg.shape
+            lmem_divisor = n_r + 1
+        elif IsFaceMassOpArray() in arg.tags:
+            n_out, n_r, n_in = arg.shape
+            lmem_divisor = 2*n_r
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None)
+
+    # Iterate over five search dimensions
+    parameter_list = []
+    if n_elem > 0:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced
+            for kio in k_inner_outer_options(n_in, kii, local_mem_size // lmem_divisor, fp_bytes=fp_bytes,start_val=kio_s):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        for ji in j_inner_options(n_in, start_val=ji_s):
+                            ji_s = None
+                            for order in ["F","C"]:
+                                choices = (kio, kii, iio, iii, ji,order)
+                                parameter_list.append(choices)
+
+    return parameter_list
+
+
+def einsum5to3_kernel_tlist_generator(params, **kwargs):
+    trans_list = []
+    kio, kii, iio, iii, ji, lm_ord = params
+    if lm_ord in "fF":
+        vecf_ord = "f,f"
+        inv_jac_tf_ord = "N3,N2,N0,N1"
+    else:
+        vecf_ord = "c,c"
+        inv_jac_tf_ord = "N3,N2,N1,N0"
+    trans_list.append(["tag_inames", ["r: unr"]])
+    trans_list.append(["tag_inames", ["x: ilp"]])
+    trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["e_inner", kii], 
+        {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+    trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+    trans_list.append(["split_iname", ["i_inner", iii], 
+        {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+    # Should the i loop have (0,1) slabs for both?
+
+    trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"],
+        {"temporary_name":"vecf", "default_tag":"l.auto"}])
+    trans_list.append(["tag_array_axes", ["vecf", vecf_ord]])
+    trans_list.append(["add_prefetch", ["inv_jac_t", "x,r,j,e_inner_outer,e_inner_inner"],
+        {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}])
+    trans_list.append(["tag_array_axes", ["inv_jac_tf", inv_jac_tf_ord]])
+
+    trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+    trans_list.append(["add_inames_for_unused_hw_axes"]) 
+    return trans_list 
+
+def einsum5to3_kernel_pspace_generator(queue, knl, start_param=None):
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsFourAxisDOFArray() in arg.tags:
+            n_r, n_x, n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsVecOpArray() in arg.tags:
+            n_r, n_out, n_in = arg.shape
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s, order = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s, order = (None, None, None, None, None, None)
+
+    # Iterate over five search dimensions
+    parameter_list = []
+    if n_elem > 0:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced
+            for kio in k_inner_outer_options(n_in, kii, local_mem_size // (n_r*n_x + 1), fp_bytes=fp_bytes,start_val=kio_s):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        for ji in j_inner_options(n_in, start_val=ji_s):
+                            ji_s = None
+                            for order in ["F", "C"]:  
+                                choices = (kio, kii, iio, iii, ji, order)
+                                parameter_list.append(choices)
+
+    return parameter_list
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning.py b/grudge/loopy_dg_kernels/parallel_autotuning.py
new file mode 100644
index 000000000..5f99a9760
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning.py
@@ -0,0 +1,113 @@
+from charm4py import charm, Chare, Array, Reducer, Future
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+#from grudge.execution import diff_prg, elwise_linear
+
+class AutotuneTask(Chare):
+
+    def __init__(self, platform_id, params):
+        self.platform_id = platform_id
+        self.params = params
+
+    def get_queue(self):
+        platform = cl.get_platforms()
+        gpu_devices = platform[self.platform_id].get_devices(device_type=cl.device_type.GPU)
+        n_gpus = len(gpu_devices)
+        ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+        profiling = cl.command_queue_properties.PROFILING_ENABLE
+        queue = cl.CommandQueue(ctx, properties=profiling)    
+        return queue
+
+    def run(self):
+        print([self.params, np.random.rand])
+
+
+class Test(Chare):
+    def start(self):
+        print('I am element', self.thisIndex, 'on PE', charm.myPe(),
+              'sending a msg to element 1')
+        self.thisProxy[1].sayHi()
+
+    #@coro
+    def sayHi(self, future):
+        rn = np.random.rand()
+        print('Hello from element', self.thisIndex, 'on PE', charm.myPe(), 'random', rn)
+        self.reduce(future, rn, Reducer.max)
+
+def get_queue(pe_num, platform_num=0):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+    #return gpu_devices[pe_num % len(gpu_devices)].int_ptr
+
+def do_work(args):
+    params = args[0]
+    knl = args[1]
+    queue = get_queue(charm.myPe())
+    print("PE: ", charm.myPe())
+    avg_time, transform_list = dgk.run_tests.apply_transformations_and_run_test(queue, knl, dgk.run_tests.generic_test, params)
+    return avg_time, params
+
+def square(x):
+    return x**2
+
+
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    """
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+    assert charm.numPes() > 1
+    assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+        
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 10, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    result = charm.pool.map(do_work, args)
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+    
+    #knl = diff_prg(3, 100000, 56, np.float64)
+    #autotune_list = gen_autotune_list(queue, knl) 
+    #print(autotune_list)
+
+    """
+
+    print(charm.numHosts(), charm.numPes())
+    f = Future()
+    #a = Array(Test, a.numPes())
+    #a.sayHi(f)
+    #result = f.get()
+    #print(result)
+    print("All finished")
+    charm.exit()    
+
+charm.start(main)
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py b/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py
new file mode 100644
index 000000000..005d3bec2
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py
@@ -0,0 +1,249 @@
+from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm
+from charm4py.pool import PoolScheduler, Pool
+from charm4py.charm import Charm, CharmRemote
+#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap
+#from charm4py.sections import SectionManager
+#import inspect
+#import sys
+import hjson
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+import os
+import grudge.grudge_array_context as gac
+import loopy as lp
+from os.path import exists
+from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test
+from grudge.grudge_array_context import convert
+#from grudge.execution import diff_prg, elwise_linear
+
+# Makes one PE inactive on each host so the number of workers is the same on all hosts as
+# opposed to the basic PoolScheduler which has one fewer worker on the host with PE 0.
+# This can be useful for running tasks on a GPU cluster for example.
+class BalancedPoolScheduler(PoolScheduler):
+
+    def __init__(self):
+       super().__init__()
+       n_pes = charm.numPes()
+       n_hosts = charm.numHosts()
+       pes_per_host = n_pes // n_hosts
+
+       assert n_pes % n_hosts == 0 # Enforce constant number of pes per host
+       assert pes_per_host > 1 # We're letting one pe on each host be unused
+
+       self.idle_workers = set([i for i in range(n_pes) if not i % pes_per_host == 0 ])
+       self.num_workers = len(self.idle_workers)
+
+# Use all PEs including PE 0 
+class AllPEsPoolScheduler(PoolScheduler):
+
+    def __init__(self):
+       super().__init__()
+       n_pes = charm.numPes()
+       n_hosts = charm.numHosts()
+
+       self.idle_workers = set(range(n_pes))
+       self.num_workers = len(self.idle_workers)
+
+
+def get_queue(pe_num, platform_num):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+
+# Just assume each rank has one processor and create a queue
+# Breaks for some reason. Maybe because the tasks migrate and the underlying hardware
+# address changes so the queue is not for the correct device.
+# The memory will probably run out over time if many queues are created.
+queue = get_queue(0,0)
+
+def test(args):
+    platform_id, knl, tlist_generator, params, test_fn = args
+    #queue = get_queue(charm.myPe(), platform_id)
+    result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) 
+    return result
+
+
+def unpickle_kernel(fname):
+    from pickle import load
+    f = open(fname, "rb")
+    program = load(f)
+    f.close()
+    return program
+
+def autotune_pickled_kernels(path, platform_id, actx_class, comm):
+    from os import listdir
+    dir_list = listdir(path)
+    for f in dir_list:
+        if f.endswith(".pickle"):
+            fname = path + "/" + f
+            print("===============================================")
+            print("Autotuning", fname)
+            knl = unpickle_kernel(fname)
+            knl_id = f.split(".")[0]
+            knl_id = knl_id.split("_")[-1]
+            print("Kernel ID", knl_id)
+            print("New kernel ID", gac.unique_program_id(knl))
+            
+            assert knl_id == gac.unique_program_id(knl)
+            knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+            knl = gac.set_memory_layout(knl)
+            assert knl_id == gac.unique_program_id(knl)
+
+            print(knl)
+            pid = gac.unique_program_id(knl)
+            hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+            if not exists(hjson_file_str):
+                parallel_autotune(knl, platform_id, actx_class, comm)
+            else:
+                print("hjson file exists, skipping")
+
+def parallel_autotune(knl, platform_id, actx_class, comm):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+
+    import pyopencl.tools as cl_tools
+    actx = actx_class(
+        comm,
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
+
+    #knl = gac.fix_program_parameters(knl)
+    #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+    knl = gac.set_memory_layout(knl)
+    pid = gac.unique_program_id(knl)
+    os.makedirs(os.getcwd() + "/hjson", exist_ok=True)
+    hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+
+
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+    from run_tests import run_single_param_set
+    
+    tlist_generator, pspace_generator = actx.get_generators(knl)
+    params_list = pspace_generator(actx.queue, knl)
+
+    # Could make a massive list with all kernels and parameters
+    args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list]
+
+
+    # May help to balance workload
+    # Should test if shuffling matters
+    from random import shuffle
+    shuffle(args)
+
+
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work
+
+    pool_proxy = Chare(PoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    if len(args) > 0: # Guard against empty list
+        results = mypool.map(test, args)
+
+        sort_key = lambda entry: entry[0]
+        results.sort(key=sort_key)
+        
+        #for r in results:
+        #    print(r)
+        # Workaround for pocl CUDA bug
+        # whereby times are imprecise
+        ret_index = 0
+        for i, result in enumerate(results):
+            if result[0] > 1e-7:
+                ret_index = i
+                break
+
+        avg_time, transformations, data = results[ret_index]
+    else:
+        transformations = {}
+    
+    od = {"transformations": transformations}
+    out_file = open(hjson_file_str, "wt+")
+    hjson.dump(od, out_file,default=convert)
+    out_file.close()
+
+    return transformations
+
+"""
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+   
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+    
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    pool_proxy = Chare(BalancedPoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    result = mypool.map(do_work, args)
+
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+"""
+
+def main(args):
+    import mpi4py.MPI as MPI
+    from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac
+    comm = MPI.COMM_WORLD
+    
+    autotune_pickled_kernels("./pickled_programs", 0, Maac, comm)
+    print("DONE!")
+    exit()
+
+def charm_autotune():
+    charm.start(main)
+    print(result)
+    charm.exit()
+ 
+if __name__ == "__main__":
+    charm.start(main)
+    print(result)
+    charm.exit()
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py b/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py
new file mode 100644
index 000000000..1fd34128e
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py
@@ -0,0 +1,308 @@
+#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm
+#from charm4py.pool import PoolScheduler, Pool
+#from charm4py.charm import Charm, CharmRemote
+#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap
+#from charm4py.sections import SectionManager
+#import inspect
+#import sys
+import hjson
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+import os
+import grudge.grudge_array_context as gac
+import loopy as lp
+from os.path import exists
+from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test
+from grudge.grudge_array_context import convert
+#from grudge.execution import diff_prg, elwise_linear
+import mpi4py.MPI as MPI
+from mpi4py.futures import MPIPoolExecutor, MPICommExecutor
+#from mpipool import MPIPool
+
+#from guppy import hpy
+#import gc
+#import linecache
+#import os
+#import tracemalloc
+#from mem_top import mem_top
+#import matplotlib.pyplot as plt
+
+data_dict = {}
+
+def display_top(snapshot, key_type='lineno', limit=10):
+    snapshot = snapshot.filter_traces((
+        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
+        tracemalloc.Filter(False, "<frozen importlib._bootstrap_external>"),
+        tracemalloc.Filter(False, "<unknown>"),
+    ))
+    top_stats = snapshot.statistics(key_type)
+
+    print("Top %s lines" % limit)
+    for index, stat in enumerate(top_stats[:limit], 1):
+        frame = stat.traceback[0]
+        # replace "/path/to/module/file.py" with "module/file.py"
+        filename = os.sep.join(frame.filename.split(os.sep)[-2:])
+        print("#%s: %s:%s: %.1f KiB"
+              % (index, filename, frame.lineno, stat.size / 1024))
+        line = linecache.getline(frame.filename, frame.lineno).strip()
+        d_str = filename + ":" + str(frame.lineno) + ": " + line
+        if d_str not in data_dict:
+            data_dict[d_str] = [stat.size]
+        else:
+            data_dict[d_str].append(stat.size)
+
+        if line:
+            print('    %s' % line)
+
+    fig = plt.figure(0)
+    fig.clear()
+    plt.ion()
+    plt.show()
+    dlist = sorted(data_dict.items(), key=lambda a: a[1][-1], reverse=True)[:10]
+    #print(dlist)
+    #exit()
+    for key, vals in dlist:
+        plt.plot(vals, label=key + " " + str(vals[-1]) + " bytes")
+    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=False, ncol=1)
+    plt.draw()
+    #plt.pause(1)
+    plt.savefig("memory_usage.png", bbox_inches="tight")
+
+    other = top_stats[limit:]
+    if other:
+        size = sum(stat.size for stat in other)
+        print("%s other: %.1f KiB" % (len(other), size / 1024))
+    total = sum(stat.size for stat in top_stats)
+    print("Total allocated size: %.1f KiB" % (total / 1024))
+
+
+
+def get_queue(pe_num, platform_num):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+
+# Assume using platform zero
+comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future
+# From MPI.PoolExecutor the communicator for the tasks is not COMM_WORLD
+queue = get_queue(comm.Get_rank(), 0)
+
+def test(args):
+    platform_id, knl, tlist_generator, params, test_fn = args
+    #comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future
+    # From MPI.PoolExecutor the communicator for the tasks is not COMM_WORLD
+    #queue = get_queue(comm.Get_rank(), platform_id)
+    result = run_single_param_set(queue, knl, tlist_generator, params, test_fn)
+    #print(mem_top())
+    #h = hpy()
+    #print(h.heap())
+    #snapshot = tracemalloc.take_snapshot()
+    #display_top(snapshot)
+    #del knl
+    #del args
+
+    #result = [10,10,10]
+    return result
+
+def unpickle_kernel(fname):
+    from pickle import load
+    f = open(fname, "rb")
+    program = load(f)
+    f.close()
+    return program
+
+
+def autotune_pickled_kernels(path, platform_id, actx_class, comm):
+    from os import listdir
+    dir_list = listdir(path)
+    for f in dir_list:
+        if f.endswith(".pickle"):
+            fname = path + "/" + f
+            print("===============================================")
+            print("Autotuning", fname)
+            knl = unpickle_kernel(fname)
+            knl_id = f.split(".")[0]
+            knl_id = knl_id.split("_")[-1]
+
+            #assert knl_id == gac.unique_program_id(knl)
+
+            print("Kernel ID", knl_id)
+            print("Calculated Kernel ID", gac.unique_program_id(knl))
+            # These should be baked into the kernel object already?
+            #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+            #knl = gac.set_memory_layout(knl)
+            #print("New kernel ID", gac.unique_program_id(knl))
+
+            assert knl_id == gac.unique_program_id(knl)
+
+            print(knl)
+            #pid = gac.unique_program_id(knl)
+            hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{knl_id}.hjson"
+            if not exists(hjson_file_str):
+                parallel_autotune(knl, platform_id, actx_class, comm)
+            else:
+                print("hjson file exists, skipping")
+
+
+def parallel_autotune(knl, platform_id, actx_class, comm):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    # Should just use get_queue
+    ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+
+    import pyopencl.tools as cl_tools
+    actx = actx_class(
+        comm,
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
+
+    #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+    #knl = gac.set_memory_layout(knl)
+    pid = gac.unique_program_id(knl)
+    os.makedirs(os.getcwd() + "/hjson", exist_ok=True)
+    hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+
+    #assert comm.Get_size() > 1
+    #assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+    from run_tests import run_single_param_set
+    
+    tlist_generator, pspace_generator = actx.get_generators(knl)
+    params_list = pspace_generator(actx.queue, knl)
+
+    # Could make a massive list with all kernels and parameters
+    args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list]
+
+    # May help to balance workload
+    # Should test if shuffling matters
+    #from random import shuffle
+    #shuffle(args)
+
+
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work
+
+    #pool_proxy = Chare(PoolScheduler, onPE=0)
+
+    sort_key = lambda entry: entry[0]
+    transformations = {}
+    comm = MPI.COMM_WORLD
+    #nranks = comm.Get_size()
+    if len(params_list) > 0: # Guard against empty list
+        #executor = MPIPoolExecutor(max_workers=1)
+        #results = list(executor.map(test, args))
+        #results.sort(key=sort_key)
+        #avg_time, transformations, data = results[0]
+        #for entry in results:
+        #    print(entry)
+        #exit()
+        #"""
+        with MPICommExecutor(comm, root=0) as mypool:
+            if mypool is not None:
+                results = list(mypool.map(test, args, chunksize=1))
+                results.sort(key=sort_key)
+        
+                #for r in results:
+                #    print(r)
+                # Workaround for pocl CUDA bug
+                # whereby times are imprecise
+                ret_index = 0
+                for i, result in enumerate(results):
+                    if result[0] > 1e-7:
+                        ret_index = i
+                        break
+
+                avg_time, transformations, data = results[ret_index]
+                od = {"transformations": transformations}
+                out_file = open(hjson_file_str, "wt+")
+                hjson.dump(od, out_file,default=convert)
+                out_file.close()
+        #"""
+
+    return transformations
+
+"""
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+   
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+    
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    pool_proxy = Chare(BalancedPoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    result = mypool.map(do_work, args)
+
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+"""
+
+def main():
+    from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac
+    comm = MPI.COMM_WORLD
+    
+    #tracemalloc.start()
+    #gc.set_debug(gc.DEBUG_UNCOLLECTABLE)
+    autotune_pickled_kernels("./pickled_programs", 0, Maac, comm)
+
+    print("DONE!")
+    exit()
+
+if __name__ == "__main__":
+    import sys
+    main()
+
+    #pool = MPIPool()
+
+    #if not pool.is_master():
+    #    pool.wait()
+    #    sys.exit(0)
+
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py b/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py
new file mode 100644
index 000000000..cfdef85fe
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py
@@ -0,0 +1,225 @@
+#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm
+#from charm4py.pool import PoolScheduler, Pool
+#from charm4py.charm import Charm, CharmRemote
+#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap
+#from charm4py.sections import SectionManager
+#import inspect
+#import sys
+import hjson
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+import os
+import grudge.grudge_array_context as gac
+import loopy as lp
+from os.path import exists
+from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test
+from grudge.grudge_array_context import convert
+#from grudge.execution import diff_prg, elwise_linear
+import mpi4py.MPI as MPI
+from mpi4py.futures import MPIPoolExecutor, MPICommExecutor
+from mpipool import MPIPool
+
+def get_queue(pe_num, platform_num):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+
+comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future
+queue = get_queue(comm.Get_rank(), 0)
+
+
+def test(args):
+    #print(args)
+    platform_id, knl, tlist_generator, params, test_fn = args
+    result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) 
+    return result
+
+
+def unpickle_kernel(fname):
+    from pickle import load
+    f = open(fname, "rb")
+    program = load(f)
+    f.close()
+    return program
+
+
+def autotune_pickled_kernels(path, platform_id, actx_class, comm):
+    from os import listdir
+    dir_list = listdir(path)
+    for f in dir_list:
+        if f.endswith(".pickle"):
+            fname = path + "/" + f
+            print("===============================================")
+            print("Autotuning", fname)
+            knl = unpickle_kernel(fname)
+            knl_id = f.split(".")[0]
+            knl_id = knl_id.split("_")[-1]
+            print("Kernel ID", knl_id)
+            print("New kernel ID", gac.unique_program_id(knl))
+            
+            assert knl_id == gac.unique_program_id(knl)
+            knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+            knl = gac.set_memory_layout(knl)
+            assert knl_id == gac.unique_program_id(knl)
+
+            print(knl)
+            pid = gac.unique_program_id(knl)
+            hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+            if not exists(hjson_file_str):
+
+                parallel_autotune(knl, platform_id, actx_class, comm)
+            else:
+                print("hjson file exists, skipping")
+
+
+def parallel_autotune(knl, platform_id, actx_class, comm):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+
+    import pyopencl.tools as cl_tools
+    actx = actx_class(
+        comm,
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
+
+    knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+    knl = gac.set_memory_layout(knl)
+    pid = gac.unique_program_id(knl)
+    os.makedirs(os.path.dirname("./hjson"), exist_ok=True)
+    hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+
+    #assert comm.Get_size() > 1
+    #assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+    from run_tests import run_single_param_set
+    
+    tlist_generator, pspace_generator = actx.get_generators(knl)
+    params_list = pspace_generator(actx.queue, knl)
+
+    # Could make a massive list with all kernels and parameters
+    args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list]
+
+
+    # May help to balance workload
+    # Should test if shuffling matters
+    from random import shuffle
+    shuffle(args)
+
+
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work
+
+    #pool_proxy = Chare(PoolScheduler, onPE=0)
+
+    sort_key = lambda entry: entry[0]
+    transformations = {}
+    if len(args) > 0: # Guard against empty list
+        with MPIPool() as mypool:
+            mypool.workers_exit()
+            if mypool is not None:
+                results = list(mypool.map(test, args))
+                results.sort(key=sort_key)
+        
+                #for r in results:
+                #    print(r)
+                # Workaround for pocl CUDA bug
+                # whereby times are imprecise
+                ret_index = 0
+                for i, result in enumerate(results):
+                    if result[0] > 1e-7:
+                        ret_index = i
+                        break
+
+                avg_time, transformations, data = results[ret_index]
+
+    od = {"transformations": transformations}
+    out_file = open(hjson_file_str, "wt+")
+    hjson.dump(od, out_file,default=convert)
+    out_file.close()
+
+    return transformations
+
+"""
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+   
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+    
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    pool_proxy = Chare(BalancedPoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    result = mypool.map(do_work, args)
+
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+"""
+
+def main():
+    from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac
+    comm = MPI.COMM_WORLD
+    
+    autotune_pickled_kernels("./pickled_programs", 0, Maac, comm)
+
+    print("DONE!")
+    exit()
+
+if __name__ == "__main__":
+    import sys
+    main()
+
+    #pool = MPIPool()
+
+    #if not pool.is_master():
+    #    pool.wait()
+    #    sys.exit(0)
+
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py b/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py
new file mode 100644
index 000000000..0d9b2d572
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py
@@ -0,0 +1,247 @@
+#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm
+#from charm4py.pool import PoolScheduler, Pool
+#from charm4py.charm import Charm, CharmRemote
+#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap
+#from charm4py.sections import SectionManager
+#import inspect
+#import sys
+import hjson
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+import os
+import grudge.grudge_array_context as gac
+import loopy as lp
+from os.path import exists
+from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test
+from grudge.grudge_array_context import convert
+#from grudge.execution import diff_prg, elwise_linear
+import mpi4py.MPI as MPI
+from schwimmbad import SerialPool, MPIPool
+#from schwimmbad.mpi import MPIAsyncPool
+
+def get_queue(pe_num, platform_num):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+
+comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future
+queue = get_queue(comm.Get_rank(), 0)
+
+
+def test(args):
+    platform_id, knl, tlist_generator, params, test_fn = args
+    result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) 
+    return result
+
+
+def unpickle_kernel(fname):
+    from pickle import load
+    f = open(fname, "rb")
+    program = load(f)
+    f.close()
+    return program
+
+
+def autotune_pickled_kernels(path, platform_id, actx_class, comm):
+    from os import listdir
+    dir_list = listdir(path)
+    for f in dir_list:
+        if f.endswith(".pickle"):
+            fname = path + "/" + f
+            print("===============================================")
+            print("Autotuning", fname)
+            knl = unpickle_kernel(fname)
+            knl_id = f.split(".")[0]
+            knl_id = knl_id.split("_")[-1]
+            print("Kernel ID", knl_id)
+            print("New kernel ID", gac.unique_program_id(knl))
+            
+            assert knl_id == gac.unique_program_id(knl)
+            knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+            knl = gac.set_memory_layout(knl)
+            assert knl_id == gac.unique_program_id(knl)
+
+            print(knl)
+            pid = gac.unique_program_id(knl)
+            hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+            if not exists(hjson_file_str):
+                parallel_autotune(knl, platform_id, actx_class, comm)
+            else:
+                print("hjson file exists, skipping")
+
+
+def parallel_autotune(knl, platform_id, actx_class, comm):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+
+    import pyopencl.tools as cl_tools
+    actx = actx_class(
+        comm,
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
+
+    knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+    knl = gac.set_memory_layout(knl)
+    pid = gac.unique_program_id(knl)
+    os.makedirs(os.path.dirname("./hjson"), exist_ok=True)
+    hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+
+    #assert comm.Get_size() > 1
+    #assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+    from run_tests import run_single_param_set
+    
+    tlist_generator, pspace_generator = actx.get_generators(knl)
+    params_list = pspace_generator(actx.queue, knl)
+
+    # Could make a massive list with all kernels and parameters
+    args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list]
+
+
+    # May help to balance workload
+    # Should test if shuffling matters
+    from random import shuffle
+    shuffle(args)
+
+
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work
+
+    #pool_proxy = Chare(PoolScheduler, onPE=0)
+    #mypool = MPIAsyncPool()
+    mypool = MPIPool()#Pool(pool_proxy)
+    #mypool = SerialPool()
+    if isinstance(mypool, MPIPool) and not mypool.is_master():
+        mypool.wait()
+        sys.exit(0)
+
+    sort_key = lambda entry: entry[0]
+    if len(args) > 0: # Guard against empty list
+        results = list(mypool.map(test, args))
+        mypool.close()
+        results.sort(key=sort_key)
+        
+        #for r in results:
+        #    print(r)
+        # Workaround for pocl CUDA bug
+        # whereby times are imprecise
+        ret_index = 0
+        for i, result in enumerate(results):
+            if result[0] > 1e-7:
+                ret_index = i
+                break
+
+        avg_time, transformations, data = results[ret_index]
+    else:
+        transformations = {}
+        mypool.close()
+    
+    od = {"transformations": transformations}
+    out_file = open(hjson_file_str, "wt+")
+    hjson.dump(od, out_file,default=convert)
+    out_file.close()
+
+    return transformations
+
+"""
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+   
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+    
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    pool_proxy = Chare(BalancedPoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    result = mypool.map(do_work, args)
+
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+"""
+
+def main():
+    from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac
+    comm = MPI.COMM_WORLD
+    
+    autotune_pickled_kernels("./pickled_programs", 0, Maac, comm)
+
+    print("DONE!")
+    exit()
+
+"""
+def worker(task):
+    a, b = task
+    return a**2 + b**2
+
+def main(args):
+    # Here we generate some fake data
+    import random
+    a = [random.random() for _ in range(10000)]
+    b = [random.random() for _ in range(10000)]
+
+    tasks = list(zip(a, b))
+    results = pool.map(worker, tasks)
+    pool.close()
+
+    print(results[:8])
+"""
+
+if __name__ == "__main__":
+    import sys
+    main()
+
+    #pool = MPIPool()
+
+    #if not pool.is_master():
+    #    pool.wait()
+    #    sys.exit(0)
+
diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_v2.py b/grudge/loopy_dg_kernels/parallel_autotuning_v2.py
new file mode 100644
index 000000000..2dff8c7d4
--- /dev/null
+++ b/grudge/loopy_dg_kernels/parallel_autotuning_v2.py
@@ -0,0 +1,253 @@
+from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm
+from charm4py.pool import PoolScheduler, Pool
+from charm4py.charm import Charm, CharmRemote
+#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap
+#from charm4py.sections import SectionManager
+#import inspect
+#import sys
+import hjson
+import pyopencl as cl
+import numpy as np
+import grudge.loopy_dg_kernels as dgk
+import os
+import grudge.grudge_array_context as gac
+import loopy as lp
+from os.path import exists
+from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test
+from grudge.grudge_array_context import convert
+#from grudge.execution import diff_prg, elwise_linear
+
+# Makes one PE inactive on each host so the number of workers is the same on all hosts as
+# opposed to the basic PoolScheduler which has one fewer worker on the host with PE 0.
+# This can be useful for running tasks on a GPU cluster for example.
+class BalancedPoolScheduler(PoolScheduler):
+
+    def __init__(self):
+       super().__init__()
+       n_pes = charm.numPes()
+       n_hosts = charm.numHosts()
+       pes_per_host = n_pes // n_hosts
+
+       assert n_pes % n_hosts == 0 # Enforce constant number of pes per host
+       assert pes_per_host > 1 # We're letting one pe on each host be unused
+
+       self.idle_workers = set([i for i in range(n_pes) if not i % pes_per_host == 0 ])
+       self.num_workers = len(self.idle_workers)
+
+# Use all PEs including PE 0 
+class AllPEsPoolScheduler(PoolScheduler):
+
+    def __init__(self):
+       super().__init__()
+       n_pes = charm.numPes()
+       n_hosts = charm.numHosts()
+
+       self.idle_workers = set(range(n_pes))
+       self.num_workers = len(self.idle_workers)
+
+
+def get_queue(pe_num, platform_num):
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU)
+    ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]])
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    return queue
+
+
+def do_work(args):
+    params = args[0]
+    knl = args[1]
+    queue = get_queue(charm.myPe())
+    print("PE: ", charm.myPe())
+    avg_time, transform_list = dgk.run_tests.apply_transformations_and_run_test(queue, knl, dgk.run_tests.generic_test, params)
+    return avg_time, params
+
+def test(args):
+    platform_id, knl, tlist_generator, params, test_fn = args
+    queue = get_queue(charm.myPe(), platform_id)
+    result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) 
+    return result
+
+
+
+def unpickle_kernel(fname):
+    from pickle import load
+    f = open(fname, "rb")
+    program = load(f)
+    f.close()
+    return program
+
+def autotune_pickled_kernels(path, platform_id, actx_class, comm):
+    from os import listdir
+    dir_list = listdir(path)
+    for f in dir_list:
+        if f.endswith(".pickle"):
+            fname = path + "/" + f
+            print("===============================================")
+            print("Autotuning", fname)
+            knl = unpickle_kernel(fname)
+            knl_id = f.split(".")[0]
+            knl_id = knl_id.split("_")[-1]
+            print("Kernel ID", knl_id)
+            print("New kernel ID", gac.unique_program_id(knl))
+            
+            assert knl_id == gac.unique_program_id(knl)
+            knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+            knl = gac.set_memory_layout(knl)
+            assert knl_id == gac.unique_program_id(knl)
+
+            print(knl)
+            pid = gac.unique_program_id(knl)
+            hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+            if not exists(hjson_file_str):
+                parallel_autotune(knl, platform_id, actx_class, comm)
+            else:
+                print("hjson file exists, skipping")
+
+def parallel_autotune(knl, platform_id, actx_class, comm):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+
+
+    import pyopencl.tools as cl_tools
+    actx = actx_class(
+        comm,
+        queue,
+        allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
+
+    #knl = gac.fix_program_parameters(knl)
+    #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True))
+    knl = gac.set_memory_layout(knl)
+    pid = gac.unique_program_id(knl)
+    os.makedirs(os.getcwd() + "/hjson", exist_ok=True)
+    hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson"
+
+
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+
+    from run_tests import run_single_param_set
+    
+    tlist_generator, pspace_generator = actx.get_generators(knl)
+    params_list = pspace_generator(actx.queue, knl)
+
+    # Could make a massive list with all kernels and parameters
+    args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list]
+
+
+    # May help to balance workload
+    # Should test if shuffling matters
+    from random import shuffle
+    shuffle(args)
+
+
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work
+
+    pool_proxy = Chare(PoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    if len(args) > 0: # Guard against empty list
+        results = mypool.map(test, args)
+
+        sort_key = lambda entry: entry[0]
+        results.sort(key=sort_key)
+        
+        #for r in results:
+        #    print(r)
+        # Workaround for pocl CUDA bug
+        # whereby times are imprecise
+        ret_index = 0
+        for i, result in enumerate(results):
+            if result[0] > 1e-7:
+                ret_index = i
+                break
+
+        avg_time, transformations, data = results[ret_index]
+    else:
+        transformations = {}
+    
+    od = {"transformations": transformations}
+    out_file = open(hjson_file_str, "wt+")
+    hjson.dump(od, out_file,default=convert)
+    out_file.close()
+
+    return transformations
+
+"""
+def main(args):
+
+    # Create queue, assume all GPUs on the machine are the same
+    platforms = cl.get_platforms()
+    platform_id = 0
+    gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU)
+    n_gpus = len(gpu_devices)
+    ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]])
+    profiling = cl.command_queue_properties.PROFILING_ENABLE
+    queue = cl.CommandQueue(ctx, properties=profiling)    
+   
+    assert charm.numPes() > 1
+    #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices)
+    assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1)
+    # Check that it can assign one PE to each GPU
+    # The first PE is used for scheduling
+    # Not certain how this will work with multiple nodes
+    
+    from grudge.execution import diff_prg, elwise_linear_prg
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    params = dgk.run_tests.gen_autotune_list(queue, knl)
+
+    args = [[param, knl] for param in params]
+
+    # May help to balance workload
+    from random import shuffle
+    shuffle(args)
+    
+    #a = Array(AutotuneTask, dims=(len(args)), args=args[0])
+    #a.get_queue()
+   
+    #result = charm.pool.map(do_work, args)
+
+    pool_proxy = Chare(BalancedPoolScheduler, onPE=0)
+    mypool = Pool(pool_proxy)
+    result = mypool.map(do_work, args)
+
+    sort_key = lambda entry: entry[0]
+    result.sort(key=sort_key)
+    
+
+    for r in result:
+        print(r)
+"""
+
+def main(args):
+    import mpi4py.MPI as MPI
+    from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac
+    comm = MPI.COMM_WORLD
+    
+    autotune_pickled_kernels("./pickled_programs", 0, Maac, comm)
+    print("DONE!")
+    exit()
+
+def charm_autotune():
+    charm.start(main)
+    print(result)
+    charm.exit()
+ 
+if __name__ == "__main__":
+    charm.start(main)
+    print(result)
+    charm.exit()
diff --git a/grudge/loopy_dg_kernels/resample_by_mat.hjson b/grudge/loopy_dg_kernels/resample_by_mat.hjson
new file mode 100644
index 000000000..56f133597
--- /dev/null
+++ b/grudge/loopy_dg_kernels/resample_by_mat.hjson
@@ -0,0 +1,166 @@
+{
+	  72a3ce98-5d21-48bf-b402-6ee96bafd1b6: {
+      description: "Transformations for the NVIDIA Titan V"
+        # 64-bit or 32-bit kernel
+        FP32:{
+          # Polynomial order
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], 
+              #["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["resample_mat", "idof,j"], {temporary_name: "matf", default_tag: "l.auto"}], 
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+   
+          ],
+          5:[
+              ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+          6:[
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 42], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+        }
+        # Not optimized, just copied from 32 bit version
+        FP64: {
+          2:[
+              # Format: [Transformation, args, kwargs]
+              ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          3:[
+
+              ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}],
+              # For tests uncomment this
+              #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 96], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}],
+              ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["split_iname", ["idof", 20], {outer_tag: "g.1"}], 
+              #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], 
+              # For tests comment this
+              # Would need to specify shared memory and the location for this prefetch. It probably can't help
+              # anyway
+              #["add_prefetch", ["ary", "j"], {temporary_name: "aryf", default_tag: "l.auto"}],
+              # Maybe can stop random accesses from evicting matrix from cache by putting it in shared memory
+          ],
+          4:[
+              # Move this to array context?
+              #["tag_array_axes", ["mat", "sep,c,c"]],
+              #["tag_array_axes", ["result", "sep,f,f"]],
+              #["tag_array_axes", ["vec", "f,f"]],
+
+              ["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              # See if these pass the tests
+              #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}],
+              #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+
+              #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["idof", 35], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], 
+ 
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], 
+          ],
+          5:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 56], {outer_tag: "g.1"}], 
+              ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], 
+ 
+              #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}],
+              #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+
+              ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], 
+            ]
+          6:[
+              ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}],
+              ["split_iname", ["idof", 84], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+              ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ], 
+          7:[
+              ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}],
+              ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}],
+              ["split_iname", ["idof", 120], {outer_tag: "g.1"}],
+              ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}],
+ 
+              #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}],
+              ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}],
+              ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}],
+              #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], 
+
+              #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}],
+              #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}],
+              #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}],
+              #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}],
+          ]
+      }
+    }
+	  1d7cab16-19bd-4474-95f2-44ed1c0e60df: {}
+}
diff --git a/grudge/loopy_dg_kernels/roofline_plotting.py b/grudge/loopy_dg_kernels/roofline_plotting.py
new file mode 100644
index 000000000..f17843cd1
--- /dev/null
+++ b/grudge/loopy_dg_kernels/roofline_plotting.py
@@ -0,0 +1,69 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+max_flops_unboosted = 12288  # GFLOP/s
+max_flops_boosted = 13444.5  # Empirical roofline toolkit
+
+max_g_bandwidth_warburton = 540  # GB/s
+max_g_bandwidth_ert = 561.4
+max_l1_bandwidth = 2610.5
+
+flops_per_byte_accessed = np.arange(0, 101)
+max_flops_unboosted_array = max_flops_unboosted * \
+    np.ones_like(flops_per_byte_accessed)
+
+max_flops_g_unboosted_data = np.minimum(flops_per_byte_accessed
+    * max_g_bandwidth_warburton, max_flops_unboosted_array)
+max_flops_l1_unboosted_data = np.minimum(flops_per_byte_accessed
+    * max_l1_bandwidth, max_flops_unboosted_array)
+
+fig = plt.figure()
+ax = fig.add_subplot(111)
+ax.loglog(flops_per_byte_accessed, max_flops_g_unboosted_data,
+    label="Device memory roofline")
+ax.loglog(flops_per_byte_accessed, max_flops_l1_unboosted_data,
+    label="L1 cache/Local memory roofline")
+
+theoretical_x_1 = 3*2*np.array([10, 20, 35, 56, 85, 120]) \
+    / (4 + 12)  # Assumes one read and three stores
+theoretical_x_4 = 3*2*np.array([10, 20, 35, 56, 84, 120]) \
+    / (4 + 12 + 12)  # Assumes four reads and three stores
+theoretical_x_7 = 3*2*np.array([10, 20, 35, 56, 84, 120]) \
+    / (4 + 2*(12+12))  # Assumes seven reads and three stores
+#theoretical_x = 2*np.arange(1,33) / (4 + 4) # Assumes one read and one stores
+theoretical_y_1 = np.minimum(theoretical_x_1
+    * max_g_bandwidth_warburton, max_flops_unboosted)
+theoretical_y_4 = np.minimum(theoretical_x_4
+    * max_g_bandwidth_warburton, max_flops_unboosted)
+theoretical_y_7 = np.minimum(theoretical_x_7
+    * max_g_bandwidth_warburton, max_flops_unboosted)
+empirical_x = theoretical_x_4.copy()
+#empirical_x[0:3] = theoretical_x_1[0:3]
+empirical_y = [2026.9636053441898, 4049.8734098551745, 7085.0042493541905,
+    8143.440577930807, 9010.054141132498, 10126.59788574097]
+print(theoretical_x_1)
+print(theoretical_y_1)
+print(theoretical_x_4)
+print(theoretical_y_4)
+
+pn_labels = ["2", "3", "4", "5", "6", "7"]
+
+plt.title("Grudge elementwise differentiation kernel: FP32")
+ax.loglog(theoretical_x_1, theoretical_y_1, "sy",
+    label="4 device memory accesses model (3 writes, 1 read)", markersize=8)
+ax.loglog(theoretical_x_4, theoretical_y_4, "ob",
+    label="7 device memory accesses model, (3 writes, 4 reads)")
+#plt.loglog(theoretical_x_7, theoretical_y_7,"oy", label="13 accesses model")
+ax.loglog(theoretical_x_1, empirical_y, ".r",
+    label="Experimental results assuming 4 accesses")
+for i in range(6):
+    ax.annotate(pn_labels[i], x=(theoretical_x_1[i], empirical_y[i]))
+ax.loglog(theoretical_x_4, empirical_y, ".g",
+    label="Experimental results assuming 7 accesses")
+for i in range(6):
+    ax.annotate(pn_labels[i], xy=(theoretical_x_4[i], empirical_y[i]))
+plt.ylabel("GFLOP/s")
+plt.xlabel("Bytes per flop")
+plt.legend()
+#plt.yticks(theoretical_y)
+plt.show()
diff --git a/grudge/loopy_dg_kernels/run_tests.py b/grudge/loopy_dg_kernels/run_tests.py
new file mode 100644
index 000000000..e2f46fb58
--- /dev/null
+++ b/grudge/loopy_dg_kernels/run_tests.py
@@ -0,0 +1,1296 @@
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.array
+import pyopencl.clrandom
+
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+from grudge.loopy_dg_kernels import apply_transformation_list
+from pyopencl.tools import ImmediateAllocator, MemoryPool
+#from loopy.kernel.data import AddressSpace
+
+"""
+import pycuda.gpuarray as cuarray
+import pycuda.driver as drv
+import pycuda.tools
+import pycuda.autoinit
+from pycuda.compiler import SourceModule
+from pycuda.curandom import rand as curand
+"""
+
+from modepy import equidistant_nodes
+from pytools.obj_array import make_obj_array
+
+import hjson
+import time
+#from math import ceil
+import sys
+
+# setup
+# -----
+lp.set_caching_enabled(False)
+import loopy.options
+loopy.options.ALLOW_TERMINAL_COLORS = False
+
+from grudge.loopy_dg_kernels import (gen_diff_knl, gen_diff_knl_fortran2,
+    apply_transformation_list, gen_elwise_linear_knl, gen_face_mass_knl, gen_face_mass_knl_merged)
+from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray, IsOpArray,
+    IsSepVecOpArray, IsFaceDOFArray, IsFaceMassOpArray, IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray)
+import  grudge.grudge_array_context as gac#import set_memory_layout
+
+def testBandwidth(fp_format=np.float32, nruns=100):
+
+    from pyopencl.array import sum as clsum
+    platform = cl.get_platforms()
+    my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU)
+    #ctx = cl.Context(devices=my_gpu_devices)
+    ctx = cl.create_some_context(interactive=True)
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    from pyopencl.tools import ImmediateAllocator, MemoryPool
+    allocator = ImmediateAllocator(queue)
+    mem_pool = MemoryPool(allocator) 
+
+
+    knl = lp.make_copy_kernel("c,c", old_dim_tags="c,c")
+    knl = lp.add_dtypes(knl, {"input": fp_format, "output": fp_format})
+    knl = knl.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0]))
+    n0 = 2
+    #knl = lp.split_iname(knl, "i1", 1024//2, inner_tag="l.0", outer_tag="g.0", slabs=(0,1))
+    knl = lp.split_iname(knl, "i1", 256, inner_tag="l.0", outer_tag="g.0", slabs=(0,1))
+    #knl = lp.split_iname(knl, "i1", 6*16, outer_tag="g.0") 
+    #knl = lp.split_iname(knl, "i1_inner", 16, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) 
+    #knl = lp.split_iname(knl, "i0", n0, inner_tag="l.1", outer_tag="g.1", slabs=(0,0))
+
+    fp_bytes = 8 if fp_format == np.float64 else 4
+
+    # This assumes fp32
+    len_list = []
+    float_count = 1
+    max_floats = 2**28
+    while float_count <= max_floats:
+        len_list.append(float_count)
+        float_count = int(np.ceil(float_count*1.5))
+    for i in range(29):
+        len_list.append(2**i)
+    len_list = sorted(list(set(len_list)))
+
+    #data = np.random.randint(-127, 128, (1,max_bytes), dtype=np.int8)
+    #inpt = cl.array.to_device(queue, data, allocator=mem_pool)
+
+    print(len_list)
+
+    for n in len_list:
+    #for i in range(29):
+
+        #n = 2**i
+        kern = lp.fix_parameters(knl, n0=n0, n1=n)
+        #data = np.random.randint(-127, 128, (1,n), dtype=np.int8)
+        #inpt = cl.array.to_device(queue, data, allocator=mem_pool)
+        inpt = cl.clrandom.rand(queue, (n0, n), dtype=fp_format)
+        outpt = cl.array.Array(queue, (n0, n), dtype=fp_format, allocator=mem_pool)
+     
+        #kern = lp.set_options(kern, "write_code")  # Output code before editing it
+
+        for j in range(2):
+            kern(queue, input=inpt, output=outpt)
+        dt = 0
+        events = []
+        for j in range(nruns):
+            evt, _ = kern(queue, input=inpt, output=outpt)
+            events.append(evt)
+
+        cl.wait_for_events(events)
+        for evt in events:
+            dt += evt.profile.end - evt.profile.start 
+        #queue.finish()
+        dt = dt / nruns / 1e9
+
+        nbytes_transferred = 2*fp_bytes*n*n0
+        bandwidth = nbytes_transferred / dt / 1e9
+        print("{} {}".format(nbytes_transferred, bandwidth))
+
+        #print((inpt - outpt)) 
+        diff = (inpt - outpt)
+        if  clsum(inpt - outpt) != 0:
+            print("INCORRECT COPY")
+
+
+def test_face_mass_merged(kern, backend="OPENCL", nruns=10, warmup=True):
+    #kern = gen_diff_knl(n_elem, n_in, n_out, k_inner_outer, k_inner_inner,
+    #    i_inner_outer, i_inner_inner, j_inner)
+    kern = lp.set_options(kern, "no_numpy")
+    kern = lp.set_options(kern, "return_dict")
+    for arg in kern.args:
+        if arg.name == "vec":
+            fp_format = arg.dtype
+            n_elem, n_in = arg.shape
+        elif arg.name == "mat":
+            n_out, _ = arg.shape
+
+    CUDA = (backend == "CUDA")
+    OPENCL = not CUDA
+
+    if CUDA:
+        print("Not supported")
+        exit()
+    elif OPENCL:
+        platform = cl.get_platforms()
+        my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU)
+        #ctx = cl.Context(devices=my_gpu_devices)
+        ctx = cl.create_some_context(interactive=True)
+        queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+        #kern = lp.set_options(kern, edit_code=False) #Only works for OpenCL?
+        kern = lp.set_options(kern, "write_code")  # Output code before editing it
+        # Print the Code
+        kern = kern.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0]))
+        code = lp.generate_code_v2(kern).device_code()
+        prog = cl.Program(ctx, code)
+        prog = prog.build()
+        ptx = prog.get_info(cl.program_info.BINARIES)[0]#.decode(
+        #errors="ignore") #Breaks pocl
+        from bs4 import UnicodeDammit
+        dammit = UnicodeDammit(ptx)
+        #print(dammit.unicode_markup)
+        f = open("ptx.ptx", "w")
+        f.write(dammit.unicode_markup)
+        f.close()
+
+        from pyopencl.tools import ImmediateAllocator, MemoryPool
+        allocator = ImmediateAllocator(queue)
+        mem_pool = MemoryPool(allocator)
+
+        X_dev = cl.array.Array(queue, (n_elem, n_in), dtype=fp_format, order="F", allocator=mem_pool)
+        cl.clrandom.fill_rand(X_dev, queue=queue)
+        B_dev = cl.array.Array(queue, (n_elem, n_out), dtype=fp_format, allocator=mem_pool,order="F")
+        A_dev = cl.clrandom.rand(queue, (n_out, n_in), dtype=fp_format)
+
+        if warmup:
+            for i in range(2):
+                kern(queue, result=B_dev, mat=A_dev, vec=X_dev)
+            queue.finish()
+
+        sum_time = 0.0
+        events = []
+        for i in range(nruns):
+            evt, _ = kern(queue, result=B_dev, mat=A_dev, vec=X_dev)
+            events.append(evt)
+
+        cl.wait_for_events(events)
+        for evt in events:
+            sum_time += evt.profile.end - evt.profile.start
+        sum_time = sum_time / 1e9        
+        #queue.finish()
+
+    avg_time = sum_time / nruns
+
+    return (B_dev, A_dev, X_dev), avg_time
+
+# Maybe the queue could also be a cuda stream? Could use the type of that to
+# distinguish between CUDA and OpenCL possibly
+# This hardcodes the memory layout, should probably instead retrieve it from somewhere on a per
+# tag basis
+
+#cache_arg_dict = {}
+def generic_test(queue, kern, backend="OPENCL", nruns=10, warmup=True):
+
+    kern = lp.set_options(kern, "no_numpy")
+    kern = lp.set_options(kern, "return_dict")
+
+    CUDA = (backend == "CUDA")
+    OPENCL = not CUDA
+
+    if CUDA:
+        print("CUDA not supported")
+        exit()
+    elif OPENCL:
+        """
+        platform = cl.get_platforms()
+        my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU)
+        ctx = cl.Context(devices=my_gpu_devices)
+        #ctx = cl.create_some_context(interactive=True)
+        #queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+        #kern = lp.set_options(kern, edit_code=False) #Only works for OpenCL?
+        kern = lp.set_options(kern, "write_code")  # Output code before editing it
+        # Print the Code
+        kern = kern.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0]))
+        code = lp.generate_code_v2(kern).device_code()
+        prog = cl.Program(ctx, code)
+        prog = prog.build()
+        ptx = prog.get_info(cl.program_info.BINARIES)[0]#.decode(
+        #errors="ignore") #Breaks pocl
+        dammit = UnicodeDammit(ptx)
+        print(dammit.unicode_markup)
+        f = open("ptx.ptx", "w")
+        f.write(dammit.unicode_markup)
+        f.close()
+        """
+
+        allocator = ImmediateAllocator(queue)
+        mem_pool = MemoryPool(allocator)
+
+        arg_dict = {}
+
+        # Fill arrays with random data
+        # Could probably just read the strides from the kernel to get ordering
+        # Could probably move this to a separate function and memoize it
+        for arg in kern.default_entrypoint.args:
+            print(arg)
+            print(arg.dim_tags)
+            fp_bytes = arg.dtype.numpy_dtype.itemsize
+            strides = [fp_bytes*entry.stride for entry in arg.dim_tags]
+
+            if True:#str(arg) not in cache_arg_dict:
+                if IsSepVecDOFArray() in arg.tags:
+                    print(arg)
+                    print(arg.dim_tags)
+                    print("My strides:", strides)
+                    print("VERIFY IF STRIDES IS CORRECT FOR SEPVECDOFARRAY")
+                    exit()
+                    obj_array = [cl.array.Array(queue, arg.shape[1:], dtype=arg.dtype, allocator=mem_pool, order="F") for i in range(arg.shape[0])]
+                    array = make_obj_array(obj_array)
+                elif IsSepVecOpArray() in arg.tags:
+                    print(arg)
+                    print(arg.dim_tags)
+                    print("My strides:", strides)
+                    print("VERIFY IF STRIDES IS CORRECT FOR SEPVECOPARRAY")
+                    exit()
+                    obj_array = [cl.array.Array(queue, arg.shape[1:], dtype=arg.dtype, order="C", allocator=mem_pool) for i in range(arg.shape[0])]
+                    array = make_obj_array(obj_array)
+                elif isinstance(arg, lp.ArrayArg):
+                    print(f"Giving '{arg.name}' strides {strides}")
+                    array = cl.array.Array(queue, arg.shape, arg.dtype, strides=strides, allocator=mem_pool)
+                    print(arg.name)
+
+                if not arg.is_output:
+                    if isinstance(array, cl.array.Array):
+                        #pass
+                        #if arg.dtype.dtype == np.int8:
+                        #    data = np.random.randint(0, array.shape)
+                        #    array.set(data)
+                        #else:
+
+                        # Handle generating random indices for resampling kernels
+                        # This functionality should probably be moved to a separate
+                        # test function.
+                        if arg.name == "indices":
+                            data_arg_shape = None
+                            for data_arg in kern.default_entrypoint.args:
+                                if data_arg.name == "ary":
+                                    data_arg_shape = data_arg.shape[0]
+                            
+                            cl.clrandom.fill_rand(array, queue=queue, a=0, b=data_arg_shape)
+                        else:
+                            cl.clrandom.fill_rand(array, queue=queue)
+                    elif isinstance(array[0], cl.array.Array):
+                        for entry in array:
+                            #pass
+                            cl.clrandom.fill_rand(entry, queue=queue)
+                    else:
+                        raise TypeError
+
+                #cache_arg_dict[str(arg)] = array
+                #print(arg.name)
+                #print(arg.tags)
+                #print("Unknown Tag")
+                #exit()
+                   
+            #arg_dict[arg.name] = cache_arg_dict[str(arg)]
+            arg_dict[arg.name] = array
+
+        if warmup:
+            for i in range(2):
+                kern(queue, **arg_dict)
+            queue.finish()
+
+        #"""
+        sum_time = 0.0
+        events = []
+        for i in range(nruns):
+            evt, out = kern(queue, **arg_dict)
+            events.append(evt)
+
+        cl.wait_for_events(events)
+        for evt in events:
+            sum_time += evt.profile.end - evt.profile.start
+        sum_time = sum_time / 1e9        
+        #queue.finish()
+        #"""
+
+    avg_time = sum_time / nruns
+
+    return arg_dict, avg_time
+
+
+def analyze_knl_bandwidth(knl, avg_time):
+    nbytes = 0
+    # What if the output is not in the input arguments?
+    #print(knl.default_entrypoint.args)
+    # Would probably be better to use the memory footprint
+    # if can get it to work.
+    for arg in knl.default_entrypoint.args:
+        print(arg.name)
+        print(arg.shape)
+        print(type(arg.dtype))
+        entries = np.prod((arg.shape))
+        fp_bytes = arg.dtype.dtype.itemsize
+        nbytes += fp_bytes * entries
+    bw = nbytes / avg_time / 1e9
+
+    # Seems lp.gather_access_footprint_bytes breaks
+    #footprint = lp.gather_access_footprint_bytes(knl)
+    #footprint_bytes = 0
+    #for val in footprint.values():
+    #    footprint_bytes += val.eval_with_dict({})
+    #footprint_bw =  footprint_bytes / avg_time / 1e9  
+    #print(f"Time: {avg_time}, Bytes: {nbytes}, Bandwidth: {bw} GB/s Footprint BW: {footprint_bw} GB/s")
+
+    print(f"Time: {avg_time}, Bytes: {nbytes}, Bandwidth: {bw} GB/s")
+    return bw
+
+
+def analyze_FLOPS(knl, avg_time, max_gflops=None):
+
+    op_map = lp.get_op_map(knl, count_within_subscripts=False, subgroup_size=1)
+    #print(op_map)
+    map_flops = 0
+    for val in op_map.values():
+        map_flops += val.eval_with_dict({})
+    gflop_rate = map_flops / avg_time / 1e9
+
+    """
+    n_mat = 1
+    nfaces = 1
+    for arg in knl.default_entrypoint.args:
+        if IsDOFArray() in arg.tags:
+            n_elem, n_out = arg.shape
+            fp_bytes = arg.dtype.dtype.itemsize
+        elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags:
+            n_mat, n_out, n_in = arg.shape
+        elif IsOpArray() in arg.tags:
+            n_out, n_in = arg.shape
+        elif IsFaceDOFArray() in arg.tags:
+            nfaces, n_elem, n_in = arg.shape
+    
+    flops = nfaces*n_mat*2*(n_out * n_in * n_elem)
+    """
+    gflop_rate = (map_flops / avg_time) * 1e-9
+    print("GFLOP/s: " + str(gflop_rate))
+
+    #print("Map GFLOP/s: " + str(map_gflop_rate))
+    #print(flops)
+    #print(map_flops)
+
+    frac_peak_gflops = None
+    if max_gflops is not None:
+        print("Peak GFLOP/s: " + str(max_gflops))
+        frac_peak_gflops = gflop_rate / max_gflops
+        print("Percent peak: " + str(100*(frac_peak_gflops)))
+    
+    print()
+
+    # Calculate bandwidth
+    # Assumes each element only read once
+    #ideal_total_bytes_transferred = fp_bytes*(3*(n_out * n_elem) + (n_in * n_elem)
+    #                                            + 3*(n_out * n_in))
+    #GBps = (ideal_total_bytes_transferred / avg_time) / 1e9
+    #frac_peak_GBps = GBps / device_memory_bandwidth
+    #print("GB/s: " + str(GBps))
+    #print("Peak GB/s: " + str(device_memory_bandwidth))
+    #print("Percent peak: " + str(100*(frac_peak_GBps)))
+    #print()
+
+    return gflop_rate, frac_peak_gflops
+
+
+def verifyResult(B_dev1, B_dev2, B_dev3, A_dev1, A_dev2, A_dev3, X_dev):
+    A_host1 = A_dev1.get()
+    A_host2 = A_dev2.get()
+    A_host3 = A_dev3.get()
+    X_host = X_dev.get()
+    B_host1 = B_dev1.get()
+    B_host2 = B_dev2.get()
+    B_host3 = B_dev3.get()
+    np.set_printoptions(threshold=sys.maxsize)
+    errMat = ((A_host1 @ X_host) - B_host1) / np.linalg.norm(A_host1 @ X_host)
+    print("Fraction Nonzero: " + str(np.count_nonzero(errMat)/(n_out*n_elem)))
+    print("Norm1: " + str(np.linalg.norm((A_host1 @ X_host) - B_host1)
+            / np.linalg.norm(A_host1 @ X_host)))
+    print("Norm2: " + str(np.linalg.norm((A_host2 @ X_host) - B_host2)
+            / np.linalg.norm(A_host2 @ X_host)))
+    print("Norm3: " + str(np.linalg.norm((A_host3 @ X_host) - B_host3)
+            / np.linalg.norm(A_host3 @ X_host)))
+
+
+def verifyResultFortran(B_dev1, B_dev2, B_dev3, A_dev1, A_dev2, A_dev3, X_dev):
+    A_host1 = A_dev1.get()
+    A_host2 = A_dev2.get()
+    A_host3 = A_dev3.get()
+    X_host = X_dev.get().T
+    B_host1 = B_dev1.get()
+    B_host2 = B_dev2.get()
+    B_host3 = B_dev3.get()
+    np.set_printoptions(threshold=sys.maxsize)
+    errMat = ((A_host1 @ X_host).T - B_host1) / np.linalg.norm(A_host1 @ X_host)
+    print("Fraction Nonzero: " + str(np.count_nonzero(errMat)/(n_out*n_elem)))
+    print("Norm1: " + str(np.linalg.norm((A_host1 @ X_host).T - B_host1)
+            / np.linalg.norm(A_host1 @ X_host)))
+    print("Norm2: " + str(np.linalg.norm((A_host2 @ X_host).T - B_host2)
+            / np.linalg.norm(A_host2 @ X_host)))
+    print("Norm3: " + str(np.linalg.norm((A_host3 @ X_host).T - B_host3)
+            / np.linalg.norm(A_host3 @ X_host)))
+
+
+# This can be removed eventually
+def apply_transformations_and_run_test(queue, knl, test_fn, params, tgenerator, max_gflops=None,
+	device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None):
+	
+    kio, kii, iio, iii, ji = params
+
+    # Transform and run
+    knl = gac.set_memory_layout(knl)
+    if applicator is not None:
+        trans_list = tgenerator(params)
+    else:
+        # Should probably read in eligible transformations from a file instead of using if-statements
+        trans_list = []
+        if "diff" in knl.default_entrypoint.name:
+            trans_list.append(["tag_inames", ["imatrix: ilp"]])
+
+        trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["iel_inner", kii], 
+            {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+        trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+        trans_list.append(["split_iname", ["idof_inner", iii], 
+            {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+
+        if knl.default_entrypoint.name == "face_mass":
+            pass
+            #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"],
+            #    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+            #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]])
+        elif knl.default_entrypoint.name == "nodes":
+            trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"],
+                {"temporary_name":"vecf", "default_tag":"l.auto"}])
+            trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+        elif "resample_by_mat" in knl.default_entrypoint.name:
+            # Indirection may prevent prefetching
+            pass
+        else:
+            trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"],
+                {"temporary_name":"vecf", "default_tag":"l.auto"}])
+            trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+
+        trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+        trans_list.append(["add_inames_for_unused_hw_axes"]) 
+
+    knl = apply_transformation_list(knl, trans_list)
+
+
+    #print(knl.default_entrypoint.name)
+    #print(trans_list)
+
+    # Execute and analyze the results
+    dev_arrays, avg_time = test_fn(queue, knl)
+    #avg_time = np.random.rand()
+
+    return avg_time, trans_list
+
+    """
+    # The analysis should be done elsewhere
+    bw = None
+    flop_rate = None
+
+    if device_memory_bandwidth is not None:  # noqa
+	bw = analyze_knl_bandwidth(knl, avg_time)
+	frac_peak_GBps = bw / device_memory_bandwidth
+	if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+	    # Should validate result here
+	    print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+	    return avg_time, params
+
+    # Einsum complicates this. This depends on the kernel being called.
+    if max_gflops is not None:
+	frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time)
+	if frac_peak_gflops >= gflops_cutoff:
+	    # Should validate result here
+	    print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+	    return choices
+
+    if device_memory_bandwidth is not None and max_gflops is not None:
+	data = (avg_time, 
+			    frac_peak_GBps*device_memory_bandwidth, 
+			    frac_peak_gflops*max_gflops, 
+			    frac_peak_GBps, 
+			    frac_peak_gflops, 
+			    (kio, kii, iio, iii, ji))
+	result_list.append(data)
+	f.write(str(data) + "\n")
+
+    if avg_time < avg_time_saved:
+	avg_time_saved = avg_time
+	result_saved = choices
+	result_saved_list = trans_list
+    if time.time() - start > time_limit: 
+	result_list.sort()
+	print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+	for entry in result_list:
+	    print(entry)
+	print()
+
+
+	#return result_saved_list
+	return result_saved
+    """
+
+def run_single_param_set(queue, knl_base, tlist_generator, params, test_fn, max_gflops=None, device_memory_bandwidth=None):
+    trans_list = tlist_generator(params, knl=knl_base)
+    knl = apply_transformation_list(knl_base, trans_list)
+    dev_arrays, avg_time = test_fn(queue, knl)
+
+    # Should this return the fraction of peak of should that be calculated in this function?
+    gflops, frac_peak_gflops = analyze_FLOPS(knl, avg_time, max_gflops=max_gflops)
+    bw = analyze_knl_bandwidth(knl, avg_time)
+
+    if device_memory_bandwidth is not None:  # noqa
+        bw = analyze_knl_bandwidth(knl, avg_time)
+        frac_peak_GBps = bw / device_memory_bandwidth
+        if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+            # Should validate result here
+            print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+            return choices
+
+    # This is incorrect for general einsum kernels
+    if max_gflops is not None:
+        frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time)
+        if frac_peak_gflops >= gflops_cutoff:
+            # Should validate result here
+            print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+            return choices
+
+    data = None
+    if device_memory_bandwidth is not None and max_gflops is not None:
+        data = (frac_peak_GBps*device_memory_bandwidth, 
+                frac_peak_gflops*max_gflops, 
+                frac_peak_GBps, 
+                frac_peak_gflops)
+
+    return (avg_time, trans_list, data)
+
+
+def exhaustive_search_v2(queue, knl, test_fn, pspace_generator, tlist_generator, time_limit=float("inf"), max_gflops=None, 
+        device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None):
+
+
+    #param_list = gen_autotune_list(queue, knl, start_param=start_param)
+
+    #Probably don't need all of these parameters
+    #apply_transformations_and_run_test(queue, knl, test_fn, params, max_gflops=None,
+	    #device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None):
+	
+    # Should probably obtain device_memory_bandwidth from empirical tests
+
+    # Also fixes the parameters. Maybe that should be a separate function   
+    knl = gac.set_memory_layout(knl)
+
+    knl_base = knl.copy()
+
+    params_list = pspace_generator(queue, knl, start_param=start_param)
+    #print(knl)
+    #print(len(params_list))
+    
+    result_list = []
+    start = time.time()
+
+    # Iterate over parameter space coordinates
+    # If serial run this otherwise, run the parallel autotuner
+    # Should probably make separate function for each.
+    for params in params_list:
+        print(f"Currently testing: {params}")
+        """
+        trans_list = tlist_generator(params, knl=knl)
+        knl = apply_transformation_list(knl_base, trans_list)
+        dev_arrays, avg_time = test_fn(queue, knl)
+
+        # Should this return the fraction of peak of should that be calculated in this function?
+        gflops, frac_peak_gflops = analyze_FLOPS(knl, avg_time, max_gflops=max_gflops)
+        bw = analyze_knl_bandwidth(knl, avg_time)
+
+        if device_memory_bandwidth is not None:  # noqa
+            bw = analyze_knl_bandwidth(knl, avg_time)
+            frac_peak_GBps = bw / device_memory_bandwidth
+            if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+                # Should validate result here
+                print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+                return choices
+
+        # This is incorrect for general einsum kernels
+        if max_gflops is not None:
+            frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time)
+            if frac_peak_gflops >= gflops_cutoff:
+                # Should validate result here
+                print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+                return choices
+
+        data = None
+        if device_memory_bandwidth is not None and max_gflops is not None:
+            data = (frac_peak_GBps*device_memory_bandwidth, 
+                    frac_peak_gflops*max_gflops, 
+                    frac_peak_GBps, 
+                    frac_peak_gflops)
+        """
+
+        avg_time, trans_list, data = run_single_param_set(queue, knl_base, tlist_generator, params, test_fn, max_gflops=max_gflops, device_memory_bandwidth=device_memory_bandwidth)
+        result_list.append((avg_time, trans_list, data))
+        print(avg_time)
+        #result_list.append(data)
+        #f.write(str(data) + "\n")
+
+        #if avg_time < avg_time_saved:
+        #    avg_time_saved = avg_time
+        #    result_saved = choices
+        #    result_saved_list = trans_list
+        
+        if time.time() - start > time_limit: 
+            break
+            #result_list.sort()
+            #print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+            #for entry in result_list:
+            #    print(entry)
+            #print()
+
+        #return result_saved_list
+        #return result_saved
+
+    #print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+    #for entry in result_list:
+    #    print(entry)
+    #print()
+
+
+    
+    #print("Suggested loop splittings")
+    #print(result_saved)
+    #print(f"iel: {kio}")
+    #print(f"iel_inner: {kii}")
+    #print(f"idof: {iio}")
+    #print(f"idof_inner: {iii}")
+    #print(f"j: {ji}")
+ 
+    #return result_saved_list
+    #return result_saved
+
+    # Could save the highest performing function, but often one wants to see the results
+    # over the entire parameter space
+    key_func = lambda result: result[0]
+    sorted_results = sorted(result_list, key=key_func)
+    return sorted_results[0]
+  
+
+def exhaustive_search(queue, knl, test_fn, time_limit=float("inf"), max_gflops=None, 
+        device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None):
+
+    # Should probably obtain device_memory_bandwidth from empirical tests
+
+    # Imports
+    from grudge.grudge_tags import ParameterValue
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    avg_time_saved = float("inf")
+    result_saved = None
+
+    transform_list = []
+
+    for arg in knl.default_entrypoint.args:
+        if "resample_by_mat" not in knl.default_entrypoint.name:
+            if IsDOFArray() in arg.tags:
+                n_elem, n_out = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+                #n_in = n_out # Not true for non-square
+            elif IsSepVecOpArray() in arg.tags:
+                n_mat, n_out, n_in = arg.shape
+            elif IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+            elif IsFaceDOFArray() in arg.tags:
+                nfaces, n_elem, n_in = arg.shape
+        else:
+            if IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+
+    # Also fixes the parameters    
+    knl = gac.set_memory_layout(knl)
+
+    tested = []
+
+    if start_param is not None:
+        kio_s, kii_s, iio_s, iii_s, ji_s = start_param
+    else:
+        kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None)
+
+    #k_inner_inner_opt = k_inner_inner_options(start_val=kii_s)
+    #kii_s = None
+    #j_inner_opt = j_inner_options(n_in)
+    knl_base = knl.copy()
+
+    avg_time_saved = float("inf")
+    result_saved = None
+    result_saved_list = []
+    
+    # Iterate over five search dimensions
+    result_list = []
+    start = time.time()
+    with open("output.txt", "a") as f:
+        for kii in k_inner_inner_options(start_val=kii_s):
+            # This prevents shared memory from overflowing when running with the face mass kernel
+            if knl.default_entrypoint.name == "face_mass":
+                n_in_2 = n_in * nfaces
+            else:
+                n_in_2 = n_in
+            for kio in k_inner_outer_options(n_in_2, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s):
+                kio_s = None # Set to None so will form the full set the next time around
+                for iii in i_inner_inner_options(n_out, kii,
+                        max_work_group_size=max_work_group_size, start_val=iii_s):
+                    iii_s = None
+                    for iio in i_inner_outer_options(n_out, iii, start_val=iio_s):
+                        iio_s = None
+                        for ji in j_inner_options(n_in, start_val=ji_s):
+                            ji_s = None
+                            print((kio, kii, iio, iii, ji))
+                            # Transform and run
+                            knl = knl_base.copy()
+                            knl = lp.split_iname(knl, "iel", kio, outer_tag="g.0", slabs=(0,1))
+                            knl = lp.split_iname(knl, "iel_inner", kii, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+                            knl = lp.split_iname(knl, "idof", iio, outer_tag="g.1", slabs=(0,0))
+                            knl = lp.split_iname(knl, "idof_inner", iii, outer_tag="ilp", inner_tag="l.1", slabs=(0,0))        
+
+                            if knl.default_entrypoint.name == "face_mass":
+                                knl = lp.add_prefetch(knl, "vec", "f,j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                                #knl = lp.tag_array_axes(knl, "vecf", "N1,N0,N2") # Should be this but breaks
+                            elif knl.default_entrypoint.name == "nodes":
+                                knl = lp.add_prefetch(knl, "nodes", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                                knl = lp.tag_array_axes(knl, "vecf", "f,f")
+                            elif "resample_by_mat" in knl.default_entrypoint.name: # Reads are scattered so prefetching is difficult
+                                pass
+                                #knl = lp.add_prefetch(knl, "ary", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                                #knl = lp.tag_array_axes(knl, "vecf", "f,f")                           
+                            else:   
+                                knl = lp.add_prefetch(knl, "vec", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                                knl = lp.tag_array_axes(knl, "vecf", "f,f")
+
+                            knl = lp.split_iname(knl, "j", ji, outer_tag="for", inner_tag="for")
+                            knl = lp.add_inames_for_unused_hw_axes(knl)
+
+
+                            # Change this to just use the transformation list instead of applying the transformations
+                            # directly
+                            trans_list = []
+                            if "diff" in knl.default_entrypoint.name:
+                                trans_list.append(["tag_inames", ["imatrix: ilp"]])
+                            trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+                            trans_list.append(["split_iname", ["iel_inner", kii], 
+                                {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+                            trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+                            trans_list.append(["split_iname", ["idof_inner", iii], 
+                                {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+
+                            if knl.default_entrypoint.name == "face_mass":
+                                pass
+                                #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"],
+                                #    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                                #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]])
+                            elif knl.default_entrypoint.name == "nodes":
+                                trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"],
+                                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                                trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+                            elif "resample_by_mat" in knl.default_entrypoint.name:
+                                # Indirection may prevent prefetching
+                                pass
+                            else:
+                                trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"],
+                                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                                trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+
+                            trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+                            trans_list.append(["add_inames_for_unused_hw_axes"]) 
+
+                            print(knl.default_entrypoint.name)
+                            print(trans_list)
+
+                            # Execute and analyze the results
+                            dev_arrays, avg_time = test_fn(queue, knl)
+
+                            choices = (kio, kii, iio, iii, ji)
+                            """
+                            if device_memory_bandwidth is not None:  # noqa
+                                #frac_peak_gflops, frac_peak_GBps = analyzeResult(n_out,
+                                #    n_in, n_elem, max_gflops, device_memory_bandwidth,
+                                #    avg_time)
+                                bw  = analyze_knl_bandwidth(knl, avg_time)
+                                frac_peak_GBps = bw / device_memory_bandwidth
+                                result_list.append((frac_peak_GBps, (kio, kii, iio, iii, ji)))
+                                if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+                                    # Should validate result here
+                                    pass
+                                    #print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+                                    #return (kio, kii, iio, iii, ji)
+                            """
+                            """
+                            # TODO: Fix flop calculation
+                            if max_gflops is not None and device_memory_bandwidth is not None:  # noqa
+                                frac_peak_gflops, frac_peak_GBps = analyzeResult(n_out,
+                                    n_in, n_elem, max_gflops, device_memory_bandwidth,
+                                    avg_time)
+                                if frac_peak_gflops >= gflops_cutoff or frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+                                    # Should validate result here
+                                    print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+                                    return (kio, kii, iio, iii, ji)
+                            """
+                            print(choices)
+                            if device_memory_bandwidth is not None:  # noqa
+                                bw = analyze_knl_bandwidth(knl, avg_time)
+                                frac_peak_GBps = bw / device_memory_bandwidth
+                                if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+                                    # Should validate result here
+                                    print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+                                    return choices
+                    
+                            if max_gflops is not None:
+                                frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time)
+                                if frac_peak_gflops >= gflops_cutoff:
+                                    # Should validate result here
+                                    print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+                                    return choices
+
+                            if device_memory_bandwidth is not None and max_gflops is not None:
+                                data = (avg_time, 
+                                                    frac_peak_GBps*device_memory_bandwidth, 
+                                                    frac_peak_gflops*max_gflops, 
+                                                    frac_peak_GBps, 
+                                                    frac_peak_gflops, 
+                                                    (kio, kii, iio, iii, ji))
+                                result_list.append(data)
+                                f.write(str(data) + "\n")
+                                
+                            if avg_time < avg_time_saved:
+                                avg_time_saved = avg_time
+                                result_saved = choices
+                                result_saved_list = trans_list
+                            if time.time() - start > time_limit: 
+                                result_list.sort()
+                                print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+                                for entry in result_list:
+                                    print(entry)
+                                print()
+
+       
+                                #return result_saved_list
+                                return result_saved
+
+
+    result_list.sort()
+
+    print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+    for entry in result_list:
+        print(entry)
+    print()
+
+
+    
+    print("Suggested loop splittings")
+    print(result_saved)
+    #print(f"iel: {kio}")
+    #print(f"iel_inner: {kii}")
+    #print(f"idof: {iio}")
+    #print(f"idof_inner: {iii}")
+    #print(f"j: {ji}")
+ 
+    return result_saved_list
+    #return result_saved
+
+def random_search(queue, knl, test_fn, time_limit=float("inf"), max_gflops=None, 
+        device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95):
+
+    # Imports
+    from random import choice
+    from grudge.grudge_tags import ParameterValue
+
+    local_mem_size = queue.device.local_mem_size
+    max_work_group_size = queue.device.max_work_group_size    
+
+    avg_time_saved = float("inf")
+    result_saved = None
+    result_saved_list = []
+
+    # Get sizes
+    for arg in knl.default_entrypoint.args:
+        if "resample_by_mat" not in knl.default_entrypoint.name:
+            if IsDOFArray() in arg.tags:
+                n_elem, n_out = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+                #n_in = n_out
+            elif IsSepVecOpArray() in arg.tags:
+                n_mat, n_out, n_in = arg.shape
+            elif IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+            elif IsFaceDOFArray() in arg.tags:
+                nfaces, n_elem, n_in = arg.shape
+        else:
+            if IsOpArray() in arg.tags:
+                n_out, n_in = arg.shape
+                fp_bytes = arg.dtype.dtype.itemsize
+
+    # Also fixes the parameters
+    knl = gac.set_memory_layout(knl)
+
+    tested = []
+
+    k_inner_inner_opt = k_inner_inner_options()
+    j_inner_opt = j_inner_options(n_in)
+    knl_base = knl.copy()
+    result_list = []
+
+    start = time.time()
+    while(time.time() - start < time_limit):
+        # Can be more intelligent by ensuring choices are not run multiple times
+        # Maybe could use expressions
+        kii = choice(k_inner_inner_opt)
+        if knl.default_entrypoint.name == "face_mass":
+            kio = choice(k_inner_outer_options(n_in*nfaces, kii, local_mem_size, fp_bytes=fp_bytes))
+        else:
+            kio = choice(k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes))
+        iii = choice(i_inner_inner_options(n_out, kii, max_work_group_size=max_work_group_size))
+        iio = choice(i_inner_outer_options(n_out, iii))
+        ji = choice(j_inner_opt)
+        choices = (kio, kii, iio, iii, ji)
+
+        if choices not in tested:
+            print(choices)
+            knl = knl_base.copy()
+            if "diff" in knl.default_entrypoint.name:
+                knl = lp.tag_inames(knl, "imatrix: ilp")
+            knl = lp.split_iname(knl, "iel", kio, outer_tag="g.0", slabs=(0,1))
+            knl = lp.split_iname(knl, "iel_inner", kii, outer_tag="ilp", inner_tag="l.0", slabs=(0,1))
+            knl = lp.split_iname(knl, "idof", iio, outer_tag="g.1", slabs=(0,0))
+            knl = lp.split_iname(knl, "idof_inner", iii, outer_tag="ilp", inner_tag="l.1", slabs=(0,0))        
+
+            if knl.default_entrypoint.name == "face_mass":
+                knl = lp.add_prefetch(knl, "vec", "f,j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                # Both N1,N0,N2 and N0,N1,N2 both seem to give memory errors..
+                #knl = lp.tag_array_axes(knl, "vecf", "N1,N0,N2")
+            elif knl.default_entrypoint.name == "nodes":
+                knl = lp.add_prefetch(knl, "nodes", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                knl = lp.tag_array_axes(knl, "vecf", "f,f")
+            elif "resample_by_mat" in knl.default_entrypoint.name:
+                pass
+                # Indirection may prevent prefetching
+                #knl = lp.add_prefetch(knl, "ary", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                #knl = lp.tag_array_axes(knl, "vecf", "f,f")                           
+            else:   
+                knl = lp.add_prefetch(knl, "vec", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto")
+                knl = lp.tag_array_axes(knl, "vecf", "f,f")
+
+            knl = lp.split_iname(knl, "j", ji, outer_tag="for", inner_tag="for")
+            knl = lp.add_inames_for_unused_hw_axes(knl)
+
+            # Change this to just use the transformation list instead of applying the transformations
+            # directly
+            trans_list = []
+            if "diff" in knl.default_entrypoint.name:
+                trans_list.append(["tag_inames", ["imatrix: ilp"]])
+            trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}])
+            trans_list.append(["split_iname", ["iel_inner", kii], 
+                {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}])
+            trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}])
+            trans_list.append(["split_iname", ["idof_inner", iii], 
+                {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}])
+
+            if knl.default_entrypoint.name == "face_mass":
+                trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"],
+                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]])
+            elif knl.default_entrypoint.name == "nodes":
+                trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"],
+                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+            elif "resample_by_mat" in knl.default_entrypoint.name:
+                # Indirection may prevent prefetching
+                pass
+            else:
+                trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"],
+                    {"temporary_name":"vecf", "default_tag":"l.auto"}])
+                trans_list.append(["tag_array_axes", ["vecf", "f,f"]])
+
+            trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}])
+            trans_list.append(["add_inames_for_unused_hw_axes"]) 
+            
+            dev_arrays, avg_time = test_fn(queue, knl)
+            tested.append(choices)
+
+            print(choices)
+            if device_memory_bandwidth is not None:  # noqa
+                bw  = analyze_knl_bandwidth(knl, avg_time)
+                frac_peak_GBps = bw / device_memory_bandwidth
+                #result_list.append((frac_peak_GBps, (kio, kii, iio, iii, ji)))
+                if frac_peak_GBps  >= bandwidth_cutoff:  # noqa
+                    # Should validate result here
+                    print("Performance is within tolerance of peak bandwith. Terminating search")  # noqa
+                    return choices
+    
+            if max_gflops is not None:
+                frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time)
+                if frac_peak_gflops >= gflops_cutoff:
+                    # Should validate result here
+                    print("Performance is within tolerance of peak bandwith or flop rate. Terminating search")  # noqa
+                    return choices
+
+            if device_memory_bandwidth is not None and max_gflops is not None:
+                result_list.append((avg_time, frac_peak_GBps*device_memory_bandwidth, frac_peak_gflops*max_gflops,
+                                     frac_peak_GBps, frac_peak_gflops, (kio, kii, iio, iii, ji)))
+
+            if avg_time < avg_time_saved:
+                avg_time_saved = avg_time
+                result_saved = choices
+                result_saved_list = trans_list
+
+    print("Time limit exceeded: returning current best result")
+
+    """
+    print("Suggested loop splittings")
+    print(f"iel: {kio}")
+    print(f"iel_inner: {kii}")
+    print(f"idof: {iio}")
+    print(f"idof_inner: {iii}")
+    print(f"j: {ji}")
+    """    
+
+    result_list.sort()
+
+    print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops")
+    #print("Avg time, Frac peak bandwidth, Frac peak GFlops")
+    for entry in result_list:
+        print(entry)
+    print()
+    #print(result_list)
+
+
+    #return result_saved
+    return result_saved_list
+
+def convert(o):
+    if isinstance(o, np.generic): return o.item()
+    raise TypeError
+
+
+def autotune_and_save(queue, search_fn, tlist_generator, pspace_generator,  hjson_file_str, time_limit=np.inf):
+    from hjson import dump
+    try:
+        avg_time, transformations, data = search_fn(queue, program, generic_test, 
+                                    pspace_generator, tlist_generator, time_limit=time_limit)
+    except cl._cl.RuntimeError as e:
+        print(e)
+        print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.")
+
+    od = {"transformations": transformations}
+    out_file = open(hjson_file_str, "wt+")
+
+    hjson.dump(od, out_file,default=convert)
+    out_file.close()
+    return transformations
+
+
+def get_transformation_id(device_id):
+    hjson_file = open("device_mappings.hjson") 
+    hjson_text = hjson_file.read()
+    hjson_file.close()
+    od = hjson.loads(hjson_text)
+    return od[device_id]
+
+if __name__ == "__main__": 
+    from __init__ import gen_diff_knl, load_transformations_from_file, apply_transformation_list
+    from grudge.execution import diff_prg, elwise_linear_prg, face_mass_prg
+
+    # Test existing optimizations
+    platform = cl.get_platforms()
+    my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU)
+    #ctx = cl.Context(devices=my_gpu_devices)
+    ctx = cl.create_some_context(interactive=True)
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    
+    # Testing code
+    device_id = "NVIDIA Titan V"
+    tid = get_transformation_id("NVIDIA Titan V")
+    fp_format = np.float64
+    fp_format_dict = {np.float32: (4, "FP32"), np.float64: (8, "FP64"),
+                        np.complex128: (16, "C128")}
+    fp_bytes, fp_string = (8, "FP64") if fp_format == np.float64 else (4, "FP32")
+
+    """
+    to_test = True
+    if to_test:
+        n_elem = 2**22#2**15  # 2**21
+        pn = 5
+        print(len(equidistant_nodes(pn, 3)[1]))
+        n_out = len(equidistant_nodes(pn, 3)[1])
+        n_in = len(equidistant_nodes(pn, 3)[1])
+
+        #settings = exhaustiveSearch(n_in, n_out, n_elem, 4*12*1024, fp_bytes=fp_bytes,
+        #               max_gflops=12288, device_memory_bandwidth=540)
+        settings = randomSearch(n_in, n_out, n_elem, 4*12*1024, time_limit=120,
+                        fp_format=fp_format, max_gflops=12288//2,
+                        device_memory_bandwidth=540)
+        #settings = noSearch(n_in, n_out, n_elem, 4*12*1024, time_limit=180,1
+        #                       fp_bytes=fp_bytes, max_gflops=12288,
+        #                       device_memory_bandwidth=540)
+        print("FINAL RESULTS")
+        print(settings)
+    # Add functionality to write transformations to file
+    """ 
+    """
+    dim_to_file = {1: "diff_1d_transform.hjson", 
+                   2: "diff_2d_transform.hjson",
+                   3: "diff_3d_transform.hjson"}
+
+    bandwidths = []
+    from os import environ
+    for nreg in range(57,58):#range(1, 61):
+        environ['CU_JIT_MAX_REGISTERS'] = str(nreg)
+        for dim in range(3,4):
+            hjson_file = open(dim_to_file[dim])
+            #for i in range(2,8):
+            pn = 5
+            n_out = len(equidistant_nodes(pn, 3)[1])
+            n_in = len(equidistant_nodes(pn, 3)[1]) 
+            n_elem = 178746 # 2**20
+            knl = diff_prg(dim, n_elem, n_out, fp_format) 
+            #knl = gen_diff_knl_fortran2(dim, n_elem, n_out, n_in, fp_format=fp_format)
+            knl = set_memory_layout(knl)
+            knl = lp.set_options(knl, "write_code")
+            trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(n_out)])
+            knl = apply_transformation_list(knl, trans)
+            #print(lp.generate_code_v2(knl).device_code())
+
+            dev_arrays, avg_time = generic_test(queue, knl, nruns=10, warmup=True)
+            #dev_arrays, avg_time = runTest(n_elem, n_in, n_out, kio, kii, iio, iii, ji)
+            bw = analyze_knl_bandwidth(knl, avg_time)
+            bandwidths.append(bw)
+            #analyzeResult(n_out, n_in, n_elem, 12288//2, 540, avg_time, fp_bytes=fp_bytes)
+            print(avg_time)
+            #verifyResult(*dev_arrays)
+    
+    print(knl)
+    for i, entry in enumerate(bandwidths):
+        print(f"{i}, {entry}")
+    #print(bandwidths)
+    """
+    #testBandwidth()
+    #exit()
+    """
+    # Test elwise linear
+    pn = 4
+    n_out = len(equidistant_nodes(pn,3)[1])
+    n_in = n_out
+    n_elem = 178746
+    fp_format = np.float64
+    fp_string = "FP64" if fp_format == np.float64 else "FP32" 
+    knl = elwise_linear_prg(n_elem, n_out, fp_format)
+    #knl = gen_elwise_linear_knl(n_elem, n_in, n_out, fp_format)
+
+    hjson_file = open("elwise_linear_transform.hjson")
+    trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(n_out)])
+
+    knl = set_memory_layout(knl)
+    knl = apply_transformation_list(knl, trans)
+    #print(knl)
+    _, avg_time = generic_test(queue, knl, backend="OPENCL", nruns=10, warmup=True)
+    print(avg_time)
+    analyze_knl_bandwidth(knl, avg_time)
+    """
+    """
+    # Test face_mass            
+    pn = 3
+    nvol_nodes = len(equidistant_nodes(pn,3)[1])
+    nface_nodes = 10
+    #nelements = 2**22
+    nelements = 178746
+    nfaces = 4
+    fp_format = np.float64
+    fp_string = "FP64" if fp_format == np.float64 else "FP32" 
+
+    knl = face_mass_prg(178746, 4, 20, 20, np.float64)
+    knl = set_memory_layout(knl)
+    #knl = gen_face_mass_knl(nelements, nfaces, nvol_nodes, nface_nodes, fp_format)
+    #knl = gen_face_mass_knl_merged(nelements, nfaces, nvol_nodes, nface_nodes, fp_format)
+    # Need to load these from file
+    #hjson_file = open("elwise_linear_transform.hjson")
+    #trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(pn)])
+    #knl = apply_transformation_list(knl, trans)
+    print(knl)
+    _, avg_time = test_face_mass(queue, knl, backend="OPENCL", nruns=10, warmup=True)
+    #_, avg_time = test_face_mass_merged(queue, knl, backend="OPENCL", nruns=10, warmup=True)
+    print(avg_time)
+    analyze_knl_bandwidth(knl, avg_time)
+    """
+
+    # Test order=4 copy
+    """
+    knl = lp.make_copy_kernel("f,f", old_dim_tags="f,f")
+    knl = lp.add_dtypes(knl, {"input": np.float64, "output": np.float64})
+    knl = lp.fix_parameters(knl, {"n0": 178746, "n1": 35})  
+    knl = lp.split_iname(knl, "i0", 48, outer_tag="g.0")
+    knl = lp.split_iname(knl, "i0_inner", 16, outer_tag="ilp", inner_tag="l.0")
+    knl = lp.split_iname(knl, "i1", 35, outer_tag="g.1", inner_tag="l.1")
+    for arg in knl.default_entrypoint.args:
+        if arg.name == "input":
+            arg.tags = IsDOFArray()
+            arg.shape = (178746, 35)
+        if arg.name == "output":
+            arg.tags = IsDOFArray()
+            arg.is_output = True 
+            arg.shape = (178746, 35)
+
+    print(knl)
+    _, avg_time = generic_test(queue, knl)
+    analyze_knl_bandwidth(knl, avg_time)
+    #knl = lp.split_iname(knl, "i1", 1024//2, inner_tag="l.0", outer_tag="g.0", slabs=(0,1))
+    #knl = lp.split_iname(knl, "i1", 1024, inner_tag="l.0", outer_tag="g.0", slabs=(0,1))
+    #knl = lp.split_iname(knl, "i1", 6*16, outer_tag="g.0") 
+    #knl = lp.split_iname(knl, "i1_inner", 16, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) 
+    #knl = lp.split_iname(knl, "i0", n0, inner_tag="l.1", outer_tag="g.1", slabs=(0,0))
+    """
+   
+
+    #"""
+    # Test autotuner
+    knl = diff_prg(3, 1000000, 3, np.float64)
+    #print(knl)
+    #print(knl.default_entrypoint.domains)
+    #print(knl.default_entrypoint.instructions)
+    #exit()
+    #knl = diff_prg(3, 196608, 10, np.float64)
+    #knl = elwise_linear_prg(24576, 120, np.float64)
+    #dofs = 84
+    #knl = elwise_linear_prg(1000000, 3*dofs, np.float64, nnodes_in=dofs)
+    #start_param = (24, 4, 126, 9, 28)#(96, 32, 60, 2, 5)
+    start_param = None
+    ## Figure out the actual dimensions
+    #knl = face_mass_prg(178746, 4, 20, 20, np.float64)
+
+    # Spock
+    #result = exhaustive_search(queue, knl, generic_test, time_limit=np.inf, max_gflops=11540, device_memory_bandwidth=1047, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param)
+    #pspace_generator = gen_autotune_list(queue, knl)
+    #print(len(result))
+
+    # Titan V
+    #result = exhaustive_search(queue, knl, generic_test, time_limit=np.inf, max_gflops=6144, device_memory_bandwidth=580, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param)
+    #print(result)
+    pspace_generator = gen_autotune_list
+    tlist_generator = mxm_trans_list_generator 
+    result = exhaustive_search_v2(queue, knl, generic_test, pspace_generator, tlist_generator, time_limit=np.inf, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param)
+ 
+    #result = exhaustive_search_v2(queue, knl, generic_test, pspace_generator, tlist_generator, time_limit=np.inf, max_gflops=6144, device_memory_bandwidth=580, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param)
diff --git a/grudge/loopy_dg_kernels/test_import_mpi4py.py b/grudge/loopy_dg_kernels/test_import_mpi4py.py
new file mode 100644
index 000000000..9bbbd91c9
--- /dev/null
+++ b/grudge/loopy_dg_kernels/test_import_mpi4py.py
@@ -0,0 +1,3 @@
+import mpi4py.MPI as MPI
+
+comm = MPI.COMM_WORLD
diff --git a/grudge/models/advection.py b/grudge/models/advection.py
index cfe1a4920..c5d35e2d1 100644
--- a/grudge/models/advection.py
+++ b/grudge/models/advection.py
@@ -214,13 +214,13 @@ def __init__(self, dcoll, v, inflow_u, flux_type="central", quad_tag=None):
         self.quad_tag = quad_tag
 
     def flux(self, u_tpair):
-        from grudge.dof_desc import DD_VOLUME
+        from grudge.dof_desc import DD_VOLUME_ALL
 
-        surf_v = op.project(self.dcoll, DD_VOLUME, u_tpair.dd, self.v)
+        surf_v = op.project(self.dcoll, DD_VOLUME_ALL, u_tpair.dd, self.v)
         return advection_weak_flux(self.dcoll, self.flux_type, u_tpair, surf_v)
 
     def operator(self, t, u):
-        from grudge.dof_desc import DOFDesc, DD_VOLUME, DTAG_VOLUME_ALL
+        from grudge.dof_desc import DOFDesc, DD_VOLUME_ALL, DTAG_VOLUME_ALL
         from meshmode.mesh import BTAG_ALL
         from meshmode.discretization.connection import FACE_RESTR_ALL
 
@@ -234,7 +234,7 @@ def flux(tpair):
             return op.project(dcoll, tpair.dd, face_dd, self.flux(tpair))
 
         def to_quad(arg):
-            return op.project(dcoll, DD_VOLUME, quad_dd, arg)
+            return op.project(dcoll, DD_VOLUME_ALL, quad_dd, arg)
 
         if self.inflow_u is not None:
             inflow_flux = flux(op.bv_trace_pair(dcoll,
@@ -279,7 +279,7 @@ def to_quad(arg):
 # {{{ closed surface advection
 
 def v_dot_n_tpair(actx, dcoll, velocity, trace_dd):
-    from grudge.dof_desc import DTAG_BOUNDARY
+    from grudge.dof_desc import BoundaryDomainTag
     from grudge.trace_pair import TracePair
     from meshmode.discretization.connection import FACE_RESTR_INTERIOR
 
@@ -287,10 +287,9 @@ def v_dot_n_tpair(actx, dcoll, velocity, trace_dd):
     v_dot_n = velocity.dot(normal)
     i = op.project(dcoll, trace_dd.with_discr_tag(None), trace_dd, v_dot_n)
 
-    if trace_dd.domain_tag is FACE_RESTR_INTERIOR:
-        e = dcoll.opposite_face_connection()(i)
-    elif isinstance(trace_dd.domain_tag, DTAG_BOUNDARY):
-        e = dcoll.distributed_boundary_swap_connection(trace_dd)(i)
+    assert isinstance(trace_dd.domain_tag, BoundaryDomainTag)
+    if trace_dd.domain_tag.tag is FACE_RESTR_INTERIOR:
+        e = dcoll.opposite_face_connection(trace_dd.domain_tag)(i)
     else:
         raise ValueError("Unrecognized domain tag: %s" % trace_dd.domain_tag)
 
@@ -325,9 +324,9 @@ def __init__(self, dcoll, v, flux_type="central", quad_tag=None):
         self.quad_tag = quad_tag
 
     def flux(self, u_tpair):
-        from grudge.dof_desc import DD_VOLUME
+        from grudge.dof_desc import DD_VOLUME_ALL
 
-        surf_v = op.project(self.dcoll, DD_VOLUME,
+        surf_v = op.project(self.dcoll, DD_VOLUME_ALL,
                             u_tpair.dd.with_discr_tag(None), self.v)
         return surface_advection_weak_flux(self.dcoll,
                                            self.flux_type,
@@ -335,7 +334,7 @@ def flux(self, u_tpair):
                                            surf_v)
 
     def operator(self, t, u):
-        from grudge.dof_desc import DOFDesc, DD_VOLUME, DTAG_VOLUME_ALL
+        from grudge.dof_desc import DOFDesc, DD_VOLUME_ALL, DTAG_VOLUME_ALL
         from meshmode.discretization.connection import FACE_RESTR_ALL
 
         face_dd = DOFDesc(FACE_RESTR_ALL, self.quad_tag)
@@ -347,7 +346,7 @@ def flux(tpair):
             return op.project(dcoll, tpair.dd, face_dd, self.flux(tpair))
 
         def to_quad(arg):
-            return op.project(dcoll, DD_VOLUME, quad_dd, arg)
+            return op.project(dcoll, DD_VOLUME_ALL, quad_dd, arg)
 
         quad_v = to_quad(self.v)
         quad_u = to_quad(u)
diff --git a/grudge/op.py b/grudge/op.py
index 015e0718b..9e234a1d8 100644
--- a/grudge/op.py
+++ b/grudge/op.py
@@ -68,7 +68,6 @@
 THE SOFTWARE.
 """
 
-
 from arraycontext import (ArrayContext, map_array_container, tag_axes,
         ArrayOrContainer)
 
@@ -81,13 +80,23 @@
                                          DiscretizationFaceAxisTag)
 
 from grudge.discretization import DiscretizationCollection
+from grudge.dof_desc import as_dofdesc
 
 from pytools import keyed_memoize_in
 from pytools.obj_array import make_obj_array
 
 import numpy as np
+import loopy as lp
 
 import grudge.dof_desc as dof_desc
+from grudge.dof_desc import (
+    DD_VOLUME_ALL, FACE_RESTR_ALL, DISCR_TAG_BASE,
+    DOFDesc, VolumeDomainTag
+)
+
+from grudge.grudge_tags import (KernelDataTag, IsDOFArray, IsOpArray, 
+    ParameterValue, IsVecOpArray, IsVecDOFArray, IsFourAxisDOFArray,
+    IsFaceMassOpArray, IsFaceDOFArray)
 
 from grudge.interpolation import interp
 from grudge.projection import project
@@ -113,8 +122,10 @@
     interior_trace_pair,
     interior_trace_pairs,
     local_interior_trace_pair,
-    connected_ranks,
+    inter_volume_trace_pairs,
+    local_inter_volume_trace_pairs,
     cross_rank_trace_pairs,
+    cross_rank_inter_volume_trace_pairs,
     bdry_trace_pair,
     bv_trace_pair
 )
@@ -142,8 +153,10 @@
     "interior_trace_pair",
     "interior_trace_pairs",
     "local_interior_trace_pair",
-    "connected_ranks",
+    "inter_volume_trace_pairs",
+    "local_inter_volume_trace_pairs",
     "cross_rank_trace_pairs",
+    "cross_rank_inter_volume_trace_pairs",
     "bdry_trace_pair",
     "bv_trace_pair",
 
@@ -174,6 +187,43 @@ def _single_axis_derivative_kernel(
     # - whether the chain rule terms ("inv_jac_mat") sit outside (strong)
     #   or inside (weak) the matrix-vector product that carries out the
     #   derivative, cf. "metric_in_matvec".
+
+    data = []
+    for out_grp, in_grp, vec_i, ijm_i in zip(out_discr.groups, in_discr.groups, vec, inv_jac_mat):
+        ref_stiffT_mat = get_diff_mat(
+                        actx,
+                        out_element_group=out_grp,
+                        in_element_group=in_grp
+                    )
+        
+        fp_format = vec_i.dtype
+        Nr, Ni, _ = ref_stiffT_mat.shape
+        Ne, Nj = vec_i.shape
+
+        kernel_data = [
+            lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]),
+            lp.GlobalArg("ref_stiffT_mat", fp_format, shape=(Nr, Ni, Nj), offset=lp.auto, tags=[IsVecOpArray()]),
+            lp.GlobalArg("inv_jac_t", fp_format, shape=(Nr, Ne, Nj), offset=lp.auto, tags=[IsVecDOFArray()]),  
+            lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True),  
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            lp.ValueArg("Nr", tags=[ParameterValue(Nr)]),
+            ...
+        ]
+    
+        kd_tag = KernelDataTag(kernel_data)
+        
+        data.append(actx.einsum("rej,rij,ej->ei" if metric_in_matvec else "rei,rij,ej->ei",
+                    ijm_i[xyz_axis],
+                    ref_stiffT_mat,
+                    vec_i,
+                    arg_names=("inv_jac_t", "ref_stiffT_mat", "vec", ),
+                    tagged=(FirstAxisIsElementsTag(),kd_tag)))
+
+    return DOFArray(actx, data = tuple(data))
+
+    """
     return DOFArray(
         actx,
         data=tuple(
@@ -191,12 +241,58 @@ def _single_axis_derivative_kernel(
             for out_grp, in_grp, vec_i, ijm_i in zip(
                 out_discr.groups, in_discr.groups, vec,
                 inv_jac_mat)))
+    """
 
 
 def _gradient_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec,
         *, metric_in_matvec):
     # See _single_axis_derivative_kernel for comments on the usage scenarios
     # (both strong and weak derivative) and their differences.
+
+    per_group_grads = []
+    for out_grp, in_grp, vec_i, ijm_i, in zip(out_discr.groups, in_discr.groups, vec, inv_jac_mat):
+
+        ref_stiffT_mat = get_diff_mat(
+                        actx,
+                        out_element_group=out_grp,
+                        in_element_group=in_grp
+                    )
+
+        fp_format = vec_i.dtype
+        Nx, _, _, _ = inv_jac_mat._data[0].shape
+        Nr, Ni, _ = ref_stiffT_mat.shape
+        Ne, Nj = vec_i.shape
+
+        kernel_data = [
+            lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]),
+            lp.GlobalArg("ref_stiffT_mat", fp_format, shape=(Nr, Ni, Nj), offset=lp.auto, tags=[IsVecOpArray()]),
+            lp.GlobalArg("inv_jac_t", fp_format, shape=(Nx, Nr, Ne, Nj), offset=lp.auto, tags=[IsFourAxisDOFArray()]),  
+            lp.GlobalArg("out", fp_format, shape=(Nx, Ne, Ni), offset=lp.auto, tags=[IsVecDOFArray()], is_output=True),  
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            lp.ValueArg("Nr", tags=[ParameterValue(Nr)]),
+            lp.ValueArg("Nx", tags=[ParameterValue(Nx)]),
+            ...
+        ]
+        
+        kd_tag = KernelDataTag(kernel_data)
+
+        # r for rst axis
+        # x for xyz axis
+        per_group_grads.append(actx.einsum("xrej,rij,ej->xei" if metric_in_matvec else "xrei,rij,ej->xei",
+                    ijm_i,
+                    get_diff_mat(
+                        actx,
+                        out_element_group=out_grp,
+                        in_element_group=in_grp
+                    ),
+                    vec_i,
+                    arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"),
+                    tagged=(FirstAxisIsElementsTag(),kd_tag)))
+       
+
+    """
     per_group_grads = [
         # r for rst axis
         # x for xyz axis
@@ -213,6 +309,7 @@ def _gradient_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec,
         for out_grp, in_grp, vec_i, ijm_i in zip(
             out_discr.groups, in_discr.groups, vec,
             inv_jac_mat)]
+    """
 
     return make_obj_array([
             DOFArray(
@@ -243,15 +340,13 @@ def get_ref_derivative_mats(grp):
     return get_ref_derivative_mats(out_element_group)
 
 
-def _strong_scalar_grad(dcoll, dd_in, vec):
-    assert dd_in == dof_desc.as_dofdesc(dof_desc.DD_VOLUME)
-
+def _strong_scalar_grad(dcoll, dd, vec):
     from grudge.geometry import inverse_surface_metric_derivative_mat
 
-    discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME)
+    discr = dcoll.discr_from_dd(dd)
     actx = vec.array_context
 
-    inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll,
+    inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
     return _gradient_kernel(actx, discr, discr,
             _reference_derivative_matrices, inverse_jac_mat, vec,
@@ -259,7 +354,7 @@ def _strong_scalar_grad(dcoll, dd_in, vec):
 
 
 def local_grad(
-        dcoll: DiscretizationCollection, vec, *, nested=False) -> ArrayOrContainer:
+        dcoll: DiscretizationCollection, *args, nested=False) -> ArrayOrContainer:
     r"""Return the element-local gradient of a function :math:`f` represented
     by *vec*:
 
@@ -268,24 +363,35 @@ def local_grad(
         \nabla|_E f = \left(
             \partial_x|_E f, \partial_y|_E f, \partial_z|_E f \right)
 
+    May be called with ``(vec)`` or ``(dd, vec)``.
+
     :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
+    :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+        Defaults to the base volume discretization if not provided.
     :arg nested: return nested object arrays instead of a single multidimensional
         array if *vec* is non-scalar.
     :returns: an object array (possibly nested) of
         :class:`~meshmode.dof_array.DOFArray`\ s or
         :class:`~arraycontext.ArrayContainer` of object arrays.
     """
-    dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+    if len(args) == 1:
+        vec, = args
+        dd = DD_VOLUME_ALL
+    elif len(args) == 2:
+        dd, vec = args
+    else:
+        raise TypeError("invalid number of arguments")
+
     from grudge.tools import rec_map_subarrays
     return rec_map_subarrays(
-        partial(_strong_scalar_grad, dcoll, dd_in),
+        partial(_strong_scalar_grad, dcoll, dd),
         (), (dcoll.ambient_dim,),
         vec, scalar_cls=DOFArray, return_nested=nested,)
 
 
 def local_d_dx(
-        dcoll: DiscretizationCollection, xyz_axis, vec) -> ArrayOrContainer:
+        dcoll: DiscretizationCollection, xyz_axis, *args) -> ArrayOrContainer:
     r"""Return the element-local derivative along axis *xyz_axis* of a
     function :math:`f` represented by *vec*:
 
@@ -293,22 +399,34 @@ def local_d_dx(
 
         \frac{\partial f}{\partial \lbrace x,y,z\rbrace}\Big|_E
 
+    May be called with ``(vec)`` or ``(dd, vec)``.
+
     :arg xyz_axis: an integer indicating the axis along which the derivative
         is taken.
+    :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+        Defaults to the base volume discretization if not provided.
     :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
     :returns: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
     """
+    if len(args) == 1:
+        vec, = args
+        dd = DD_VOLUME_ALL
+    elif len(args) == 2:
+        dd, vec = args
+    else:
+        raise TypeError("invalid number of arguments")
+
     if not isinstance(vec, DOFArray):
-        return map_array_container(partial(local_d_dx, dcoll, xyz_axis), vec)
+        return map_array_container(partial(local_d_dx, dcoll, xyz_axis, dd), vec)
 
-    discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME)
+    discr = dcoll.discr_from_dd(dd)
     actx = vec.array_context
 
     from grudge.geometry import inverse_surface_metric_derivative_mat
-    inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll,
-            _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+    inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd,
+        _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
 
     return _single_axis_derivative_kernel(
         actx, discr, discr,
@@ -316,7 +434,7 @@ def local_d_dx(
         metric_in_matvec=False)
 
 
-def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer:
+def local_div(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     r"""Return the element-local divergence of the vector function
     :math:`\mathbf{f}` represented by *vecs*:
 
@@ -324,6 +442,10 @@ def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer:
 
         \nabla|_E \cdot \mathbf{f} = \sum_{i=1}^d \partial_{x_i}|_E \mathbf{f}_i
 
+    May be called with ``(vec)`` or ``(dd, vec)``.
+
+    :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+        Defaults to the base volume discretization if not provided.
     :arg vecs: an object array of
         :class:`~meshmode.dof_array.DOFArray`\s or an
         :class:`~arraycontext.ArrayContainer` object
@@ -332,13 +454,21 @@ def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer:
     :returns: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
     """
+    if len(args) == 1:
+        vec, = args
+        dd = DD_VOLUME_ALL
+    elif len(args) == 2:
+        dd, vec = args
+    else:
+        raise TypeError("invalid number of arguments")
+
     from grudge.tools import rec_map_subarrays
     return rec_map_subarrays(
         lambda vec: sum(
-            local_d_dx(dcoll, i, vec_i)
+            local_d_dx(dcoll, i, dd, vec_i)
             for i, vec_i in enumerate(vec)),
         (dcoll.ambient_dim,), (),
-        vecs, scalar_cls=DOFArray)
+        vec, scalar_cls=DOFArray)
 
 # }}}
 
@@ -391,10 +521,12 @@ def get_ref_stiffness_transpose_mat(out_grp, in_grp):
 def _weak_scalar_grad(dcoll, dd_in, vec):
     from grudge.geometry import inverse_surface_metric_derivative_mat
 
+    dd_in = as_dofdesc(dd_in)
     in_discr = dcoll.discr_from_dd(dd_in)
-    out_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME)
+    out_discr = dcoll.discr_from_dd(dd_in.with_discr_tag(DISCR_TAG_BASE))
 
     actx = vec.array_context
+    # TODO: Figure out if this should be dd=dd_in or dd=dd_out
     inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd_in,
             times_area_element=True,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
@@ -429,7 +561,7 @@ def weak_local_grad(
     """
     if len(args) == 1:
         vecs, = args
-        dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd_in = DD_VOLUME_ALL
     elif len(args) == 2:
         dd_in, vecs = args
     else:
@@ -474,7 +606,7 @@ def weak_local_d_dx(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     """
     if len(args) == 2:
         xyz_axis, vec = args
-        dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd_in = dof_desc.DD_VOLUME_ALL
     elif len(args) == 3:
         dd_in, xyz_axis, vec = args
     else:
@@ -488,8 +620,9 @@ def weak_local_d_dx(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
 
     from grudge.geometry import inverse_surface_metric_derivative_mat
 
+    dd_in = as_dofdesc(dd_in)
     in_discr = dcoll.discr_from_dd(dd_in)
-    out_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME)
+    out_discr = dcoll.discr_from_dd(dd_in.with_discr_tag(DISCR_TAG_BASE))
 
     actx = vec.array_context
     inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd_in,
@@ -533,7 +666,7 @@ def weak_local_div(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     """
     if len(args) == 1:
         vecs, = args
-        dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd_in = DD_VOLUME_ALL
     elif len(args) == 2:
         dd_in, vecs = args
     else:
@@ -602,6 +735,44 @@ def _apply_mass_operator(
     actx = vec.array_context
     area_elements = area_element(actx, dcoll, dd=dd_in,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
+
+    # out[e, i] = reduce(sum, [j], mass_mat[i, j]*jac[e, j]*vec[e, j])
+
+    esums = []
+    for in_grp, out_grp, ae_i, vec_i in zip(in_discr.groups, out_discr.groups, area_elements, vec):
+        mass_mat = reference_mass_matrix(
+            actx,
+            out_element_group=out_grp,
+            in_element_group=in_grp
+        )
+
+        fp_format = vec_i.dtype
+        Ni, Nj = mass_mat.shape
+        Ne, Nj = vec_i.shape
+        kernel_data = [
+            lp.GlobalArg("mass_mat", fp_format, shape=(Ni, Nj), offset=lp.auto, tags=[IsOpArray()]),
+            lp.GlobalArg("jac", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]),  
+            lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]),
+            lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True),  
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            ...
+        ]
+    
+        kd_tag = KernelDataTag(kernel_data)
+
+        esum = actx.einsum("ij,ej,ej->ei",
+                    mass_mat,
+                    ae_i,
+                    vec_i,
+                    arg_names=("mass_mat", "jac", "vec"),
+                    tagged=(FirstAxisIsElementsTag(),kd_tag))
+        esums.append(esum)
+
+    return DOFArray(actx, data=tuple(esums))
+
+    """    
     return DOFArray(
         actx,
         data=tuple(
@@ -620,7 +791,8 @@ def _apply_mass_operator(
                     in_discr.groups, out_discr.groups, area_elements, vec)
         )
     )
-
+    """
+    
 
 def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     r"""Return the action of the DG mass matrix on a vector (or vectors)
@@ -628,7 +800,7 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     *vec* being an :class:`~arraycontext.ArrayContainer`,
     the mass operator is applied component-wise.
 
-    May be called with ``(vec)`` or ``(dd, vec)``.
+    May be called with ``(vec)`` or ``(dd_in, vec)``.
 
     Specifically, this function applies the mass matrix elementwise on a
     vector of coefficients :math:`\mathbf{f}` via:
@@ -640,7 +812,7 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
 
     where :math:`\phi_i` are local polynomial basis functions on :math:`E`.
 
-    :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+    :arg dd_in: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
         Defaults to the base volume discretization if not provided.
     :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
@@ -650,13 +822,15 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
 
     if len(args) == 1:
         vec, = args
-        dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd_in = dof_desc.DD_VOLUME_ALL
     elif len(args) == 2:
-        dd, vec = args
+        dd_in, vec = args
     else:
         raise TypeError("invalid number of arguments")
 
-    return _apply_mass_operator(dcoll, dof_desc.DD_VOLUME, dd, vec)
+    dd_out = dd_in.with_discr_tag(DISCR_TAG_BASE)
+
+    return _apply_mass_operator(dcoll, dd_out, dd_in, vec)
 
 # }}}
 
@@ -701,20 +875,44 @@ def _apply_inverse_mass_operator(
     discr = dcoll.discr_from_dd(dd_in)
     inv_area_elements = 1./area_element(actx, dcoll, dd=dd_in,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
-    group_data = [
+
+    group_data = []
+    for grp, jac_inv, vec_i in zip(discr.groups, inv_area_elements, vec):
+
+        ref_mass_inverse = reference_inverse_mass_matrix(actx,
+                                                         element_group=grp)
+
+        fp_format = vec_i.dtype
+        Ne, Nj = vec_i.shape
+        _, Ni = jac_inv.shape
+
+        kernel_data = [
+            lp.GlobalArg("arg2", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]),
+            lp.GlobalArg("arg1", fp_format, shape=(Ni, Nj), offset=lp.auto, tags=[IsOpArray()]),
+            lp.GlobalArg("arg0", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()]),  
+            lp.GlobalArg("out",  fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True),  
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            ...
+        ]
+    
+        kd_tag = KernelDataTag(kernel_data)
+
+        group_data.append(
+
             # Based on https://arxiv.org/pdf/1608.03836.pdf
             # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv
             actx.einsum("ei,ij,ej->ei",
                         jac_inv,
                         reference_inverse_mass_matrix(actx, element_group=grp),
                         vec_i,
-                        tagged=(FirstAxisIsElementsTag(),))
-            for grp, jac_inv, vec_i in zip(discr.groups, inv_area_elements, vec)]
+                        tagged=(FirstAxisIsElementsTag(),kd_tag,)))
 
     return DOFArray(actx, data=tuple(group_data))
 
 
-def inverse_mass(dcoll: DiscretizationCollection, vec) -> ArrayOrContainer:
+def inverse_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     r"""Return the action of the DG mass matrix inverse on a vector
     (or vectors) of :class:`~meshmode.dof_array.DOFArray`\ s, *vec*.
     In the case of *vec* being an :class:`~arraycontext.ArrayContainer`,
@@ -744,15 +942,24 @@ def inverse_mass(dcoll: DiscretizationCollection, vec) -> ArrayOrContainer:
     where :math:`\widehat{\mathbf{M}}` is the reference mass matrix on
     :math:`\widehat{E}`.
 
+    May be called with ``(vec)`` or ``(dd, vec)``.
+
     :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
+    :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one.
+        Defaults to the base volume discretization if not provided.
     :returns: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` like *vec*.
     """
+    if len(args) == 1:
+        vec, = args
+        dd = DD_VOLUME_ALL
+    elif len(args) == 2:
+        dd, vec = args
+    else:
+        raise TypeError("invalid number of arguments")
 
-    return _apply_inverse_mass_operator(
-        dcoll, dof_desc.DD_VOLUME, dof_desc.DD_VOLUME, vec
-    )
+    return _apply_inverse_mass_operator(dcoll, dd, dd, vec)
 
 # }}}
 
@@ -850,23 +1057,76 @@ def get_ref_face_mass_mat(face_grp, vol_grp):
     return get_ref_face_mass_mat(face_element_group, vol_element_group)
 
 
-def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd, vec):
+def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd_in, vec):
     if not isinstance(vec, DOFArray):
         return map_array_container(
-            partial(_apply_face_mass_operator, dcoll, dd), vec
+            partial(_apply_face_mass_operator, dcoll, dd_in), vec
         )
 
     from grudge.geometry import area_element
 
-    volm_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME)
-    face_discr = dcoll.discr_from_dd(dd)
+    dd_out = DOFDesc(
+        VolumeDomainTag(dd_in.domain_tag.volume_tag),
+        DISCR_TAG_BASE)
+
+    volm_discr = dcoll.discr_from_dd(dd_out)
+    face_discr = dcoll.discr_from_dd(dd_in)
     dtype = vec.entry_dtype
     actx = vec.array_context
 
     assert len(face_discr.groups) == len(volm_discr.groups)
-    surf_area_elements = area_element(actx, dcoll, dd=dd,
+    surf_area_elements = area_element(actx, dcoll, dd=dd_in,
             _use_geoderiv_connection=actx.supports_nonscalar_broadcasting)
 
+    data = []
+    for vgrp, afgrp, vec_i, surf_ae_i, in zip(volm_discr.groups, face_discr.groups, vec, surf_area_elements):
+
+        
+        ref_fm_mat = reference_face_mass_matrix(
+                                actx,
+                                face_element_group=afgrp,
+                                vol_element_group=vgrp,
+                                dtype=dtype)
+
+        fp_format = dtype
+        Ni, Nf, Nj = ref_fm_mat.shape 
+        Ne = vgrp.nelements
+
+        kernel_data = [
+            lp.GlobalArg("vec", fp_format, shape=(Nf, Ne, Nj), offset=lp.auto, tags=[IsFaceDOFArray()]),
+            lp.GlobalArg("jac_surf", fp_format, shape=(Nf, Ne, Nj), offset=lp.auto, tags=[IsFaceDOFArray()]),
+            lp.GlobalArg("ref_face_mass_mat", fp_format, shape=(Ni, Nf, Nj), 
+                offset=lp.auto, tags=[IsFaceMassOpArray()]),  
+            lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True),  
+            lp.ValueArg("Ni", tags=[ParameterValue(Ni)]),
+            lp.ValueArg("Nj", tags=[ParameterValue(Nj)]),
+            lp.ValueArg("Ne", tags=[ParameterValue(Ne)]),
+            lp.ValueArg("Nf", tags=[ParameterValue(Nf)]),
+            ...
+        ]
+    
+        kd_tag = KernelDataTag(kernel_data)
+
+        data.append(actx.einsum("ifj,fej,fej->ei",
+                        ref_fm_mat,
+                        actx.tag_axis(1, DiscretizationElementAxisTag(), surf_ae_i.reshape(
+                                vgrp.mesh_el_group.nfaces,
+                                vgrp.nelements,
+                                surf_ae_i.shape[-1])),
+                        actx.tag_axis(0, DiscretizationFaceAxisTag(), vec_i.reshape(
+                                vgrp.mesh_el_group.nfaces,
+                                vgrp.nelements,
+                                afgrp.nunit_dofs)),
+                        arg_names=("ref_face_mass_mat", "jac_surf", "vec"),
+                        tagged=(FirstAxisIsElementsTag(),kd_tag)))
+        
+        
+
+       
+
+    return DOFArray(actx, data=tuple(data))
+
+    """
     return DOFArray(
         actx,
         data=tuple(
@@ -892,7 +1152,10 @@ def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd, vec):
             for vgrp, afgrp, vec_i, surf_ae_i in zip(volm_discr.groups,
                                                      face_discr.groups,
                                                      vec,
-                                                     surf_area_elements)))
+                                                     surf_area_elements)
+        )
+    )
+    """
 
 
 def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
@@ -901,7 +1164,7 @@ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
     *vec* being an arbitrary :class:`~arraycontext.ArrayContainer`,
     the face mass operator is applied component-wise.
 
-    May be called with ``(vec)`` or ``(dd, vec)``.
+    May be called with ``(vec)`` or ``(dd_in, vec)``.
 
     Specifically, this function applies the face mass matrix elementwise on a
     vector of coefficients :math:`\mathbf{f}` as the sum of contributions for
@@ -932,13 +1195,13 @@ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer:
 
     if len(args) == 1:
         vec, = args
-        dd = dof_desc.DOFDesc("all_faces", dof_desc.DISCR_TAG_BASE)
+        dd_in = DD_VOLUME_ALL.trace(FACE_RESTR_ALL)
     elif len(args) == 2:
-        dd, vec = args
+        dd_in, vec = args
     else:
         raise TypeError("invalid number of arguments")
 
-    return _apply_face_mass_operator(dcoll, dd, vec)
+    return _apply_face_mass_operator(dcoll, dd_in, vec)
 
 # }}}
 
diff --git a/grudge/projection.py b/grudge/projection.py
index 425239591..e21e02295 100644
--- a/grudge/projection.py
+++ b/grudge/projection.py
@@ -37,13 +37,19 @@
 from arraycontext import ArrayOrContainer
 
 from grudge.discretization import DiscretizationCollection
-from grudge.dof_desc import as_dofdesc
+from grudge.dof_desc import (
+    as_dofdesc,
+    VolumeDomainTag,
+    BoundaryDomainTag,
+    ConvertibleToDOFDesc)
 
 from numbers import Number
 
 
 def project(
-        dcoll: DiscretizationCollection, src, tgt, vec) -> ArrayOrContainer:
+        dcoll: DiscretizationCollection,
+        src: "ConvertibleToDOFDesc",
+        tgt: "ConvertibleToDOFDesc", vec) -> ArrayOrContainer:
     """Project from one discretization to another, e.g. from the
     volume to the boundary, or from the base to the an overintegrated
     quadrature discretization.
@@ -55,10 +61,24 @@ def project(
     :returns: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` like *vec*.
     """
-    src = as_dofdesc(src)
-    tgt = as_dofdesc(tgt)
+    # {{{ process dofdesc arguments
 
-    if isinstance(vec, Number) or src == tgt:
+    src_dofdesc = as_dofdesc(src)
+
+    contextual_volume_tag = None
+    if isinstance(src_dofdesc.domain_tag, VolumeDomainTag):
+        contextual_volume_tag = src_dofdesc.domain_tag.tag
+    elif isinstance(src_dofdesc.domain_tag, BoundaryDomainTag):
+        contextual_volume_tag = src_dofdesc.domain_tag.volume_tag
+
+    tgt_dofdesc = as_dofdesc(tgt, _contextual_volume_tag=contextual_volume_tag)
+
+    del src
+    del tgt
+
+    # }}}
+
+    if isinstance(vec, Number) or src_dofdesc == tgt_dofdesc:
         return vec
 
-    return dcoll.connection_from_dds(src, tgt)(vec)
+    return dcoll.connection_from_dds(src_dofdesc, tgt_dofdesc)(vec)
diff --git a/grudge/reductions.py b/grudge/reductions.py
index 95ed44726..bdcf5a7f9 100644
--- a/grudge/reductions.py
+++ b/grudge/reductions.py
@@ -94,7 +94,7 @@ def norm(dcoll: DiscretizationCollection, vec, p, dd=None) -> Scalar:
     :returns: a nonegative scalar denoting the norm.
     """
     if dd is None:
-        dd = dof_desc.DD_VOLUME
+        dd = dof_desc.DD_VOLUME_ALL
 
     from arraycontext import get_container_context_recursively
     actx = get_container_context_recursively(vec)
@@ -128,7 +128,7 @@ def nodal_sum(dcoll: DiscretizationCollection, dd, vec) -> Scalar:
     if comm is None:
         return nodal_sum_loc(dcoll, dd, vec)
 
-    # NOTE: Don't move this
+    # NOTE: Do not move, we do not want to import mpi4py in single-rank computations
     from mpi4py import MPI
 
     from arraycontext import get_container_context_recursively
@@ -174,7 +174,7 @@ def nodal_min(dcoll: DiscretizationCollection, dd, vec, *, initial=None) -> Scal
     if comm is None:
         return nodal_min_loc(dcoll, dd, vec, initial=initial)
 
-    # NOTE: Don't move this
+    # NOTE: Do not move, we do not want to import mpi4py in single-rank computations
     from mpi4py import MPI
     actx = vec.array_context
 
@@ -231,7 +231,7 @@ def nodal_max(dcoll: DiscretizationCollection, dd, vec, *, initial=None) -> Scal
     if comm is None:
         return nodal_max_loc(dcoll, dd, vec, initial=initial)
 
-    # NOTE: Don't move this
+    # NOTE: Do not move, we do not want to import mpi4py in single-rank computations
     from mpi4py import MPI
     actx = vec.array_context
 
@@ -320,7 +320,7 @@ def _apply_elementwise_reduction(
     """
     if len(args) == 1:
         vec, = args
-        dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd = dof_desc.DD_VOLUME_ALL
     elif len(args) == 2:
         dd, vec = args
     else:
@@ -335,6 +335,7 @@ def _apply_elementwise_reduction(
 
     actx = vec.array_context
 
+    import loopy as lp
     if actx.supports_nonscalar_broadcasting:
         return DOFArray(
             actx,
@@ -344,11 +345,12 @@ def _apply_elementwise_reduction(
             )
         )
     else:
-        @memoize_in(actx, (_apply_elementwise_reduction,
-                        "elementwise_%s_prg" % op_name))
-        def elementwise_prg():
+        @memoize_in(actx, (_apply_elementwise_reduction, dd,
+                           "elementwise_%s_prg" % op_name))
+        def elementwise_prg(nelements, ndofs, fp_format):
             # FIXME: This computes the reduction value redundantly for each
             # output DOF.
+            from grudge.grudge_tags import IsDOFArray, ParameterValue
             t_unit = make_loopy_program(
                 [
                     "{[iel]: 0 <= iel < nelements}",
@@ -357,21 +359,31 @@ def elementwise_prg():
                 """
                     result[iel, idof] = %s(jdof, operand[iel, jdof])
                 """ % op_name,
+                kernel_data=[
+                    lp.GlobalArg("result", fp_format, shape=("nelements", "ndofs"),
+                        tags=[IsDOFArray()]),
+                    lp.GlobalArg("operand", fp_format, shape=("nelements", "ndofs"),
+                        tags=[IsDOFArray()]),
+                    lp.ValueArg("ndofs", tags=[ParameterValue(ndofs)]),
+                    lp.ValueArg("nelements", tags=[ParameterValue(nelements)]),
+                    ...
+                ],
                 name="grudge_elementwise_%s_knl" % op_name
             )
-            import loopy as lp
             from meshmode.transform_metadata import (
                     ConcurrentElementInameTag, ConcurrentDOFInameTag)
             return lp.tag_inames(t_unit, {
                 "iel": ConcurrentElementInameTag(),
                 "idof": ConcurrentDOFInameTag()})
 
-        return actx.tag_axis(1, DiscretizationDOFAxisTag(),
-                DOFArray(
-                    actx,
-                    data=tuple(
-                        actx.call_loopy(elementwise_prg(), operand=vec_i)["result"]
-                        for vec_i in vec)))
+        data = []
+        for vec_i in vec:
+            iel, jdof = vec_i.shape
+            fp_format = vec_i.dtype
+            data.append(actx.call_loopy(elementwise_prg(iel, jdof, fp_format),
+                            operand=vec_i)["result"])
+
+        return actx.tag_axis(1, DiscretizationDOFAxisTag(), DOFArray(actx, data=tuple(data)))
 
 
 def elementwise_sum(
@@ -485,7 +497,7 @@ def elementwise_integral(
     """
     if len(args) == 1:
         vec, = args
-        dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE)
+        dd = dof_desc.DD_VOLUME_ALL
     elif len(args) == 2:
         dd, vec = args
     else:
diff --git a/grudge/shortcuts.py b/grudge/shortcuts.py
index 0aca64a58..e6e62cc55 100644
--- a/grudge/shortcuts.py
+++ b/grudge/shortcuts.py
@@ -20,6 +20,8 @@
 THE SOFTWARE.
 """
 
+from grudge.dof_desc import DD_VOLUME_ALL
+
 
 from pytools import memoize_in
 
@@ -76,11 +78,14 @@ def set_up_rk4(field_var_name, dt, fields, rhs, t_start=0.0):
     return dt_stepper
 
 
-def make_visualizer(dcoll, vis_order=None, **kwargs):
+def make_visualizer(dcoll, vis_order=None, volume_dd=None, **kwargs):
     from meshmode.discretization.visualization import make_visualizer
+    if volume_dd is None:
+        volume_dd = DD_VOLUME_ALL
+
     return make_visualizer(
             dcoll._setup_actx,
-            dcoll.discr_from_dd("vol"), vis_order, **kwargs)
+            dcoll.discr_from_dd(volume_dd), vis_order, **kwargs)
 
 
 def make_boundary_visualizer(dcoll, vis_order=None, **kwargs):
diff --git a/grudge/trace_pair.py b/grudge/trace_pair.py
index cb2de38f6..333832340 100644
--- a/grudge/trace_pair.py
+++ b/grudge/trace_pair.py
@@ -18,12 +18,15 @@
 .. autofunction:: bdry_trace_pair
 .. autofunction:: bv_trace_pair
 
-Interior and cross-rank trace functions
----------------------------------------
+Interior, cross-rank, and inter-volume traces
+---------------------------------------------
 
 .. autofunction:: interior_trace_pairs
 .. autofunction:: local_interior_trace_pair
+.. autofunction:: inter_volume_trace_pairs
+.. autofunction:: local_inter_volume_trace_pairs
 .. autofunction:: cross_rank_trace_pairs
+.. autofunction:: cross_rank_inter_volume_trace_pairs
 """
 
 __copyright__ = """
@@ -51,17 +54,19 @@
 """
 
 
-from typing import List, Hashable, Optional, Type, Any
+from warnings import warn
+from typing import List, Hashable, Optional, Tuple, Type, Any, Sequence, Mapping
 
 from pytools.persistent_dict import KeyBuilder
 
 from arraycontext import (
     ArrayContainer,
+    ArrayContext,
     with_container_arithmetic,
     dataclass_array_container,
-    get_container_context_recursively,
-    flatten, to_numpy,
-    unflatten, from_numpy,
+    get_container_context_recursively_opt,
+    to_numpy,
+    from_numpy,
     ArrayOrContainer
 )
 
@@ -70,15 +75,20 @@
 from numbers import Number
 
 from pytools import memoize_on_first_arg
-from pytools.obj_array import obj_array_vectorize
 
-from grudge.discretization import DiscretizationCollection
+from grudge.discretization import DiscretizationCollection, PartID
 from grudge.projection import project
 
 from meshmode.mesh import BTAG_PARTITION
 
 import numpy as np
+
 import grudge.dof_desc as dof_desc
+from grudge.dof_desc import (
+        DOFDesc, DD_VOLUME_ALL, FACE_RESTR_INTERIOR, DISCR_TAG_BASE,
+        VolumeTag, VolumeDomainTag, BoundaryDomainTag,
+        ConvertibleToDOFDesc,
+        )
 
 
 # {{{ trace pair container class
@@ -107,12 +117,22 @@ class TracePair:
     .. automethod:: __len__
     """
 
-    dd: dof_desc.DOFDesc
+    dd: DOFDesc
     interior: ArrayContainer
     exterior: ArrayContainer
 
-    def __init__(self, dd, *, interior, exterior):
-        object.__setattr__(self, "dd", dof_desc.as_dofdesc(dd))
+    def __init__(self, dd: DOFDesc, *,
+            interior: ArrayOrContainer,
+            exterior: ArrayOrContainer):
+        if not isinstance(dd, DOFDesc):
+            warn("Constructing a TracePair with a first argument that is not "
+                    "exactly a DOFDesc (but convertible to one) is deprecated. "
+                    "This will stop working in July 2022. "
+                    "Pass an actual DOFDesc instead.",
+                    DeprecationWarning, stacklevel=2)
+            dd = dof_desc.as_dofdesc(dd)
+
+        object.__setattr__(self, "dd", dd)
         object.__setattr__(self, "interior", interior)
         object.__setattr__(self, "exterior", exterior)
 
@@ -178,7 +198,8 @@ def diff(self):
 # {{{ boundary trace pairs
 
 def bdry_trace_pair(
-        dcoll: DiscretizationCollection, dd, interior, exterior) -> TracePair:
+        dcoll: DiscretizationCollection, dd: "ConvertibleToDOFDesc",
+        interior, exterior) -> TracePair:
     """Returns a trace pair defined on the exterior boundary. Input arguments
     are assumed to already be defined on the boundary denoted by *dd*.
     If the input arguments *interior* and *exterior* are
@@ -197,11 +218,19 @@ def bdry_trace_pair(
         be used for the flux.
     :returns: a :class:`TracePair` on the boundary.
     """
+    if not isinstance(dd, DOFDesc):
+        warn("Calling  bdry_trace_pair with a first argument that is not "
+                "exactly a DOFDesc (but convertible to one) is deprecated. "
+                "This will stop working in July 2022. "
+                "Pass an actual DOFDesc instead.",
+                DeprecationWarning, stacklevel=2)
+        dd = dof_desc.as_dofdesc(dd)
     return TracePair(dd, interior=interior, exterior=exterior)
 
 
 def bv_trace_pair(
-        dcoll: DiscretizationCollection, dd, interior, exterior) -> TracePair:
+        dcoll: DiscretizationCollection, dd: "ConvertibleToDOFDesc",
+        interior, exterior) -> TracePair:
     """Returns a trace pair defined on the exterior boundary. The interior
     argument is assumed to be defined on the volume discretization, and will
     therefore be restricted to the boundary *dd* prior to creating a
@@ -223,21 +252,29 @@ def bv_trace_pair(
         be used for the flux.
     :returns: a :class:`TracePair` on the boundary.
     """
+    if not isinstance(dd, DOFDesc):
+        warn("Calling  bv_trace_pair with a first argument that is not "
+                "exactly a DOFDesc (but convertible to one) is deprecated. "
+                "This will stop working in July 2022. "
+                "Pass an actual DOFDesc instead.",
+                DeprecationWarning, stacklevel=2)
+        dd = dof_desc.as_dofdesc(dd)
     return bdry_trace_pair(
-        dcoll, dd, project(dcoll, "vol", dd, interior), exterior
-    )
+        dcoll, dd, project(dcoll, dd.domain_tag.volume_tag, dd, interior), exterior)
 
 # }}}
 
 
 # {{{ interior trace pairs
 
-def local_interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair:
+def local_interior_trace_pair(
+        dcoll: DiscretizationCollection, vec, *,
+        volume_dd: Optional[DOFDesc] = None,
+        ) -> TracePair:
     r"""Return a :class:`TracePair` for the interior faces of
     *dcoll* with a discretization tag specified by *discr_tag*.
     This does not include interior faces on different MPI ranks.
 
-
     :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an
         :class:`~arraycontext.ArrayContainer` of them.
 
@@ -250,21 +287,33 @@ def local_interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair
     computation.
     :returns: a :class:`TracePair` object.
     """
-    i = project(dcoll, "vol", "int_faces", vec)
+    if volume_dd is None:
+        volume_dd = DD_VOLUME_ALL
+
+    assert isinstance(volume_dd.domain_tag, VolumeDomainTag)
+    trace_dd = volume_dd.trace(FACE_RESTR_INTERIOR)
+
+    interior = project(dcoll, volume_dd, trace_dd, vec)
+
+    opposite_face_conn = dcoll.opposite_face_connection(trace_dd.domain_tag)
 
-    def get_opposite_face(el):
-        if isinstance(el, Number):
-            return el
+    def get_opposite_trace(ary):
+        if isinstance(ary, Number):
+            return ary
         else:
-            return dcoll.opposite_face_connection()(el)
+            return opposite_face_conn(ary)
 
-    e = obj_array_vectorize(get_opposite_face, i)
+    from arraycontext import rec_map_array_container
+    from meshmode.dof_array import DOFArray
+    exterior = rec_map_array_container(
+        get_opposite_trace,
+        interior,
+        leaf_class=DOFArray)
 
-    return TracePair("int_faces", interior=i, exterior=e)
+    return TracePair(trace_dd, interior=interior, exterior=exterior)
 
 
 def interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair:
-    from warnings import warn
     warn("`grudge.op.interior_trace_pair` is deprecated and will be dropped "
          "in version 2022.x. Use `local_interior_trace_pair` "
          "instead, or `interior_trace_pairs` which also includes contributions "
@@ -274,7 +323,8 @@ def interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair:
 
 
 def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *,
-        comm_tag: Hashable = None, tag: Hashable = None) -> List[TracePair]:
+        comm_tag: Hashable = None, tag: Hashable = None,
+        volume_dd: Optional[DOFDesc] = None) -> List[TracePair]:
     r"""Return a :class:`list` of :class:`TracePair` objects
     defined on the interior faces of *dcoll* and any faces connected to a
     parallel boundary.
@@ -293,7 +343,6 @@ def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *,
     """
 
     if tag is not None:
-        from warnings import warn
         warn("Specifying 'tag' is deprecated and will stop working in July of 2022. "
                 "Specify 'comm_tag' instead.", DeprecationWarning, stacklevel=2)
         if comm_tag is not None:
@@ -302,154 +351,428 @@ def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *,
             comm_tag = tag
     del tag
 
+    if volume_dd is None:
+        volume_dd = DD_VOLUME_ALL
+
     return (
-        [local_interior_trace_pair(dcoll, vec)]
-        + cross_rank_trace_pairs(dcoll, vec, comm_tag=comm_tag)
+        [local_interior_trace_pair(
+            dcoll, vec, volume_dd=volume_dd)]
+        + cross_rank_trace_pairs(
+            dcoll, vec, comm_tag=comm_tag, volume_dd=volume_dd)
     )
 
 # }}}
 
 
-# {{{ distributed-memory functionality
+# {{{ inter-volume trace pairs
+
+def local_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]]
+        ) -> Mapping[Tuple[DOFDesc, DOFDesc], TracePair]:
+    for vol_dd_pair in pairwise_volume_data.keys():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+
+    rank = (
+        dcoll.mpi_communicator.Get_rank()
+        if dcoll.mpi_communicator is not None
+        else None)
+
+    result: Mapping[Tuple[DOFDesc, DOFDesc], TracePair] = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        from meshmode.mesh import mesh_has_boundary
+        if not mesh_has_boundary(
+                dcoll.discr_from_dd(vol_dd_pair[0]).mesh,
+                BTAG_PARTITION(PartID(vol_dd_pair[1].domain_tag.tag, rank))):
+            continue
+
+        directional_vol_dd_pairs = [
+            (vol_dd_pair[1], vol_dd_pair[0]),
+            (vol_dd_pair[0], vol_dd_pair[1])]
+
+        trace_dd_pair = tuple(
+            self_vol_dd.trace(
+                BTAG_PARTITION(
+                    PartID(other_vol_dd.domain_tag.tag, rank)))
+            for other_vol_dd, self_vol_dd in directional_vol_dd_pairs)
+
+        # Pre-compute the projections out here to avoid doing it twice inside
+        # the loop below
+        trace_data = {
+            trace_dd: project(dcoll, vol_dd, trace_dd, vol_data)
+            for vol_dd, trace_dd, vol_data in zip(
+                vol_dd_pair, trace_dd_pair, vol_data_pair)}
+
+        for other_vol_dd, self_vol_dd in directional_vol_dd_pairs:
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_id = PartID(other_vol_dd.domain_tag.tag, rank)
+
+            self_trace_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+            other_trace_dd = other_vol_dd.trace(BTAG_PARTITION(self_part_id))
+
+            self_trace_data = trace_data[self_trace_dd]
+            unswapped_other_trace_data = trace_data[other_trace_dd]
+
+            other_to_self = dcoll._inter_part_connections[
+                other_part_id, self_part_id]
+
+            def get_opposite_trace(ary):
+                if isinstance(ary, Number):
+                    return ary
+                else:
+                    return other_to_self(ary)  # noqa: B023
+
+            from arraycontext import rec_map_array_container
+            from meshmode.dof_array import DOFArray
+            other_trace_data = rec_map_array_container(
+                get_opposite_trace,
+                unswapped_other_trace_data,
+                leaf_class=DOFArray)
+
+            result[other_vol_dd, self_vol_dd] = TracePair(
+                self_trace_dd,
+                interior=self_trace_data,
+                exterior=other_trace_data)
+
+    return result
+
+
+def inter_volume_trace_pairs(dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        comm_tag: Hashable = None) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    """
+    Note that :func:`local_inter_volume_trace_pairs` provides the rank-local
+    contributions if those are needed in isolation. Similarly,
+    :func:`cross_rank_inter_volume_trace_pairs` provides only the trace pairs
+    defined on cross-rank boundaries.
+    """
+    # TODO documentation
+
+    result: Mapping[
+        Tuple[DOFDesc, DOFDesc],
+        List[TracePair]] = {}
+
+    local_tpairs = local_inter_volume_trace_pairs(dcoll, pairwise_volume_data)
+    cross_rank_tpairs = cross_rank_inter_volume_trace_pairs(
+        dcoll, pairwise_volume_data, comm_tag=comm_tag)
+
+    for directional_vol_dd_pair, tpair in local_tpairs.items():
+        result[directional_vol_dd_pair] = [tpair]
+
+    for directional_vol_dd_pair, tpairs in cross_rank_tpairs.items():
+        result.setdefault(directional_vol_dd_pair, []).extend(tpairs)
+
+    return result
+
+# }}}
+
+
+# {{{ distributed: helper functions
+
+class _TagKeyBuilder(KeyBuilder):
+    def update_for_type(self, key_hash, key: Type[Any]):
+        self.rec(key_hash, (key.__module__, key.__name__, key.__name__,))
+
 
 @memoize_on_first_arg
-def connected_ranks(dcoll: DiscretizationCollection):
-    from meshmode.distributed import get_connected_partitions
-    return get_connected_partitions(dcoll._volume_discr.mesh)
+def _connected_parts(
+        dcoll: DiscretizationCollection,
+        self_volume_tag: VolumeTag,
+        other_volume_tag: VolumeTag
+        ) -> Sequence[PartID]:
+    result: List[PartID] = [
+        connected_part_id
+        for connected_part_id, part_id in dcoll._inter_part_connections.keys()
+        if (
+            part_id.volume_tag == self_volume_tag
+            and connected_part_id.volume_tag == other_volume_tag)]
+
+    return result
 
 
-class _RankBoundaryCommunication:
+def _sym_tag_to_num_tag(comm_tag: Optional[Hashable]) -> Optional[int]:
+    if comm_tag is None:
+        return comm_tag
+
+    if isinstance(comm_tag, int):
+        return comm_tag
+
+    # FIXME: This isn't guaranteed to be correct.
+    # See here for discussion:
+    # - https://github.com/illinois-ceesd/mirgecom/issues/617#issuecomment-1057082716  # noqa
+    # - https://github.com/inducer/grudge/pull/222
+
+    from mpi4py import MPI
+    tag_ub = MPI.COMM_WORLD.Get_attr(MPI.TAG_UB)
+    key_builder = _TagKeyBuilder()
+    digest = key_builder(comm_tag)
+
+    num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub
+
+    warn("Encountered unknown symbolic tag "
+            f"'{comm_tag}', assigning a value of '{num_tag}'. "
+            "This is a temporary workaround, please ensure that "
+            "tags are sufficiently distinct for your use case.")
+
+    return num_tag
+
+# }}}
+
+
+# {{{ eager rank-boundary communication
+
+class _RankBoundaryCommunicationEager:
     base_comm_tag = 1273
 
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank, comm_tag: Optional[int] = None):
-        actx = get_container_context_recursively(array_container)
-        btag = BTAG_PARTITION(remote_rank)
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None):
 
-        local_bdry_data = project(dcoll, "vol", btag, array_container)
         comm = dcoll.mpi_communicator
+        assert comm is not None
+
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
 
         self.dcoll = dcoll
         self.array_context = actx
-        self.remote_btag = btag
-        self.bdry_discr = dcoll.discr_from_dd(btag)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
         self.local_bdry_data = local_bdry_data
-        self.local_bdry_data_np = \
-            to_numpy(flatten(self.local_bdry_data, actx), actx)
+        self.remote_bdry_data_template = remote_bdry_data_template
 
         self.comm_tag = self.base_comm_tag
+        comm_tag = _sym_tag_to_num_tag(comm_tag)
         if comm_tag is not None:
             self.comm_tag += comm_tag
+        del comm_tag
 
-        # Here, we initialize both send and recieve operations through
-        # mpi4py `Request` (MPI_Request) instances for comm.Isend (MPI_Isend)
-        # and comm.Irecv (MPI_Irecv) respectively. These initiate non-blocking
-        # point-to-point communication requests and require explicit management
-        # via the use of wait (MPI_Wait, MPI_Waitall, MPI_Waitany, MPI_Waitsome),
-        # test (MPI_Test, MPI_Testall, MPI_Testany, MPI_Testsome), and cancel
-        # (MPI_Cancel). The rank-local data `self.local_bdry_data_np` will have its
-        # associated memory buffer sent across connected ranks and must not be
-        # modified at the Python level during this process. Completion of the
-        # requests is handled in :meth:`finish`.
-        #
-        # For more details on the mpi4py semantics, see:
-        # https://mpi4py.readthedocs.io/en/stable/overview.html#nonblocking-communications
-        #
         # NOTE: mpi4py currently (2021-11-03) holds a reference to the send
         # memory buffer for (i.e. `self.local_bdry_data_np`) until the send
         # requests is complete, however it is not clear that this is documented
         # behavior. We hold on to the buffer (via the instance attribute)
         # as well, just in case.
-        self.send_req = comm.Isend(self.local_bdry_data_np,
-                                   remote_rank,
-                                   tag=self.comm_tag)
-        self.remote_data_host_numpy = np.empty_like(self.local_bdry_data_np)
-        self.recv_req = comm.Irecv(self.remote_data_host_numpy,
-                                   remote_rank,
-                                   tag=self.comm_tag)
+        self.send_reqs = []
+        self.send_data = []
+
+        def send_single_array(key, local_subary):
+            if not isinstance(local_subary, Number):
+                local_subary_np = to_numpy(local_subary, actx)
+                self.send_reqs.append(
+                    comm.Isend(local_subary_np, remote_rank, tag=self.comm_tag))
+                self.send_data.append(local_subary_np)
+            return local_subary
+
+        self.recv_reqs = []
+        self.recv_data = {}
+
+        def recv_single_array(key, remote_subary_template):
+            if not isinstance(remote_subary_template, Number):
+                remote_subary_np = np.empty(
+                    remote_subary_template.shape,
+                    remote_subary_template.dtype)
+                self.recv_reqs.append(
+                    comm.Irecv(remote_subary_np, remote_rank, tag=self.comm_tag))
+                self.recv_data[key] = remote_subary_np
+            return remote_subary_template
+
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        rec_keyed_map_array_container(recv_single_array, remote_bdry_data_template)
 
     def finish(self):
-        # Wait for the nonblocking receive request to complete before
+        from mpi4py import MPI
+
+        # Wait for the nonblocking receive requests to complete before
         # accessing the data
-        self.recv_req.Wait()
+        MPI.Request.waitall(self.recv_reqs)
 
-        # Nonblocking receive is complete, we can now access the data and apply
-        # the boundary-swap connection
-        actx = self.array_context
-        remote_bdry_data_flat = from_numpy(self.remote_data_host_numpy, actx)
-        remote_bdry_data = unflatten(self.local_bdry_data,
-                                     remote_bdry_data_flat, actx)
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            dof_desc.as_dofdesc(dof_desc.DTAG_BOUNDARY(self.remote_btag)))
-        swapped_remote_bdry_data = bdry_conn(remote_bdry_data)
+        def finish_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                return from_numpy(self.recv_data[key], self.array_context)
 
-        # Complete the nonblocking send request associated with communicating
-        # `self.local_bdry_data_np`
-        self.send_req.Wait()
+        from arraycontext.container.traversal import rec_keyed_map_array_container
+        unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            finish_single_array, self.remote_bdry_data_template)
+
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
 
-        return TracePair(self.remote_btag,
-                         interior=self.local_bdry_data,
-                         exterior=swapped_remote_bdry_data)
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
 
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
 
-from pytato import make_distributed_recv, staple_distributed_send
+        # Complete the nonblocking send requests
+        MPI.Request.waitall(self.send_reqs)
 
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
+
+# }}}
+
+
+# {{{ lazy rank-boundary communication
 
 class _RankBoundaryCommunicationLazy:
     def __init__(self,
-                 dcoll: DiscretizationCollection,
-                 array_container: ArrayOrContainer,
-                 remote_rank: int, comm_tag: Hashable):
+            actx: ArrayContext,
+            dcoll: DiscretizationCollection,
+            *,
+            local_part_id: PartID,
+            remote_part_id: PartID,
+            local_bdry_data: ArrayOrContainer,
+            remote_bdry_data_template: ArrayOrContainer,
+            comm_tag: Optional[Hashable] = None) -> None:
+
         if comm_tag is None:
-            raise ValueError("lazy communication requires 'tag' to be supplied")
+            raise ValueError("lazy communication requires 'comm_tag' to be supplied")
+
+        remote_rank = remote_part_id.rank
+        assert remote_rank is not None
 
         self.dcoll = dcoll
-        self.array_context = get_container_context_recursively(array_container)
-        self.remote_btag = BTAG_PARTITION(remote_rank)
-        self.bdry_discr = dcoll.discr_from_dd(self.remote_btag)
-
-        self.local_bdry_data = project(
-            dcoll, "vol", self.remote_btag, array_container)
-
-        def communicate_single_array(key, local_bdry_ary):
-            ary_tag = (comm_tag, key)
-            return staple_distributed_send(
-                    local_bdry_ary, dest_rank=remote_rank, comm_tag=ary_tag,
-                    stapled_to=make_distributed_recv(
+        self.array_context = actx
+        self.local_bdry_dd = DOFDesc(
+            BoundaryDomainTag(
+                BTAG_PARTITION(remote_part_id),
+                volume_tag=local_part_id.volume_tag),
+            DISCR_TAG_BASE)
+        self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd)
+        self.local_part_id = local_part_id
+        self.remote_part_id = remote_part_id
+
+        from pytato import (
+            make_distributed_recv,
+            make_distributed_send,
+            DistributedSendRefHolder)
+
+        # TODO: This currently assumes that local_bdry_data and
+        # remote_bdry_data_template have the same structure. This is not true
+        # in general. Find a way to staple the sends appropriately when the number
+        # of recvs is not equal to the number of sends
+        # FIXME: Overly restrictive (just needs to be the same structure)
+        assert type(local_bdry_data) == type(remote_bdry_data_template)
+
+        sends = {}
+
+        def send_single_array(key, local_subary):
+            if isinstance(local_subary, Number):
+                return
+            else:
+                ary_tag = (comm_tag, key)
+                sends[key] = make_distributed_send(
+                    local_subary, dest_rank=remote_rank, comm_tag=ary_tag)
+
+        def recv_single_array(key, remote_subary_template):
+            if isinstance(remote_subary_template, Number):
+                # NOTE: Assumes that the same number is passed on every rank
+                return remote_subary_template
+            else:
+                ary_tag = (comm_tag, key)
+                return DistributedSendRefHolder(
+                    sends[key],
+                    make_distributed_recv(
                         src_rank=remote_rank, comm_tag=ary_tag,
-                        shape=local_bdry_ary.shape, dtype=local_bdry_ary.dtype,
-                        axes=local_bdry_ary.axes))
+                        shape=remote_subary_template.shape,
+                        dtype=remote_subary_template.dtype,
+                        axes=remote_subary_template.axes))
 
         from arraycontext.container.traversal import rec_keyed_map_array_container
-        self.remote_data = rec_keyed_map_array_container(
-                communicate_single_array, self.local_bdry_data)
+
+        rec_keyed_map_array_container(send_single_array, local_bdry_data)
+        self.local_bdry_data = local_bdry_data
+
+        self.unswapped_remote_bdry_data = rec_keyed_map_array_container(
+            recv_single_array, remote_bdry_data_template)
 
     def finish(self):
-        bdry_conn = self.dcoll.distributed_boundary_swap_connection(
-            dof_desc.as_dofdesc(dof_desc.DTAG_BOUNDARY(self.remote_btag)))
+        remote_to_local = self.dcoll._inter_part_connections[
+            self.remote_part_id, self.local_part_id]
+
+        def get_opposite_trace(ary):
+            if isinstance(ary, Number):
+                return ary
+            else:
+                return remote_to_local(ary)
+
+        from arraycontext import rec_map_array_container
+        from meshmode.dof_array import DOFArray
+        remote_bdry_data = rec_map_array_container(
+            get_opposite_trace,
+            self.unswapped_remote_bdry_data,
+            leaf_class=DOFArray)
+
+        return TracePair(
+                self.local_bdry_dd,
+                interior=self.local_bdry_data,
+                exterior=remote_bdry_data)
 
-        return TracePair(self.remote_btag,
-                         interior=self.local_bdry_data,
-                         exterior=bdry_conn(self.remote_data))
+# }}}
 
 
-class _TagKeyBuilder(KeyBuilder):
-    def update_for_type(self, key_hash, key: Type[Any]):
-        self.rec(key_hash, (key.__module__, key.__name__, key.__name__,))
+# {{{ cross_rank_trace_pairs
+
+def _replace_dof_arrays(array_container, dof_array):
+    from arraycontext import rec_map_array_container
+    from meshmode.dof_array import DOFArray
+    return rec_map_array_container(
+        lambda x: dof_array if isinstance(x, DOFArray) else x,
+        array_container,
+        leaf_class=DOFArray)
 
 
 def cross_rank_trace_pairs(
-        dcoll: DiscretizationCollection, ary,
-        comm_tag: Hashable = None,
-        tag: Hashable = None) -> List[TracePair]:
+        dcoll: DiscretizationCollection, ary: ArrayOrContainer,
+        tag: Hashable = None,
+        *, comm_tag: Hashable = None,
+        volume_dd: Optional[DOFDesc] = None) -> List[TracePair]:
     r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
 
     For each partition boundary, the field data values in *ary* are
-    communicated to/from the neighboring partition. Presumably, this
-    communication is MPI (but strictly speaking, may not be, and this
-    routine is agnostic to the underlying communication).
+    communicated to/from the neighboring part. Presumably, this communication
+    is MPI (but strictly speaking, may not be, and this routine is agnostic to
+    the underlying communication).
 
     For each face on each partition boundary, a
     :class:`TracePair` is created with the locally, and
@@ -472,61 +795,227 @@ def cross_rank_trace_pairs(
 
     :returns: a :class:`list` of :class:`TracePair` objects.
     """
+    # {{{ process arguments
+
+    if volume_dd is None:
+        volume_dd = DD_VOLUME_ALL
+
+    if not isinstance(volume_dd.domain_tag, VolumeDomainTag):
+        raise TypeError(f"expected a volume DOFDesc, got '{volume_dd}'")
+    if volume_dd.discretization_tag != DISCR_TAG_BASE:
+        raise TypeError(f"expected a base-discretized DOFDesc, got '{volume_dd}'")
+
     if tag is not None:
-        from warnings import warn
         warn("Specifying 'tag' is deprecated and will stop working in July of 2022. "
-                "Specify 'comm_tag' instead.", DeprecationWarning, stacklevel=2)
+                "Specify 'comm_tag' (keyword-only) instead.",
+                DeprecationWarning, stacklevel=2)
         if comm_tag is not None:
             raise TypeError("may only specify one of 'tag' and 'comm_tag'")
         else:
             comm_tag = tag
     del tag
 
-    if isinstance(ary, Number):
-        # NOTE: Assumed that the same number is passed on every rank
-        return [TracePair(BTAG_PARTITION(remote_rank), interior=ary, exterior=ary)
-                for remote_rank in connected_ranks(dcoll)]
+    # }}}
+
+    if dcoll.mpi_communicator is None:
+        return []
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    local_part_id = PartID(volume_dd.domain_tag.tag, rank)
+
+    connected_part_ids = _connected_parts(
+            dcoll, self_volume_tag=volume_dd.domain_tag.tag,
+            other_volume_tag=volume_dd.domain_tag.tag)
+
+    remote_part_ids = [
+        part_id
+        for part_id in connected_part_ids
+        if part_id.rank != rank]
 
-    actx = get_container_context_recursively(ary)
+    # This asserts that there is only one data exchange per rank, so that
+    # there is no risk of mismatched data reaching the wrong recipient.
+    # (Since we have only a single tag.)
+    assert len(remote_part_ids) == len({part_id.rank for part_id in remote_part_ids})
+
+    actx = get_container_context_recursively_opt(ary)
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank
+        return [
+            TracePair(
+                volume_dd.trace(BTAG_PARTITION(remote_part_id)),
+                interior=ary, exterior=ary)
+            for remote_part_id in remote_part_ids]
 
     from grudge.array_context import MPIPytatoArrayContextBase
 
     if isinstance(actx, MPIPytatoArrayContextBase):
-        rbc = _RankBoundaryCommunicationLazy
+        rbc_class = _RankBoundaryCommunicationLazy
     else:
-        rbc = _RankBoundaryCommunication
-        if comm_tag is not None:
-            num_tag: Optional[int] = None
-            if isinstance(comm_tag, int):
-                num_tag = comm_tag
-
-            if num_tag is None:
-                # FIXME: This isn't guaranteed to be correct.
-                # See here for discussion:
-                # - https://github.com/illinois-ceesd/mirgecom/issues/617#issuecomment-1057082716  # noqa
-                # - https://github.com/inducer/grudge/pull/222
-                from mpi4py import MPI
-                tag_ub = MPI.COMM_WORLD.Get_attr(MPI.TAG_UB)
-                key_builder = _TagKeyBuilder()
-                digest = key_builder(comm_tag)
-                num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub
-
-                from warnings import warn
-                warn("Encountered unknown symbolic tag "
-                        f"'{comm_tag}', assigning a value of '{num_tag}'. "
-                        "This is a temporary workaround, please ensure that "
-                        "tags are sufficiently distinct for your use case.")
-
-            comm_tag = num_tag
-
-    # Initialize and post all sends/receives
-    rank_bdry_communcators = [
-        rbc(dcoll, ary, remote_rank, comm_tag=comm_tag)
-        for remote_rank in connected_ranks(dcoll)
-    ]
-
-    # Complete send/receives and return communicated data
-    return [rc.finish() for rc in rank_bdry_communcators]
+        rbc_class = _RankBoundaryCommunicationEager
+
+    rank_bdry_communicators = []
+
+    for remote_part_id in remote_part_ids:
+        bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_part_id))
+
+        local_bdry_data = project(dcoll, volume_dd, bdry_dd, ary)
+
+        from arraycontext import tag_axes
+        from meshmode.transform_metadata import (
+            DiscretizationElementAxisTag,
+            DiscretizationDOFAxisTag)
+        remote_bdry_zeros = tag_axes(
+            actx, {
+                0: DiscretizationElementAxisTag(),
+                1: DiscretizationDOFAxisTag()},
+            dcoll._inter_part_connections[
+                remote_part_id, local_part_id].from_discr.zeros(actx))
+
+        remote_bdry_data_template = _replace_dof_arrays(
+            local_bdry_data, remote_bdry_zeros)
+
+        rank_bdry_communicators.append(
+            rbc_class(actx, dcoll,
+                local_part_id=local_part_id,
+                remote_part_id=remote_part_id,
+                local_bdry_data=local_bdry_data,
+                remote_bdry_data_template=remote_bdry_data_template,
+                comm_tag=comm_tag))
+
+    return [rbc.finish() for rbc in rank_bdry_communicators]
+
+# }}}
+
+
+# {{{ cross_rank_inter_volume_trace_pairs
+
+def cross_rank_inter_volume_trace_pairs(
+        dcoll: DiscretizationCollection,
+        pairwise_volume_data: Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            Tuple[ArrayOrContainer, ArrayOrContainer]],
+        *, comm_tag: Hashable = None,
+        ) -> Mapping[
+            Tuple[DOFDesc, DOFDesc],
+            List[TracePair]]:
+    # FIXME: Should this interface take in boundary data instead?
+    # TODO: Docs
+    r"""Get a :class:`list` of *ary* trace pairs for each partition boundary.
+
+    :arg comm_tag: a hashable object used to match sent and received data
+        across ranks. Communication will only match if both endpoints specify
+        objects that compare equal. A generalization of MPI communication
+        tags to arbitary, potentially composite objects.
+
+    :returns: a :class:`list` of :class:`TracePair` objects.
+    """
+    # {{{ process arguments
+
+    for vol_dd_pair in pairwise_volume_data.keys():
+        for vol_dd in vol_dd_pair:
+            if not isinstance(vol_dd.domain_tag, VolumeDomainTag):
+                raise ValueError(
+                    "pairwise_volume_data keys must describe volumes, "
+                    f"got '{vol_dd}'")
+            if vol_dd.discretization_tag != DISCR_TAG_BASE:
+                raise ValueError(
+                    "expected base-discretized DOFDesc in pairwise_volume_data, "
+                    f"got '{vol_dd}'")
+
+    # }}}
+
+    if dcoll.mpi_communicator is None:
+        return []
+
+    rank = dcoll.mpi_communicator.Get_rank()
+
+    for vol_data_pair in pairwise_volume_data.values():
+        for vol_data in vol_data_pair:
+            actx = get_container_context_recursively_opt(vol_data)
+            if actx is not None:
+                break
+        if actx is not None:
+            break
+
+    def get_remote_connected_parts(local_vol_dd, remote_vol_dd):
+        connected_part_ids = _connected_parts(
+            dcoll, self_volume_tag=local_vol_dd.domain_tag.tag,
+            other_volume_tag=remote_vol_dd.domain_tag.tag)
+        return [
+            part_id
+            for part_id in connected_part_ids
+            if part_id.rank != rank]
+
+    if actx is None:
+        # NOTE: Assumes that the same number is passed on every rank for a
+        # given volume
+        return {
+            (remote_vol_dd, local_vol_dd): [
+                TracePair(
+                    local_vol_dd.trace(BTAG_PARTITION(remote_part_id)),
+                    interior=local_vol_ary, exterior=remote_vol_ary)
+                for remote_part_id in get_remote_connected_parts(
+                    local_vol_dd, remote_vol_dd)]
+            for (remote_vol_dd, local_vol_dd), (remote_vol_ary, local_vol_ary)
+            in pairwise_volume_data.items()}
+
+    from grudge.array_context import MPIPytatoArrayContextBase
+
+    if isinstance(actx, MPIPytatoArrayContextBase):
+        rbc_class = _RankBoundaryCommunicationLazy
+    else:
+        rbc_class = _RankBoundaryCommunicationEager
+
+    rank_bdry_communicators = {}
+
+    for vol_dd_pair, vol_data_pair in pairwise_volume_data.items():
+        directional_volume_data = {
+            (vol_dd_pair[0], vol_dd_pair[1]): (vol_data_pair[0], vol_data_pair[1]),
+            (vol_dd_pair[1], vol_dd_pair[0]): (vol_data_pair[1], vol_data_pair[0])}
+
+        for dd_pair, data_pair in directional_volume_data.items():
+            other_vol_dd, self_vol_dd = dd_pair
+            other_vol_data, self_vol_data = data_pair
+
+            self_part_id = PartID(self_vol_dd.domain_tag.tag, rank)
+            other_part_ids = get_remote_connected_parts(self_vol_dd, other_vol_dd)
+
+            rbcs = []
+
+            for other_part_id in other_part_ids:
+                self_bdry_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id))
+                self_bdry_data = project(
+                    dcoll, self_vol_dd, self_bdry_dd, self_vol_data)
+
+                from arraycontext import tag_axes
+                from meshmode.transform_metadata import (
+                    DiscretizationElementAxisTag,
+                    DiscretizationDOFAxisTag)
+                other_bdry_zeros = tag_axes(
+                    actx, {
+                        0: DiscretizationElementAxisTag(),
+                        1: DiscretizationDOFAxisTag()},
+                    dcoll._inter_part_connections[
+                        other_part_id, self_part_id].from_discr.zeros(actx))
+
+                other_bdry_data_template = _replace_dof_arrays(
+                    other_vol_data, other_bdry_zeros)
+
+                rbcs.append(
+                    rbc_class(actx, dcoll,
+                        local_part_id=self_part_id,
+                        remote_part_id=other_part_id,
+                        local_bdry_data=self_bdry_data,
+                        remote_bdry_data_template=other_bdry_data_template,
+                        comm_tag=comm_tag))
+
+            rank_bdry_communicators[other_vol_dd, self_vol_dd] = rbcs
+
+    return {
+        directional_vol_dd_pair: [rbc.finish() for rbc in rbcs]
+        for directional_vol_dd_pair, rbcs in rank_bdry_communicators.items()}
 
 # }}}
 
diff --git a/prepare-and-run-flake8.sh b/prepare-and-run-flake8.sh
new file mode 100755
index 000000000..d22a4d874
--- /dev/null
+++ b/prepare-and-run-flake8.sh
@@ -0,0 +1,9 @@
+#! /bin/bash
+
+curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+source ci-support.sh
+
+print_status_message
+clean_up_repo_and_working_env
+create_and_set_up_virtualenv
+install_and_run_flake8 "$@"
diff --git a/requirements.txt b/requirements.txt
index 2107e5aeb..5e21bd1e9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,18 @@
 numpy
 mpi4py
+gmsh
+hjson
 git+https://github.com/inducer/pytools.git#egg=pytools
 git+https://github.com/inducer/pymbolic.git#egg=pymbolic
 git+https://github.com/inducer/islpy.git#egg=islpy
 git+https://github.com/inducer/pyopencl.git#egg=pyopencl
-git+https://github.com/inducer/loopy.git#egg=loopy
+git+https://github.com/inducer/loopy.git@more-0-strides-fixing#egg=loopy
 git+https://github.com/inducer/dagrt.git#egg=dagrt
 git+https://github.com/inducer/leap.git#egg=leap
 git+https://github.com/inducer/meshpy.git#egg=meshpy
 git+https://github.com/inducer/modepy.git#egg=modepy
+git+https://github.com/nchristensen/meshmode.git@dof_tagging#egg=meshmode
 git+https://github.com/inducer/arraycontext.git#egg=arraycontext
-git+https://github.com/inducer/meshmode.git#egg=meshmode
 git+https://github.com/inducer/pyvisfile.git#egg=pyvisfile
 git+https://github.com/inducer/pymetis.git#egg=pymetis
 git+https://github.com/illinois-ceesd/logpyle.git#egg=logpyle
diff --git a/setup.cfg b/setup.cfg
index da67c8630..9b8c87fc4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,6 +12,8 @@ per-file-ignores =
     test/test_op.py:B023
     test/test_euler_model.py:B023
 
+#per-file-ignores =
+#  grudge/loopy_dg_kernels/run_tests.py:N806, N803, N802
 # enable-flake8-bugbear
 
 [mypy]
diff --git a/setup.py b/setup.py
index adcbd6c02..aeb776597 100644
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,7 @@ def main():
 
     version_dict = {}
     init_filename = "grudge/version.py"
+
     exec(compile(open(init_filename, "r").read(), init_filename, "exec"),
             version_dict)
 
@@ -47,9 +48,12 @@ def main():
             "meshmode>=2020.2",
             "pyopencl>=2013.1",
             "pymbolic>=2013.2",
-            "loopy>=2020.2",
+            "loopy>=2020.2.2",
             "cgen>=2013.1.2",
-            "dataclasses>=0.7;python_version<='3.6'"
+            "hjson",
+            #"gmsh",
+            "import_resources; python_version<'3.7'",
+            "dataclasses>=0.7; python_version<='3.6'"
         ],
     )
 
diff --git a/test/test_grudge.py b/test/test_grudge.py
index ce0b199b8..9b119d9d7 100644
--- a/test/test_grudge.py
+++ b/test/test_grudge.py
@@ -38,19 +38,64 @@
 
 from pytools.obj_array import flat_obj_array
 
-from grudge import DiscretizationCollection
+from grudge import DiscretizationCollection, make_discretization_collection
 
 import grudge.dof_desc as dof_desc
 import grudge.op as op
 
 
 import pytest
+from meshmode.array_context import generate_pytest_generate_tests
+from grudge.grudge_array_context import GrudgeArrayContext
+pytest_generate_tests = generate_pytest_generate_tests(GrudgeArrayContext)
 
 import logging
 
 logger = logging.getLogger(__name__)
 
 
+# { {{ inverse metric
+
+@pytest.mark.parametrize("dim", [2, 3])
+def test_inverse_metric(actx_factory, dim):
+    actx = actx_factory()
+
+    mesh = mgen.generate_regular_rect_mesh(a=(-0.5,)*dim, b=(0.5,)*dim,
+            nelements_per_axis=(6,)*dim, order=4)
+
+    def m(x):
+        result = np.empty_like(x)
+        result[0] = (
+                1.5*x[0] + np.cos(x[0])
+                + 0.1*np.sin(10*x[1]))
+        result[1] = (
+                0.05*np.cos(10*x[0])
+                + 1.3*x[1] + np.sin(x[1]))
+        if len(x) == 3:
+            result[2] = x[2]
+        return result
+
+    from meshmode.mesh.processing import map_mesh
+    mesh = map_mesh(mesh, m)
+
+    dcoll = DiscretizationCollection(actx, mesh, order=4)
+
+    from grudge.geometry import \
+        forward_metric_derivative_mat, inverse_metric_derivative_mat
+
+    mat = forward_metric_derivative_mat(actx, dcoll).dot(
+        inverse_metric_derivative_mat(actx, dcoll))
+
+    for i in range(mesh.dim):
+        for j in range(mesh.dim):
+            tgt = 1 if i == j else 0
+
+            err = flat_norm(mat[i, j] - tgt, ord=np.inf)
+            logger.info("error[%d, %d]: %.5e", i, j, err)
+            assert err < 1.0e-12, (i, j, err)
+
+# }}}
+
 # {{{ mass operator trig integration
 
 @pytest.mark.parametrize("ambient_dim", [1, 2, 3])
@@ -341,7 +386,10 @@ def test_face_normal_surface(actx_factory, mesh_name):
     surf_normal = surf_normal / actx.np.sqrt(sum(surf_normal**2))
 
     face_normal_i = actx.thaw(dcoll.normal(df))
-    face_normal_e = dcoll.opposite_face_connection()(face_normal_i)
+    face_normal_e = dcoll.opposite_face_connection(
+            dof_desc.BoundaryDomainTag(
+                dof_desc.FACE_RESTR_INTERIOR, dof_desc.VTAG_ALL)
+            )(face_normal_i)
 
     if mesh.ambient_dim == 3:
         from grudge.geometry import pseudoscalar, area_element
@@ -618,10 +666,9 @@ def f(x):
             or eoc_local.order_estimate() > order - 0.5
 
 # }}}
-
-
 # {{{ models: advection
 
+
 @pytest.mark.parametrize(("mesh_name", "mesh_pars"), [
     ("segment", [8, 16, 32]),
     ("disk", [0.07, 0.02, 0.01]),
@@ -780,9 +827,9 @@ def rhs(t, u):
 
 # }}}
 
-
 # {{{ models: maxwell
 
+
 @pytest.mark.parametrize("order", [3, 4, 5])
 def test_convergence_maxwell(actx_factory,  order):
     """Test whether 3D Maxwell's actually converges"""
@@ -857,9 +904,9 @@ def rhs(t, w):
 
 # }}}
 
-
 # {{{ models: variable coefficient advection oversampling
 
+
 @pytest.mark.parametrize("order", [2, 3, 4])
 def test_improvement_quadrature(actx_factory, order):
     """Test whether quadrature improves things and converges"""
@@ -941,6 +988,34 @@ def zero_inflow(dtag, t=0):
 
 # }}}
 
+# {{{ operator collector determinism
+
+
+def test_op_collector_order_determinism():
+    class TestOperator(sym.Operator):
+
+        def __init__(self):
+            sym.Operator.__init__(self, dof_desc.DD_VOLUME, dof_desc.DD_VOLUME)
+
+        mapper_method = "map_test_operator"
+
+    from grudge.symbolic.mappers import BoundOperatorCollector
+
+    class TestBoundOperatorCollector(BoundOperatorCollector):
+
+        def map_test_operator(self, expr):
+            return self.map_operator(expr)
+
+    v0 = sym.var("v0")
+    ob0 = sym.OperatorBinding(TestOperator(), v0)
+
+    v1 = sym.var("v1")
+    ob1 = sym.OperatorBinding(TestOperator(), v1)
+
+    # The output order isn't significant, but it should always be the same.
+    assert list(TestBoundOperatorCollector(TestOperator)(ob0 + ob1)) == [ob0, ob1]
+
+# }}}
 
 # {{{ bessel
 
@@ -978,6 +1053,47 @@ def bessel_j(actx, n, r):
 
 # }}}
 
+# {{{ function symbol
+
+
+def test_external_call(actx_factory):
+    actx = actx_factory()
+
+    def double(queue, x):
+        return 2 * x
+
+    dims = 2
+
+    mesh = mgen.generate_regular_rect_mesh(
+            a=(0,) * dims, b=(1,) * dims, nelements_per_axis=(4,) * dims)
+    discr = DiscretizationCollection(actx, mesh, order=1)
+
+    ones = sym.Ones(dof_desc.DD_VOLUME)
+    op = (
+            ones * 3
+            + sym.FunctionSymbol("double")(ones))
+
+    from grudge.function_registry import (
+            base_function_registry, register_external_function)
+
+    freg = register_external_function(
+            base_function_registry,
+            "double",
+            implementation=double,
+            dd=dof_desc.DD_VOLUME)
+
+    bound_op = bind(discr, op, function_registry=freg)
+
+    result = bound_op(actx, double=double)
+    assert actx.to_numpy(flatten(result) == 5).all()
+
+
+@pytest.mark.parametrize("array_type", ["scalar", "vector"])
+def test_function_symbol_array(actx_factory, array_type):
+    """Test if `FunctionSymbol` distributed properly over object arrays."""
+
+
+# {{{ test norms
 
 @pytest.mark.parametrize("p", [2, np.inf])
 def test_norm_real(actx_factory, p):
@@ -1042,6 +1158,10 @@ def test_norm_obj_array(actx_factory, p):
     logger.info("norm: %.5e %.5e", norm, ref_norm)
     assert abs(norm-ref_norm) / abs(ref_norm) < 1e-14
 
+# }}}
+
+
+# {{{ empty boundaries
 
 def test_empty_boundary(actx_factory):
     # https://github.com/inducer/grudge/issues/54
@@ -1061,10 +1181,39 @@ def test_empty_boundary(actx_factory):
         assert isinstance(component, DOFArray)
         assert len(component) == len(dcoll.discr_from_dd(BTAG_NONE).groups)
 
+# }}}
+
+
+# {{{ multi-volume
+
+def test_multi_volume(actx_factory):
+    dim = 2
+    actx = actx_factory()
+
+    mesh = mgen.generate_regular_rect_mesh(
+            a=(-0.5,)*dim, b=(0.5,)*dim,
+            nelements_per_axis=(8,)*dim, order=4)
+
+    meg, = mesh.groups
+    x = mesh.vertices[0, meg.vertex_indices]
+    x_elem_avg = np.sum(x, axis=1)/x.shape[1]
+    volume_per_element = (x_elem_avg > 0).astype(np.int32)
+
+    from meshmode.distributed import membership_list_to_map
+    volume_to_elements = membership_list_to_map(volume_per_element)
+
+    from meshmode.mesh.processing import partition_mesh
+    volume_to_mesh = partition_mesh(mesh, volume_to_elements)
+
+    make_discretization_collection(actx, volume_to_mesh, order=4)
+
+# }}}
+
 
 # You can test individual routines by typing
 # $ python test_grudge.py 'test_routine()'
 
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:
diff --git a/test/test_mpi_communication.py b/test/test_mpi_communication.py
index 47100b415..6d1d26f40 100644
--- a/test/test_mpi_communication.py
+++ b/test/test_mpi_communication.py
@@ -31,6 +31,7 @@
 import logging
 import sys
 
+from grudge.grudge_array_context import GrudgeArrayContext
 from grudge.array_context import MPIPyOpenCLArrayContext, MPIPytatoArrayContext
 
 logger = logging.getLogger(__name__)
@@ -45,6 +46,7 @@
 from pytools.obj_array import flat_obj_array
 
 import grudge.op as op
+import grudge.dof_desc as dof_desc
 
 
 class SimpleTag:
@@ -153,7 +155,10 @@ def hopefully_zero():
         return (
             op.project(
                 dcoll, "int_faces", "all_faces",
-                dcoll.opposite_face_connection()(int_faces_func)
+                dcoll.opposite_face_connection(
+                    dof_desc.BoundaryDomainTag(
+                        dof_desc.FACE_RESTR_INTERIOR, dof_desc.VTAG_ALL)
+                    )(int_faces_func)
             )
             + sum(op.project(dcoll, tpair.dd, "all_faces", tpair.ext)
                   for tpair in op.cross_rank_trace_pairs(dcoll, myfunc,
@@ -170,7 +175,6 @@ def hopefully_zero():
 
     assert error < 1e-14
 
-
 # }}}
 
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 9e7387bad..e0e424fe4 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -149,7 +149,7 @@ def f(x):
         min_res = np.empty(grp_f.shape)
         max_res = np.empty(grp_f.shape)
         sum_res = np.empty(grp_f.shape)
-        for eidx in range(dcoll.mesh.nelements):
+        for eidx in range(mesh.nelements):
             element_data = actx.to_numpy(grp_f[eidx])
             min_res[eidx, :] = np.min(element_data)
             max_res[eidx, :] = np.max(element_data)
@@ -272,7 +272,7 @@ def _get_ref_data(field):
             min_res = np.empty(grp_f.shape)
             max_res = np.empty(grp_f.shape)
             sum_res = np.empty(grp_f.shape)
-            for eidx in range(dcoll.mesh.nelements):
+            for eidx in range(mesh.nelements):
                 element_data = actx.to_numpy(grp_f[eidx])
                 min_res[eidx, :] = np.min(element_data)
                 max_res[eidx, :] = np.max(element_data)