diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 32707fe86..234988280 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,10 @@ on:
     schedule:
         - cron:  '17 3 * * 0'
 
+concurrency:
+    group: ${{ github.head_ref || github.ref_name }}
+    cancel-in-progress: true
+
 jobs:
     ruff:
         name: Ruff
@@ -20,6 +24,13 @@ jobs:
                 pipx install ruff
                 ruff check
 
+    typos:
+        name: Typos
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   uses: crate-ci/typos@master
+
     pylint:
         name: Pylint
         runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 7cf3c4751..4378c7122 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,8 +21,6 @@ lextab.py
 yacctab.py
 .pytest_cache/*
 
-loopy/_git_rev.py
-
 .cache
 .env
 virtualenv-[0-9]*[0-9]
diff --git a/MEMO b/MEMO
index f4e5c34e4..5a9438811 100644
--- a/MEMO
+++ b/MEMO
@@ -7,7 +7,7 @@ Documentation Notes
 Things to consider
 ^^^^^^^^^^^^^^^^^^
 
-- Depedencies are pointwise for shared loop dimensions
+- Dependencies are pointwise for shared loop dimensions
   and global over non-shared ones (between dependent and ancestor)
 
 - multiple insns could fight over which iname gets local axis 0
diff --git a/README.rst b/README.rst
index 1ef7773db..87c4887a3 100644
--- a/README.rst
+++ b/README.rst
@@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
 .. image:: https://gitlab.tiker.net/inducer/loopy/badges/main/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/loopy/commits/main
-.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main&event=push
+.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main
     :alt: Github Build Status
-    :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
+    :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI
 .. image:: https://badge.fury.io/py/loopy.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/loopy/
diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py
index 82a2b9602..bbde23174 100644
--- a/contrib/mem-pattern-explorer/pattern_vis.py
+++ b/contrib/mem-pattern-explorer/pattern_vis.py
@@ -76,7 +76,7 @@ def tick(self):
 class Array:
     def __init__(self, ctx, name, shape, strides, elements_per_row=None):
         # Each array element stores a tuple:
-        # (timestamp, subgroup, g0, g1, g2, ) of last acccess
+        # (timestamp, subgroup, g0, g1, g2, ) of last access
 
         assert len(shape) == len(strides)
 
diff --git a/doc/misc.rst b/doc/misc.rst
index 3fea6fdd4..be1c964cd 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -158,7 +158,7 @@ In the meantime, you can generate code simply by saying::
     print(cg_result.host_code())
     print(cg_result.device_code())
 
-Additionally, for C-based languages, header defintions are available via::
+Additionally, for C-based languages, header definitions are available via::
 
     loopy.generate_header(knl)
 
@@ -338,8 +338,8 @@ This list is always growing, but here are a few pointers:
 
   Use :func:`loopy.join_inames`.
 
-In what sense does Loopy suport vectorization?
-----------------------------------------------
+In what sense does Loopy support vectorization?
+-----------------------------------------------
 
 There are really two ways in which the OpenCL/CUDA model of computation exposes
 vectorization:
@@ -352,7 +352,7 @@ vectorization:
   e.g. ``float4``, which support arithmetic with implicit vector semantics
   as well as a number of 'intrinsic' functions.
 
-Loopy suports both. The first one, SIMT, is accessible by tagging inames with,
+Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
 e.g., ``l.0```. Accessing the second one requires using both execution- and
 data-reshaping capabilities in loopy. To start with, you need an array that
 has an axis with the length of the desired vector. If that's not yet available,
diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst
index 3dc0a2bd7..86e2edad2 100644
--- a/doc/ref_internals.rst
+++ b/doc/ref_internals.rst
@@ -53,3 +53,7 @@ Schedule
 --------
 
 .. automodule:: loopy.schedule
+.. automodule:: loopy.schedule.tools
+.. automodule:: loopy.schedule.tree
+
+
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 1fa237b25..2962c23b8 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -262,6 +262,7 @@ Instructions
 
 .. {{{
 
+.. autoclass:: HappensAfter
 .. autoclass:: InstructionBase
 
 .. _assignments:
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
index e5059380d..538f0cdb9 100644
--- a/doc/ref_other.rst
+++ b/doc/ref_other.rst
@@ -1,6 +1,11 @@
 Reference: Other Functionality
 ==============================
 
+Auxiliary Data Types
+--------------------
+
+.. automodule:: loopy.typing
+
 Obtaining Kernel Performance Statistics
 ---------------------------------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index a697bed30..4aeb42428 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -438,7 +438,8 @@ with identical bounds, for the use of the transpose:
     ...     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
     ...     """,
     ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
-    >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj")
+    >>> knl = lp.prioritize_loops(knl, "i,j")
+    >>> knl = lp.prioritize_loops(knl, "ii,jj")
 
 :func:`loopy.duplicate_inames` can be used to achieve the same goal.
 Now the intended code is generated and our test passes.
@@ -613,7 +614,7 @@ commonly called 'loop tiling':
     ...     assumptions="n mod 16 = 0 and n >= 1")
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.split_iname(knl, "j", 16)
-    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
+    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
     >>> knl = lp.set_options(knl, write_code=True)
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -822,7 +823,7 @@ enabling some cost savings:
       {
         int const i_outer = -1 + n + -1 * ((3 * n) / 4);
     <BLANKLINE>
-        if (-1 + n >= 0)
+        if (i_outer >= 0)
         {
           a[4 * i_outer] = (float) (0.0f);
           if (-2 + -4 * i_outer + n >= 0)
@@ -957,7 +958,7 @@ Consider the following example:
     ...     "{ [i_outer,i_inner, k]:  "
     ...          "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
     ...     """
-    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
+    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner]
     ...     out[16*i_outer + i_inner] = sum(k, a_temp[k])
     ...     """)
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
@@ -1032,8 +1033,8 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-        acc_k = 0.0f;
         a_fetch = a[16 * gid(0) + lid(0)];
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch;
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1056,12 +1057,11 @@ earlier:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        acc_k = 0.0f;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch[lid(0)];
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1209,6 +1209,12 @@ Let us start with an example. Consider the kernel from above with a
    ...     assumptions="n mod 16 = 0")
    >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
+.. testsetup::
+
+    >>> prog = prog.with_kernel(
+    ...    prog.default_entrypoint.copy(
+    ...        silenced_warnings=["v1_scheduler_fallback"]))
+
 Here is what happens when we try to generate code for the kernel:
 
    >>> cgr = lp.generate_code_v2(prog)
@@ -1312,7 +1318,7 @@ The kernel translates into two OpenCL kernels.
      int tmp;
    <BLANKLINE>
      tmp = tmp_save_slot[16 * gid(0) + lid(0)];
-     arr[(lid(0) + gid(0) * 16 + 1) % n] = tmp;
+     arr[(1 + lid(0) + gid(0) * 16) % n] = tmp;
    }
 
 Now we can execute the kernel.
@@ -1903,18 +1909,16 @@ Now to make things more interesting, we'll create a kernel with barriers:
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-      {
-        int const k_outer = 0;
-    <BLANKLINE>
+      for (int i = 0; i <= 49; ++i)
         for (int j = 0; j <= 9; ++j)
-          for (int i = 0; i <= 49; ++i)
-          {
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
-            c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
-            e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
-          }
-      }
+        {
+          int const k_outer = 0;
+    <BLANKLINE>
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
+          c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
+          e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
+        }
     }
 
 In this kernel, when a work-item performs the second instruction it uses data
diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb
index d9ac1f1b2..64fcb0af4 100644
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
@@ -49,7 +49,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(prog)"
+    "print(prog)  # noqa: F821"
    ]
   },
   {
@@ -105,17 +105,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(prog)"
+    "print(prog)  # noqa: F821"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index 070e0d071..bf6e29e47 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -24,10 +24,7 @@ def transform(knl, vars, stream_dtype):
         knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
     knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
 
-    knl = lp.add_and_infer_dtypes(knl, {
-        var: stream_dtype
-        for var in vars
-        })
+    knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
 
     knl = lp.set_argument_order(knl, vars + ["n"])
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 275d4f26e..149a1af26 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -56,6 +56,7 @@
     BarrierInstruction,
     CallInstruction,
     CInstruction,
+    HappensAfter,
     InstructionBase,
     LegacyStringInstructionTag,
     MemoryOrdering,
@@ -203,15 +204,9 @@
     find_rules_matching,
 )
 from loopy.translation_unit import TranslationUnit, for_each_kernel, make_program
-
-# }}}
 from loopy.type_inference import infer_unknown_types
 from loopy.types import to_loopy_type
-
-# {{{ imported user interface
 from loopy.typing import auto
-
-# {{{ import transforms
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION, VERSION
 
 
@@ -242,6 +237,7 @@
     "ExecutorBase",
     "GeneratedProgram",
     "GlobalArg",
+    "HappensAfter",
     "ISPCTarget",
     "ImageArg",
     "InKernelCallable",
@@ -563,18 +559,18 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None):
 
     indices = ["i%d" % i for i in range(rank)]
     shape = ["n%d" % i for i in range(rank)]
-    commad_indices = ", ".join(indices)
+    command_indices = ", ".join(indices)
     bounds = " and ".join(
             f"0<={ind}<{shape_i}"
             for ind, shape_i in zip(indices, shape))
 
     set_str = "{{[{}]: {} }}".format(
-                commad_indices,
+                command_indices,
                 bounds
                 )
     result = make_kernel(set_str,
             "output[%s] = input[%s]"
-            % (commad_indices, commad_indices),
+            % (command_indices, command_indices),
             lang_version=MOST_RECENT_LANGUAGE_VERSION,
             default_offset=auto)
 
diff --git a/loopy/check.py b/loopy/check.py
index 17887cff5..31bbd7b0f 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -22,6 +22,7 @@
 
 import logging
 from collections import defaultdict
+from collections.abc import Mapping, Sequence
 from functools import reduce
 from typing import List, Optional, Tuple, Union
 
@@ -43,7 +44,15 @@
     FixedStrideArrayDimTag,
     SeparateArrayArrayDimTag,
 )
-from loopy.kernel.data import ArrayArg, ArrayDimImplementationTag, auto
+from loopy.kernel.data import (
+    AddressSpace,
+    ArrayArg,
+    ArrayDimImplementationTag,
+    InameImplementationTag,
+    TemporaryVariable,
+    auto,
+)
+from loopy.kernel.function_interface import CallableKernel
 from loopy.kernel.instruction import (
     CallInstruction,
     CInstruction,
@@ -51,10 +60,14 @@
     NoOpInstruction,
     _DataObliviousInstruction,
 )
-from loopy.symbolic import CombineMapper, ResolvedFunction, WalkMapper
-from loopy.translation_unit import for_each_kernel
+from loopy.symbolic import CombineMapper, ResolvedFunction, SubArrayRef, WalkMapper
+from loopy.translation_unit import (
+    CallablesTable,
+    TranslationUnit,
+    check_each_kernel,
+)
 from loopy.type_inference import TypeReader
-from loopy.typing import ExpressionT
+from loopy.typing import ExpressionT, not_none
 
 
 logger = logging.getLogger(__name__)
@@ -144,8 +157,8 @@ def map_constant(self, expr):
     map_nan = map_constant
 
 
-@for_each_kernel
-def check_functions_are_resolved(kernel):
+@check_each_kernel
+def check_functions_are_resolved(kernel: LoopKernel) -> None:
     """ Checks if all call nodes in the *kernel* expression have been
     resolved.
     """
@@ -166,7 +179,7 @@ def check_functions_are_resolved(kernel):
             raise NotImplementedError(type(insn))
 
 
-@for_each_kernel
+@check_each_kernel
 def check_separated_array_consistency(kernel: LoopKernel) -> None:
     # Boo. This is (part of) the price of redundant representation.
     for arg in kernel.args:
@@ -197,7 +210,7 @@ def check_separated_array_consistency(kernel: LoopKernel) -> None:
                                 f"'{sub_arg.name}' is not consistent.")
 
 
-@for_each_kernel
+@check_each_kernel
 def check_offsets_and_dim_tags(kernel: LoopKernel) -> None:
     from pymbolic.primitives import Expression, Variable
 
@@ -356,8 +369,8 @@ def check_for_integer_subscript_indices(t_unit):
             raise NotImplementedError(type(clbl).__name__)
 
 
-@for_each_kernel
-def check_sub_array_ref_inames_not_within_or_redn_inames(kernel):
+@check_each_kernel
+def check_sub_array_ref_inames_not_within_or_redn_inames(kernel: LoopKernel) -> None:
     all_within_inames = frozenset().union(*(insn.within_inames
                                             for insn in kernel.instructions))
     all_redn_inames = frozenset().union(*(insn.reduction_inames()
@@ -378,8 +391,8 @@ def check_sub_array_ref_inames_not_within_or_redn_inames(kernel):
                          " illegal.")
 
 
-@for_each_kernel
-def check_insn_attributes(kernel):
+@check_each_kernel
+def check_insn_attributes(kernel: LoopKernel) -> None:
     """
     Check for legality of attributes of every instruction in *kernel*.
     """
@@ -412,8 +425,8 @@ def check_insn_attributes(kernel):
                        ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES)))
 
 
-@for_each_kernel
-def check_for_duplicate_insn_ids(knl):
+@check_each_kernel
+def check_for_duplicate_insn_ids(knl: LoopKernel) -> None:
     """
     Check if multiple instructions of *knl* have the same
     :attr:`loopy.InstructionBase.id`.
@@ -428,8 +441,8 @@ def check_for_duplicate_insn_ids(knl):
         insn_ids.add(insn.id)
 
 
-@for_each_kernel
-def check_loop_priority_inames_known(kernel):
+@check_each_kernel
+def check_loop_priority_inames_known(kernel: LoopKernel) -> None:
     """
     Checks if the inames in :attr:`loopy.LoopKernel.loop_priority` are part of
     the *kernel*'s domain.
@@ -440,8 +453,8 @@ def check_loop_priority_inames_known(kernel):
                 raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
-@for_each_kernel
-def check_multiple_tags_allowed(kernel):
+@check_each_kernel
+def check_multiple_tags_allowed(kernel: LoopKernel) -> None:
     """
     Checks if a multiple tags of an iname are compatible.
     """
@@ -465,7 +478,10 @@ def check_multiple_tags_allowed(kernel):
                                  "tags: {}".format(iname.name, iname.tags))
 
 
-def _check_for_double_use_of_hw_axes_inner(kernel, callables_table):
+def _check_for_double_use_of_hw_axes_inner(
+            kernel: LoopKernel,
+            callables_table: CallablesTable
+        ) -> None:
     from loopy.kernel.data import GroupInameTag, LocalInameTag, UniqueInameTag
     from loopy.kernel.instruction import CallInstruction
     from loopy.symbolic import ResolvedFunction
@@ -489,7 +505,7 @@ def _check_for_double_use_of_hw_axes_inner(kernel, callables_table):
                 insn_tag_keys.add(key)
 
 
-def check_for_double_use_of_hw_axes(t_unit):
+def check_for_double_use_of_hw_axes(t_unit: TranslationUnit) -> None:
     """
     Check if any instruction of *kernel* is within multiple inames tagged with
     the same hw axis tag.
@@ -505,8 +521,8 @@ def check_for_double_use_of_hw_axes(t_unit):
             raise NotImplementedError(type(clbl).__name__)
 
 
-@for_each_kernel
-def check_for_inactive_iname_access(kernel):
+@check_each_kernel
+def check_for_inactive_iname_access(kernel: LoopKernel) -> None:
     """
     Check if any instruction accesses an iname but is not within it.
     """
@@ -523,8 +539,8 @@ def check_for_inactive_iname_access(kernel):
                                   - insn.within_inames), kernel.name))
 
 
-@for_each_kernel
-def check_for_unused_inames(kernel):
+@check_each_kernel
+def check_for_unused_inames(kernel: LoopKernel) -> None:
     """
     Check if there are any unused inames in the kernel.
     """
@@ -540,7 +556,7 @@ def check_for_unused_inames(kernel):
             % unused_inames)
 
 
-def _is_racing_iname_tag(tv, tag):
+def _is_racing_iname_tag(tv: TemporaryVariable, tag: InameImplementationTag) -> bool:
     from loopy.kernel.data import (
         AddressSpace,
         ConcurrentTag,
@@ -571,8 +587,8 @@ def _is_racing_iname_tag(tv, tag):
                 "temporary variable '%s'" % tv.name)
 
 
-@for_each_kernel
-def check_for_write_races(kernel):
+@check_each_kernel
+def check_for_write_races(kernel: LoopKernel) -> None:
     """
     Check if any memory accesses lead to write races.
     """
@@ -620,8 +636,8 @@ def check_for_write_races(kernel):
                         WriteRaceConditionWarning)
 
 
-@for_each_kernel
-def check_for_data_dependent_parallel_bounds(kernel):
+@check_each_kernel
+def check_for_data_dependent_parallel_bounds(kernel: LoopKernel) -> None:
     """
     Check that inames tagged as hw axes have bounds that are known at kernel
     launch.
@@ -848,7 +864,7 @@ def map_call(self, expr, domain, insn_id):
             _check_bounds_inner_rec(subkernel, self.callables_table)
 
 
-def _check_bounds_inner(kernel, callables_table):
+def _check_bounds_inner(kernel: LoopKernel, callables_table: CallablesTable) -> None:
     from loopy.kernel.instruction import get_insn_domain
 
     temp_var_names = set(kernel.temporary_variables)
@@ -874,7 +890,10 @@ def run_acm(expr):
         insn.with_transformed_expressions(run_acm)
 
 
-def _check_bounds_inner_rec(kernel, callables_table):
+def _check_bounds_inner_rec(
+            kernel: LoopKernel,
+            callables_table: CallablesTable
+        ) -> None:
     if kernel.options.enforce_array_accesses_within_bounds not in [
             "no_check",
             True,
@@ -899,7 +918,7 @@ def _check_bounds_inner_rec(kernel, callables_table):
                 warn_with_kernel(kernel, "array_access_out_of_bounds", str(e))
 
 
-def check_bounds(t_unit):
+def check_bounds(t_unit: TranslationUnit) -> None:
     """
     Performs out-of-bound check for every array access.
     """
@@ -912,8 +931,8 @@ def check_bounds(t_unit):
 
 # {{{ check write destinations
 
-@for_each_kernel
-def check_write_destinations(kernel):
+@check_each_kernel
+def check_write_destinations(kernel: LoopKernel) -> None:
     for insn in kernel.instructions:
         for wvar in insn.assignee_var_names():
             if wvar in kernel.all_inames():
@@ -940,8 +959,8 @@ def check_write_destinations(kernel):
 
 # {{{ check_has_schedulable_iname_nesting
 
-@for_each_kernel
-def check_has_schedulable_iname_nesting(kernel):
+@check_each_kernel
+def check_has_schedulable_iname_nesting(kernel: LoopKernel) -> None:
     from loopy.transform.iname import (
         get_iname_duplication_options,
         has_schedulable_iname_nesting,
@@ -988,8 +1007,8 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b):
     return ab_nosync and ba_nosync
 
 
-def _get_address_space(kernel, var):
-    from loopy.kernel.data import AddressSpace, ArrayArg, ValueArg
+def _get_address_space(kernel: LoopKernel, var: str) -> AddressSpace | type[auto]:
+    from loopy.kernel.data import ArrayArg, ValueArg
     if var in kernel.temporary_variables:
         address_space = kernel.temporary_variables[var].address_space
     else:
@@ -1005,7 +1024,7 @@ def _get_address_space(kernel, var):
     return address_space
 
 
-def _get_topological_order(kernel):
+def _get_topological_order(kernel: LoopKernel) -> Sequence[str]:
     """
     Returns a :class:`list` of insn ids of *kernel* in a topological sort
     order.
@@ -1033,7 +1052,7 @@ def _get_topological_order(kernel):
     return order
 
 
-def _check_variable_access_ordered_inner(kernel):
+def _check_variable_access_ordered_inner(kernel: LoopKernel) -> None:
     from loopy.kernel.tools import find_aliasing_equivalence_classes
     from loopy.symbolic import AccessRangeOverlapChecker
     overlap_checker = AccessRangeOverlapChecker(kernel)
@@ -1050,7 +1069,7 @@ def _check_variable_access_ordered_inner(kernel):
     # the mapping in both directions.
     #
     # Note: This can be worst-case O(n^2) in the number of instructions.
-    dep_reqs_to_vars = {}
+    dep_reqs_to_vars: dict[tuple[str, str], set[str]] = {}
 
     wmap = kernel.writer_map()
     rmap = kernel.reader_map()
@@ -1081,14 +1100,16 @@ def _check_variable_access_ordered_inner(kernel):
     # {{{ compute rev_depends, depends_on
 
     # depends_on: mapping from insn_ids to their dependencies
-    depends_on = {insn.id: set() for insn in kernel.instructions}
+    depends_on: dict[str, set[str]] = {
+        not_none(insn.id): set() for insn in kernel.instructions}
     # rev_depends: mapping from insn_ids to their reverse deps.
-    rev_depends = {insn.id: set() for insn in kernel.instructions}
+    rev_depends: dict[str, set[str]] = {
+        not_none(insn.id): set() for insn in kernel.instructions}
 
     for insn in kernel.instructions:
-        depends_on[insn.id].update(insn.depends_on)
+        depends_on[not_none(insn.id)].update(insn.depends_on)
         for dep in insn.depends_on:
-            rev_depends[dep].add(insn.id)
+            rev_depends[dep].add(not_none(insn.id))
 
     # }}}
 
@@ -1096,7 +1117,8 @@ def _check_variable_access_ordered_inner(kernel):
 
     topological_order = _get_topological_order(kernel)
 
-    def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order):
+    # TODO: Type this
+    def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order) -> None:
         """
         Considering a graph defined by *edges* (as ``key -> value``),
         remove pairs of nodes from *dep_reqs_to_vars* for which edges
@@ -1123,7 +1145,7 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order):
             # for each *pred*, we will calculate all the direct/indirect
             # instructions that can be reached.
             seen_successors = set()
-            # first let us start with direct sucessors
+            # first let us start with direct successors
             to_check = edges[pred].copy()
             while to_check:
                 successor = to_check.pop()
@@ -1214,12 +1236,12 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order):
     # }}}
 
 
-@for_each_kernel
-def check_variable_access_ordered(kernel):
+@check_each_kernel
+def check_variable_access_ordered(kernel: LoopKernel) -> None:
     """Checks that between each write to a variable and all other accesses to
     the variable there is either:
 
-    * a direct/indirect depdendency edge, or
+    * a direct/indirect dependency edge, or
     * an explicit statement that no ordering is necessary (expressed
       through a bi-directional :attr:`loopy.InstructionBase.no_sync_with`)
     """
@@ -1252,7 +1274,7 @@ def check_variable_access_ordered(kernel):
 # }}}
 
 
-def pre_schedule_checks(t_unit):
+def pre_schedule_checks(t_unit: TranslationUnit) -> None:
     try:
         logger.debug("pre-schedule checks start for entrypoints: "
                      f"{t_unit.entrypoints}.")
@@ -1312,9 +1334,12 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None:
             storage_array = name_to_array.get(ary.base_storage, None)
 
             if storage_array is None:
-                raise ValueError("nothing known about storage array "
+                raise LoopyError("Nothing known about storage array "
                         f"'{ary.base_storage}' serving as base_storage of "
-                        f"'{ary.name}'")
+                        f"'{ary.name}'. "
+                        "(Note: base storage is no longer automatically allocated. "
+                        "Call allocate_temporaries_for_base_storage to automatically "
+                        "allocate.)")
 
             if storage_array.base_storage:
                 raise ValueError("storage array "
@@ -1327,8 +1352,11 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None:
 
 # {{{ check for unused hw axes
 
-def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
-        sched_index=None):
+def _check_for_unused_hw_axes_in_kernel_chunk(
+            kernel: LoopKernel,
+            callables_table: CallablesTable,
+            sched_index: int | None = None
+        ) -> int:
     from loopy.schedule import (
         Barrier,
         CallKernel,
@@ -1339,6 +1367,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
         gather_schedule_block,
         get_insn_ids_for_block_at,
     )
+    assert kernel.linearization is not None
 
     if sched_index is None:
         group_axes = set()
@@ -1439,7 +1468,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
     return past_end_i
 
 
-def check_for_unused_hw_axes_in_insns(kernel, callables_table):
+def check_for_unused_hw_axes_in_insns(
+            kernel: LoopKernel,
+            callables_table: CallablesTable
+        ) -> None:
     if kernel.linearization:
         _check_for_unused_hw_axes_in_kernel_chunk(kernel,
                 callables_table)
@@ -1449,7 +1481,9 @@ def check_for_unused_hw_axes_in_insns(kernel, callables_table):
 
 # {{{ check that atomic ops are used exactly on atomic arrays
 
-def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
+def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(
+            kernel: LoopKernel
+        ) -> None:
     from loopy.kernel.data import ArrayBase, Assignment
     from loopy.types import AtomicType
     atomicity_candidates = (
@@ -1484,7 +1518,9 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
 
 # {{{ check that temporaries are defined in subkernels where used
 
-def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
+def check_that_temporaries_are_defined_in_subkernels_where_used(
+            kernel: LoopKernel
+        ) -> None:
     from loopy.kernel.data import AddressSpace
     from loopy.kernel.tools import get_subkernels
 
@@ -1537,9 +1573,10 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
 
 # {{{ check that all instructions are scheduled
 
-def check_that_all_insns_are_scheduled(kernel):
+def check_that_all_insns_are_scheduled(kernel: LoopKernel) -> None:
+    assert kernel.linearization is not None
 
-    all_schedulable_insns = {insn.id for insn in kernel.instructions}
+    all_schedulable_insns = {not_none(insn.id) for insn in kernel.instructions}
     from loopy.schedule import sched_item_to_insn_id
     scheduled_insns = {
         insn_id
@@ -1559,7 +1596,7 @@ def check_that_all_insns_are_scheduled(kernel):
 
 # {{{ check that shapes and strides are arguments
 
-def check_that_shapes_and_strides_are_arguments(kernel):
+def check_that_shapes_and_strides_are_arguments(kernel: LoopKernel) -> None:
     import loopy as lp
     from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
     from loopy.kernel.data import ValueArg
@@ -1569,12 +1606,12 @@ def check_that_shapes_and_strides_are_arguments(kernel):
             arg.name
             for arg in kernel.args
             if isinstance(arg, ValueArg)
-            and arg.dtype.is_integral()}
+            and not_none(arg.dtype).is_integral()}
 
     for arg in kernel.args:
         if isinstance(arg, ArrayBase):
             if isinstance(arg.shape, tuple):
-                shape_deps = set()
+                shape_deps: set[str] = set()
                 for shape_axis in arg.shape:
                     if shape_axis is not None:
                         shape_deps.update(get_dependencies(shape_axis))
@@ -1603,14 +1640,21 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 
 # {{{ validate_kernel_call_sites
 
-def _get_sub_array_ref_swept_range(kernel, sar):
+def _get_sub_array_ref_swept_range(
+            kernel: LoopKernel,
+            sar: SubArrayRef
+        ) -> isl.Set:
     from loopy.symbolic import get_access_map
     domain = kernel.get_inames_domain(frozenset({iname_var.name
                                                  for iname_var in sar.swept_inames}))
     return get_access_map(domain, sar.swept_inames, kernel.assumptions).range()
 
 
-def _are_sub_array_refs_equivalent(sar1, sar2, caller):
+def _are_sub_array_refs_equivalent(
+            sar1: SubArrayRef,
+            sar2: SubArrayRef,
+            caller: LoopKernel
+        ) -> bool:
     """
     Returns *True* iff *sar1* and *sar2* are equivalent
     :class:`loopy.SubArrayRef`s.
@@ -1653,7 +1697,11 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller):
     return True
 
 
-def _validate_kernel_call_insn(caller, call_insn, callee):
+def _validate_kernel_call_insn(
+            caller: LoopKernel,
+            call_insn: CallInstruction,
+            callee: LoopKernel
+        ) -> None:
     assert call_insn.expression.function.name == callee.name
     from loopy.kernel.array import ArrayBase
     from loopy.symbolic import SubArrayRef
@@ -1703,7 +1751,10 @@ def _validate_kernel_call_insn(caller, call_insn, callee):
                                  f" (got {in_val}, {out_val}).")
 
 
-def _validate_kernel_call_sites_inner(kernel, callables):
+def _validate_kernel_call_sites_inner(
+            kernel: LoopKernel,
+            callables: CallablesTable,
+        ) -> None:
     from pymbolic.primitives import Call
 
     from loopy.kernel.function_interface import CallableKernel
@@ -1722,11 +1773,12 @@ def _validate_kernel_call_sites_inner(kernel, callables):
             raise NotImplementedError(type(insn))
 
 
-def validate_kernel_call_sites(translation_unit):
+def validate_kernel_call_sites(translation_unit: TranslationUnit) -> None:
     for name in translation_unit.callables_table:
-        clbl = translation_unit[name]
-        if isinstance(clbl, LoopKernel):
-            _validate_kernel_call_sites_inner(clbl, translation_unit.callables_table)
+        clbl = translation_unit.callables_table[name]
+        if isinstance(clbl, CallableKernel):
+            _validate_kernel_call_sites_inner(
+                  clbl.subkernel, translation_unit.callables_table)
 
 
 # }}}
@@ -1734,8 +1786,10 @@ def validate_kernel_call_sites(translation_unit):
 
 # {{{ check_all_callees_have_same_index_dtype
 
-def check_all_callees_have_same_index_dtype(epoint: LoopKernel,
-                                            callables_table):
+def check_all_callees_have_same_index_dtype(
+            epoint: LoopKernel,
+            callables_table: CallablesTable
+        ) -> None:
     from loopy.kernel.function_interface import CallableKernel
 
     epoint_clbl = callables_table[epoint.name]
@@ -1752,7 +1806,10 @@ def check_all_callees_have_same_index_dtype(epoint: LoopKernel,
 # }}}
 
 
-def pre_codegen_entrypoint_checks(kernel, callables_table):
+def pre_codegen_entrypoint_checks(
+            kernel: LoopKernel,
+            callables_table: CallablesTable
+        ) -> None:
     logger.debug("pre-codegen entrypoint check %s: start" % kernel.name)
 
     kernel.target.pre_codegen_entrypoint_check(kernel, callables_table)
@@ -1775,7 +1832,7 @@ def pre_codegen_callable_checks(kernel, callables_table):
     logger.debug("pre-codegen callable check %s: done" % kernel.name)
 
 
-def pre_codegen_checks(t_unit):
+def pre_codegen_checks(t_unit: TranslationUnit) -> None:
     from loopy.kernel.function_interface import CallableKernel
 
     try:
@@ -1798,7 +1855,11 @@ def pre_codegen_checks(t_unit):
 
 # {{{ sanity-check for implemented domains of each instruction
 
-def check_implemented_domains(kernel, implemented_domains, code=None):
+def check_implemented_domains(
+            kernel: LoopKernel,
+            implemented_domains: Mapping[str, isl.Set],
+            code: str | None = None,
+        ) -> bool:
     from islpy import align_two, dim_type
 
     last_idomains = None
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 645a57e31..c64c2ea67 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -28,6 +28,7 @@
 from loopy.codegen.control import build_loop_nest
 from loopy.codegen.result import merge_codegen_results
 from loopy.diagnostic import LoopyError, warn
+from loopy.symbolic import flatten
 
 
 # {{{ conditional-reducing slab decomposition
@@ -309,7 +310,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     codegen_state = codegen_state.intersect(slab)
 
     from loopy.symbolic import pw_aff_to_expr
-    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)
+    hw_axis_expr = flatten(hw_axis_expr + pw_aff_to_expr(lower_bound))
 
     # }}}
 
diff --git a/loopy/expression.py b/loopy/expression.py
index 224521b03..2581ec022 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from pymbolic.mapper import RecursiveMapper
+from pymbolic.mapper import Mapper
 
 from loopy.codegen import UnvectorizableError
 from loopy.diagnostic import LoopyError
@@ -55,7 +55,7 @@ def dtype_to_type_context(target, dtype):
 
 # {{{ vectorizability checker
 
-class VectorizabilityChecker(RecursiveMapper):
+class VectorizabilityChecker(Mapper):
     """The return value from this mapper is a :class:`bool` indicating whether
     the result of the expression is vectorized along :attr:`vec_iname`.
     If the expression is not vectorizable, the mapper raises
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 530e92678..fc9eace87 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -200,7 +200,7 @@ def get_type(self, name, none_ok=False):
                     return None
 
                 raise TranslationError(
-                        "no type for '%s' found in 'implict none' routine"
+                        "no type for '%s' found in 'implicit none' routine"
                         % name) from None
 
             return self.implicit_types.get(name[0], np.dtype(np.int32))
@@ -426,7 +426,7 @@ def map_Implicit(self, node):
             scope.implicit_types = None
 
         for stmt, specs in node.items:
-            if scope.implict_types is None:
+            if scope.implict_types is None:  # spellchecker: disable-line
                 raise TranslationError("implicit decl not allowed after "
                         "'implicit none'")
             tp = self.dtype_from_stmt(stmt)
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 160b6415b..28aa3be30 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -186,7 +186,7 @@ def simplify_pw_aff(pw_aff, context=None):
                     continue
 
                 if aff_i.gist(dom_j).is_equal(aff_j):
-                    # aff_i is sufficient to conver aff_j, eliminate aff_j
+                    # aff_i is sufficient to cover aff_j, eliminate aff_j
                     new_pieces = pieces[:]
                     if i < j:
                         new_pieces.pop(j)
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index a9b3bb07e..d85b1568a 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -74,7 +74,7 @@
 from loopy.target import TargetBase
 from loopy.tools import update_persistent_hash
 from loopy.types import LoopyType, NumpyType
-from loopy.typing import ExpressionT
+from loopy.typing import ExpressionT, InameStr
 
 
 if TYPE_CHECKING:
@@ -117,82 +117,25 @@ class LoopKernel(Taggable):
         even if it contains mutable data types. See :meth:`copy` for an easy
         way of producing a modified copy.
 
-    .. attribute:: domains
-
-        a list of :class:`islpy.BasicSet` instances representing the
-        :ref:`domain-tree`.
-
-    .. attribute:: instructions
-
-        A list of :class:`InstructionBase` instances, e.g.
-        :class:`Assignment`. See :ref:`instructions`.
-
-    .. attribute:: args
-
-        A list of :class:`loopy.KernelArgument`
-
-    .. attribute:: schedule
-
-        *None* or a list of :class:`loopy.schedule.ScheduleItem`
-
-    .. attribute:: name
-    .. attribute:: preambles
-    .. attribute:: preamble_generators
-    .. attribute:: assumptions
-
-        A :class:`islpy.BasicSet` parameter domain.
-
-    .. attribute:: temporary_variables
-
-        A :class:`dict` of mapping variable names to
-        :class:`loopy.TemporaryVariable`
-        instances.
-
-    .. attribute:: symbol_manglers
-
-    .. attribute:: substitutions
-
-        a mapping from substitution names to
-        :class:`SubstitutionRule` objects
-
-    .. attribute:: iname_slab_increments
-
-        a dictionary mapping inames to (lower_incr,
-        upper_incr) tuples that will be separated out in the execution to generate
-        'bulk' slabs with fewer conditionals.
-
-    .. attribute:: loop_priority
-
-        A frozenset of priority constraints to the kernel. Each such constraint
-        is a tuple of inames. Inames occuring in such a tuple will be scheduled
-        earlier than any iname following in the tuple. This applies only to inames
-        with non-parallel implementation tags.
-
-    .. attribute:: silenced_warnings
-
-    .. attribute:: applied_iname_rewrites
-
-        A list of past substitution dictionaries that
-        were applied to the kernel. These are stored so that they may be repeated
-        on expressions the user specifies later.
-
-    .. attribute:: options
-
-        An instance of :class:`loopy.Options`
-
-    .. attribute:: state
-
-        A value from :class:`KernelState`.
-
-    .. attribute:: target
-
-        A subclass of :class:`loopy.TargetBase`.
-
-    .. attribute:: inames
-
-        An instance of :class:`dict`, a mapping from the names of kernel's
-        inames to their corresponding instances of :class:`loopy.kernel.data.Iname`.
-        An entry is guaranteed to be present for each iname.
+    .. autoattribute:: domains
+    .. autoattribute:: instructions
+    .. autoattribute:: args
+    .. autoattribute:: schedule
+    .. autoattribute:: name
+    .. autoattribute:: preambles
+    .. autoattribute:: preamble_generators
+    .. autoattribute:: assumptions
+    .. autoattribute:: temporary_variables
+    .. autoattribute:: symbol_manglers
+    .. autoattribute:: substitutions
+    .. autoattribute:: iname_slab_increments
+    .. autoattribute:: loop_priority
+    .. autoattribute:: silenced_warnings
+    .. autoattribute:: applied_iname_rewrites
+    .. autoattribute:: options
+    .. autoattribute:: state
+    .. autoattribute:: target
+    .. autoattribute:: inames
 
     .. automethod:: __call__
     .. automethod:: copy
@@ -201,11 +144,25 @@ class LoopKernel(Taggable):
     .. automethod:: without_tags
     """
     domains: Sequence[isl.BasicSet]
+    """Represents the :ref:`domain-tree`."""
+
     instructions: Sequence[InstructionBase]
+    """
+    See :ref:`instructions`.
+    """
+
     args: Sequence[KernelArgument]
     assumptions: isl.BasicSet
+    """
+    Must be a :class:`islpy.BasicSet` parameter domain.
+    """
+
     temporary_variables: Mapping[str, TemporaryVariable]
-    inames: Mapping[str, Iname]
+    inames: Mapping[InameStr, Iname]
+    """
+    An entry is guaranteed to be present for each iname.
+    """
+
     substitutions: Mapping[str, SubstitutionRule]
     options: Options
     target: TargetBase
@@ -218,11 +175,29 @@ class LoopKernel(Taggable):
     symbol_manglers: Sequence[
             Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = ()
     linearization: Optional[Sequence[ScheduleItem]] = None
-    iname_slab_increments: Mapping[str, Tuple[int, int]] = field(
+    iname_slab_increments: Mapping[InameStr, Tuple[int, int]] = field(
             default_factory=Map)
-    loop_priority: FrozenSet[Tuple[str]] = field(
+    """
+    A mapping from inames to (lower_incr,
+    upper_incr) tuples that will be separated out in the execution to generate
+    'bulk' slabs with fewer conditionals.
+    """
+
+    loop_priority: FrozenSet[Tuple[InameStr, ...]] = field(
             default_factory=frozenset)
-    applied_iname_rewrites: Tuple[Dict[str, ExpressionT], ...] = ()
+    """
+    A frozenset of priority constraints to the kernel. Each such constraint
+    is a tuple of inames. Inames occurring in such a tuple will be scheduled
+    earlier than any iname following in the tuple. This applies only to inames
+    with non-parallel implementation tags.
+    """
+
+    applied_iname_rewrites: Tuple[Dict[InameStr, ExpressionT], ...] = ()
+    """
+    A list of past substitution dictionaries that
+    were applied to the kernel. These are stored so that they may be repeated
+    on expressions the user specifies later.
+    """
     index_dtype: NumpyType = NumpyType(np.dtype(np.int32))
     silenced_warnings: FrozenSet[str] = frozenset()
 
@@ -274,11 +249,11 @@ def all_variable_names(self):
                 | {arg.name for arg in self.args}
                 | set(self.all_inames()))
 
-    def get_var_name_generator(self):
+    def get_var_name_generator(self) -> UniqueNameGenerator:
         return UniqueNameGenerator(self.all_variable_names())
 
-    def get_instruction_id_generator(self, based_on="insn"):
-        used_ids = {insn.id for insn in self.instructions}
+    def get_instruction_id_generator(self, based_on="insn") -> UniqueNameGenerator:
+        used_ids = {insn.id for insn in self.instructions if insn.id is not None}
 
         return UniqueNameGenerator(used_ids)
 
@@ -515,7 +490,7 @@ def get_leaf_domain_indices(self, inames):
         for iname in inames:
             home_domain_index = hdm[iname]
             if home_domain_index in domain_indices:
-                # nothin' new
+                # nothing new
                 continue
 
             domain_path_to_root = [home_domain_index] + ppd[home_domain_index]
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 43e1f86b5..84477749f 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1,7 +1,7 @@
-"""Implementation tagging of array axes."""
-
 from __future__ import annotations
 
+from loopy.symbolic import flatten
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -45,6 +45,7 @@
 from warnings import warn
 
 import numpy as np  # noqa
+from typing_extensions import TypeAlias
 
 from pytools import ImmutableRecord
 from pytools.tag import Tag, Taggable
@@ -52,7 +53,7 @@
 from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 from loopy.types import LoopyType
-from loopy.typing import ExpressionT, ShapeType
+from loopy.typing import ExpressionT, ShapeType, auto
 
 
 if TYPE_CHECKING:
@@ -69,8 +70,6 @@
 
 
 __doc__ = """
-.. currentmodule:: loopy.kernel.array
-
 .. autoclass:: ArrayDimImplementationTag
 
 .. autoclass:: _StrideArrayDimTagBase
@@ -84,6 +83,23 @@
 .. autoclass:: VectorArrayDimTag
 
 .. autofunction:: parse_array_dim_tags
+
+Cross-references
+----------------
+
+(This section shouldn't exist: Sphinx should be able to resolve these on its own.)
+
+.. class:: ShapeType
+
+    See :class:`loopy.typing.ShapeType`
+
+.. class:: ExpressionT
+
+    See :class:`loopy.typing.ExpressionT`
+
+.. class:: Tag
+
+    See :class:`pytools.tag.Tag`
 """
 
 
@@ -593,132 +609,137 @@ def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
 
 # {{{ array base class (for arguments and temporary arrays)
 
-def _pymbolic_parse_if_necessary(x):
-    if isinstance(x, str):
-        from pymbolic import parse
-        return parse(x)
-    else:
-        return x
+ToShapeLikeConvertible: TypeAlias = (Tuple[ExpressionT | str, ...]
+                | ExpressionT | type[auto] | str | tuple[str, ...])
 
 
-def _parse_shape_or_strides(x):
-    import loopy as lp
-    if x == "auto":
-        warn("use of 'auto' as a shape or stride won't work "
-                "any more--use loopy.auto instead",
-                stacklevel=3)
-    x = _pymbolic_parse_if_necessary(x)
-    if isinstance(x, lp.auto):
-        return x
-    assert not isinstance(x, list)
-    if not isinstance(x, tuple):
-        assert x is not lp.auto
-        x = (x,)
+def _parse_shape_or_strides(
+            x: ToShapeLikeConvertible,
+        ) -> ShapeType | type[auto]:
+    from pymbolic import parse
 
-    return tuple(_pymbolic_parse_if_necessary(xi) for xi in x)
-
-
-class ArrayBase(ImmutableRecord, Taggable):
-    """
-    .. attribute :: name
-
-    .. attribute :: dtype
-
-        The :class:`loopy.types.LoopyType` of the array. If this is *None*,
-        :mod:`loopy` will try to continue without knowing the type of this
-        array, where the idea is that precise knowledge of the type will become
-        available at invocation time.  Calling the kernel
-        (via :meth:`loopy.LoopKernel.__call__`)
-        automatically adds this type information based on invocation arguments.
+    if x == "auto":
+        raise ValueError("use of 'auto' as a shape or stride won't work "
+                "any more--use loopy.auto instead")
 
-        Note that some transformations, such as :func:`loopy.add_padding`
-        cannot be performed without knowledge of the exact *dtype*.
+    if x is auto:
+        return auto
 
-    .. attribute :: shape
+    if isinstance(x, str):
+        x = parse(x)
 
-        May be one of the following:
+    if isinstance(x, list):
+        raise ValueError("shape can't be a list")
 
-        * *None*. In this case, no shape is intended to be specified,
-          only the strides will be used to access the array. Bounds checking
-          will not be performed.
+    if not isinstance(x, tuple):
+        assert x is not auto
+        x = (x,)
 
-        * :class:`loopy.auto`. The shape will be determined by finding the
-          access footprint.
+    return tuple(parse(xi) if isinstance(xi, str) else xi for xi in x)
 
-        * a tuple like like :attr:`numpy.ndarray.shape`.
 
-          Each entry of the tuple is also allowed to be a :mod:`pymbolic`
-          expression involving kernel parameters, or a (potentially-comma
-          separated) or a string that can be parsed to such an expression.
+class ArrayBase(ImmutableRecord, Taggable):
+    """
+    .. autoattribute:: name
+    .. autoattribute:: dtype
+    .. autoattribute:: shape
+    .. autoattribute:: dim_tags
+    .. autoattribute:: offset
+    .. autoattribute:: dim_names
+    .. autoattribute:: alignment
+    .. autoattribute:: tags
 
-          Any element of the shape tuple not used to compute strides
-          may be *None*.
+    .. automethod:: __init__
+    .. automethod:: __eq__
+    .. automethod:: num_user_axes
+    .. automethod:: num_target_axes
+    .. automethod:: vector_size
 
-    .. attribute:: dim_tags
+    (supports persistent hashing)
+    """
+    name: str
 
-        See :ref:`data-dim-tags`.
+    dtype: Optional[LoopyType]
+    """The :class:`loopy.types.LoopyType` of the array. If this is *None*,
+    :mod:`loopy` will try to continue without knowing the type of this
+    array, where the idea is that precise knowledge of the type will become
+    available at invocation time.  Calling the kernel
+    (via :meth:`loopy.LoopKernel.__call__`)
+    automatically adds this type information based on invocation arguments.
+
+    Note that some transformations, such as :func:`loopy.add_padding`
+    cannot be performed without knowledge of the exact *dtype*.
+    """
 
-    .. attribute:: offset
+    shape: Union[ShapeType, Type["auto"], None]
+    """
+    May be one of the following:
 
-        Offset from the beginning of the buffer to the point from
-        which the strides are counted, in units of the :attr:`dtype`.
-        May be one of
+    * *None*. In this case, no shape is intended to be specified,
+      only the strides will be used to access the array. Bounds checking
+      will not be performed.
 
-            * 0 or None
-            * a string (that is interpreted as an argument name).
-            * a pymbolic expression
-            * :class:`loopy.auto`, in which case an offset argument
-              is added automatically, immediately following this argument.
+    * :class:`loopy.auto`. The shape will be determined by finding the
+      access footprint.
 
-    .. attribute:: dim_names
+    * a tuple like like :attr:`numpy.ndarray.shape`.
 
-        A tuple of strings providing names for the array axes, or *None*.
-        If given, must have the same number of entries as :attr:`dim_tags`
-        and :attr:`dim_tags`. These do not live in any particular namespace
-        (i.e. collide with no other names) and serve a purely
-        informational/documentational purpose. On occasion, they are used
-        to generate more informative names than could be achieved by
-        axis numbers.
+      Each entry of the tuple is also allowed to be a :mod:`pymbolic`
+      expression involving kernel parameters, or a (potentially-comma
+      separated) or a string that can be parsed to such an expression.
 
-    .. attribute:: alignment
+      Any element of the shape tuple not used to compute strides
+      may be *None*.
+      """
 
-        Memory alignment of the array in bytes. For temporary arrays,
-        this ensures they are allocated with this alignment. For arguments,
-        this entails a promise that the incoming array obeys this alignment
-        restriction.
+    dim_tags: Optional[Sequence[ArrayDimImplementationTag]]
+    """See :ref:`data-dim-tags`.
+    """
 
-        Defaults to *None*.
+    offset: Union[ExpressionT, str, None]
+    """Offset from the beginning of the buffer to the point from
+    which the strides are counted, in units of the :attr:`dtype`.
+    May be one of
+
+    * 0 or None
+    * a string (that is interpreted as an argument name).
+    * a pymbolic expression
+    * :class:`loopy.auto`, in which case an offset argument
+      is added automatically, immediately following this argument.
+    """
 
-        If an integer N is given, the array would be declared
-        with ``__attribute__((aligned(N)))`` in code generation for
-        :class:`loopy.CFamilyTarget`.
+    dim_names: Optional[Tuple[str, ...]]
+    """A tuple of strings providing names for the array axes, or *None*.
+    If given, must have the same number of entries as :attr:`dim_tags`
+    and :attr:`dim_tags`. These do not live in any particular namespace
+    (i.e. collide with no other names) and serve a purely
+    informational/documentational purpose. On occasion, they are used
+    to generate more informative names than could be achieved by
+    axis numbers.
+    """
 
-        .. versionadded:: 2018.1
+    alignment: Optional[int]
+    """Memory alignment of the array in bytes. For temporary arrays,
+    this ensures they are allocated with this alignment. For arguments,
+    this entails a promise that the incoming array obeys this alignment
+    restriction.
 
-    .. attribute:: tags
+    Defaults to *None*.
 
-        A (possibly empty) frozenset of instances of
-        :class:`pytools.tag.Tag` intended for
-        consumption by an application.
+    If an integer N is given, the array would be declared
+    with ``__attribute__((aligned(N)))`` in code generation for
+    :class:`loopy.CFamilyTarget`.
 
-        .. versionadded:: 2020.2.2
+    .. versionadded:: 2018.1
+    """
 
-    .. automethod:: __init__
-    .. automethod:: __eq__
-    .. automethod:: num_user_axes
-    .. automethod:: num_target_axes
-    .. automethod:: vector_size
+    tags: FrozenSet[Tag]
+    """A (possibly empty) frozenset of instances of
+    :class:`pytools.tag.Tag` intended for
+    consumption by an application.
 
-    (supports persistent hashing)
+    .. versionadded:: 2020.2.2
     """
-    name: str
-    dtype: Optional[LoopyType]
-    shape: Union[ShapeType, Type["auto"], None]
-    dim_tags: Optional[Sequence[ArrayDimImplementationTag]]
-    offset: Union[ExpressionT, str, None]
-    dim_names: Optional[Tuple[str, ...]]
-    alignment: Optional[int]
-    tags: FrozenSet[Tag]
 
     # Note that order may also wind up in attributes, if the
     # number of dimensions has not yet been determined.
@@ -1299,7 +1320,7 @@ def eval_expr_assert_integer_constant(i, expr):
                         "make_temporaries_for_offsets_and_strides "
                         "during preprocessing.")
 
-            subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
+            subscripts[dim_tag.target_axis] += flatten((stride // vector_size)*idx)
 
         elif isinstance(dim_tag, SeparateArrayArrayDimTag):
             raise AssertionError()
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index c4cc880a0..4f1803f24 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -26,6 +26,7 @@
 import logging
 import re
 from sys import intern
+from typing import Any
 
 import numpy as np
 
@@ -48,7 +49,7 @@
 )
 from loopy.symbolic import IdentityMapper, SubArrayRef, WalkMapper
 from loopy.tools import Optional, intern_frozenset_of_ids
-from loopy.translation_unit import for_each_kernel
+from loopy.translation_unit import TranslationUnit, for_each_kernel
 
 
 logger = logging.getLogger(__name__)
@@ -1069,7 +1070,8 @@ def parse_domains(domains, defines):
         if isinstance(dom, str):
             dom, = expand_defines(dom, defines)
 
-            if not dom.lstrip().startswith("["):
+            # pylint warning is spurious
+            if not dom.lstrip().startswith("["):  # pylint: disable=no-member
                 # i.e. if no parameters are already given
                 parameters = (_gather_isl_identifiers(dom)
                         - _find_inames_in_set(dom)
@@ -1884,7 +1886,7 @@ def add_inferred_inames(knl):
 # {{{ apply single-writer heuristic
 
 @for_each_kernel
-def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True,
+def apply_single_writer_dependency_heuristic(kernel, warn_if_used=True,
         error_if_used=False):
     logger.debug("%s: default deps" % kernel.name)
 
@@ -2023,7 +2025,7 @@ class SliceToInameReplacer(IdentityMapper):
     .. attribute:: subarray_ref_bounds
 
         A :class:`list` (one entry for each :class:`SubArrayRef` to be created)
-        of :class:`dict` instances to store the slices enountered in the
+        of :class:`dict` instances to store the slices encountered in the
         expressions as a mapping from ``iname`` to a tuple of ``(start, stop,
         step)``, which describes the boxy (i.e. affine) constraints imposed on
         the ``iname`` by the corresponding slice notation its intended to
@@ -2536,13 +2538,6 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
 
     assert len(knl.instructions) == len(inames_to_dup)
 
-    from loopy import duplicate_inames
-    from loopy.match import Id
-    for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup):
-        for old_iname, new_iname in insn_inames_to_dup:
-            knl = duplicate_inames(knl, old_iname,
-                    within=Id(insn.id), new_inames=new_iname)
-
     check_for_nonexistent_iname_deps(knl)
 
     knl = create_temporaries(knl, default_order)
@@ -2563,6 +2558,27 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
     knl = add_inferred_inames(knl)
     from loopy.transform.parameter import fix_parameters
     knl = fix_parameters(knl, **fixed_parameters)
+
+    # -------------------------------------------------------------------------
+    # Ordering dependency:
+    # -------------------------------------------------------------------------
+    # Must duplicate inames after adding all the inames to the instructions.
+    # To duplicate an iname "i" in statement "S", lp.duplicate requires that
+    # the statement "S" be nested within the iname "i".
+    # -------------------------------------------------------------------------
+    from loopy import duplicate_inames
+    from loopy.match import Id
+    for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup):
+        for old_iname, new_iname in insn_inames_to_dup:
+            knl = duplicate_inames(knl, old_iname,
+                    within=Id(insn.id), new_inames=new_iname)
+            new_insn = knl.id_to_insn[insn.id]
+            assert old_iname not in (
+                new_insn.within_inames
+                | new_insn.reduction_inames()
+                | new_insn.sub_array_ref_inames()
+            )
+
     # -------------------------------------------------------------------------
     # Ordering dependency:
     # -------------------------------------------------------------------------
@@ -2574,7 +2590,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
     knl = guess_arg_shape_if_requested(knl, default_order)
     knl = apply_default_order_to_args(knl, default_order)
     knl = resolve_dependencies(knl)
-    knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=False)
+    knl = apply_single_writer_dependency_heuristic(knl, warn_if_used=False)
 
     # -------------------------------------------------------------------------
     # Ordering dependency:
@@ -2600,7 +2616,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
 
 # {{{ make_kernel
 
-def make_kernel(*args, **kwargs):
+def make_kernel(*args: Any, **kwargs: Any) -> TranslationUnit:
     tunit = make_function(*args, **kwargs)
     name, = tunit.callables_table
     return tunit.with_entrypoints(name)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index bdac071da..d2d80bedf 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -1,5 +1,7 @@
 """Data used by the kernel object."""
 
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -38,9 +40,9 @@
     Union,
     cast,
 )
-from warnings import warn
 
-import numpy as np  # noqa
+import numpy  # FIXME: imported as numpy to allow sphinx to resolve things
+import numpy as np
 from immutables import Map
 
 from pytools import ImmutableRecord
@@ -61,8 +63,8 @@
     VarAtomicity,
     make_assignment,
 )
-from loopy.types import LoopyType, auto
-from loopy.typing import ExpressionT, ShapeType
+from loopy.types import LoopyType, ToLoopyTypeConvertible
+from loopy.typing import ExpressionT, ShapeType, auto
 
 
 __doc__ = """
@@ -390,12 +392,6 @@ class KernelArgument(ImmutableRecord):
     def __init__(self, **kwargs):
         kwargs["name"] = intern(kwargs.pop("name"))
 
-        target = kwargs.pop("target", None)
-        if target is not None:
-            warn("Passing 'target' is deprecated and will stop working in 2023. "
-                    "It is already being ignored.",
-                    DeprecationWarning, stacklevel=2)
-
         dtype = kwargs.pop("dtype", None)
 
         for_atomic = kwargs.pop("for_atomic", False)
@@ -521,7 +517,7 @@ def supporting_names(self) -> FrozenSet[str]:
 # Making this a function prevents incorrect use in isinstance.
 # Note: This is *not* deprecated, as it is super-common and
 # incrementally more convenient to use than ArrayArg directly.
-def GlobalArg(*args, **kwargs):  # noqa: N802
+def GlobalArg(*args, **kwargs) -> ArrayArg:  # noqa: N802
     address_space = kwargs.pop("address_space", None)
     if address_space is not None:
         raise TypeError("may not pass 'address_space' to GlobalArg")
@@ -579,18 +575,15 @@ def supporting_names(self) -> FrozenSet[str]:
                 )
 
 
-"""
-    :attribute tags: A (possibly empty) frozenset of instances of
-        :class:`pytools.tag.Tag` intended for consumption by an
-        application.
-
-        ..versionadded: 2020.2.2
-"""
-
-
 class ValueArg(KernelArgument, Taggable):
-    def __init__(self, name, dtype=None, approximately=1000, target=None,
-            is_output=False, is_input=True, tags=None):
+    def __init__(self,
+                name: str,
+                dtype: ToLoopyTypeConvertible | None = None,
+                approximately: int = 1000,
+                is_output: bool = False,
+                is_input: bool = True,
+                tags: frozenset[Tag] | None = None,
+             ) -> None:
         """
         :arg tags: A an instance of or Iterable of instances of
             :class:`pytools.tag.Tag` intended for consumption by an
@@ -603,7 +596,6 @@ def __init__(self, name, dtype=None, approximately=1000, target=None,
         KernelArgument.__init__(self, name=name,
                 dtype=dtype,
                 approximately=approximately,
-                target=target,
                 is_output=is_output,
                 is_input=is_input,
                 tags=tags)
@@ -641,48 +633,42 @@ def get_arg_decl(self, ast_builder):
 
 class TemporaryVariable(ArrayBase):
     __doc__ = cast(str, ArrayBase.__doc__) + """
-    .. attribute:: storage_shape
-    .. attribute:: base_indices
-    .. attribute:: address_space
-
-        What memory this temporary variable lives in.
-        One of the values in :class:`AddressSpace`,
-        or :class:`loopy.auto` if this is
-        to be automatically determined.
-
-    .. attribute:: base_storage
-
-        The name of a storage array that is to be used to actually
-        hold the data in this temporary, or *None*. If not *None* or the name
-        of an existing variable, a variable of this name and appropriate size
-        will be created.
-
-    .. attribute:: initializer
-
-        *None* or a :class:`numpy.ndarray` of data to be used to initialize the
-        array.
-
-    .. attribute:: read_only
-
-        A :class:`bool` indicating whether the variable may be written during
-        its lifetime. If *True*, *initializer* must be given.
-
-    .. attribute:: _base_storage_access_may_be_aliasing
-
-        Whether the temporary is used to alias the underlying base storage.
-        Defaults to *False*. If *False*, C-based code generators will declare
-        the temporary as a ``restrict`` const pointer to the base storage
-        memory location. If *True*, the restrict part is omitted on this
-        declaration.
+    .. autoattribute:: storage_shape
+    .. autoattribute:: base_indices
+    .. autoattribute:: address_space
+    .. autoattribute:: base_storage
+    .. autoattribute:: initializer
+    .. autoattribute:: read_only
+    .. autoattribute:: _base_storage_access_may_be_aliasing
     """
 
     storage_shape: Optional[ShapeType]
     base_indices: Optional[Tuple[ExpressionT, ...]]
     address_space: Union[AddressSpace, Type[auto]]
     base_storage: Optional[str]
-    initializer: Optional[np.ndarray]
+    """The name of a storage array that is to be used to actually
+    hold the data in this temporary, or *None*. If not *None* or the name
+    of an existing variable, a variable of this name and appropriate size
+    will be created.
+    """
+
+    initializer: Optional[numpy.ndarray]
+    """*None* or a :class:`numpy.ndarray` of data to be used to initialize the
+    array.
+    """
+
     read_only: bool
+    """A :class:`bool` indicating whether the variable may be written during
+    its lifetime. If *True*, *initializer* must be given.
+    """
+
     _base_storage_access_may_be_aliasing: bool
+    """Whether the temporary is used to alias the underlying base storage.
+    Defaults to *False*. If *False*, C-based code generators will declare
+    the temporary as a ``restrict`` const pointer to the base storage
+    memory location. If *True*, the restrict part is omitted on this
+    declaration.
+    """
 
     min_target_axes: ClassVar[int] = 0
     max_target_axes: ClassVar[int] = 1
@@ -697,11 +683,28 @@ class TemporaryVariable(ArrayBase):
             "_base_storage_access_may_be_aliasing",
             )
 
-    def __init__(self, name, dtype=None, shape=auto, address_space=None,
-            dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
-            base_indices=None, storage_shape=None,
-            base_storage=None, initializer=None, read_only=False,
-            _base_storage_access_may_be_aliasing=False, **kwargs):
+    def __init__(
+                self,
+                name: str,
+                dtype: ToLoopyTypeConvertible = None,
+                shape: Union[ShapeType, Type["auto"], None] = auto,
+                address_space: Union[AddressSpace, Type[auto], None] = None,
+                dim_tags: Optional[Sequence[ArrayDimImplementationTag]] = None,
+                offset: Union[ExpressionT, str, None] = 0,
+                dim_names: Optional[Tuple[str, ...]] = None,
+                strides: Optional[Tuple[ExpressionT, ...]] = None,
+                order: str | None = None,
+
+                base_indices: Optional[Tuple[ExpressionT, ...]] = None,
+                storage_shape: ShapeType | None = None,
+
+                base_storage: Optional[str] = None,
+                initializer: Optional[np.ndarray] = None,
+                read_only: bool = False,
+
+                _base_storage_access_may_be_aliasing: bool = False,
+                **kwargs: Any
+            ) -> None:
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -711,12 +714,6 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None,
         if address_space is None:
             address_space = auto
 
-        if address_space is None:
-            raise LoopyError(
-                    "temporary variable '%s': "
-                    "address_space must not be None"
-                    % name)
-
         if initializer is None:
             pass
         elif isinstance(initializer, np.ndarray):
@@ -751,7 +748,12 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None,
         if order is None:
             order = "C"
 
-        if base_indices is None and shape is not auto:
+        if shape is not None:
+            from loopy.kernel.array import _parse_shape_or_strides
+            shape = _parse_shape_or_strides(shape)
+
+        if base_indices is None and shape is not auto and shape is not None:
+            assert isinstance(shape, tuple)
             base_indices = (0,) * len(shape)
 
         if not read_only and initializer is not None:
@@ -790,7 +792,7 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None,
                     _base_storage_access_may_be_aliasing),
                 **kwargs)
 
-    def copy(self, **kwargs):
+    def copy(self, **kwargs: Any) -> TemporaryVariable:
         address_space = kwargs.pop("address_space", None)
 
         if address_space is not None:
@@ -799,15 +801,23 @@ def copy(self, **kwargs):
         return super().copy(**kwargs)
 
     @property
-    def nbytes(self):
-        shape = self.shape
+    def nbytes(self) -> ExpressionT:
         if self.storage_shape is not None:
             shape = self.storage_shape
+        else:
+            if self.shape is None:
+                raise ValueError("shape is None")
+            if self.shape is auto:
+                raise ValueError("shape is auto")
+            shape = cast(Tuple[ExpressionT], self.shape)
+
+        if self.dtype is None:
+            raise ValueError("data type is indeterminate")
 
         from pytools import product
         return product(si for si in shape)*self.dtype.itemsize
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.address_space is auto:
             aspace_str = "auto"
         else:
@@ -871,35 +881,27 @@ def supporting_names(self) -> FrozenSet[str]:
 
 # {{{ substitution rule
 
-class SubstitutionRule(ImmutableRecord):
+@dataclass(frozen=True)
+class SubstitutionRule:
     """
-    .. attribute:: name
-    .. attribute:: arguments
-
-        A tuple of strings
-
-    .. attribute:: expression
+    .. autoattribute:: name
+    .. autoattribute:: arguments
+    .. autoattribute:: expression
     """
 
-    def __init__(self, name, arguments, expression):
-        assert isinstance(arguments, tuple)
-
-        ImmutableRecord.__init__(self,
-                name=name, arguments=arguments, expression=expression)
+    name: str
+    arguments: Sequence[str]
+    expression: ExpressionT
 
-    def __str__(self):
-        return "{}({}) := {}".format(
-                self.name, ", ".join(self.arguments), self.expression)
+    def copy(self, **kwargs: Any) -> SubstitutionRule:
+        return replace(self, **kwargs)
 
     def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-        """
-
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.arguments)
         key_builder.update_for_pymbolic_expression(key_hash, self.expression)
 
+
 # }}}
 
 
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index e3fcf108a..c96cd0fbb 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -23,7 +23,8 @@
 THE SOFTWARE.
 """
 
-from typing import TYPE_CHECKING, ClassVar, FrozenSet, Tuple
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Callable, ClassVar, FrozenSet, Tuple, TypeVar
 
 from pytools import ImmutableRecord
 
@@ -36,6 +37,8 @@
 
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from loopy.translation_unit import CallablesTable, FunctionIdT
 
 __doc__ = """
@@ -57,7 +60,23 @@
 
 # {{{ argument descriptors
 
-class ValueArgDescriptor(ImmutableRecord):
+ArgDescriptorT = TypeVar("ArgDescriptorT", bound="ArgDescriptor")
+
+
+class ArgDescriptor(ABC, ImmutableRecord):
+    @abstractmethod
+    def map_expr(
+                self,
+                subst_mapper: Callable[[ArgDescriptorT], ArgDescriptorT]
+            ) -> Self:
+        ...
+
+    @abstractmethod
+    def depends_on(self) -> frozenset[str]:
+        ...
+
+
+class ValueArgDescriptor(ArgDescriptor):
     hash_fields = ()
 
     def map_expr(self, subst_mapper):
@@ -69,7 +88,7 @@ def depends_on(self):
     update_persistent_hash = update_persistent_hash
 
 
-class ArrayArgDescriptor(ImmutableRecord):
+class ArrayArgDescriptor(ArgDescriptor):
     """
     Records information about an array argument to an in-kernel callable. To be
     passed to and returned from
@@ -672,7 +691,7 @@ def is_type_specialized(self):
 
 class CallableKernel(InKernelCallable):
     """
-    Records informations about a callee kernel. Also provides interface through
+    Records information about a callee kernel. Also provides interface through
     member methods to make the callee kernel compatible to be called from a
     caller kernel.
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 198b7c03f..a6420b8fc 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -20,10 +20,14 @@
 THE SOFTWARE.
 """
 
-from collections.abc import Set as abc_Set
+from collections.abc import (
+    Mapping as MappingABC,
+    Set as abc_Set,
+)
+from dataclasses import dataclass
 from functools import cached_property
 from sys import intern
-from typing import FrozenSet
+from typing import Any, FrozenSet, Mapping, Optional, Sequence, Tuple, Type, Union
 from warnings import warn
 
 import islpy as isl
@@ -31,7 +35,9 @@
 from pytools.tag import Tag, Taggable, tag_dataclass
 
 from loopy.diagnostic import LoopyError
-from loopy.tools import Optional
+from loopy.tools import Optional as LoopyOptional
+from loopy.types import LoopyType
+from loopy.typing import ExpressionT, InameStr
 
 
 # {{{ instruction tags
@@ -77,6 +83,44 @@ class UseStreamingStoreTag(Tag):
 # }}}
 
 
+# {{{ HappensAfter
+
+@dataclass(frozen=True)
+class HappensAfter:
+    """A class representing a "happens-after" relationship between two
+    statements found in a :class:`loopy.LoopKernel`. Used to validate that a
+    given kernel transformation respects the data dependencies in a given
+    program.
+
+    .. attribute:: variable_name
+
+       The name of the variable responsible for the dependency. For
+       backward compatibility purposes, this may be *None*. In this case, the
+       dependency semantics revert to the deprecated, statement-level
+       dependencies of prior versions of :mod:`loopy`.
+
+    .. attribute:: instances_rel
+
+        An :class:`islpy.Map` representing the precise happens-after
+        relationship. The domain and range are sets of statement instances. The
+        instances in the domain are required to execute before the instances in
+        the range.
+
+        Map dimensions are named according to the order of appearance of the
+        inames in a :mod:`loopy` program. The dimension names in the range are
+        appended with a prime to signify that the mapped instances are distinct.
+
+        As a (deprecated) matter of backward compatibility, this may be *None*,
+        in which case the semantics revert to the (underspecified)
+        statement-level dependencies of prior versions of :mod:`loopy`.
+    """
+
+    variable_name: Optional[str]
+    instances_rel: Optional[isl.Map]
+
+# }}}
+
+
 # {{{ instructions: base class
 
 class InstructionBase(ImmutableRecord, Taggable):
@@ -200,10 +244,20 @@ class InstructionBase(ImmutableRecord, Taggable):
 
     Inherits from :class:`pytools.tag.Taggable`.
     """
+    id: Optional[str]
+    happens_after: Mapping[str, HappensAfter]
+    depends_on_is_final: bool
+    groups: FrozenSet[str]
+    conflicts_with_groups: FrozenSet[str]
+    no_sync_with: FrozenSet[Tuple[str, str]]
+    predicates: FrozenSet[ExpressionT]
+    within_inames: FrozenSet[InameStr]
+    within_inames_is_final: bool
+    priority: int
 
     # within_inames_is_final is deprecated and will be removed in version 2017.x.
 
-    fields = set("id depends_on depends_on_is_final "
+    fields = set("id depends_on_is_final "
             "groups conflicts_with_groups "
             "no_sync_with "
             "predicates "
@@ -216,12 +270,23 @@ class InstructionBase(ImmutableRecord, Taggable):
     # Names of fields that are sets of pymbolic expressions. Needed for key building
     pymbolic_set_fields = {"predicates"}
 
-    def __init__(self, id, depends_on, depends_on_is_final,
-            groups, conflicts_with_groups,
-            no_sync_with,
-            within_inames_is_final, within_inames,
-            priority,
-            predicates, tags):
+    def __init__(self,
+                 id: Optional[str],
+                 happens_after: Union[
+                     Mapping[str, HappensAfter], FrozenSet[str], str, None],
+                 depends_on_is_final: Optional[bool],
+                 groups: Optional[FrozenSet[str]],
+                 conflicts_with_groups: Optional[FrozenSet[str]],
+                 no_sync_with: Optional[FrozenSet[Tuple[str, str]]],
+                 within_inames_is_final: Optional[bool],
+                 within_inames: Optional[FrozenSet[str]],
+                 priority: Optional[int],
+                 predicates: Optional[FrozenSet[str]],
+                 tags: Optional[FrozenSet[Tag]],
+                 *,
+                 depends_on: Union[FrozenSet[str], str, None] = None,
+                 ) -> None:
+        from immutabledict import immutabledict
 
         if predicates is None:
             predicates = frozenset()
@@ -237,8 +302,50 @@ def __init__(self, id, depends_on, depends_on_is_final,
         predicates = frozenset(new_predicates)
         del new_predicates
 
-        if depends_on is None:
-            depends_on = frozenset()
+        # {{{ process happens_after/depends_on
+
+        if happens_after is not None and depends_on is not None:
+            raise TypeError("may not pass both happens_after and depends_on")
+        elif depends_on is not None:
+            # FIXME Enable once we realistically check detailed dependencies.
+            # warn("depends_on is deprecated and will stop working in 2026. "
+            #      "Pass happens_after instead.", DeprecationWarning, stacklevel=2)
+            happens_after = depends_on
+
+        del depends_on
+
+        if depends_on_is_final and happens_after is None:
+            raise LoopyError("Setting depends_on_is_final to True requires "
+                    "actually specifying happens_after/depends_on")
+
+        if isinstance(happens_after, immutabledict):
+            pass
+        elif happens_after is None:
+            happens_after = immutabledict()
+        elif isinstance(happens_after, str):
+            warn("Passing a string for happens_after/depends_on is deprecated and "
+                 "will stop working in 2025. Instead, pass a full-fledged "
+                 "happens_after data structure.", DeprecationWarning, stacklevel=2)
+
+            happens_after = immutabledict({
+                    after_id.strip(): HappensAfter(
+                        variable_name=None,
+                        instances_rel=None)
+                    for after_id in happens_after.split(",")
+                    if after_id.strip()})
+        elif isinstance(happens_after, frozenset):
+            happens_after = immutabledict({
+                    after_id: HappensAfter(
+                        variable_name=None,
+                        instances_rel=None)
+                    for after_id in happens_after})
+        elif isinstance(happens_after, dict):
+            happens_after = immutabledict(happens_after)
+        else:
+            raise TypeError("'happens_after' has unexpected type: "
+                            f"{type(happens_after)}")
+
+        # }}}
 
         if groups is None:
             groups = frozenset()
@@ -255,16 +362,12 @@ def __init__(self, id, depends_on, depends_on_is_final,
         if within_inames_is_final is None:
             within_inames_is_final = False
 
-        if isinstance(depends_on, str):
-            depends_on = frozenset(
-                    s.strip() for s in depends_on.split(",") if s.strip())
-
         if depends_on_is_final is None:
             depends_on_is_final = False
 
-        if depends_on_is_final and not isinstance(depends_on, abc_Set):
+        if depends_on_is_final and not isinstance(happens_after, MappingABC):
             raise LoopyError("Setting depends_on_is_final to True requires "
-                    "actually specifying depends_on")
+                    "actually specifying happens_after/depends_on")
 
         if tags is None:
             tags = frozenset()
@@ -288,13 +391,16 @@ def __init__(self, id, depends_on, depends_on_is_final,
         # assert all(is_interned(pred) for pred in predicates)
 
         assert isinstance(within_inames, abc_Set)
-        assert isinstance(depends_on, abc_Set) or depends_on is None
+        assert isinstance(happens_after, MappingABC) or happens_after is None
         assert isinstance(groups, abc_Set)
         assert isinstance(conflicts_with_groups, abc_Set)
 
+        from loopy.tools import is_hashable
+        assert is_hashable(happens_after)
+
         ImmutableRecord.__init__(self,
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 no_sync_with=no_sync_with,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
@@ -307,6 +413,22 @@ def __init__(self, id, depends_on, depends_on_is_final,
                 # The Taggable constructor call does extra validation.
                 tags=tags)
 
+    def get_copy_kwargs(self, **kwargs):
+        passed_depends_on = "depends_on" in kwargs
+
+        if passed_depends_on:
+            assert "happens_after" not in kwargs
+
+        kwargs = super().get_copy_kwargs(**kwargs)
+
+        if passed_depends_on:
+            # FIXME Enable once we realistically check detailed dependencies.
+            # warn("depends_on is deprecated and will stop working in 2026. "
+            #      "Instead, use happens_after.", DeprecationWarning, stacklevel=2)
+            del kwargs["happens_after"]
+
+        return kwargs
+
     # {{{ abstract interface
 
     def read_dependency_names(self):
@@ -348,6 +470,13 @@ def with_transformed_expressions(self, f, assignee_f=None):
 
     # }}}
 
+    @property
+    def depends_on(self):
+        # FIXME Enable once we realistically check detailed dependencies.
+        # warn("depends_on is deprecated and will stop working in 2026. "
+        #      "Use happens_after instead.", DeprecationWarning, stacklevel=2)
+        return frozenset(self.happens_after)
+
     @property
     def assignee_name(self):
         """A convenience wrapper around :meth:`assignee_var_names`
@@ -452,17 +581,24 @@ def update_persistent_hash(self, key_hash, key_builder):
     def __setstate__(self, val):
         super().__setstate__(val)
 
+        from immutabledict import immutabledict
+
         from loopy.tools import intern_frozenset_of_ids
 
         if self.id is not None:  # pylint:disable=access-member-before-definition
             self.id = intern(self.id)
-        self.depends_on = intern_frozenset_of_ids(self.depends_on)
+        self.happens_after = immutabledict({
+                intern(after_id): ha
+                for after_id, ha in self.happens_after.items()})
         self.groups = intern_frozenset_of_ids(self.groups)
         self.conflicts_with_groups = (
                 intern_frozenset_of_ids(self.conflicts_with_groups))
         self.within_inames = (
                 intern_frozenset_of_ids(self.within_inames))
 
+    def _with_new_tags(self, tags: FrozenSet[Tag]):
+        return self.copy(tags=tags)
+
 # }}}
 
 
@@ -790,30 +926,44 @@ class Assignment(MultiAssignmentBase):
     .. automethod:: __init__
     """
 
+    assignee: ExpressionT
+    expression: ExpressionT
+    temp_var_type: LoopyOptional
+    atomicity: Tuple[VarAtomicity, ...]
+
     fields = MultiAssignmentBase.fields | \
             set("assignee temp_var_type atomicity".split())
     pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignee"}
 
     def __init__(self,
-            assignee, expression,
-            id=None,
-            depends_on=None,
-            depends_on_is_final=None,
-            groups=None,
-            conflicts_with_groups=None,
-            no_sync_with=None,
-            within_inames_is_final=None,
-            within_inames=None,
-            tags=None,
-            temp_var_type=_not_provided, atomicity=(),
-            priority=0, predicates=frozenset()):
+                 assignee: Union[str, ExpressionT],
+                 expression: Union[str, ExpressionT],
+                 id: Optional[str] = None,
+                 happens_after: Union[
+                     Mapping[str, HappensAfter], FrozenSet[str], str, None] = None,
+                 depends_on_is_final: Optional[bool] = None,
+                 groups: Optional[FrozenSet[str]] = None,
+                 conflicts_with_groups: Optional[FrozenSet[str]] = None,
+                 no_sync_with: Optional[FrozenSet[Tuple[str, str]]] = None,
+                 within_inames_is_final: Optional[bool] = None,
+                 within_inames: Optional[FrozenSet[str]] = None,
+                 priority: Optional[int] = None,
+                 predicates: Optional[FrozenSet[str]] = None,
+                 tags: Optional[FrozenSet[Tag]] = None,
+                 temp_var_type: Union[
+                     Type[_not_provided], None, LoopyOptional,
+                     LoopyType] = _not_provided,
+                 atomicity: Tuple[VarAtomicity, ...] = (),
+                 *,
+                 depends_on: Union[FrozenSet[str], str, None] = None,
+                 ) -> None:
 
         if temp_var_type is _not_provided:
-            temp_var_type = Optional()
+            temp_var_type = LoopyOptional()
 
         super().__init__(
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
@@ -822,7 +972,8 @@ def __init__(self,
                 within_inames=within_inames,
                 priority=priority,
                 predicates=predicates,
-                tags=tags)
+                tags=tags,
+                depends_on=depends_on)
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -934,7 +1085,7 @@ class CallInstruction(MultiAssignmentBase):
 
         A tuple of `:class:loopy.Optional`. If an entry is not empty, it
         contains the type that will be assigned to the new temporary variable
-        created from the assigment.
+        created from the assignment.
 
     .. automethod:: __init__
     """
@@ -946,7 +1097,7 @@ class CallInstruction(MultiAssignmentBase):
     def __init__(self,
             assignees, expression,
             id=None,
-            depends_on=None,
+            happens_after=None,
             depends_on_is_final=None,
             groups=None,
             conflicts_with_groups=None,
@@ -955,11 +1106,12 @@ def __init__(self,
             within_inames=None,
             tags=None,
             temp_var_types=None,
-            priority=0, predicates=frozenset()):
+            priority=0, predicates=frozenset(),
+            depends_on=None):
 
         super().__init__(
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
@@ -968,7 +1120,8 @@ def __init__(self,
                 within_inames=within_inames,
                 priority=priority,
                 predicates=predicates,
-                tags=tags)
+                tags=tags,
+                depends_on=depends_on)
 
         from pymbolic.primitives import Call
 
@@ -1001,7 +1154,7 @@ def __init__(self,
         self.expression = expression
 
         if temp_var_types is None:
-            self.temp_var_types = (Optional(),) * len(self.assignees)
+            self.temp_var_types = (LoopyOptional(),) * len(self.assignees)
         else:
             self.temp_var_types = tuple(
                     _check_and_fix_temp_var_type(tvt, stacklevel=3)
@@ -1099,7 +1252,7 @@ def is_array_call(assignees, expression):
     Returns *True* is the instruction is an array call.
 
     An array call is a function call applied to array type objects. If any of
-    the arguemnts or assignees to the function is an array,
+    the arguments or assignees to the function is an array,
     :meth:`is_array_call` will return *True*.
     """
     from pymbolic.primitives import Call, Subscript
@@ -1143,10 +1296,16 @@ def modify_assignee_for_array_call(assignee):
                 "SubArrayRef as its inputs")
 
 
-def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
+def make_assignment(assignees: tuple[ExpressionT, ...],
+                    expression: ExpressionT,
+                    temp_var_types: (
+                        Sequence[LoopyType | None] | None) = None,
+                    **kwargs: Any) -> Assignment | CallInstruction:
 
-    if temp_var_types is None:
-        temp_var_types = (Optional(),) * len(assignees)
+    if temp_var_types is not None:
+        tv_types: Sequence[LoopyType | LoopyOptional | None] = temp_var_types
+    else:
+        tv_types = (LoopyOptional(),) * len(assignees)
 
     if len(assignees) != 1 or is_array_call(assignees, expression):
         atomicity = kwargs.pop("atomicity", ())
@@ -1176,7 +1335,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
                     assignees=tuple(modify_assignee_for_array_call(
                         assignee) for assignee in assignees),
                     expression=expression,
-                    temp_var_types=temp_var_types,
+                    temp_var_types=tuple(tv_types),
                     **kwargs)
     else:
         def _is_array(expr):
@@ -1196,10 +1355,13 @@ def _is_array(expr):
             raise LoopyError("Array calls only supported as instructions"
                     " with function call as RHS for now.")
 
+        assignee, = assignees
+        tv_type, = tv_types
+
         return Assignment(
-                assignee=assignees[0],
+                assignee=assignee,
                 expression=expression,
-                temp_var_type=temp_var_types[0],
+                temp_var_type=tv_type,
                 **kwargs)
 
 
@@ -1248,12 +1410,13 @@ class CInstruction(InstructionBase):
     def __init__(self,
             iname_exprs, code,
             read_variables=frozenset(), assignees=(),
-            id=None, depends_on=None, depends_on_is_final=None,
+            id=None, happens_after=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
             priority=0,
-            predicates=frozenset(), tags=None):
+            predicates=frozenset(), tags=None,
+            depends_on=None):
         """
         :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples,
             simple strings pepresenting inames are also allowed. A single
@@ -1266,13 +1429,14 @@ def __init__(self,
 
         InstructionBase.__init__(self,
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
                 no_sync_with=no_sync_with,
                 within_inames_is_final=within_inames_is_final,
                 within_inames=within_inames,
-                priority=priority, predicates=predicates, tags=tags)
+                priority=priority, predicates=predicates, tags=tags,
+                depends_on=depends_on)
 
         # {{{ normalize iname_exprs
 
@@ -1416,15 +1580,15 @@ class NoOpInstruction(_DataObliviousInstruction):
         ... nop
     """
 
-    def __init__(self, id=None, depends_on=None, depends_on_is_final=None,
+    def __init__(self, id=None, happens_after=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
             priority=None,
-            predicates=None, tags=None):
+            predicates=None, tags=None, depends_on=None):
         super().__init__(
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
@@ -1433,7 +1597,8 @@ def __init__(self, id=None, depends_on=None, depends_on_is_final=None,
                 within_inames=within_inames,
                 priority=priority,
                 predicates=predicates,
-                tags=tags)
+                tags=tags,
+                depends_on=depends_on)
 
     def __str__(self):
         first_line = "%s: ... nop" % self.id
@@ -1460,7 +1625,7 @@ class BarrierInstruction(_DataObliviousInstruction):
     .. attribute:: mem_kind
 
         A string, ``"global"`` or ``"local"``. Chooses which memory type to
-        sychronize, for targets that require this (e.g. OpenCL)
+        synchronize, for targets that require this (e.g. OpenCL)
 
     The textual syntax in a :mod:`loopy` kernel is::
 
@@ -1475,20 +1640,21 @@ class BarrierInstruction(_DataObliviousInstruction):
     fields = _DataObliviousInstruction.fields | {"synchronization_kind",
                                                      "mem_kind"}
 
-    def __init__(self, id, depends_on=None, depends_on_is_final=None,
+    def __init__(self, id, happens_after=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
             no_sync_with=None,
             within_inames_is_final=None, within_inames=None,
             priority=None,
             predicates=None, tags=None, synchronization_kind="global",
-            mem_kind="local"):
+            mem_kind="local",
+            depends_on=None):
 
         if predicates:
             raise LoopyError("conditional barriers are not supported")
 
         super().__init__(
                 id=id,
-                depends_on=depends_on,
+                happens_after=happens_after,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
@@ -1497,8 +1663,8 @@ def __init__(self, id, depends_on=None, depends_on_is_final=None,
                 within_inames=within_inames,
                 priority=priority,
                 predicates=predicates,
-                tags=tags
-                )
+                tags=tags,
+                depends_on=depends_on)
 
         self.synchronization_kind = synchronization_kind
         self.mem_kind = mem_kind
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 5ed9b2ad3..9a14aedd5 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,6 +34,7 @@
 import islpy as isl
 from islpy import dim_type
 from pytools import memoize_on_first_arg, natsorted
+from pytools.tag import Tag
 
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from loopy.kernel import LoopKernel
@@ -44,7 +45,8 @@
     _DataObliviousInstruction,
 )
 from loopy.symbolic import CombineMapper
-from loopy.translation_unit import TranslationUnit, for_each_kernel
+from loopy.translation_unit import TranslationUnit, TUnitOrKernelT, for_each_kernel
+from loopy.types import ToLoopyTypeConvertible
 
 
 logger = logging.getLogger(__name__)
@@ -52,15 +54,20 @@
 
 # {{{ add and infer argument dtypes
 
-def add_dtypes(prog_or_kernel, dtype_dict):
+def add_dtypes(
+            kernel: TUnitOrKernelT,
+            dtype_dict: Mapping[str, ToLoopyTypeConvertible],
+        ) -> TUnitOrKernelT:
     """Specify remaining unspecified argument/temporary variable types.
 
     :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype`
         instances
     """
-    if isinstance(prog_or_kernel, TranslationUnit):
+    if isinstance(kernel, TranslationUnit):
+        t_unit = kernel
+        del kernel
         kernel_names = [clbl.subkernel.name for clbl in
-                prog_or_kernel.callables_table.values() if isinstance(clbl,
+                t_unit.callables_table.values() if isinstance(clbl,
                     CallableKernel)]
         if len(kernel_names) != 1:
             raise LoopyError("add_dtypes may not take a TranslationUnit with more"
@@ -69,10 +76,10 @@ def add_dtypes(prog_or_kernel, dtype_dict):
 
         kernel_name, = kernel_names
 
-        return prog_or_kernel.with_kernel(
-                add_dtypes(prog_or_kernel[kernel_name], dtype_dict))
+        return t_unit.with_kernel(
+                add_dtypes(t_unit[kernel_name], dtype_dict))
 
-    assert isinstance(prog_or_kernel, LoopKernel)
+    assert isinstance(kernel, LoopKernel)
 
     processed_dtype_dict = {}
 
@@ -83,13 +90,13 @@ def add_dtypes(prog_or_kernel, dtype_dict):
                 processed_dtype_dict[subkey] = v
 
     dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(
-            prog_or_kernel, processed_dtype_dict)
+            kernel, processed_dtype_dict)
 
     if dtype_dict_remainder:
         raise RuntimeError("unused argument dtypes: %s"
                 % ", ".join(dtype_dict_remainder))
 
-    return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars)
+    return kernel.copy(args=new_args, temporary_variables=new_temp_vars)
 
 
 def _add_dtypes_overdetermined(kernel, dtype_dict):
@@ -263,7 +270,7 @@ def find_all_insn_inames(kernel):
             if insn.within_inames_is_final:
                 continue
 
-            # {{{ depdency-based propagation
+            # {{{ dependency-based propagation
 
             inames_old = insn_id_to_inames[insn.id]
             inames_new = inames_old | guess_iname_deps_based_on_var_use(
@@ -513,8 +520,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
     """
 
     # make sure all automatically added stuff shows up
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False)
+    from loopy.kernel.creation import apply_single_writer_dependency_heuristic
+    kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=False)
 
     if iname_cluster and not kernel.linearization:
         try:
@@ -1252,9 +1259,9 @@ def find_recursive_dependencies(kernel, insn_ids):
 
         for insn_id in queue:
             insn = kernel.id_to_insn[insn_id]
-            additionals = insn.depends_on - result
-            result.update(additionals)
-            new_queue.extend(additionals)
+            additional = insn.depends_on - result
+            result.update(additional)
+            new_queue.extend(additional)
 
         queue = new_queue
 
@@ -1477,7 +1484,7 @@ def conform_to_uniform_length(s):
 
 # {{{ stringify_instruction_list
 
-def stringify_instruction_tag(tag):
+def stringify_instruction_tag(tag: Tag) -> str:
     from loopy.kernel.instruction import LegacyStringInstructionTag
     if isinstance(tag, LegacyStringInstructionTag):
         return f"S({tag.value})"
@@ -1485,7 +1492,7 @@ def stringify_instruction_tag(tag):
         return str(tag)
 
 
-def stringify_instruction_list(kernel):
+def stringify_instruction_list(kernel: LoopKernel) -> list[str]:
     # {{{ topological sort
 
     printed_insn_ids = set()
@@ -1519,7 +1526,7 @@ def insert_insn_into_order(insn):
 
     leader = " " * uniform_arrow_length
     lines = []
-    current_inames = [set()]
+    current_inames: list[set[str]] = [set()]
 
     if uniform_arrow_length:
         indent_level = [1]
@@ -1530,13 +1537,13 @@ def insert_insn_into_order(insn):
 
     iname_order = kernel._get_iname_order_for_printing()
 
-    def add_pre_line(s):
+    def add_pre_line(s: str) -> None:
         lines.append(leader + " " * indent_level[0] + s)
 
-    def add_main_line(s):
+    def add_main_line(s: str) -> None:
         lines.append(arrows + " " * indent_level[0] + s)
 
-    def add_post_line(s):
+    def add_post_line(s: str) -> None:
         lines.append(extender + " " * indent_level[0] + s)
 
     def adapt_to_new_inames_list(new_inames):
@@ -1735,7 +1742,7 @@ def get_global_barrier_order(kernel):
 
 @memoize_on_first_arg
 def find_most_recent_global_barrier(kernel, insn_id):
-    """Return the id of the latest occuring global barrier which the
+    """Return the id of the latest occurring global barrier which the
     given instruction (indirectly or directly) depends on, or *None* if this
     instruction does not depend on a global barrier.
 
@@ -1995,7 +2002,7 @@ def infer_args_are_input_output(kernel):
         elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)):
             pass
         else:
-            raise NotImplementedError("Unkonwn argument type %s." % type(arg))
+            raise NotImplementedError("Unknown argument type %s." % type(arg))
 
         if not (arg.is_input or arg.is_output):
             raise LoopyError("Kernel argument must be either input or output."
diff --git a/loopy/match.py b/loopy/match.py
index 889f4e74f..5e409791b 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -1,4 +1,4 @@
-"""Matching functionality for instruction ids and subsitution
+"""Matching functionality for instruction ids and substitution
 rule invocations stacks."""
 
 
diff --git a/loopy/options.py b/loopy/options.py
index 9c4fa0fb4..293670774 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -23,6 +23,7 @@
 
 import os
 import re
+from typing import Any
 from warnings import warn
 
 from pytools import ImmutableRecord
@@ -118,7 +119,7 @@ class Options(ImmutableRecord):
 
     .. attribute:: cl_exec_manage_array_events
 
-        Within the PyOpenCL executor, respect and udpate
+        Within the PyOpenCL executor, respect and update
         :attr:`pyopencl.array.Array.events`.
 
         Defaults to *True*.
@@ -156,7 +157,7 @@ class Options(ImmutableRecord):
 
         Allow re-ordering of floating point arithmetic. Re-ordering may
         give different results as floating point arithmetic is not
-        associative in addition and mulitplication. Default is *True*.
+        associative in addition and multiplication. Default is *True*.
         Note that the implementation of this option is currently incomplete.
 
     .. attribute:: build_options
@@ -214,7 +215,7 @@ def __init__(
             # All defaults are further required to be False when cast to bool
             # for the update() functionality to work.
 
-            self, **kwargs):
+            self, **kwargs: Any) -> None:
 
         kwargs = _apply_legacy_map(self._legacy_options_map, kwargs)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index d24e14cc2..3293e9a1e 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -49,7 +49,11 @@
     auto,
     filter_iname_tags_by_type,
 )
-from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.kernel.function_interface import (
+    ArgDescriptor,
+    CallableKernel,
+    ScalarCallable,
+)
 
 # from loopy.transform.iname import remove_any_newly_unused_inames
 from loopy.kernel.instruction import (
@@ -655,7 +659,7 @@ def traverse_to_infer_arg_descr(kernel, callables_table):
     return descr_inferred_kernel, arg_descr_inf_mapper.clbl_inf_ctx
 
 
-def infer_arg_descr(program):
+def infer_arg_descr(t_unit: TranslationUnit) -> TranslationUnit:
     """
     Returns a copy of *program* with the
     :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the
@@ -666,12 +670,12 @@ def infer_arg_descr(program):
     from loopy.kernel.function_interface import ArrayArgDescriptor, ValueArgDescriptor
     from loopy.translation_unit import make_clbl_inf_ctx, resolve_callables
 
-    program = resolve_callables(program)
+    t_unit = resolve_callables(t_unit)
 
-    clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table,
-                                     program.entrypoints)
+    clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table,
+                                     t_unit.entrypoints)
 
-    for e in program.entrypoints:
+    for e in t_unit.entrypoints:
         def _tuple_or_none(s):
             if isinstance(s, tuple):
                 return s
@@ -680,8 +684,8 @@ def _tuple_or_none(s):
             else:
                 return s,
 
-        arg_id_to_descr = {}
-        for arg in program[e].args:
+        arg_id_to_descr: dict[str, ArgDescriptor] = {}
+        for arg in t_unit[e].args:
             if isinstance(arg, ArrayBase):
                 if arg.shape not in (None, auto):
                     arg_id_to_descr[arg.name] = ArrayArgDescriptor(
@@ -691,12 +695,12 @@ def _tuple_or_none(s):
                 arg_id_to_descr[arg.name] = ValueArgDescriptor()
             else:
                 raise NotImplementedError()
-        new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs(
+        new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_descrs(
                 arg_id_to_descr, clbl_inf_ctx)
         clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable,
                                                             is_entrypoint=True)
 
-    return clbl_inf_ctx.finish_program(program)
+    return clbl_inf_ctx.finish_program(t_unit)
 
 # }}}
 
@@ -825,8 +829,8 @@ def preprocess_program(t_unit: TranslationUnit) -> TranslationUnit:
     from loopy.transform.subst import expand_subst
     t_unit = expand_subst(t_unit)
 
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    t_unit = apply_single_writer_depencency_heuristic(t_unit)
+    from loopy.kernel.creation import apply_single_writer_dependency_heuristic
+    t_unit = apply_single_writer_dependency_heuristic(t_unit)
 
     # Ordering restrictions:
     #
diff --git a/loopy/py.typed b/loopy/py.typed
new file mode 100644
index 000000000..e69de29bb
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index ca45521e3..1364be850 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from loopy.typing import not_none
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -51,11 +53,16 @@
 from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel
 from loopy.kernel.instruction import InstructionBase
 from loopy.tools import LoopyKeyBuilder, caches
+from loopy.typing import InameStr
 from loopy.version import DATA_MODEL_VERSION
 
 
 if TYPE_CHECKING:
     from loopy.kernel import LoopKernel
+    from loopy.schedule.tools import (
+        InameStrSet,
+        LoopTree,
+    )
     from loopy.translation_unit import CallablesTable, TranslationUnit
 
 
@@ -63,12 +70,11 @@
 
 
 __doc__ = """
-.. currentmodule:: loopy.schedule
-
 .. autoclass:: ScheduleItem
 .. autoclass:: BeginBlockItem
 .. autoclass:: EndBlockItem
 .. autoclass:: CallKernel
+.. autoclass:: ReturnFromKernel
 .. autoclass:: Barrier
 .. autoclass:: RunInstruction
 
@@ -713,12 +719,13 @@ def get_insns_in_topologically_sorted_order(
     from pytools.graph import compute_topological_order
 
     rev_dep_map: Dict[str, Set[str]] = {
-            insn.id: set() for insn in kernel.instructions}
+            not_none(insn.id): set() for insn in kernel.instructions}
     for insn in kernel.instructions:
         for dep in insn.depends_on:
+            assert insn.id is not None
             rev_dep_map[dep].add(insn.id)
 
-    # For breaking ties, we compare the features of an intruction
+    # For breaking ties, we compare the features of an instruction
     # so that instructions with the same set of features are lumped
     # together. This helps in :method:`schedule_as_many_run_insns_as_possible`
     # which bails after 5 insns that don't have the same feature.
@@ -877,7 +884,170 @@ def is_similar_to_template(insn):
 # }}}
 
 
-# {{{ scheduling algorithm
+# {{{ scheduling algorithm v2
+
+def _get_outermost_diverging_inames(
+            tree: LoopTree,
+            within1: InameStrSet,
+            within2: InameStrSet
+        ) -> Tuple[InameStr, InameStr]:
+    """
+    For loop nestings *within1* and *within2*, returns the first inames at which
+    the loops nests diverge in the loop nesting tree *tree*.
+    """
+    common_ancestors = (within1 & within2) | {""}
+
+    innermost_parent = max(common_ancestors,
+                           key=lambda k: tree.depth(k))
+    iname1, = frozenset(tree.children(innermost_parent)) & within1
+    iname2, = frozenset(tree.children(innermost_parent)) & within2
+
+    return iname1, iname2
+
+
+def _generate_loop_schedules_v2(kernel: LoopKernel) -> Sequence[ScheduleItem]:
+    from functools import reduce
+
+    from pytools.graph import compute_topological_order
+
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+    from loopy.schedule.tools import get_loop_tree
+
+    concurrent_inames = {iname for iname in kernel.all_inames()
+                         if kernel.iname_tags_of_type(iname, ConcurrentTag)}
+    ilp_inames = {iname for iname in kernel.all_inames()
+                  if kernel.iname_tags_of_type(iname, IlpBaseTag)}
+    vec_inames = {iname for iname in kernel.all_inames()
+                  if kernel.iname_tags_of_type(iname, VectorizeTag)}
+    parallel_inames = (concurrent_inames - ilp_inames - vec_inames)
+
+    # {{{ can v2 scheduler handle the kernel?
+
+    from loopy.schedule.tools import V2SchedulerNotImplementedError
+    if any(insn.conflicts_with_groups for insn in kernel.instructions):
+        raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule"
+                " kernels with instruction having conflicts with groups.")
+
+    if any(insn.priority != 0 for insn in kernel.instructions):
+        raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule"
+                " kernels with instruction priorities set.")
+
+    if kernel.schedule is not None:
+        # cannot handle preschedule yet
+        raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule"
+                " prescheduled kernels.")
+
+    if ilp_inames or vec_inames:
+        raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule"
+                " loops tagged with 'ilp'/'vec' as they are not guaranteed to"
+                " be single entry loops.")
+
+    # }}}
+
+    loop_tree = get_loop_tree(kernel)
+
+    # loop_inames: inames that are realized as loops. Concurrent inames aren't
+    # realized as a loop in the generated code for a loopy.TargetBase.
+
+    # FIXME: These three could be one statement if it weren't for
+    # - https://github.com/python/mypy/issues/17693
+    # - https://github.com/python/mypy/issues/17694
+    emptyset: frozenset[InameStr] = frozenset()
+    all_inames = reduce(
+                        frozenset.union,
+                        (insn.within_inames for insn in kernel.instructions),
+                        emptyset)
+    loop_inames = all_inames - parallel_inames
+
+    # The idea here is to build a DAG, where nodes are schedule items and if
+    # there exists an edge from schedule item A to schedule item B in the DAG =>
+    # B *must* come after A in the linearized result.
+
+    dag: dict[ScheduleItem, frozenset[ScheduleItem]] = {}
+
+    # LeaveLoop(i) *must* follow EnterLoop(i)
+    dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)})
+                for iname in loop_inames})
+    dag.update({LeaveLoop(iname=iname): frozenset()
+                for iname in loop_inames})
+    dag.update({RunInstruction(insn_id=not_none(insn.id)): frozenset()
+                for insn in kernel.instructions})
+
+    # {{{ add constraints imposed by the loop nesting
+
+    for outer_loop in loop_tree.nodes():
+        if outer_loop == "":
+            continue
+
+        for child in loop_tree.children(outer_loop):
+            inner_loop = child
+            dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)}
+            dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)}
+
+    # }}}
+
+    # {{{ add deps. between schedule items coming from insn. depepdencies
+
+    for insn in kernel.instructions:
+        assert insn.id is not None
+
+        insn_loop_inames = insn.within_inames & loop_inames
+        for dep_id in insn.depends_on:
+            dep = kernel.id_to_insn[dep_id]
+            dep_loop_inames = dep.within_inames & loop_inames
+            # Enforce instruction dep:
+            dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)}
+
+            # {{{ register deps on loop entry/leave because of insn. deps
+
+            if dep_loop_inames < insn_loop_inames:
+                for iname in insn_loop_inames - dep_loop_inames:
+                    dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)}
+            elif insn_loop_inames < dep_loop_inames:
+                for iname in dep_loop_inames - insn_loop_inames:
+                    dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)}
+            elif dep_loop_inames != insn_loop_inames:
+                insn_iname, dep_iname = _get_outermost_diverging_inames(
+                        loop_tree, insn_loop_inames, dep_loop_inames)
+                dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)}
+            else:
+                pass
+
+            # }}}
+
+        for iname in insn_loop_inames:
+            # For an insn within a loop nest 'i'
+            # for i
+            #   insn
+            # end i
+            # 'insn' *must* come b/w 'for i' and 'end i'
+            dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)}
+            dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)}
+
+    # }}}
+
+    def iname_key(iname: str) -> str:
+        all_ancestors = sorted(loop_tree.ancestors(iname),
+                               key=lambda x: loop_tree.depth(x))
+        return ",".join(all_ancestors+[iname])
+
+    def key(x: ScheduleItem) -> tuple[str, ...]:
+        if isinstance(x, RunInstruction):
+            iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames),
+                        key=lambda k: loop_tree.depth(k),
+                        default="")
+            return (iname_key(iname), x.insn_id)
+        elif isinstance(x, (EnterLoop, LeaveLoop)):
+            return (iname_key(x.iname),)
+        else:
+            raise NotImplementedError
+
+    return compute_topological_order(dag, key=key)
+
+# }}}
+
+
+# {{{ legacy scheduling algorithm
 
 def _generate_loop_schedules_internal(
         sched_state, debug=None):
@@ -1196,7 +1366,7 @@ def insn_sort_key(insn_id):
                                 print(
                                     "%(warn)swarning:%(reset_all)s '%(iname)s', "
                                     "which the schedule is "
-                                    "currently stuck inside of, seems mis-nested. "
+                                    "currently stuck inside of, seems misnested. "
                                     "'%(subdep)s' must occur " "before '%(dep)s', "
                                     "but '%(subdep)s must be outside "
                                     "'%(iname)s', whereas '%(dep)s' must be back "
@@ -1404,7 +1574,7 @@ def insn_sort_key(insn_id):
                     get_priority_tiers(wanted, sched_state.kernel.loop_priority))
 
             # Update the loop priority set, because some constraints may have
-            # have been contradictary.
+            # have been contradictory.
             loop_priority_set = set().union(*[set(t) for t in priority_tiers])
 
             priority_tiers.append(
@@ -2029,6 +2199,40 @@ def generate_loop_schedules(
                 callables_table, debug_args=debug_args)
 
 
+def _postprocess_schedule(kernel, callables_table, gen_sched):
+    from loopy.kernel import KernelState
+
+    gen_sched = convert_barrier_instructions_to_barriers(
+            kernel, gen_sched)
+
+    gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table,
+                                                     return_dict=True)
+
+    if (gsize or lsize):
+        if not kernel.options.disable_global_barriers:
+            logger.debug("%s: barrier insertion: global" % kernel.name)
+            gen_sched = insert_barriers(kernel, callables_table, gen_sched,
+                    synchronization_kind="global",
+                    verify_only=(not
+                        kernel.options.insert_gbarriers))
+
+        logger.debug("%s: barrier insertion: local" % kernel.name)
+        gen_sched = insert_barriers(kernel, callables_table, gen_sched,
+            synchronization_kind="local", verify_only=False)
+        logger.debug("%s: barrier insertion: done" % kernel.name)
+
+    new_kernel = kernel.copy(
+            linearization=gen_sched,
+            state=KernelState.LINEARIZED)
+
+    from loopy.schedule.device_mapping import map_schedule_onto_host_or_device
+    if kernel.state != KernelState.LINEARIZED:
+        # Device mapper only gets run once.
+        new_kernel = map_schedule_onto_host_or_device(new_kernel)
+
+    return new_kernel
+
+
 def _generate_loop_schedules_inner(
         kernel: LoopKernel,
         callables_table: CallablesTable,
@@ -2041,6 +2245,19 @@ def _generate_loop_schedules_inner(
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
+    from loopy.schedule.tools import V2SchedulerNotImplementedError
+    try:
+        gen_sched = _generate_loop_schedules_v2(kernel)
+        yield _postprocess_schedule(kernel, callables_table, gen_sched)
+        return
+
+    except V2SchedulerNotImplementedError as e:
+        warn_with_kernel(
+            kernel,
+            "v1_scheduler_fallback",
+            f"Falling back to a slow scheduler implementation due to: {e}",
+             stacklevel=1)
+
     schedule_count = 0
 
     debug = ScheduleDebugger(**debug_args)
@@ -2102,7 +2319,7 @@ def _generate_loop_schedules_inner(
 
             schedule=(),
 
-            unscheduled_insn_ids={insn.id for insn in kernel.instructions},
+            unscheduled_insn_ids={not_none(insn.id) for insn in kernel.instructions},
             scheduled_insn_ids=frozenset(),
             within_subkernel=kernel.state != KernelState.LINEARIZED,
             may_schedule_global_barriers=True,
@@ -2155,33 +2372,7 @@ def print_longest_dead_end():
                 sched_state, debug=debug, **schedule_gen_kwargs):
             debug.stop()
 
-            gen_sched = convert_barrier_instructions_to_barriers(
-                    kernel, gen_sched)
-
-            gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table,
-                                                             return_dict=True)
-
-            if (gsize or lsize):
-                if not kernel.options.disable_global_barriers:
-                    logger.debug("%s: barrier insertion: global" % kernel.name)
-                    gen_sched = insert_barriers(kernel, callables_table, gen_sched,
-                            synchronization_kind="global",
-                            verify_only=(not
-                                kernel.options.insert_gbarriers))
-
-                logger.debug("%s: barrier insertion: local" % kernel.name)
-                gen_sched = insert_barriers(kernel, callables_table, gen_sched,
-                    synchronization_kind="local", verify_only=False)
-                logger.debug("%s: barrier insertion: done" % kernel.name)
-
-            new_kernel = kernel.copy(
-                    linearization=gen_sched,
-                    state=KernelState.LINEARIZED)
-
-            from loopy.schedule.device_mapping import map_schedule_onto_host_or_device
-            if kernel.state != KernelState.LINEARIZED:
-                # Device mapper only gets run once.
-                new_kernel = map_schedule_onto_host_or_device(new_kernel)
+            new_kernel = _postprocess_schedule(kernel, callables_table, gen_sched)
 
             yield new_kernel
 
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index b2a44c499..3858462b1 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -1,4 +1,35 @@
-__copyright__ = "Copyright (C) 2016 Matt Wala"
+__copyright__ = """
+Copyright (C) 2016 Matt Wala
+Copyright (C) 2020 University of Illinois Board of Trustees
+Copyright (C) 2022 Kaushik Kulkarni
+"""
+
+__doc__ = """
+.. autofunction:: get_block_boundaries
+.. autofunction:: temporaries_read_in_subkernel
+.. autofunction:: args_read_in_subkernel
+.. autofunction:: args_written_in_subkernel
+.. autofunction:: supporting_temporary_names
+
+.. autoclass:: KernelArgInfo
+.. autoclass:: SubKernelArgInfo
+
+.. autofunction:: get_kernel_arg_info
+.. autofunction:: get_subkernel_arg_info
+
+.. autofunction:: get_return_from_kernel_mapping
+
+.. autoclass:: AccessMapDescriptor
+.. autoclass:: WriteRaceChecker
+
+.. autoclass:: InameStrSet
+.. autoclass:: LoopNestTree
+.. autoclass:: LoopTree
+
+.. autofunction:: separate_loop_nest
+.. autofunction:: get_partial_loop_nest_tree
+.. autofunction:: get_loop_tree
+"""
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -21,24 +52,32 @@
 """
 
 import enum
+from collections.abc import Callable, Collection, Mapping
 from dataclasses import dataclass
-from functools import cached_property
-from typing import Dict, FrozenSet, List, Sequence, Set, Tuple
+from functools import cached_property, reduce
+from typing import AbstractSet, Dict, FrozenSet, List, Sequence, Set, Tuple
+
+from immutables import Map
+from typing_extensions import TypeAlias
 
 import islpy as isl
-from pytools import memoize_method
+from pytools import memoize_method, memoize_on_first_arg
 
+from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.kernel.data import AddressSpace, ArrayArg, TemporaryVariable
+from loopy.schedule import ScheduleItem
+from loopy.schedule.tree import Tree
+from loopy.typing import InameStr, not_none
 
 
 # {{{ block boundary finder
 
-def get_block_boundaries(schedule):
-    """
+def get_block_boundaries(schedule: Sequence[ScheduleItem]) -> Mapping[int, int]:
+    r"""
     Return a dictionary mapping indices of
-    :class:`loopy.schedule.BlockBeginItem`s to
-    :class:`loopy.schedule.BlockEndItem`s and vice versa.
+    :class:`loopy.schedule.BeginBlockItem`\ s to
+    :class:`loopy.schedule.EndBlockItem`\ s and vice versa.
     """
     from loopy.schedule import BeginBlockItem, EndBlockItem
     block_bounds = {}
@@ -131,8 +170,12 @@ def supporting_temporary_names(
 
 @dataclass(frozen=True)
 class KernelArgInfo:
-    passed_arg_names: Sequence[str]
+    """
+    .. autoattribute:: passed_arg_names
+    .. autoattribute:: written_names
+    """
 
+    passed_arg_names: Sequence[str]
     written_names: FrozenSet[str]
 
     @property
@@ -142,6 +185,12 @@ def passed_names(self) -> Sequence[str]:
 
 @dataclass(frozen=True)
 class SubKernelArgInfo(KernelArgInfo):
+    """Inherits from :class:`KernelArgInfo`.
+
+    .. autoattribute:: passed_inames
+    .. autoattribute:: passed_temporaries
+    """
+
     passed_inames: Sequence[str]
     passed_temporaries: Sequence[str]
 
@@ -301,7 +350,7 @@ def get_subkernel_arg_info(
 
 # {{{ get_return_from_kernel_mapping
 
-def get_return_from_kernel_mapping(kernel):
+def get_return_from_kernel_mapping(kernel: LoopKernel) -> Mapping[int, int | None]:
     """
     Returns a mapping from schedule index of every schedule item (S) in
     *kernel* to the schedule index of :class:`loopy.schedule.ReturnFromKernel`
@@ -318,8 +367,8 @@ def get_return_from_kernel_mapping(kernel):
     )
     assert isinstance(kernel, LoopKernel)
     assert isinstance(kernel.linearization, list)
-    return_from_kernel_idxs = {}
-    current_return_from_kernel = None
+    return_from_kernel_idxs: dict[int, int | None] = {}
+    current_return_from_kernel: int | None = None
     for sched_idx, sched_item in list(enumerate(kernel.linearization))[::-1]:
         if isinstance(sched_item, CallKernel):
             return_from_kernel_idxs[sched_idx] = current_return_from_kernel
@@ -621,4 +670,445 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir,
 
 # }}}
 
-# vim: foldmethod=marker
+
+InameStrSet: TypeAlias = FrozenSet[InameStr]
+LoopNestTree: TypeAlias = Tree[InameStrSet]
+LoopTree: TypeAlias = Tree[InameStr]
+
+
+class V2SchedulerNotImplementedError(LoopyError):
+    pass
+
+
+def separate_loop_nest(
+            tree: LoopNestTree,
+            loop_nests: Collection[InameStrSet],
+            inames_to_separate: InameStrSet
+        ) -> tuple[LoopNestTree, InameStrSet, InameStrSet | None]:
+    """
+    Returns a copy of *tree* that has *inames_to_separate* occur in
+    nodes that are not shared with other inames.
+    Returns a version of the loop nest tree *tree* so that every node in the tree is
+    either a subset of *outermost_inames* or has an empty intersection with
+    *outermost_inames*.
+
+    This routine modifies at most one node of the tree.
+    All its ancestors must satisfy `ancestor <= outermost_inames`.
+    For the first node not  satisfying this relationship,
+    if `node & outermost_inames` is empty, no modification is made.
+    Otherwise, if ``node & outermost_inames < node``, that node is split
+    so as to separate *outermost_inames* in their own node.
+
+    :arg loop_nests: A collection of nodes in *tree* that cover
+        *inames_to_separate*.
+
+    :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``,
+        where outer_loop_nest is the identifier for the new outer and inner
+        loop nests so that *inames_to_separate* is a valid nesting.
+
+    .. note::
+
+        We could compute *loop_nests* within this routine's implementation, but
+        computing would be expensive and hence we ask the caller for this info.
+
+    Example::
+       *tree*: frozenset()
+               └── frozenset({'j', 'i'})
+                   └── frozenset({'k', 'l'})
+
+       *inames_to_separate*: frozenset({'k', 'i', 'j'})
+       *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})}
+
+       Returns:
+
+       *new_tree*: frozenset()
+                   └── frozenset({'j', 'i'})
+                       └── frozenset({'k'})
+                           └── frozenset({'l'})
+
+       *outer_loop_nest*: frozenset({'k'})
+       *inner_loop_nest*: frozenset({'l'})
+    """
+    assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests)
+
+    # annotation to avoid https://github.com/python/mypy/issues/17693
+    emptyset: InameStrSet = frozenset()
+
+    assert inames_to_separate <= reduce(frozenset.union, loop_nests, emptyset)
+
+    # {{{ sanity check to ensure the loop nest *inames_to_separate* is possible
+
+    loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest))
+
+    for outer, inner in zip(loop_nests[:-1], loop_nests[1:]):
+        if outer != tree.parent(inner):
+            raise LoopyError(f"Cannot schedule loop nest {inames_to_separate} "
+                             f" in the nesting tree:\n{tree}")
+
+    assert tree.depth(loop_nests[0]) == 0
+
+    # }}}
+
+    innermost_node = loop_nests[-1]
+    # separate variable to avoid https://github.com/python/mypy/issues/17694
+    outerer_loops = reduce(frozenset.union, loop_nests[:-1], emptyset)
+    new_outer_node = inames_to_separate - outerer_loops
+    new_inner_node = innermost_node - inames_to_separate
+
+    if new_outer_node == innermost_node:
+        # such a loop nesting already exists => do nothing
+        return tree, new_outer_node, None
+
+    # add the outer loop to our loop nest tree
+    tree = tree.add_node(new_outer_node,
+                         parent=not_none(tree.parent(innermost_node)))
+
+    # rename the old loop to the inner loop
+    tree = tree.replace_node(innermost_node,
+                            new_node=new_inner_node)
+
+    # set the parent of inner loop to be the outer loop
+    tree = tree.move_node(new_inner_node, new_parent=new_outer_node)
+
+    return tree, new_outer_node, new_inner_node
+
+
+def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest):
+    """
+    Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*.
+    """
+    # add the outer loop to our loop nest tree
+    return tree.add_node(inner_loop_nest, parent=outer_loop_nest)
+
+
+def _order_loop_nests(
+            loop_nest_tree: LoopNestTree,
+            strict_priorities: FrozenSet[Tuple[InameStr, ...]],
+            relaxed_priorities: FrozenSet[Tuple[InameStr, ...]],
+            iname_to_tree_node_id: Mapping[InameStr, InameStrSet],
+          ) -> LoopTree:
+    """
+    Returns a loop nest where all nodes in the tree are instances of
+    :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to
+    multiple loop nesting, this routine returns a unique loop nest that is
+    obtained after constraining *loop_nest_tree* with the constraints enforced
+    by *priorities*.
+
+    :arg strict_priorities: Expresses strict nesting constraints using the same
+        data structure as :attr:`loopy.LoopKernel.loop_priority`.
+        These priorities are imposed strictly i.e. if these conditions cannot be met a
+        :class:`loopy.diagnostic.LoopyError` is raised.
+
+    :arg relaxed_priorities: Expresses strict nesting constraints using the same
+        data structure as :attr:`loopy.LoopKernel.loop_priority`.
+        These nesting constraints are treated as optional.
+
+    :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a
+        part of.
+    """
+    from warnings import warn
+
+    from pytools.graph import compute_topological_order as toposort
+
+    loop_nests = set(iname_to_tree_node_id.values())
+
+    # nesting_constraints: A mapping from the loop nest level to the nesting
+    # constraints applicable to it.
+    # Each nesting constraint is represented as a DAG. In the DAG, if there
+    # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested
+    # inside 'i'.
+    iname_to_nesting_constraints: dict[InameStrSet, dict[InameStr, InameStrSet]] = {
+        loop_nest: {iname: frozenset() for iname in loop_nest}
+        for loop_nest in loop_nests}
+
+    # The plan here is populate DAGs in *nesting_constraints* and then perform a
+    # toposort for each loop nest.
+
+    def _update_nesting_constraints(
+                priorities: FrozenSet[Tuple[InameStr, ...]],
+                cannot_satisfy_callback: Callable[[str], None]
+            ) -> None:
+        """
+        Records *priorities* in *nesting_constraints* and calls
+        *cannot_satisfy_callback* with an appropriate error message if the
+        priorities cannot be met.
+        """
+        for priority in priorities:
+            for outer_iname, inner_iname in zip(priority[:-1], priority[1:]):
+                if inner_iname not in iname_to_tree_node_id:
+                    cannot_satisfy_callback(f"Cannot enforce the constraint:"
+                                            f" {inner_iname} to be nested within"
+                                            f" {outer_iname}, as {inner_iname}"
+                                            f" is either a parallel loop or"
+                                            f" not an iname.")
+                    continue
+
+                if outer_iname not in iname_to_tree_node_id:
+                    cannot_satisfy_callback(f"Cannot enforce the constraint:"
+                                            f" {inner_iname} to be nested within"
+                                            f" {outer_iname}, as {outer_iname}"
+                                            f" is either a parallel loop or"
+                                            f" not an iname.")
+                    continue
+
+                inner_iname_nest = iname_to_tree_node_id[inner_iname]
+                outer_iname_nest = iname_to_tree_node_id[outer_iname]
+
+                if inner_iname_nest == outer_iname_nest:
+                    iname_to_nesting_constraints[
+                        inner_iname_nest][outer_iname] |= {inner_iname}
+                else:
+                    ancestors_of_inner_iname = (loop_nest_tree
+                                                .ancestors(inner_iname_nest))
+                    ancestors_of_outer_iname = (loop_nest_tree
+                                                .ancestors(outer_iname_nest))
+                    if outer_iname in ancestors_of_inner_iname:
+                        # nesting constraint already satisfied => do nothing
+                        pass
+                    elif inner_iname in ancestors_of_outer_iname:
+                        cannot_satisfy_callback("Cannot satisfy constraint that"
+                                                f" iname '{inner_iname}' must be"
+                                                f" nested within '{outer_iname}''.")
+                    else:
+                        # inner iname and outer iname are indirect family members
+                        # => must be realized via dependencies in the linearization
+                        # phase, not implemented in v2-scheduler yet.
+                        raise V2SchedulerNotImplementedError("cannot"
+                                " schedule kernels with priority dependencies"
+                                " between sibling loop nests")
+
+    def _raise_loopy_err(x):
+        raise LoopyError(x)
+
+    # record strict priorities
+    _update_nesting_constraints(strict_priorities, _raise_loopy_err)
+    # record relaxed priorities
+    _update_nesting_constraints(relaxed_priorities, warn)
+
+    # ordered_loop_nests: A mapping from the unordered loop nests to their
+    # ordered counterparts. For example. If we had only one loop nest
+    # `frozenset({"i", "j", "k"})`, and the prioirities said added the
+    # constraint that "i" must be nested within "k", then `ordered_loop_nests`
+    # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop
+    # nests would now have an order.
+    ordered_loop_nests = {
+        unordered_nest: toposort(flow, key=lambda x: x)
+        for unordered_nest, flow in iname_to_nesting_constraints.items()}
+
+    # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree'
+
+    assert loop_nest_tree.root == frozenset()
+
+    new_tree = Tree.from_root("")
+
+    old_to_new_parent = {}
+
+    old_to_new_parent[loop_nest_tree.root] = ""
+
+    # traversing 'tree' in an BFS fashion to create 'new_tree'
+    queue = list(loop_nest_tree.children(loop_nest_tree.root))
+
+    while queue:
+        current_nest = queue.pop(0)
+
+        ordered_nest = ordered_loop_nests[current_nest]
+        new_tree = new_tree.add_node(ordered_nest[0],
+                                     parent=old_to_new_parent[not_none(loop_nest_tree
+                                                              .parent(current_nest))])
+        for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]):
+            new_tree = new_tree.add_node(node=new_child, parent=new_parent)
+
+        old_to_new_parent[current_nest] = ordered_nest[-1]
+
+        queue.extend(loop_nest_tree.children(current_nest))
+
+    # }}}
+
+    return new_tree
+
+
+@memoize_on_first_arg
+def _get_parallel_inames(kernel: LoopKernel) -> AbstractSet[str]:
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+
+    concurrent_inames = {iname for iname in kernel.all_inames()
+                         if kernel.iname_tags_of_type(iname, ConcurrentTag)}
+    ilp_inames = {iname for iname in kernel.all_inames()
+                  if kernel.iname_tags_of_type(iname, IlpBaseTag)}
+    vec_inames = {iname for iname in kernel.all_inames()
+                  if kernel.iname_tags_of_type(iname, VectorizeTag)}
+    return (concurrent_inames - ilp_inames - vec_inames)
+
+
+def get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree:
+    """
+    Returns a tree representing the *kernel*'s loop nests.
+
+    Each node of the returned tree has a :class:`frozenset` of inames.
+    All the inames in the identifier of a parent node of a loop nest in the
+    tree must be nested outside all the iname in identifier of the loop nest.
+
+    .. note::
+
+        This routine only takes into account the nesting dependency
+        constraints of :attr:`loopy.InstructionBase.within_inames` of all the
+        *kernel*'s instructions and the iname tags. This routine does *NOT*
+        include the nesting constraints imposed by the dependencies between the
+        instructions and the dependencies imposed by the kernel's domain tree.
+    """
+    from loopy.kernel.data import IlpBaseTag
+
+    # figuring the possible loop nestings minus the concurrent_inames as they
+    # are never realized as actual loops
+    insn_iname_sets = {
+        insn.within_inames - _get_parallel_inames(kernel)
+        for insn in kernel.instructions}
+
+    root: InameStrSet = frozenset()
+    tree = Tree.from_root(root)
+
+    # mapping from iname to the innermost loop nest they are part of in *tree*.
+    iname_to_tree_node_id: Dict[InameStr, InameStrSet] = {}
+
+    # if there were any loop with no inames, those have been already account
+    # for as the root.
+    insn_iname_sets = insn_iname_sets - {root}
+
+    for iname_set in insn_iname_sets:
+        not_seen_inames = frozenset(iname for iname in iname_set
+                                    if iname not in iname_to_tree_node_id)
+        seen_inames = iname_set - not_seen_inames
+
+        all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames}
+
+        tree, outer_loop, inner_loop = separate_loop_nest(tree,
+                                                           (all_nests
+                                                            | {frozenset()}),
+                                                           seen_inames)
+        if not_seen_inames:
+            # make '_not_seen_inames' nest inside the seen ones.
+            # example: if there is already a loop nesting "i,j,k"
+            # and the current iname chain is "i,j,l". Only way this is possible
+            # is if "l" is nested within "i,j"-loops.
+            tree = _add_inner_loops(tree, outer_loop, not_seen_inames)
+
+        # {{{ update iname to node id
+
+        for iname in outer_loop:
+            iname_to_tree_node_id[iname] = outer_loop
+
+        if inner_loop is not None:
+            for iname in inner_loop:
+                iname_to_tree_node_id[iname] = inner_loop
+
+        for iname in not_seen_inames:
+            iname_to_tree_node_id[iname] = not_seen_inames
+
+        # }}}
+
+    # {{{ make ILP tagged inames innermost
+
+    ilp_inames = {iname for iname in kernel.all_inames()
+                  if kernel.iname_tags_of_type(iname, IlpBaseTag)}
+
+    for iname_set in insn_iname_sets:
+        for ilp_iname in (ilp_inames & insn_iname_sets):
+            # pull out other loops so that ilp_iname is the innermost
+            all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames}
+            tree, outer_loop, inner_loop = separate_loop_nest(tree,
+                                                               (all_nests
+                                                                | {frozenset()}),
+                                                               (iname_set
+                                                                - {ilp_iname}))
+
+            for iname in outer_loop:
+                iname_to_tree_node_id[iname] = outer_loop
+
+            if inner_loop is not None:
+                for iname in inner_loop:
+                    iname_to_tree_node_id[iname] = inner_loop
+
+    # }}}
+
+    return tree
+
+
+def _get_iname_to_tree_node_id_from_partial_loop_nest_tree(
+            tree: LoopNestTree,
+        ) -> Mapping[str, frozenset[str]]:
+    """
+    Returns the mapping from the iname to the *tree*'s node that it was a part
+    of.
+
+    :arg tree: A partial loop nest tree.
+    """
+    iname_to_tree_node_id = {}
+    for node in tree.nodes():
+        assert isinstance(node, frozenset)
+        for iname in node:
+            iname_to_tree_node_id[iname] = node
+
+    return Map(iname_to_tree_node_id)
+
+
+def get_loop_tree(kernel: LoopKernel) -> LoopTree:
+    """
+    Returns a tree representing the loop nesting for *kernel*. A parent node in
+    the tree is always nested outside all its children.
+
+    .. note::
+
+        Multiple loop nestings might exist for *kernel*, but this routine returns
+        one valid loop nesting.
+    """
+    from islpy import dim_type
+
+    tree = get_partial_loop_nest_tree(kernel)
+    iname_to_tree_node_id = (
+        _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree))
+
+    strict_loop_priorities: FrozenSet[Tuple[InameStr, ...]] = frozenset()
+
+    # {{{ impose constraints by the domain tree
+
+    # FIXME: These three could be one statement if it weren't for
+    # - https://github.com/python/mypy/issues/17693
+    # - https://github.com/python/mypy/issues/17694
+    emptyset: InameStrSet = frozenset()
+    loop_inames = reduce(frozenset.union,
+                          (insn.within_inames
+                           for insn in kernel.instructions),
+                          emptyset)
+    loop_inames = loop_inames - _get_parallel_inames(kernel)
+
+    for dom in kernel.domains:
+        for outer_iname in set(dom.get_var_names(dim_type.param)):
+            if outer_iname not in loop_inames:
+                continue
+
+            for inner_iname in dom.get_var_names(dim_type.set):
+                if inner_iname not in loop_inames:
+                    continue
+
+                # either outer_iname and inner_iname should belong to the same
+                # loop nest level or outer should be strictly outside inner
+                # iname
+                inner_iname_nest = iname_to_tree_node_id[inner_iname]
+                outer_iname_nest = iname_to_tree_node_id[outer_iname]
+
+                if inner_iname_nest == outer_iname_nest:
+                    strict_loop_priorities |= {(outer_iname, inner_iname)}
+                else:
+                    ancestors_of_inner_iname = tree.ancestors(inner_iname_nest)
+                    if outer_iname_nest not in ancestors_of_inner_iname:
+                        raise LoopyError(f"Loop '{outer_iname}' cannot be nested"
+                                         f" outside '{inner_iname}'.")
+
+    # }}}
+
+    return _order_loop_nests(tree,
+                             strict_loop_priorities,
+                             kernel.loop_priority,
+                             iname_to_tree_node_id)
+
+# vim: fdm=marker
diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py
new file mode 100644
index 000000000..253ff5f84
--- /dev/null
+++ b/loopy/schedule/tree.py
@@ -0,0 +1,285 @@
+# mypy: disallow-untyped-defs
+
+from __future__ import annotations
+
+
+__copyright__ = """
+Copyright (C) 2022 Kaushik Kulkarni
+Copyright (C) 2022-24 University of Illinois Board of Trustees
+"""
+
+
+__doc__ = """
+.. autoclass:: NodeT
+.. autoclass:: Tree
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Generic, Hashable, Iterator, List, Optional, Sequence, Tuple, TypeVar
+
+from immutables import Map
+
+from pytools import memoize_method
+
+
+# {{{ tree data structure
+
+NodeT = TypeVar("NodeT", bound=Hashable)
+
+
+@dataclass(frozen=True)
+class Tree(Generic[NodeT]):
+    """
+    An immutable tree containing nodes of type :class:`NodeT`.
+
+    .. automethod:: ancestors
+    .. automethod:: parent
+    .. automethod:: children
+    .. automethod:: add_node
+    .. automethod:: depth
+    .. automethod:: replace_node
+    .. automethod:: move_node
+
+    .. automethod:: __contains__
+
+    .. note::
+
+       Almost all the operations are implemented recursively. NOT suitable for
+       deep trees. At the very least if the Python implementation is CPython
+       this allocates a new stack frame for each iteration of the operation.
+    """
+
+    _parent_to_children: Map[NodeT, Tuple[NodeT, ...]]
+    _child_to_parent: Map[NodeT, Optional[NodeT]]
+
+    @staticmethod
+    def from_root(root: NodeT) -> "Tree[NodeT]":
+        return Tree(Map({root: ()}),
+                    Map({root: None}))
+
+    @cached_property
+    def root(self) -> NodeT:
+        guess = set(self._child_to_parent).pop()
+        parent_of_guess = self.parent(guess)
+        while parent_of_guess is not None:
+            guess = parent_of_guess
+            parent_of_guess = self.parent(guess)
+
+        return guess
+
+    @memoize_method
+    def ancestors(self, node: NodeT) -> Tuple[NodeT, ...]:
+        """
+        Returns a :class:`tuple` of nodes that are ancestors of *node*.
+        """
+        assert node in self
+
+        if self.is_root(node):
+            # => root
+            return ()
+
+        parent = self._child_to_parent[node]
+        assert parent is not None
+
+        return (parent,) + self.ancestors(parent)
+
+    def parent(self, node: NodeT) -> Optional[NodeT]:
+        """
+        Returns the parent of *node*.
+        """
+        assert node in self
+
+        return self._child_to_parent[node]
+
+    def children(self, node: NodeT) -> Tuple[NodeT, ...]:
+        """
+        Returns the children of *node*.
+        """
+        assert node in self
+
+        return self._parent_to_children[node]
+
+    @memoize_method
+    def depth(self, node: NodeT) -> int:
+        """
+        Returns the depth of *node*, with the root having depth 0.
+        """
+        assert node in self
+
+        if self.is_root(node):
+            # => None
+            return 0
+
+        parent_of_node = self.parent(node)
+        assert parent_of_node is not None
+
+        return 1 + self.depth(parent_of_node)
+
+    def is_root(self, node: NodeT) -> bool:
+        assert node in self
+
+        return self.parent(node) is None
+
+    def is_leaf(self, node: NodeT) -> bool:
+        assert node in self
+
+        return len(self.children(node)) == 0
+
+    def __contains__(self, node: NodeT) -> bool:
+        """Return *True* if *node* is a node in the tree."""
+        return node in self._child_to_parent
+
+    def add_node(self, node: NodeT, parent: NodeT) -> "Tree[NodeT]":
+        """
+        Returns a :class:`Tree` with added node *node* having a parent
+        *parent*.
+        """
+        if node in self:
+            raise ValueError(f"'{node}' already present in tree.")
+
+        siblings = self._parent_to_children[parent]
+
+        return Tree((self._parent_to_children
+                     .set(parent, siblings + (node,))
+                     .set(node, ())),
+                    self._child_to_parent.set(node, parent))
+
+    def replace_node(self, node: NodeT, new_node: NodeT) -> "Tree[NodeT]":
+        """
+        Returns a copy of *self* with *node* replaced with *new_node*.
+        """
+        if node not in self:
+            raise ValueError(f"'{node}' not present in tree.")
+
+        if new_node in self:
+            raise ValueError(f"cannot replace with '{new_node}', as its already a part"
+                             " of the tree.")
+
+        parent = self.parent(node)
+        children = self.children(node)
+
+        # {{{ update child to parent
+
+        child_to_parent_mut = self._child_to_parent.mutate()
+        del child_to_parent_mut[node]
+        child_to_parent_mut[new_node] = parent
+
+        for child in children:
+            child_to_parent_mut[child] = new_node
+
+        # }}}
+
+        # {{{ update parent_to_children
+
+        parent_to_children_mut = self._parent_to_children.mutate()
+        del parent_to_children_mut[node]
+        parent_to_children_mut[new_node] = children
+
+        if parent is not None:
+            # update the child's name in the parent's children
+            parent_to_children_mut[parent] = (
+                            *(frozenset(self.children(parent)) - frozenset([node])),
+                            new_node,)
+
+        # }}}
+
+        return Tree(parent_to_children_mut.finish(),
+                    child_to_parent_mut.finish())
+
+    def move_node(self, node: NodeT, new_parent: Optional[NodeT]) -> "Tree[NodeT]":
+        """
+        Returns a copy of *self* with node *node* as a child of *new_parent*.
+        """
+        if node not in self:
+            raise ValueError(f"'{node}' not a part of the tree => cannot move.")
+
+        if self.is_root(node):
+            if new_parent is None:
+                return self
+            else:
+                raise ValueError("Moving root not allowed.")
+
+        if new_parent is None:
+            raise ValueError("Making multiple roots not allowed")
+
+        if new_parent not in self:
+            raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.")
+
+        parent = self.parent(node)
+        assert parent is not None  # parent=root handled as a special case
+        siblings = self.children(parent)
+        parents_new_children = tuple(frozenset(siblings) - frozenset([node]))
+        new_parents_children = self.children(new_parent) + (node,)
+
+        new_child_to_parent = self._child_to_parent.set(node, new_parent)
+        new_parent_to_children = (self._parent_to_children
+                                  .set(parent, parents_new_children)
+                                  .set(new_parent, new_parents_children))
+
+        return Tree(new_parent_to_children,
+                    new_child_to_parent)
+
+    def __str__(self) -> str:
+        """
+        Stringifies the tree by using the box-drawing unicode characters.
+
+        .. doctest::
+
+            >>> from loopy.schedule.tree import Tree
+            >>> tree = (Tree.from_root("Root")
+            ...         .add_node("A", "Root")
+            ...         .add_node("B", "Root")
+            ...         .add_node("D", "B")
+            ...         .add_node("E", "B")
+            ...         .add_node("C", "A"))
+
+            >>> print(tree)
+            Root
+            ├── A
+            │   └── C
+            └── B
+                ├── D
+                └── E
+        """
+        def rec(node: NodeT) -> List[str]:
+            children_result = [rec(c) for c in self.children(node)]
+
+            def post_process_non_last_child(children: Sequence[str]) -> list[str]:
+                return ["├── " + children[0]] + [f"│   {c}" for c in children[1:]]
+
+            def post_process_last_child(children: Sequence[str]) -> list[str]:
+                return ["└── " + children[0]] + [f"    {c}" for c in children[1:]]
+
+            children_result = ([post_process_non_last_child(c)
+                                for c in children_result[:-1]]
+                            + [post_process_last_child(c)
+                                for c in children_result[-1:]])
+            return [str(node)] + sum(children_result, start=[])
+
+        return "\n".join(rec(self.root))
+
+    def nodes(self) -> Iterator[NodeT]:
+        return iter(self._child_to_parent.keys())
+
+# }}}
diff --git a/loopy/statistics.py b/loopy/statistics.py
index c9cf9d938..99b163f80 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -36,7 +36,7 @@
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from loopy.kernel.data import AddressSpace, MultiAssignmentBase, TemporaryVariable
 from loopy.kernel.function_interface import CallableKernel
-from loopy.symbolic import CoefficientCollector
+from loopy.symbolic import CoefficientCollector, flatten
 from loopy.translation_unit import TranslationUnit
 
 
@@ -422,16 +422,16 @@ def to_bytes(self):
             bytes_map = get_mem_access_map(knl).to_bytes()
             params = {"n": 512, "m": 256, "l": 128}
 
-            s1_g_ld_byt = bytes_map.filter_by(
+            s1_g_ld_bytes = bytes_map.filter_by(
                                 mtype=["global"], lid_strides={0: 1},
                                 direction=["load"]).eval_and_sum(params)
-            s2_g_ld_byt = bytes_map.filter_by(
+            s2_g_ld_bytes = bytes_map.filter_by(
                                 mtype=["global"], lid_strides={0: 2},
                                 direction=["load"]).eval_and_sum(params)
-            s1_g_st_byt = bytes_map.filter_by(
+            s1_g_st_bytes = bytes_map.filter_by(
                                 mtype=["global"], lid_strides={0: 1},
                                 direction=["store"]).eval_and_sum(params)
-            s2_g_st_byt = bytes_map.filter_by(
+            s2_g_st_bytes = bytes_map.filter_by(
                                 mtype=["global"], lid_strides={0: 2},
                                 direction=["store"]).eval_and_sum(params)
 
@@ -629,7 +629,7 @@ class Op(ImmutableRecord):
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
-       analagous to an NVIDIA CUDA warp.
+       analogous to an NVIDIA CUDA warp.
 
     .. attribute:: kernel_name
 
@@ -709,7 +709,7 @@ class MemAccess(ImmutableRecord):
     .. attribute:: variable_tags
 
        A :class:`frozenset` of subclasses of :class:`~pytools.tag.Tag`
-       that reflects :attr:`~loopy.symbolic.TaggedVariable.tags` of
+       that reflects :attr:`~loopy.TaggedVariable.tags` of
        an accessed variable.
 
     .. attribute:: count_granularity
@@ -723,7 +723,7 @@ class MemAccess(ImmutableRecord):
        work-group executes on a single compute unit with all work-items within
        the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
-       analagous to an NVIDIA CUDA warp.
+       analogous to an NVIDIA CUDA warp.
 
     .. attribute:: kernel_name
 
@@ -1109,7 +1109,7 @@ def _get_lid_and_gid_strides(knl, array, index):
 
     # create lid_strides and gid_strides dicts
 
-    # strides are coefficents in flattened index, i.e., we want
+    # strides are coefficients in flattened index, i.e., we want
     # lid_strides = {0:l0, 1:l1, 2:l2, ...} and
     # gid_strides = {0:g0, 1:g1, 2:g2, ...},
     # where l0, l1, l2, g0, g1, and g2 come from flattened index
@@ -1167,7 +1167,7 @@ def get_iname_strides(tag_to_iname_dict):
 
                 total_iname_stride += axis_tag_stride*coeff
 
-            tag_to_stride_dict[tag] = total_iname_stride
+            tag_to_stride_dict[tag] = flatten(total_iname_stride)
 
         return tag_to_stride_dict
 
@@ -1723,7 +1723,7 @@ def get_op_map(program, count_redundant_work=False,
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
-        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
@@ -1921,7 +1921,7 @@ def get_mem_access_map(program, count_redundant_work=False,
     :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or
         *None* that specifies the sub-group size. An OpenCL sub-group is an
         implementation-dependent grouping of work-items within a work-group,
-        analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
+        analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
         counting a :class:`MemAccess` whose count_granularity specifies that it
         should only be counted once per sub-group. If set to *None* an attempt
         to find the sub-group size using the device will be made, if this fails
@@ -2085,7 +2085,7 @@ def get_synchronization_map(program, subgroup_size=None, entrypoint=None):
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``"guess"``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
-        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index d56b54e79..f0e0333ec 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1,5 +1,7 @@
 """Pymbolic mappers for loopy."""
 
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,12 +29,21 @@
 import re
 from functools import cached_property, reduce
 from sys import intern
-from typing import AbstractSet, ClassVar, Mapping, Sequence, Tuple
+from typing import (
+    TYPE_CHECKING,
+    AbstractSet,
+    Any,
+    ClassVar,
+    Mapping,
+    Sequence,
+    Tuple,
+)
 
 import immutables
 import numpy as np
 
 import islpy as isl
+import pymbolic.primitives  # FIXME: also import by full name to allow sphinx to resolve
 import pymbolic.primitives as p
 import pytools.lex
 from islpy import dim_type
@@ -51,6 +62,7 @@
 )
 from pymbolic.mapper.dependency import CachedDependencyMapper as DependencyMapperBase
 from pymbolic.mapper.evaluator import CachedEvaluationMapper as EvaluationMapperBase
+from pymbolic.mapper.flattener import FlattenMapper as FlattenMapperBase
 from pymbolic.mapper.stringifier import StringifyMapper as StringifyMapperBase
 from pymbolic.mapper.substitutor import (
     CachedSubstitutionMapper as SubstitutionMapperBase,
@@ -58,19 +70,22 @@
 from pymbolic.mapper.unifier import UnidirectionalUnifier as UnidirectionalUnifierBase
 from pymbolic.parser import Parser as ParserBase
 from pytools import ImmutableRecord, memoize, memoize_method, memoize_on_first_arg
-from pytools.tag import Taggable
+from pytools.tag import Tag, Taggable
 
 from loopy.diagnostic import (
     ExpressionToAffineConversionError,
     LoopyError,
     UnableToDetermineAccessRangeError,
 )
+from loopy.types import ToLoopyTypeConvertible
 from loopy.typing import ExpressionT
 
 
-__doc__ = """
-.. currentmodule:: loopy.symbolic
+if TYPE_CHECKING:
+    from loopy.library.reduction import ReductionOperation
 
+
+__doc__ = """
 Loopy-specific expression types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -82,6 +97,8 @@
 
 .. autoclass:: TypedCSE
 
+.. currentmodule:: loopy
+
 .. autoclass:: TypeCast
 
 .. autoclass:: TaggedVariable
@@ -90,6 +107,8 @@
 
 .. autoclass:: LinearSubscript
 
+.. currentmodule:: loopy.symbolic
+
 .. autoclass:: RuleArgument
 
 .. autoclass:: ExpansionState
@@ -185,6 +204,14 @@ def map_resolved_function(self, expr, *args, **kwargs):
     map_fortran_division = IdentityMapperBase.map_quotient
 
 
+class FlattenMapper(FlattenMapperBase, IdentityMapperMixin):
+    pass
+
+
+def flatten(expr):
+    return FlattenMapper()(expr)
+
+
 class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
     pass
 
@@ -640,8 +667,9 @@ class TypeCast(LoopyExpressionBase):
 
         The expression to be cast.
     """
+    child: ExpressionT
 
-    def __init__(self, type, child):
+    def __init__(self, type: ToLoopyTypeConvertible, child: ExpressionT):
         super().__init__()
 
         from loopy.types import NumpyType, to_loopy_type
@@ -678,13 +706,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable):
     may then be used to address these uses--such as by prefetching only
     accesses tagged a certain way.
 
-    .. attribute:: tags
-
-        A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to
-        provide metadata on this object. Legacy string tags are converted to
-        :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry
-        a functional meaning, the tag carrying that same fucntional meaning
-        (e.g. :class:`~loopy.UseStreamingStoreTag`).
+    .. autoattribute:: tags
 
     Inherits from :class:`pymbolic.primitives.Variable`
     and :class:`pytools.tag.Taggable`.
@@ -692,6 +714,14 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable):
 
     init_arg_names = ("name", "tags")
 
+    tags: frozenset[Tag]
+    """A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to
+    provide metadata on this object. Legacy string tags are converted to
+    :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry
+    a functional meaning, the tag carrying that same functional meaning
+    (e.g. :class:`~loopy.UseStreamingStoreTag`).
+    """
+
     def __init__(self, name, tags):
         p.Variable.__init__(self, name)
         if isinstance(tags, str):
@@ -718,31 +748,42 @@ class Reduction(LoopyExpressionBase):
     """
     Represents a reduction operation on :attr:`expr` across :attr:`inames`.
 
-    .. attribute:: operation
-        an instance of :class:`loopy.library.reduction.ReductionOperation`
-
-    .. attribute:: inames
+    .. autoattribute:: operation
+    .. autoattribute:: inames
+    .. autoattribute:: expr
+    .. autoattribute:: allow_simultaneous
+    """
 
-        a list of inames across which reduction on :attr:`expr` is being
-        carried out.
+    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
 
-    .. attribute:: expr
+    operation: ReductionOperation
 
-        An expression which may have tuple type. If the expression has tuple
-        type, it must be one of the following:
-        * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or
-        * a :class:`loopy.symbolic.Reduction`, or
-        * a function call or substitution rule invocation.
+    inames: Sequence[str]
+    """The inames across which reduction on :attr:`expr` is being
+    carried out.
+    """
 
-    .. attribute:: allow_simultaneous
+    expr: ExpressionT
+    """An expression which may have tuple type. If the expression has tuple
+    type, it must be one of the following:
 
-        A :class:`bool`. If not *True*, an iname is allowed to be used
-        in precisely one reduction, to avoid mis-nesting errors.
+    * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or
+    * a :class:`loopy.symbolic.Reduction`, or
+    * a function call or substitution rule invocation.
     """
 
-    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
+    allow_simultaneous: bool
+    """If not *True*, an iname is allowed to be used
+    in precisely one reduction, to avoid misnesting errors.
+    """
 
-    def __init__(self, operation, inames, expr, allow_simultaneous=False):
+    def __init__(self,
+                 operation: ReductionOperation | str,
+                 inames: (tuple[str | pymbolic.primitives.Variable, ...]
+                     | pymbolic.primitives.Variable | str),
+                 expr: ExpressionT,
+                 allow_simultaneous: bool = False
+             ) -> None:
         if isinstance(inames, str):
             inames = tuple(iname.strip() for iname in inames.split(","))
 
@@ -751,7 +792,7 @@ def __init__(self, operation, inames, expr, allow_simultaneous=False):
 
         assert isinstance(inames, tuple)
 
-        def strip_var(iname):
+        def strip_var(iname: Any) -> str:
             if isinstance(iname, p.Variable):
                 iname = iname.name
 
@@ -968,10 +1009,14 @@ class SubArrayRef(LoopyExpressionBase):
 
     .. automethod:: is_equal
     """
+    swept_inames: tuple[p.Variable, ...]
+    subscript: p.Subscript
 
     init_arg_names = ("swept_inames", "subscript")
 
-    def __init__(self, swept_inames, subscript):
+    def __init__(self,
+                  swept_inames: tuple[p.Variable, ...] | p.Variable,
+                  subscript: p.Subscript) -> None:
 
         # {{{ sanity checks
 
@@ -1787,7 +1832,7 @@ def map_subscript(self, expr):
 
 # {{{ (pw)aff to expr conversion
 
-def aff_to_expr(aff):
+def aff_to_expr(aff: isl.Aff) -> ExpressionT:
     from pymbolic import var
 
     denom = aff.get_denominator_val().to_python()
@@ -1805,10 +1850,10 @@ def aff_to_expr(aff):
         if coeff:
             result += coeff*aff_to_expr(aff.get_div(i))
 
-    return result // denom
+    return flatten(result // denom)
 
 
-def pw_aff_to_expr(pw_aff, int_ok=False):
+def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> ExpressionT:
     if isinstance(pw_aff, int):
         if not int_ok:
             from warnings import warn
@@ -1830,7 +1875,7 @@ def pw_aff_to_expr(pw_aff, int_ok=False):
     return expr
 
 
-def pw_aff_to_pw_aff_implemented_by_expr(pw_aff):
+def pw_aff_to_pw_aff_implemented_by_expr(pw_aff: isl.PwAff) -> isl.PwAff:
     pieces = pw_aff.get_pieces()
 
     rest = isl.Set.universe(pw_aff.space.params())
@@ -1923,7 +1968,7 @@ def map_call(self, expr):
                 "for as-pwaff evaluation")
 
 
-def aff_from_expr(space, expr, vars_to_zero=None):
+def aff_from_expr(space: isl.Space, expr: ExpressionT, vars_to_zero=None) -> isl.Aff:
     if vars_to_zero is None:
         vars_to_zero = frozenset()
 
@@ -2090,7 +2135,8 @@ def simplify_using_aff(kernel, expr):
     try:
         aff = guarded_aff_from_expr(domain.space, expr)
     except ExpressionToAffineConversionError:
-        return expr
+        # Accomplish at least *some* simplification
+        return flatten(expr)
 
     # FIXME: Deal with assumptions, too.
     aff = aff.gist(domain)
@@ -2150,14 +2196,17 @@ def qpolynomial_to_expr(qpoly):
     assert all(isinstance(num, int) for num in numerators)
     assert isinstance(common_denominator, int)
 
+    # FIXME: Delete if in favor of the general case once we depend on pymbolic 2024.1.
     if common_denominator == 1:
-        return sum(num * monomial
+        res = sum(num * monomial
                    for num, monomial in zip(numerators, monomials))
     else:
-        return FloorDiv(sum(num * monomial
+        res = FloorDiv(sum(num * monomial
                             for num, monomial in zip(numerators, monomials)),
                         common_denominator)
 
+    return flatten(res)
+
 # }}}
 
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index a99b7d065..56e5dd872 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -35,6 +35,7 @@
     Const,
     Declarator,
     Generable,
+    Initializer,
     NestedDeclarator,
     Pointer,
 )
@@ -800,7 +801,7 @@ def get_function_definition(
         kernel = codegen_state.kernel
         assert kernel.linearization is not None
 
-        from cgen import FunctionBody, Initializer, Line
+        from cgen import FunctionBody, Line
 
         result = []
 
@@ -883,6 +884,42 @@ def get_kernel_call(self, codegen_state: CodeGenerationState,
             lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]:
         return None
 
+    def emit_temp_var_decl_for_tv_with_base_storage(self,
+                                                    codegen_state: CodeGenerationState,
+                                                    tv: TemporaryVariable) -> Generable:
+        """
+        Returns the statement for initializing a :class:`loopy.TemporaryVariable`
+        with a user-provided :attr:`loopy.TemporaryVariable.base_storage`.
+        """
+        assert tv.base_storage is not None
+        assert isinstance(tv.address_space, AddressSpace)
+        ecm = codegen_state.expression_to_code_mapper
+
+        cast_decl = POD(self, tv.dtype, "")
+        temp_var_decl = POD(self, tv.dtype, tv.name)
+
+        if tv._base_storage_access_may_be_aliasing:
+            ptrtype = _ConstPointer
+        else:
+            # The 'restrict' part of this is a complete lie--of course
+            # all these temporaries are aliased. But we're promising to
+            # not use them to shovel data from one representation to the
+            # other. That counts, right?
+            ptrtype = _ConstRestrictPointer
+
+        cast_decl = self.wrap_decl_for_address_space(
+                ptrtype(cast_decl), tv.address_space)
+        temp_var_decl = self.wrap_decl_for_address_space(
+                ptrtype(temp_var_decl), tv.address_space)
+
+        cast_tp, cast_d = cast_decl.get_decl_pair()
+        return Initializer(
+            temp_var_decl,
+            "({} {}) ({} + {})".format(
+                " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset)
+            ),
+        )
+
     def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import AddressSpace
 
@@ -930,33 +967,9 @@ def get_temporary_decls(self, codegen_state, schedule_index):
 
             else:
                 assert tv.initializer is None
-
-                cast_decl = POD(self, tv.dtype, "")
-                temp_var_decl = POD(self, tv.dtype, tv.name)
-
-                if tv._base_storage_access_may_be_aliasing:
-                    ptrtype = _ConstPointer
-                else:
-                    # The 'restrict' part of this is a complete lie--of course
-                    # all these temporaries are aliased. But we're promising to
-                    # not use them to shovel data from one representation to the
-                    # other. That counts, right?
-                    ptrtype = _ConstRestrictPointer
-
-                cast_decl = self.wrap_decl_for_address_space(
-                        ptrtype(cast_decl), tv.address_space)
-                temp_var_decl = self.wrap_decl_for_address_space(
-                        ptrtype(temp_var_decl), tv.address_space)
-
-                cast_tp, cast_d = cast_decl.get_decl_pair()
-                temp_var_decl = Initializer(
-                        temp_var_decl,
-                        "({} {}) ({} + {})".format(
-                            " ".join(cast_tp), cast_d,
-                            tv.base_storage,
-                            ecm(tv.offset)
-                            ))
-
+                temp_var_decl = self.emit_temp_var_decl_for_tv_with_base_storage(
+                    codegen_state, tv
+                )
                 temp_decls_using_base_storage.append(temp_var_decl)
 
         # }}}
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index fc3238e92..9cde501a7 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -94,21 +94,21 @@ def python_dtype_str_inner(self, dtype):
             return f"_lpy_np.dtype(_lpy_np.{name})"
         raise Exception(f"dtype: {dtype} not recognized")
 
-    # {{{ handle non numpy arguements
+    # {{{ handle non numpy arguments
 
     def handle_non_numpy_arg(self, gen, arg):
         pass
 
     # }}}
 
-    # {{{ handle allocation of unspecified arguements
+    # {{{ handle allocation of unspecified arguments
 
     def handle_alloc(
             self, gen: CodeGenerator, arg: ArrayArg,
             strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
             skip_arg_checks: bool) -> None:
         """
-        Handle allocation of non-specified arguements for C-execution
+        Handle allocation of non-specified arguments for C-execution
         """
         from pymbolic import var
 
@@ -181,7 +181,7 @@ def target_specific_preamble(self, gen):
 
     def initialize_system_args(self, gen):
         """
-        Initializes possibly empty system arguements
+        Initializes possibly empty system arguments
         """
         pass
 
@@ -238,7 +238,7 @@ class CCompiler:
     The general strategy here is as follows:
 
     1.  A :class:`codepy.Toolchain` is guessed from distutils.
-        The user may override any flags obtained therein by passing in arguements
+        The user may override any flags obtained therein by passing in arguments
         to cc, cflags, etc.
 
     2.  The kernel source is built into and object first, then made into a shared
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 276407fc1..810ac4379 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -28,7 +28,7 @@
 import islpy as isl
 import pymbolic.primitives as p
 from pymbolic import var
-from pymbolic.mapper import IdentityMapper, RecursiveMapper
+from pymbolic.mapper import IdentityMapper, Mapper
 from pymbolic.mapper.stringifier import (
     PREC_BITWISE_AND,
     PREC_BITWISE_OR,
@@ -124,9 +124,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
 
         return s
 
-    def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None):
-        result = RecursiveMapper.rec(self, expr, type_context)
-
+    def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None):  # type: ignore[override]
+        result = Mapper.rec(self, expr, type_context)
         if needed_type is None:
             return result
         else:
@@ -604,7 +603,7 @@ def map_nan(self, expr, type_context):
 
 # {{{ C expression to code mapper
 
-class CExpressionToCodeMapper(RecursiveMapper):
+class CExpressionToCodeMapper(Mapper):
 
     # {{{ helpers
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 3155b3bff..afeb5cee2 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -35,7 +35,13 @@
 from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
-from loopy.kernel.data import AddressSpace, ArrayArg, ConstantArg, ImageArg
+from loopy.kernel.data import (
+    AddressSpace,
+    ArrayArg,
+    ConstantArg,
+    ImageArg,
+    TemporaryVariable,
+)
 from loopy.kernel.function_interface import ScalarCallable
 from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
@@ -462,6 +468,39 @@ def get_image_arg_declarator(
             self, arg: ImageArg, is_written: bool) -> Declarator:
         raise NotImplementedError("not yet: texture arguments in CUDA")
 
+    def emit_temp_var_decl_for_tv_with_base_storage(self,
+                                                    codegen_state: CodeGenerationState,
+                                                    tv: TemporaryVariable) -> Generable:
+        from cgen import Initializer
+
+        from loopy.target.c import POD, _ConstPointer, _ConstRestrictPointer
+
+        assert tv.base_storage is not None
+        ecm = codegen_state.expression_to_code_mapper
+
+        cast_decl = POD(self, tv.dtype, "")
+        temp_var_decl = POD(self, tv.dtype, tv.name)
+
+        if tv._base_storage_access_may_be_aliasing:
+            ptrtype = _ConstPointer
+        else:
+            # The 'restrict' part of this is a complete lie--of course
+            # all these temporaries are aliased. But we're promising to
+            # not use them to shovel data from one representation to the
+            # other. That counts, right?
+            ptrtype = _ConstRestrictPointer
+
+        cast_decl = ptrtype(cast_decl)
+        temp_var_decl = ptrtype(temp_var_decl)
+
+        cast_tp, cast_d = cast_decl.get_decl_pair()
+        return Initializer(
+            temp_var_decl,
+            "({} {}) ({} + {})".format(
+                " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset)
+            ),
+        )
+
     # }}}
 
     # {{{ atomics
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index cb081a3e5..d1455530d 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -187,7 +187,7 @@ def generate_integer_arg_finding_from_array_data(
                         if shape_i is not None:
                             equations.append(
                                 _ArgFindingEquation(
-                                    lhs=var(arg.name).attr("shape").index(axis_nr),
+                                    lhs=var(arg.name).attr("shape")[axis_nr],
                                     rhs=shape_i,
                                     order=0,
                                     based_on_names=frozenset({arg.name})))
@@ -198,7 +198,7 @@ def generate_integer_arg_finding_from_array_data(
                         equations.append(
                                 _ArgFindingEquation(
                                     lhs=var("_lpy_even_div")(
-                                        var(arg.name).attr("strides").index(axis_nr),
+                                        var(arg.name).attr("strides")[axis_nr],
                                         arg.dtype.itemsize),
                                     rhs=_str_to_expr(stride_i),
                                     order=0,
@@ -377,21 +377,21 @@ def generate_value_arg_check(
 
     # }}}
 
-    # {{{ handle non numpy arguements
+    # {{{ handle non numpy arguments
 
     def handle_non_numpy_arg(self, gen: CodeGenerator, arg):
         raise NotImplementedError()
 
     # }}}
 
-    # {{{ handle allocation of unspecified arguements
+    # {{{ handle allocation of unspecified arguments
 
     def handle_alloc(
             self, gen: CodeGenerator, arg: ArrayArg,
             strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
             skip_arg_checks: bool) -> None:
         """
-        Handle allocation of non-specified arguements for C-execution
+        Handle allocation of non-specified arguments for C-execution
         """
         raise NotImplementedError()
 
@@ -647,7 +647,7 @@ def target_specific_preamble(self, gen):
 
     def initialize_system_args(self, gen):
         """
-        Override to intialize any default system args
+        Override to initialize any default system args
         """
         raise NotImplementedError()
 
@@ -674,7 +674,7 @@ def __call__(self, program, entrypoint, codegen_result):
         """
         Generates the wrapping python invoker for this execution target
 
-        :arg kernel: the loopy :class:`LoopKernel`(s) to be executued
+        :arg kernel: the loopy :class:`LoopKernel`(s) to be executed
         :codegen_result: the loopy :class:`CodeGenerationResult` created
         by code generation
 
@@ -944,7 +944,7 @@ def __call__(self, queue, **kwargs):
 
 # }}}
 
-# {{{ code highlighers
+# {{{ code highlighters
 
 
 def get_highlighted_code(text, python=False):
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index ce2a150b0..31d1cfd2d 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -112,7 +112,7 @@ def map_subscript(self, expr, type_context):
 
         if (isinstance(ary, TemporaryVariable)
                 and ary.address_space == AddressSpace.PRIVATE):
-            # generate access code for acccess to private-index temporaries
+            # generate access code for access to private-index temporaries
 
             gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index e2f3ecda2..14383e54f 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -642,7 +642,7 @@ def get_function_declaration(
         from loopy.target.c import FunctionDeclarationWrapper
         assert isinstance(fdecl, FunctionDeclarationWrapper)
         if not codegen_state.is_entrypoint:
-            # auxiliary kernels need not mention opencl speicific qualifiers
+            # auxiliary kernels need not mention opencl specific qualifiers
             # for a functions signature
             return preambles, fdecl
 
@@ -908,7 +908,7 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
 # }}}
 
 
-# {{{ volatile mem acccess target
+# {{{ volatile mem access target
 
 class VolatileMemExpressionToOpenCLCExpressionMapper(
         ExpressionToOpenCLCExpressionMapper):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index ec702e39b..e4da6cd8b 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -324,7 +324,7 @@ def binary_tree_add(start, end):
                 # -ffp-contract=fast which is the default for PTX codegen, but
                 # for some unknown reason, clang fails to see the FMAs.
                 #
-                # We need to do this only for complex as we haev temporaries
+                # We need to do this only for complex as we have temporaries
                 # only in complex. For reals, the code generated looks like
                 #
                 #    res = c + a * b
@@ -879,7 +879,7 @@ def get_kernel_call(
 
         value_arg_code = generate_value_arg_setup(
                 codegen_state.kernel, regular_arg_names)
-        arry_arg_code = generate_array_arg_setup(
+        array_arg_code = generate_array_arg_setup(
                 codegen_state.kernel, regular_arg_names)
 
         if struct_overflow_arg_names:
@@ -888,7 +888,7 @@ def get_kernel_call(
                     struct_overflow_arg_names)
 
             py_passed_args = []
-            struct_pack_types = []
+            struct_pack_types: list[str] = []
             struct_pack_args = []
 
             for arg_name in skai.passed_names:
@@ -952,7 +952,7 @@ def get_kernel_call(
                    "argument count of the kernel ({_lpy_knl.num_args}).'"),
             Line(),
             value_arg_code,
-            arry_arg_code,
+            array_arg_code,
             overflow_args_code,
             Assign("_lpy_evt",
                    f"{self.target.pyopencl_module_name}.enqueue_nd_range_kernel("
@@ -1207,7 +1207,7 @@ def get_expression_to_c_expression_mapper(self, codegen_state):
 # }}}
 
 
-# {{{ volatile mem acccess target
+# {{{ volatile mem access target
 
 class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder):
     def get_expression_to_c_expression_mapper(self, codegen_state):
diff --git a/loopy/tools.py b/loopy/tools.py
index 50a523ee8..2e3b5db4f 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -972,4 +972,13 @@ def _get_persistent_hashable_arg(arg):
 
 # }}}
 
+
+def is_hashable(o: object) -> bool:
+    try:
+        hash(o)
+    except TypeError:
+        return False
+    return True
+
+
 # vim: fdm=marker
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index 73bd5cdde..7ab5e376e 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -91,7 +91,7 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
 
     new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add])
     if insn_after is not None:
-        new_kernel = add_dependency(kernel=new_kernel,
+        new_kernel = add_dependency(new_kernel,
                                  insn_match=insn_after,
                                  depends_on="id:"+id)
 
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index ec3737233..7e7b6459c 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -413,17 +413,17 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs):
                 except_inames=frozenset(self.primed_sweep_inames))
 
         s2s_domain = stor2sweep.domain()
-        s2s_domain, aligned_g_s2s_parm_dom = isl.align_two(
+        s2s_domain, aligned_g_s2s_param_dom = isl.align_two(
                 s2s_domain, global_s2s_par_dom)
 
         arg_restrictions = (
-                aligned_g_s2s_parm_dom
+                aligned_g_s2s_param_dom
                 .eliminate(dim_type.set, 0,
-                    aligned_g_s2s_parm_dom.dim(dim_type.set))
+                    aligned_g_s2s_param_dom.dim(dim_type.set))
                 .remove_divs())
 
         return (arg_restrictions & s2s_domain).is_subset(
-                aligned_g_s2s_parm_dom)
+                aligned_g_s2s_param_dom)
 
 
 class NoOpArrayToBufferMap(ArrayToBufferMapBase):
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 1fe40a370..6866f23f3 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -20,6 +20,8 @@
 THE SOFTWARE.
 """
 
+from collections.abc import Sequence
+
 from immutables import Map
 
 import islpy as isl
@@ -27,7 +29,11 @@
 
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
-from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.kernel.function_interface import (
+    CallableKernel,
+    InKernelCallable,
+    ScalarCallable,
+)
 from loopy.kernel.instruction import (
     Assignment,
     CallInstruction,
@@ -40,7 +46,7 @@
     RuleAwareSubstitutionMapper,
     SubstitutionRuleMappingContext,
 )
-from loopy.translation_unit import TranslationUnit, for_each_kernel
+from loopy.translation_unit import FunctionIdT, TranslationUnit, for_each_kernel
 
 
 __doc__ = """
@@ -80,10 +86,8 @@ def register_callable(translation_unit, function_identifier, callable_,
             callables_table=new_callables)
 
 
-def merge(translation_units):
+def merge(translation_units: Sequence[TranslationUnit]) -> TranslationUnit:
     """
-    :param translation_units: A sequence of :class:`loopy.TranslationUnit`.
-
     :returns: An instance of :class:`loopy.TranslationUnit` which contains all the
         callables from each of the *translation_units.
     """
@@ -102,7 +106,7 @@ def merge(translation_units):
                 if (prg_i.callables_table[clbl_name]
                         != prg_j.callables_table[clbl_name]):
                     # TODO: generate unique names + rename for the colliding
-                    # callables (if entrypoints are colliding that shuold still
+                    # callables (if entrypoints are colliding that should still
                     # be an error)
                     raise NotImplementedError("Translation units to be merged"
                                               " must have different callable names"
@@ -110,7 +114,7 @@ def merge(translation_units):
 
     # }}}
 
-    callables_table = {}
+    callables_table: dict[FunctionIdT, InKernelCallable] = {}
     for trans_unit in translation_units:
         callables_table.update(trans_unit.callables_table)
 
@@ -534,7 +538,12 @@ def inline_callable_kernel(translation_unit, function_name):
 
 # {{{ rename_callable
 
-def rename_callable(program, old_name, new_name=None, existing_ok=False):
+def rename_callable(
+            t_unit: TranslationUnit,
+            old_name: str,
+            new_name: str | None = None,
+            existing_ok=False
+        ) -> TranslationUnit:
     """
     :arg program: An instance of :class:`loopy.TranslationUnit`
     :arg old_name: The callable to be renamed
@@ -548,21 +557,21 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False):
         SubstitutionRuleMappingContext,
     )
 
-    assert isinstance(program, TranslationUnit)
+    assert isinstance(t_unit, TranslationUnit)
     assert isinstance(old_name, str)
 
-    if (new_name in program.callables_table) and not existing_ok:
+    if (new_name in t_unit.callables_table) and not existing_ok:
         raise LoopyError(f"callables named '{new_name}' already exists")
 
     if new_name is None:
-        namegen = UniqueNameGenerator(program.callables_table.keys())
+        namegen = UniqueNameGenerator(t_unit.callables_table.keys())
         new_name = namegen(old_name)
 
     assert isinstance(new_name, str)
 
     new_callables_table = {}
 
-    for name, clbl in program.callables_table.items():
+    for name, clbl in t_unit.callables_table.items():
         if name == old_name:
             name = new_name
 
@@ -582,12 +591,12 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False):
 
         new_callables_table[name] = clbl
 
-    new_entrypoints = program.entrypoints.copy()
+    new_entrypoints = t_unit.entrypoints.copy()
     if old_name in new_entrypoints:
         new_entrypoints = ((new_entrypoints | frozenset([new_name]))
                            - frozenset([old_name]))
 
-    return program.copy(callables_table=Map(new_callables_table),
+    return t_unit.copy(callables_table=Map(new_callables_table),
                         entrypoints=new_entrypoints)
 
 # }}}
diff --git a/loopy/transform/concatenate.py b/loopy/transform/concatenate.py
index 8a4bb28fb..fcf2e07ca 100644
--- a/loopy/transform/concatenate.py
+++ b/loopy/transform/concatenate.py
@@ -85,6 +85,9 @@ def concatenate_arrays(
         axis_length += ary.shape[axis_nr]
 
     new_ary = arrays[0]
+    if not isinstance(new_ary.shape, tuple):
+        raise ValueError("one of the arrays has indeterminate shape")
+
     new_shape = list(new_ary.shape)
     new_shape[axis_nr] = axis_length
     new_ary = new_ary.copy(shape=tuple(new_shape))
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 088d89643..ea6cc0fc1 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -252,7 +252,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
                     footprint_subscripts, var_descr)
 
     # Our _not_provided is actually a different object from the one in the
-    # precompute module, but precompute acutally uses that to adjust its
+    # precompute module, but precompute actually uses that to adjust its
     # warning message.
 
     from loopy.transform.precompute import precompute_for_single_kernel
@@ -292,7 +292,13 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
         return new_kernel
 
 
-def add_prefetch(program, *args, **kwargs):
+def add_prefetch(t_unit,
+                 var_name, sweep_inames=None, dim_arg_names=None,
+                 default_tag=None,
+                 rule_name=None, temporary_name=None,
+                 temporary_address_space=None, temporary_scope=None,
+                 footprint_subscripts=None, fetch_bounding_box=False,
+                 fetch_outer_inames=None, prefetch_insn_id=None, within=None):
     """Prefetch all accesses to the variable *var_name*, with all accesses
     being swept through *sweep_inames*.
 
@@ -379,7 +385,7 @@ def add_prefetch(program, *args, **kwargs):
     :arg fetch_outer_inames: The inames within which the fetch
         instruction is nested. If *None*, make an educated guess.
 
-    :arg fetch_insn_id: The ID of the instruction generated to perform the
+    :arg prefetch_insn_id: The ID of the instruction generated to perform the
         prefetch.
 
     :arg within: a stack match as understood by
@@ -388,14 +394,26 @@ def add_prefetch(program, *args, **kwargs):
 
     This function internally uses :func:`extract_subst` and :func:`precompute`.
     """
-    assert isinstance(program, TranslationUnit)
+    assert isinstance(t_unit, TranslationUnit)
 
     new_callables = {}
-    for func_id, in_knl_callable in program.callables_table.items():
+    for func_id, in_knl_callable in t_unit.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = add_prefetch_for_single_kernel(
-                    in_knl_callable.subkernel, program.callables_table,
-                    *args, **kwargs)
+                    in_knl_callable.subkernel, t_unit.callables_table,
+                    var_name=var_name,
+                    sweep_inames=sweep_inames,
+                    dim_arg_names=dim_arg_names,
+                    default_tag=default_tag,
+                    rule_name=rule_name,
+                    temporary_name=temporary_name,
+                    temporary_address_space=temporary_address_space,
+                    temporary_scope=temporary_scope,
+                    footprint_subscripts=footprint_subscripts,
+                    fetch_bounding_box=fetch_bounding_box,
+                    fetch_outer_inames=fetch_outer_inames,
+                    prefetch_insn_id=prefetch_insn_id,
+                    within=within)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
 
@@ -407,7 +425,7 @@ def add_prefetch(program, *args, **kwargs):
 
         new_callables[func_id] = in_knl_callable
 
-    return program.copy(callables_table=Map(new_callables))
+    return t_unit.copy(callables_table=Map(new_callables))
 
 # }}}
 
@@ -653,7 +671,7 @@ def set_argument_order(kernel, arg_names):
     :arg arg_names: A list (or comma-separated string) or argument
         names. All arguments must be in this list.
     """
-    # FIXME: @inducer -- shoulld this only affect the root kernel, or should it
+    # FIXME: @inducer -- should this only affect the root kernel, or should it
     # take a within?
 
     if isinstance(arg_names, str):
@@ -1007,6 +1025,11 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel,
             raise LoopyError(
                     f"Temporary '{tv.name}' has an offset and no base_storage. "
                     "That's not allowed.")
+        if not isinstance(tv.dtype, LoopyType):
+            raise LoopyError(
+                f"Dtype of temporary '{tv.name}' "
+                " is not inferred. Call lp.infer_unknown_types"
+                " first.")
 
         if (tv.base_storage
                 and tv.base_storage not in kernel.temporary_variables
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index bb828221f..6c2688d90 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -154,7 +154,7 @@ def map_call(self, expr, *args):
         dc = self.diff_context
 
         if expr.function.name in dc.kernel.substitutions:
-            # FIXME: Deal with subsitution rules
+            # FIXME: Deal with substitution rules
             # Need to use chain rule here, too.
             raise NotImplementedError("substitution rules in differentiation")
         else:
@@ -382,8 +382,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i",
 
     assert isinstance(kernel, LoopKernel)
 
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True)
+    from loopy.kernel.creation import apply_single_writer_dependency_heuristic
+    kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=True)
 
     if isinstance(diff_outputs, str):
         diff_outputs = [
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 18df3dae4..97257745c 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -21,7 +21,7 @@
 """
 
 
-from typing import FrozenSet, Optional
+from typing import Any, FrozenSet, Optional
 
 import islpy as isl
 from islpy import dim_type
@@ -29,6 +29,7 @@
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel
+from loopy.kernel.instruction import InstructionBase
 from loopy.symbolic import (
     RuleAwareIdentityMapper,
     RuleAwareSubstitutionMapper,
@@ -919,9 +920,13 @@ def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None,
             old_to_new=dict(list(zip(inames, new_inames))),
             within=within)
 
-    def _does_access_old_inames(kernel, insn, *args):
-        return bool(frozenset(inames) & (insn.dependency_names()
-                                         | insn.reduction_inames()))
+    def _does_access_old_inames(kernel: LoopKernel,
+                                insn: InstructionBase,
+                                *args: Any) -> bool:
+        all_inames = (insn.within_inames
+                      | insn.reduction_inames()
+                      | insn.sub_array_ref_inames())
+        return bool(frozenset(inames) & all_inames)
 
     kernel = rule_mapping_context.finish_kernel(
             indup.map_kernel(kernel, within=_does_access_old_inames,
@@ -1540,7 +1545,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
                 break
 
         if not found:
-            raise LoopyError("invlaid tag kind: %s" % kind)
+            raise LoopyError("invalid tag kind: %s" % kind)
 
     from loopy.match import parse_match
     match = parse_match(insn_match)
@@ -2265,7 +2270,7 @@ def add_inames_for_unused_hw_axes(kernel, within=None):
     Current limitations:
 
     * Only one iname in the kernel may be tagged with each of the unused hw axes.
-    * Occurence of an ``l.auto`` tag when an instruction is missing one of the
+    * Occurrence of an ``l.auto`` tag when an instruction is missing one of the
       local hw axes.
 
     :arg within: An instruction match as understood by
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 629916628..494bbf0bc 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -267,6 +267,7 @@ def replace_instruction_ids_in_insn(
     new_no_sync_with: List[Tuple[str, str]] = []
 
     if insn.id in replacements:
+        assert isinstance(insn.id, str)
         insn = insn.copy(id=replacements[insn.id][0])
 
     new_depends_on = list(insn.depends_on)
@@ -425,7 +426,7 @@ def insns_in_conflicting_groups(insn1_id, insn2_id):
     if not nosync_to_add and not empty_ok:
         raise LoopyError("No nosync annotations were added as a result "
                 "of this call. add_nosync will (by default) only add them to "
-                "accompany existing depencies or group exclusions. Maybe you want "
+                "accompany existing dependencies or group exclusions. Maybe you want "
                 "to pass force=True?")
 
     new_instructions = list(kernel.instructions)
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index 1c1b48c59..2a82952c2 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -48,7 +48,7 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
 
     :arg call_name: An instance of :class:`str` denoting the function call in
         the *kernel*.
-    :arg args_to_unpack: A list of the arguments as instances of :class:`str` which
+    :arg args_to_pack: A list of the arguments as instances of :class:`str` which
         must be packed. If set *None*, it is interpreted that all the array
         arguments would be packed.
     :arg args_to_unpack: A list of the arguments as instances of :class:`str`
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index d04fa5b2d..2c91643ac 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -47,6 +47,7 @@
     RuleAwareIdentityMapper,
     RuleAwareSubstitutionMapper,
     SubstitutionRuleMappingContext,
+    flatten,
     get_dependencies,
 )
 from loopy.transform.array_buffer_map import (
@@ -296,7 +297,7 @@ def map_substitution(self, name, tag, arguments, expn_state):
 
         new_outer_expr = var(self.temporary_name)
         if stor_subscript:
-            new_outer_expr = new_outer_expr.index(tuple(stor_subscript))
+            new_outer_expr = new_outer_expr[tuple(stor_subscript)]
 
         # Can't possibly be nested, and no need to traverse
         # further as compute expression has already been seen
@@ -928,7 +929,7 @@ def add_assumptions(d):
 
         storage_axis_subst_dict[
                 prior_storage_axis_name_dict.get(arg_name, arg_name)] = \
-                        arg+base_index
+                        flatten(arg+base_index)
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index e9b2b8c53..ca31368d2 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -43,7 +43,7 @@
 
 # {{{ privatize temporaries with iname
 
-from loopy.symbolic import IdentityMapper
+from loopy.symbolic import IdentityMapper, flatten
 
 
 class ExtraInameIndexInserter(IdentityMapper):
@@ -66,7 +66,7 @@ def map_subscript(self, expr):
 
             self.seen_priv_axis_inames.update(v.name for v in extra_idx)
 
-            new_idx = index + tuple(v - self.iname_to_lbound[v.name]
+            new_idx = index + tuple(flatten(v - self.iname_to_lbound[v.name])
                             for v in extra_idx)
 
             if len(new_idx) == 1:
@@ -81,7 +81,7 @@ def map_variable(self, expr):
         else:
             self.seen_priv_axis_inames.update(v.name for v in new_idx)
 
-            new_idx = tuple(v - self.iname_to_lbound[v.name]
+            new_idx = tuple(flatten(v - self.iname_to_lbound[v.name])
                             for v in new_idx)
 
             if len(new_idx) == 1:
@@ -247,7 +247,7 @@ def privatize_temporaries_with_inames(
                     "Kernel '%s': Instruction '%s': touched variable that "
                     "(for privatization, e.g. as performed for ILP) "
                     "required iname(s) '%s', but that the instruction was not "
-                    "previously within the iname(s). To remedy this, first promote"
+                    "previously within the iname(s). To remedy this, first promote "
                     "the instruction into the iname."
                     % (kernel.name, insn.id, ", ".join(
                         eiii.seen_priv_axis_inames - insn.within_inames)))
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index 5161efba6..7d1f3c870 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -711,7 +711,7 @@ def _add_to_depends_on(insn_id, new_depends_on_params):
 
             needs_replacement = True
 
-            # {{{ generate a new assignent instruction
+            # {{{ generate a new assignment instruction
 
             new_assignee_name = var_name_gen(
                     "{insn_id}_retval_{assignee_nr}"
@@ -2019,7 +2019,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                         | red_realize_ctx.surrounding_insn_add_within_inames))
 
             kwargs.pop("id")
-            kwargs.pop("depends_on")
+            kwargs.pop("happens_after")
             kwargs.pop("expression")
             kwargs.pop("assignee", None)
             kwargs.pop("assignees", None)
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index b5c7aa7a1..422d22568 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -327,8 +327,8 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
     # {{{ establish the relevant definition of lhs_name for each usage site
 
     dep_kernel = expand_subst(kernel)
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    dep_kernel = apply_single_writer_depencency_heuristic(dep_kernel)
+    from loopy.kernel.creation import apply_single_writer_dependency_heuristic
+    dep_kernel = apply_single_writer_dependency_heuristic(dep_kernel)
     assigning_insn_ids = {insn.id
                           for insn in dep_kernel.instructions
                           if lhs_name in insn.assignee_var_names()}
@@ -354,7 +354,7 @@ def get_relevant_definition_insn_id(usage_insn_id):
 
         if len(rel_def_ids) > 1:
             raise LoopyError("more than one write to '%s' found in "
-                    "depdendencies of '%s'--definition cannot be resolved "
+                    "dependencies of '%s'--definition cannot be resolved "
                     "(writer instructions ids: %s)"
                     % (lhs_name, usage_insn_id, ", ".join(rel_def_ids)))
 
@@ -433,7 +433,7 @@ def _accesses_lhs(kernel, insn, *args):
         for i in indices:
             if not isinstance(i, Variable):
                 raise LoopyError("In defining instruction '%s': "
-                        "asignee index '%s' is not a plain variable. "
+                        "assignee index '%s' is not a plain variable. "
                         "Perhaps use loopy.affine_map_inames() "
                         "to perform substitution." % (def_id, i))
 
diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py
index c0d1b0b05..4afdfcef7 100644
--- a/loopy/translation_unit.py
+++ b/loopy/translation_unit.py
@@ -27,10 +27,20 @@
 from collections.abc import Set as abc_Set
 from dataclasses import dataclass, field, replace
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Callable, FrozenSet, Mapping, Optional, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    FrozenSet,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+)
 from warnings import warn
 
 from immutables import Map
+from typing_extensions import Concatenate, ParamSpec, Self
 
 from pymbolic.primitives import Call, Variable
 
@@ -76,8 +86,16 @@
 
 .. autofunction:: make_program
 
+.. autofunction:: check_each_kernel
+
 .. autofunction:: for_each_kernel
 
+.. autoclass:: TUnitOrKernelT
+
+.. class:: P
+
+    A :class:`typing.ParamSpec` for use in annotating :func:`for_each_kernel` and
+    :func:`check_each_kernel`.
 """
 
 
@@ -182,6 +200,8 @@ class TranslationUnit:
         The :class:`~loopy.LoopKernel` representing the main entrypoint
         of the program, if defined. Currently, this attribute may only be
         accessed if there is exactly one entrypoint in the translation unit.
+        Will raise an error if the default entrypoint is not a
+        :class:`~loopy.LoopKernel`.
 
     .. attribute:: callables_table
 
@@ -196,7 +216,7 @@ class TranslationUnit:
     .. attribute:: func_id_to_in_knl_callables_mappers
 
         A :class:`frozenset` of functions of the signature ``(target:
-        TargetBase, function_indentifier: str)`` that returns an instance
+        TargetBase, function_identifier: str)`` that returns an instance
         of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
 
     .. automethod:: executor
@@ -226,9 +246,9 @@ def __post_init__(self):
 
         object.__setattr__(self, "_program_executor_cache", {})
 
-    def copy(self, **kwargs):
+    def copy(self, **kwargs: Any) -> Self:
         target = kwargs.pop("target", None)
-        program = replace(self, **kwargs)
+        t_unit = replace(self, **kwargs)
         if target:
             from loopy.kernel import KernelState
             if max(callable_knl.subkernel.state
@@ -240,7 +260,7 @@ def copy(self, **kwargs):
                             "preprocessed, cannot modify target now.")
 
             new_callables = {}
-            for func_id, clbl in program.callables_table.items():
+            for func_id, clbl in t_unit.callables_table.items():
                 if isinstance(clbl, CallableKernel):
                     knl = clbl.subkernel
                     knl = knl.copy(target=target)
@@ -251,16 +271,12 @@ def copy(self, **kwargs):
                     raise NotImplementedError()
                 new_callables[func_id] = clbl
 
-            program = replace(
+            t_unit = replace(
                     self, callables_table=Map(new_callables), target=target)
 
-        return program
+        return t_unit
 
-    def with_entrypoints(self, entrypoints):
-        """
-        :param entrypoints: Either a comma-separated :class:`str` or
-        :class:`frozenset`.
-        """
+    def with_entrypoints(self, entrypoints: str | frozenset[str]) -> Self:
         if isinstance(entrypoints, str):
             entrypoints = frozenset([e.strip() for e in
                 entrypoints.split(",")])
@@ -278,7 +294,7 @@ def state(self):
                     if isinstance(callable_knl, CallableKernel)),
                    default=KernelState.INITIAL)
 
-    def with_kernel(self, kernel):
+    def with_kernel(self, kernel: LoopKernel) -> Self:
         """
         If *self* contains a callable kernel with *kernel*'s name, replaces its
         subkernel and returns a copy of *self*. Else records a new callable
@@ -300,9 +316,9 @@ def with_kernel(self, kernel):
             new_callables = self.callables_table.set(kernel.name, clbl)
             return self.copy(callables_table=new_callables)
 
-    def __getitem__(self, name):
+    def __getitem__(self, name) -> LoopKernel:
         """
-        For the callable named *name*, return a :class:`loopy.LoopKernel` if
+        For the callable named *name*, return a :class:`loopy.LoopKernel`. if
         it's a :class:`~loopy.kernel.function_interface.CallableKernel`
         otherwise return the callable itself.
         """
@@ -310,13 +326,20 @@ def __getitem__(self, name):
         if isinstance(result, CallableKernel):
             return result.subkernel
         else:
-            return result
+            raise ValueError("TranslationUnit.__getitem__ "
+                             "can only be used for instances of LoopKernel. "
+                             "Access all other callables via callables_table.")
 
     @property
-    def default_entrypoint(self):
+    def default_entrypoint(self) -> LoopKernel:
         if len(self.entrypoints) == 1:
-            entrypoint, = self.entrypoints
-            return self[entrypoint]
+            ep_name, = self.entrypoints
+            entrypoint = self[ep_name]
+
+            if not isinstance(entrypoint, LoopKernel):
+                raise ValueError("default entrypoint is not a kernel")
+
+            return entrypoint
         else:
             raise ValueError("TranslationUnit has multiple possible entrypoints."
                              " The default entrypoint kernel is not uniquely"
@@ -726,6 +749,9 @@ def __getitem__(self, name):
 # }}}
 
 
+TUnitOrKernelT = TypeVar("TUnitOrKernelT", LoopKernel, TranslationUnit)
+
+
 # {{{ helper functions
 
 def make_program(kernel: LoopKernel) -> TranslationUnit:
@@ -741,21 +767,46 @@ def make_program(kernel: LoopKernel) -> TranslationUnit:
             entrypoints=frozenset())
 
 
-def for_each_kernel(transform):
+P = ParamSpec("P")
+
+
+def check_each_kernel(
+            check: Callable[Concatenate[LoopKernel, P], None]
+        ) -> Callable[Concatenate[TranslationUnit, P], None]:
+    def _collective_check(
+                t_unit_or_kernel: TranslationUnit | LoopKernel, /,
+                *args: P.args,
+                **kwargs: P.kwargs
+            ) -> None:
+        if isinstance(t_unit_or_kernel, TranslationUnit):
+            for clbl in t_unit_or_kernel.callables_table.values():
+                if isinstance(clbl, CallableKernel):
+                    check(clbl.subkernel, *args, **kwargs)
+                elif isinstance(clbl, ScalarCallable):
+                    pass
+                else:
+                    raise NotImplementedError(f"{type(clbl)}")
+        elif isinstance(t_unit_or_kernel, LoopKernel):
+            check(t_unit_or_kernel, *args, **kwargs)
+        else:
+            raise TypeError("expected LoopKernel or TranslationUnit")
+
+    return wraps(check)(_collective_check)
+
+
+def for_each_kernel(
+            transform: Callable[Concatenate[LoopKernel, P], LoopKernel]
+        ) -> Callable[Concatenate[TUnitOrKernelT, P], TUnitOrKernelT]:
     """
     Function wrapper for transformations of the type ``transform(kernel:
     LoopKernel, *args, **kwargs) -> LoopKernel``. Returns a function that would
     apply *transform* to all callable kernels in a :class:`loopy.TranslationUnit`.
     """
-    def _collective_transform(*args, **kwargs):
-        if "translation_unit" in kwargs:
-            t_unit_or_kernel = kwargs.pop("translation_unit")
-        elif "kernel" in kwargs:
-            t_unit_or_kernel = kwargs.pop("kernel")
-        else:
-            t_unit_or_kernel = args[0]
-            args = args[1:]
-
+    def _collective_transform(
+                t_unit_or_kernel: TUnitOrKernelT, /,
+                *args: P.args,
+                **kwargs: P.kwargs
+            ) -> TUnitOrKernelT:
         if isinstance(t_unit_or_kernel, TranslationUnit):
             t_unit = t_unit_or_kernel
             new_callables = {}
@@ -771,10 +822,11 @@ def _collective_transform(*args, **kwargs):
                 new_callables[func_id] = clbl
 
             return t_unit.copy(callables_table=Map(new_callables))
-        else:
-            assert isinstance(t_unit_or_kernel, LoopKernel)
+        elif isinstance(t_unit_or_kernel, LoopKernel):
             kernel = t_unit_or_kernel
             return transform(kernel, *args, **kwargs)
+        else:
+            raise TypeError("expected LoopKernel or TranslationUnit")
 
     return wraps(transform)(_collective_transform)
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index b997cee25..b8e3db720 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -43,7 +43,11 @@
     parse_tagged_name,
 )
 from loopy.tools import is_integer
-from loopy.translation_unit import CallablesInferenceContext, make_clbl_inf_ctx
+from loopy.translation_unit import (
+    CallablesInferenceContext,
+    TranslationUnit,
+    make_clbl_inf_ctx,
+)
 from loopy.types import NumpyType
 
 
@@ -396,7 +400,7 @@ def map_constant(self, expr):
 
     def map_type_cast(self, expr):
         subtype, = self.rec(expr.child)
-        if not issubclass(subtype.dtype.type, np.number):
+        if not issubclass(subtype.dtype.type, (np.number, np.bool_)):
             raise LoopyError(f"Can't cast a '{subtype}' to '{expr.type}'")
         return [expr.type]
 
@@ -1026,31 +1030,34 @@ def _instruction_missed_during_inference(insn):
     return type_specialized_kernel, clbl_inf_ctx
 
 
-def infer_unknown_types(program, expect_completion=False):
+def infer_unknown_types(
+            t_unit: TranslationUnit,
+            expect_completion: bool = False
+        ) -> TranslationUnit:
     """Infer types on temporaries and arguments."""
     from loopy.kernel.data import auto
     from loopy.translation_unit import resolve_callables
 
-    program = resolve_callables(program)
+    t_unit = resolve_callables(t_unit)
 
     # {{{ early-exit criterion
 
     if all(clbl.is_type_specialized()
-           for clbl in program.callables_table.values()):
+           for clbl in t_unit.callables_table.values()):
         # all the callables including the kernels have inferred their types
         # => no need for type inference
-        return program
+        return t_unit
 
     # }}}
 
-    clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table,
-            program.entrypoints)
+    clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table,
+            t_unit.entrypoints)
 
-    for e in program.entrypoints:
+    for e in t_unit.entrypoints:
         logger.debug(f"Entering entrypoint: {e}")
         arg_id_to_dtype = {arg.name: arg.dtype for arg in
-                program[e].args if arg.dtype not in (None, auto)}
-        new_callable, clbl_inf_ctx = program.callables_table[e].with_types(
+                t_unit[e].args if arg.dtype not in (None, auto)}
+        new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_types(
                 arg_id_to_dtype, clbl_inf_ctx)
         clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable,
                                                             is_entrypoint=True)
@@ -1073,7 +1080,7 @@ def infer_unknown_types(program, expect_completion=False):
                     raise LoopyError("could not determine type of"
                             f" '{vars_not_inferred.pop()}' of kernel '{e}'.")
 
-    return clbl_inf_ctx.finish_program(program)
+    return clbl_inf_ctx.finish_program(t_unit)
 
 # }}}
 
diff --git a/loopy/types.py b/loopy/types.py
index 143715a39..a837d1c46 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -165,7 +165,7 @@ def __repr__(self):
 class OpaqueType(LoopyType):
     """An opaque data type is truly opaque - it has no allocations, no
     temporaries of that type, etc. The only thing allowed is to be pass in
-    through one ValueArg and go out to another. It is introduced to accomodate
+    through one ValueArg and go out to another. It is introduced to accommodate
     functional calls to external libraries.
     """
     def __init__(self, name: str) -> None:
diff --git a/loopy/typing.py b/loopy/typing.py
index 948616578..e6166fb7a 100644
--- a/loopy/typing.py
+++ b/loopy/typing.py
@@ -1,3 +1,15 @@
+"""
+.. autoclass:: IntegralT
+.. autoclass:: FloatT
+.. autoclass:: ExpressionT
+.. autoclass:: ShapeType
+.. autoclass:: auto
+"""
+
+
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees"
 
 __license__ = """
@@ -24,25 +36,28 @@
 from typing import Optional, Tuple, TypeVar, Union
 
 import numpy as np
+from typing_extensions import TypeAlias
 
 from pymbolic.primitives import Expression
 
 
-IntegralT = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8,
+IntegralT: TypeAlias = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8,
                   np.uint16, np.uint32, np.uint64]
-FloatT = Union[float, complex, np.float32, np.float64, np.complex64,
+FloatT: TypeAlias = Union[float, complex, np.float32, np.float64, np.complex64,
         np.complex128]
 
 
-ExpressionT = Union[IntegralT, FloatT, Expression]
-ShapeType = Tuple[ExpressionT, ...]
-StridesType = ShapeType
+ExpressionT: TypeAlias = Union[IntegralT, FloatT, Expression]
+ShapeType: TypeAlias = Tuple[ExpressionT, ...]
+StridesType: TypeAlias = ShapeType
+
+InameStr: TypeAlias = str
 
 
 class auto:  # noqa
     """A generic placeholder object for something that should be automatically
     determined.  See, for example, the *shape* or *strides* argument of
-    :class:`ArrayArg`.
+    :class:`~loopy.ArrayArg`.
     """
 
 
diff --git a/loopy/version.py b/loopy/version.py
index 09d8442a2..609e6c179 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -21,32 +21,16 @@
 """
 
 
-# {{{ find install- or run-time git revision
+import re
+from importlib import metadata
 
-import os
 
+VERSION_TEXT = metadata.version("loopy")
+_match = re.match("^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
+assert _match is not None
+VERSION_STATUS = _match.group(2)
+VERSION = tuple(int(nr) for nr in _match.group(1).split("."))
 
-if os.environ.get("AKPYTHON_EXEC_IMPORT_UNAVAILABLE") is not None:
-    # We're just being exec'd by setup.py. We can't import anything.
-    _git_rev = None
-
-else:
-    import loopy._git_rev as _git_rev_mod  # pylint: disable=no-name-in-module,import-error  # noqa: E501
-    _git_rev = _git_rev_mod.GIT_REVISION
-
-    # If we're running from a dev tree, the last install (and hence the most
-    # recent update of the above git rev) could have taken place very long ago.
-    from pytools import find_module_git_revision
-    _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1)
-    if _runtime_git_rev is not None:
-        _git_rev = _runtime_git_rev
-
-# }}}
-
-
-VERSION = (2024, 1)
-VERSION_STATUS = ""
-VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
 try:
     import islpy.version
@@ -62,8 +46,7 @@
 else:
     _cgen_version = cgen.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "{}-islpy{}-cgen{}-{}-v1".format(
-        VERSION_TEXT, _islpy_version, _cgen_version, _git_rev)
+DATA_MODEL_VERSION = f"{VERSION_TEXT}-islpy{_islpy_version}-cgen{_cgen_version}-v1"
 
 
 FALLBACK_LANGUAGE_VERSION = (2018, 2)
diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py
index 7ee30313c..eb8125cdb 100644
--- a/proto-tests/test_tim.py
+++ b/proto-tests/test_tim.py
@@ -190,7 +190,7 @@ def test_tim3d(ctx_factory):
     knl = lp.split_iname(knl, "k", n, inner_tag="l.2")  # , slabs=(0, 1))
     knl = lp.split_iname(knl, "i", n, inner_tag="l.0")  # , slabs=(0, 1))
 
-#    knl = lp.tag_inames(knl, dict(k_nner="unr"))
+#    knl = lp.tag_inames(knl, dict(k_inner="unr"))
 
     knl = lp.tag_inames(knl, dict(o="unr"))
     knl = lp.tag_inames(knl, dict(m="unr"))
diff --git a/pyproject.toml b/pyproject.toml
index 6f7b977b6..4b4334adb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,86 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools>=63",
+]
+
+[project]
+name = "loopy"
+version = "2024.1"
+description = "A code generator for array-based code on CPUs and GPUs"
+readme = "README.rst"
+license = { text = "MIT" }
+authors = [
+    { name = "Andreas Kloeckner", email = "inform@tiker.net" },
+]
+requires-python = ">=3.8"
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Other Audience",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Natural Language :: English",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Visualization",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Utilities",
+]
+dependencies = [
+    "pytools>=2024.1.5",
+    "pymbolic>=2022.1",
+    "genpy>=2016.1.2",
+
+    # https://github.com/inducer/loopy/pull/419
+    "numpy>=1.19",
+
+    "cgen>=2016.1",
+    "islpy>=2019.1",
+    "codepy>=2017.1",
+    "colorama",
+    "Mako",
+    "pyrsistent",
+    "immutables",
+    "immutabledict",
+
+    "typing-extensions>=4",
+]
+[project.optional-dependencies]
+pyopencl = [
+    "pyopencl>=2022.3",
+]
+fortran = [
+    # Note that this is *not* regular 'f2py2e', this is
+    # the Fortran parser from the (unfinished) third-edition
+    # f2py, as linked below. This package is not on the package index, AFAIK.
+    # -AK, 2024-08-02
+    "f2py @ git+https://github.com/pearu/f2py.git",
+    "ply>=3.6",
+]
+
+[project.scripts]
+
+[project.urls]
+Documentation = "https://documen.tician.de/loopy"
+Homepage = "https://github.com/inducer/loopy"
+
+[tool.setuptools.packages.find]
+include = [
+    "loopy*",
+]
+
+[tool.setuptools.package-data]
+loopy = [
+    "py.typed",
+]
+
+[tool.setuptools.package-dir]
+# https://github.com/Infleqtion/client-superstaq/pull/715
+"" = "."
 
 [tool.ruff]
 preview = true
@@ -63,7 +146,7 @@ known-local-folder = [
 lines-after-imports = 2
 
 [tool.mypy]
-python_version = 3.8
+python_version = "3.10"
 warn_unused_ignores = true
 
 # TODO
@@ -90,3 +173,29 @@ module = [
     "IPython.*",
 ]
 ignore_missing_imports = true
+
+[tool.typos.default]
+extend-ignore-re = [
+  "(?Rm)^.*(#|//)\\s*spellchecker:\\s*disable-line$"
+]
+
+[tool.typos.default.extend-words]
+# like the numpy function, array range
+arange = "arange"
+# N-Dimensional
+ND = "ND"
+# used for 'diff_output'
+dout = "dout"
+# an element-wise slice of array u
+ue = "ue"
+# used in an ordering context, "ab" / "ba"
+ba = "ba"
+
+"dependees" = "dependees"
+
+[tool.typos.files]
+extend-exclude = [
+  "loopy/target/c/compyte",
+  "notes/*/*.eps",
+]
+
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 0cf58f83a..000000000
--- a/setup.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python
-
-import os
-
-from setuptools import find_packages, setup
-
-
-ver_dic = {}
-version_file = open("loopy/version.py")
-try:
-    version_file_contents = version_file.read()
-finally:
-    version_file.close()
-
-os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1"
-exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic)
-
-
-# {{{ capture git revision at install time
-
-# authoritative version in pytools/__init__.py
-def find_git_revision(tree_root):
-    # Keep this routine self-contained so that it can be copy-pasted into
-    # setup.py.
-
-    from os.path import abspath, exists, join
-    tree_root = abspath(tree_root)
-
-    if not exists(join(tree_root, ".git")):
-        return None
-
-    from subprocess import PIPE, STDOUT, Popen
-    p = Popen(["git", "rev-parse", "HEAD"], shell=False,
-              stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True,
-              cwd=tree_root)
-    (git_rev, _) = p.communicate()
-
-    git_rev = git_rev.decode()
-
-    git_rev = git_rev.rstrip()
-
-    retcode = p.returncode
-    assert retcode is not None
-    if retcode != 0:
-        from warnings import warn
-        warn("unable to find git revision", stacklevel=1)
-        return None
-
-    return git_rev
-
-
-def write_git_revision(package_name):
-    from os.path import dirname, join
-    dn = dirname(__file__)
-    git_rev = find_git_revision(dn)
-
-    with open(join(dn, package_name, "_git_rev.py"), "w") as outf:
-        outf.write('GIT_REVISION = "%s"\n' % git_rev)
-
-
-write_git_revision("loopy")
-
-# }}}
-
-
-setup(name="loopy",
-      version=ver_dic["VERSION_TEXT"],
-      description="A code generator for array-based code on CPUs and GPUs",
-      long_description=open("README.rst").read(),
-      classifiers=[
-          "Development Status :: 4 - Beta",
-          "Intended Audience :: Developers",
-          "Intended Audience :: Other Audience",
-          "Intended Audience :: Science/Research",
-          "License :: OSI Approved :: MIT License",
-          "Natural Language :: English",
-          "Programming Language :: Python",
-          "Programming Language :: Python :: 3",
-          "Topic :: Scientific/Engineering",
-          "Topic :: Scientific/Engineering :: Information Analysis",
-          "Topic :: Scientific/Engineering :: Mathematics",
-          "Topic :: Scientific/Engineering :: Visualization",
-          "Topic :: Software Development :: Libraries",
-          "Topic :: Utilities",
-          ],
-
-      python_requires="~=3.8",
-      install_requires=[
-          "pytools>=2024.1.5",
-          "pymbolic>=2022.1",
-          "genpy>=2016.1.2",
-
-          # https://github.com/inducer/loopy/pull/419
-          "numpy>=1.19",
-
-          "cgen>=2016.1",
-          "islpy>=2019.1",
-          "codepy>=2017.1",
-          "colorama",
-          "Mako",
-          "pyrsistent",
-          "immutables",
-          "typing_extensions",
-          ],
-
-      extras_require={
-          "pyopencl":  [
-              "pyopencl>=2022.3",
-              ],
-          "fortran":  [
-              # Note that this is *not* regular 'f2py2e', this is
-              # the Fortran parser from the (unfinished) third-edition
-              # f2py, as linked below.
-              "f2py>=0.3.1",
-              "ply>=3.6",
-              ],
-          },
-
-      dependency_links=[
-          "git+https://github.com/pearu/f2py.git"
-          ],
-
-      scripts=["bin/loopy"],
-
-      author="Andreas Kloeckner",
-      url="https://mathema.tician.de/software/loopy",
-      author_email="inform@tiker.net",
-      license="MIT",
-      packages=find_packages(),
-      )
diff --git a/test/test_apps.py b/test/test_apps.py
index 207bc7ee2..c4cffaee1 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -324,7 +324,7 @@ def test_rob_stroud_bernstein_full():
 def test_stencil(ctx_factory):
     ctx = ctx_factory()
 
-    # n=32 causes corner case behavior in size calculations for temprorary (a
+    # n=32 causes corner case behavior in size calculations for temporary (a
     # non-unifiable, two-constant-segments PwAff as the base index)
 
     n = 256
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index e703d9415..6208b9aed 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -95,17 +95,17 @@ def test_c_target_strides_nonsquare():
     from loopy.target.c import ExecutableCTarget
 
     def __get_kernel(order="C"):
-        indicies = ["i", "j", "k"]
-        sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
+        indices = ["i", "j", "k"]
+        sizes = tuple(np.random.randint(1, 11, size=len(indices)))
         # create domain strings
         domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
-        for idx, size in zip(indicies, sizes):
+        for idx, size in zip(indices, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
         statement = "out[{indexed}] = 2 * a[{indexed}]".format(
-            indexed=", ".join(indicies))
+            indexed=", ".join(indices))
         return lp.make_kernel(
                 domains,
                 statement,
@@ -142,17 +142,17 @@ def test_c_optimizations():
     from loopy.target.c import ExecutableCTarget
 
     def __get_kernel(order="C"):
-        indicies = ["i", "j", "k"]
-        sizes = tuple(np.random.randint(1, 11, size=len(indicies)))
+        indices = ["i", "j", "k"]
+        sizes = tuple(np.random.randint(1, 11, size=len(indices)))
         # create domain strings
         domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}"
         domains = []
-        for idx, size in zip(indicies, sizes):
+        for idx, size in zip(indices, sizes):
             domains.append(domain_template.format(
                 iname=idx,
                 size=size))
         statement = "out[{indexed}] = 2 * a[{indexed}]".format(
-            indexed=", ".join(indicies))
+            indexed=", ".join(indices))
         return lp.make_kernel(
                 domains,
                 statement,
diff --git a/test/test_callables.py b/test/test_callables.py
index d58247a75..44a94e43a 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -1397,8 +1397,8 @@ def test_inline_deps(ctx_factory):
     prg = lp.merge([parent_knl, child_knl])
     inlined = lp.inline_callable_kernel(prg, "func")
 
-    from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    apply_single_writer_depencency_heuristic(inlined, error_if_used=True)
+    from loopy.kernel.creation import apply_single_writer_dependency_heuristic
+    apply_single_writer_dependency_heuristic(inlined, error_if_used=True)
 
     _evt, (a_dev,) = inlined(cq)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index e9aa47ef4..bfa607328 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2306,7 +2306,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
 
     from testlib import GridOverride
 
-    # artifically expand via overridden_get_grid_sizes_for_insn_ids
+    # artificially expand via overridden_get_grid_sizes_for_insn_ids
     knl = prog["loopy_kernel"]
     knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
         knl.copy(), vecsize))
@@ -3689,6 +3689,21 @@ def test_no_unnecessary_lbarrier(ctx_factory):
     assert not barrier_between(knl, "write_s_a", "write_ao")
 
 
+def test_long_kernel():
+    n = 500
+    insns = [
+        f"a{i}[j{i}] = j{i}"
+        for i in range(n)
+    ]
+    domains = [
+        f"{{ [j{i}]: 0<=j{i}<10 }}"
+        for i in range(n)
+    ]
+    t_unit = lp.make_kernel(domains, insns)
+    t_unit = lp.preprocess_kernel(t_unit)
+    lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_target.py b/test/test_target.py
index 6c448debf..08bf286cd 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -314,10 +314,7 @@ def test_ispc_streaming_stores():
     knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
     knl = lp.tag_instructions(knl, "!streaming_store")
 
-    knl = lp.add_and_infer_dtypes(knl, {
-        var: stream_dtype
-        for var in vars
-        })
+    knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
 
     knl = lp.set_argument_order(knl, vars + ["n"])
 
diff --git a/test/test_transform.py b/test/test_transform.py
index 8060d2038..98398fefd 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1704,6 +1704,36 @@ def test_precompute_lets_inner_length1_inames_live():
             == parse("(e_inner + e_outer*16) / i_0"))
 
 
+def test_duplicate_iname_not_read_only_nested(ctx_factory):
+    # See <https://github.com/inducer/loopy/issues/859>
+    ctx = ctx_factory()
+    t_unit = lp.make_kernel(
+        "{[i, j]: 0<=i,j<10}",
+        """
+        for i
+            <> acc = 0 {id=init, tags=foo}
+            for j
+                acc = acc + A[i, j] * x[i, j] {id=update, tags=foo}
+            end
+            y[i] = acc {id=assign, tags=foo}
+        end
+        """,
+        [lp.GlobalArg("A,x,y", shape=lp.auto, dtype=np.float32),
+         ...],
+         seq_dependencies=True,
+    )
+    ref_t_unit = t_unit
+
+    t_unit = lp.duplicate_inames(
+        t_unit,
+        inames="i", within="tag:foo", new_inames="irow")
+    print(t_unit)
+    assert (t_unit.default_entrypoint.id_to_insn["init"].within_inames
+            == frozenset({"irow"}))
+
+    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_tree.py b/test/test_tree.py
new file mode 100644
index 000000000..3dea8470e
--- /dev/null
+++ b/test/test_tree.py
@@ -0,0 +1,50 @@
+__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from pyopencl.tools import (  # noqa: F401
+    pytest_generate_tests_for_pyopencl as pytest_generate_tests,
+)
+
+from loopy.schedule.tree import Tree
+
+
+def test_tree_simple():
+    tree = Tree.from_root("")
+
+    tree = tree.add_node("bar", parent="")
+    tree = tree.add_node("baz", parent="bar")
+
+    assert tree.depth("") == 0
+    assert tree.depth("bar") == 1
+    assert tree.depth("baz") == 2
+
+    assert "" in tree
+    assert "bar" in tree
+    assert "baz" in tree
+    assert "foo" not in tree
+
+    tree = tree.replace_node("bar", "foo")
+    assert "bar" not in tree
+    assert "foo" in tree
+
+    tree = tree.move_node("baz", new_parent="")
+    assert tree.depth("baz") == 1