From d50b720935ece64c381a49e80020d8db977e4bda Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 18 Jul 2024 09:09:02 -0500 Subject: [PATCH 01/68] Add typos CI, fix typos --- .github/workflows/ci.yml | 7 ++++++ MEMO | 2 +- contrib/mem-pattern-explorer/pattern_vis.py | 2 +- doc/misc.rst | 8 +++---- loopy/__init__.py | 6 ++--- loopy/check.py | 4 ++-- loopy/frontend/fortran/translator.py | 4 ++-- loopy/isl_helpers.py | 2 +- loopy/kernel/__init__.py | 4 ++-- loopy/kernel/creation.py | 6 ++--- loopy/kernel/function_interface.py | 2 +- loopy/kernel/instruction.py | 6 ++--- loopy/kernel/tools.py | 16 ++++++------- loopy/match.py | 2 +- loopy/options.py | 4 ++-- loopy/preprocess.py | 4 ++-- loopy/schedule/__init__.py | 6 ++--- loopy/statistics.py | 12 +++++----- loopy/symbolic.py | 4 ++-- loopy/target/c/c_execution.py | 10 ++++---- loopy/target/execution.py | 12 +++++----- loopy/target/ispc.py | 2 +- loopy/target/opencl.py | 4 ++-- loopy/target/pyopencl.py | 8 +++---- loopy/transform/array_buffer_map.py | 8 +++---- loopy/transform/callable.py | 2 +- loopy/transform/data.py | 4 ++-- loopy/transform/diff.py | 6 ++--- loopy/transform/iname.py | 4 ++-- loopy/transform/instruction.py | 2 +- loopy/transform/subst.py | 8 +++---- loopy/translation_unit.py | 2 +- loopy/types.py | 2 +- proto-tests/test_tim.py | 2 +- pyproject.toml | 26 +++++++++++++++++++++ test/test_apps.py | 2 +- test/test_c_execution.py | 16 ++++++------- test/test_callables.py | 4 ++-- test/test_loopy.py | 2 +- 39 files changed, 130 insertions(+), 97 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 32707fe86..2c6c77e77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,13 @@ jobs: pipx install ruff ruff check + typos: + name: Typos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: crate-ci/typos@master + pylint: name: Pylint runs-on: ubuntu-latest diff --git a/MEMO b/MEMO index f4e5c34e4..5a9438811 100644 --- a/MEMO +++ b/MEMO @@ -7,7 +7,7 @@ Documentation Notes Things to consider ^^^^^^^^^^^^^^^^^^ -- Depedencies are pointwise for shared loop dimensions +- Dependencies are pointwise for shared loop dimensions and global over non-shared ones (between dependent and ancestor) - multiple insns could fight over which iname gets local axis 0 diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py index 82a2b9602..bbde23174 100644 --- a/contrib/mem-pattern-explorer/pattern_vis.py +++ b/contrib/mem-pattern-explorer/pattern_vis.py @@ -76,7 +76,7 @@ def tick(self): class Array: def __init__(self, ctx, name, shape, strides, elements_per_row=None): # Each array element stores a tuple: - # (timestamp, subgroup, g0, g1, g2, ) of last acccess + # (timestamp, subgroup, g0, g1, g2, ) of last access assert len(shape) == len(strides) diff --git a/doc/misc.rst b/doc/misc.rst index 3fea6fdd4..be1c964cd 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -158,7 +158,7 @@ In the meantime, you can generate code simply by saying:: print(cg_result.host_code()) print(cg_result.device_code()) -Additionally, for C-based languages, header defintions are available via:: +Additionally, for C-based languages, header definitions are available via:: loopy.generate_header(knl) @@ -338,8 +338,8 @@ This list is always growing, but here are a few pointers: Use :func:`loopy.join_inames`. -In what sense does Loopy suport vectorization? ----------------------------------------------- +In what sense does Loopy support vectorization? +----------------------------------------------- There are really two ways in which the OpenCL/CUDA model of computation exposes vectorization: @@ -352,7 +352,7 @@ vectorization: e.g. ``float4``, which support arithmetic with implicit vector semantics as well as a number of 'intrinsic' functions. -Loopy suports both. The first one, SIMT, is accessible by tagging inames with, +Loopy supports both. The first one, SIMT, is accessible by tagging inames with, e.g., ``l.0```. Accessing the second one requires using both execution- and data-reshaping capabilities in loopy. To start with, you need an array that has an axis with the length of the desired vector. If that's not yet available, diff --git a/loopy/__init__.py b/loopy/__init__.py index 275d4f26e..1eebb8223 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -563,18 +563,18 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None): indices = ["i%d" % i for i in range(rank)] shape = ["n%d" % i for i in range(rank)] - commad_indices = ", ".join(indices) + command_indices = ", ".join(indices) bounds = " and ".join( f"0<={ind}<{shape_i}" for ind, shape_i in zip(indices, shape)) set_str = "{{[{}]: {} }}".format( - commad_indices, + command_indices, bounds ) result = make_kernel(set_str, "output[%s] = input[%s]" - % (commad_indices, commad_indices), + % (command_indices, command_indices), lang_version=MOST_RECENT_LANGUAGE_VERSION, default_offset=auto) diff --git a/loopy/check.py b/loopy/check.py index 17887cff5..c2b3d8cd3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1123,7 +1123,7 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): # for each *pred*, we will calculate all the direct/indirect # instructions that can be reached. seen_successors = set() - # first let us start with direct sucessors + # first let us start with direct successors to_check = edges[pred].copy() while to_check: successor = to_check.pop() @@ -1219,7 +1219,7 @@ def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: - * a direct/indirect depdendency edge, or + * a direct/indirect dependency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.InstructionBase.no_sync_with`) """ diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 530e92678..fc9eace87 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -200,7 +200,7 @@ def get_type(self, name, none_ok=False): return None raise TranslationError( - "no type for '%s' found in 'implict none' routine" + "no type for '%s' found in 'implicit none' routine" % name) from None return self.implicit_types.get(name[0], np.dtype(np.int32)) @@ -426,7 +426,7 @@ def map_Implicit(self, node): scope.implicit_types = None for stmt, specs in node.items: - if scope.implict_types is None: + if scope.implict_types is None: # spellchecker: disable-line raise TranslationError("implicit decl not allowed after " "'implicit none'") tp = self.dtype_from_stmt(stmt) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 160b6415b..28aa3be30 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -186,7 +186,7 @@ def simplify_pw_aff(pw_aff, context=None): continue if aff_i.gist(dom_j).is_equal(aff_j): - # aff_i is sufficient to conver aff_j, eliminate aff_j + # aff_i is sufficient to cover aff_j, eliminate aff_j new_pieces = pieces[:] if i < j: new_pieces.pop(j) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index a9b3bb07e..7bf4cb845 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -164,7 +164,7 @@ class LoopKernel(Taggable): .. attribute:: loop_priority A frozenset of priority constraints to the kernel. Each such constraint - is a tuple of inames. Inames occuring in such a tuple will be scheduled + is a tuple of inames. Inames occurring in such a tuple will be scheduled earlier than any iname following in the tuple. This applies only to inames with non-parallel implementation tags. @@ -515,7 +515,7 @@ def get_leaf_domain_indices(self, inames): for iname in inames: home_domain_index = hdm[iname] if home_domain_index in domain_indices: - # nothin' new + # nothing new continue domain_path_to_root = [home_domain_index] + ppd[home_domain_index] diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c4cc880a0..f359eec33 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1884,7 +1884,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic @for_each_kernel -def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True, +def apply_single_writer_dependency_heuristic(kernel, warn_if_used=True, error_if_used=False): logger.debug("%s: default deps" % kernel.name) @@ -2023,7 +2023,7 @@ class SliceToInameReplacer(IdentityMapper): .. attribute:: subarray_ref_bounds A :class:`list` (one entry for each :class:`SubArrayRef` to be created) - of :class:`dict` instances to store the slices enountered in the + of :class:`dict` instances to store the slices encountered in the expressions as a mapping from ``iname`` to a tuple of ``(start, stop, step)``, which describes the boxy (i.e. affine) constraints imposed on the ``iname`` by the corresponding slice notation its intended to @@ -2574,7 +2574,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) knl = resolve_dependencies(knl) - knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=False) + knl = apply_single_writer_dependency_heuristic(knl, warn_if_used=False) # ------------------------------------------------------------------------- # Ordering dependency: diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e3fcf108a..e81e4dafc 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -672,7 +672,7 @@ def is_type_specialized(self): class CallableKernel(InKernelCallable): """ - Records informations about a callee kernel. Also provides interface through + Records information about a callee kernel. Also provides interface through member methods to make the callee kernel compatible to be called from a caller kernel. diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 198b7c03f..b9b86b53b 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -934,7 +934,7 @@ class CallInstruction(MultiAssignmentBase): A tuple of `:class:loopy.Optional`. If an entry is not empty, it contains the type that will be assigned to the new temporary variable - created from the assigment. + created from the assignment. .. automethod:: __init__ """ @@ -1099,7 +1099,7 @@ def is_array_call(assignees, expression): Returns *True* is the instruction is an array call. An array call is a function call applied to array type objects. If any of - the arguemnts or assignees to the function is an array, + the arguments or assignees to the function is an array, :meth:`is_array_call` will return *True*. """ from pymbolic.primitives import Call, Subscript @@ -1460,7 +1460,7 @@ class BarrierInstruction(_DataObliviousInstruction): .. attribute:: mem_kind A string, ``"global"`` or ``"local"``. Chooses which memory type to - sychronize, for targets that require this (e.g. OpenCL) + synchronize, for targets that require this (e.g. OpenCL) The textual syntax in a :mod:`loopy` kernel is:: diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 5ed9b2ad3..0826ed010 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -263,7 +263,7 @@ def find_all_insn_inames(kernel): if insn.within_inames_is_final: continue - # {{{ depdency-based propagation + # {{{ dependency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( @@ -513,8 +513,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, """ # make sure all automatically added stuff shows up - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=False) if iname_cluster and not kernel.linearization: try: @@ -1252,9 +1252,9 @@ def find_recursive_dependencies(kernel, insn_ids): for insn_id in queue: insn = kernel.id_to_insn[insn_id] - additionals = insn.depends_on - result - result.update(additionals) - new_queue.extend(additionals) + additional = insn.depends_on - result + result.update(additional) + new_queue.extend(additional) queue = new_queue @@ -1735,7 +1735,7 @@ def get_global_barrier_order(kernel): @memoize_on_first_arg def find_most_recent_global_barrier(kernel, insn_id): - """Return the id of the latest occuring global barrier which the + """Return the id of the latest occurring global barrier which the given instruction (indirectly or directly) depends on, or *None* if this instruction does not depend on a global barrier. @@ -1995,7 +1995,7 @@ def infer_args_are_input_output(kernel): elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)): pass else: - raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + raise NotImplementedError("Unknown argument type %s." % type(arg)) if not (arg.is_input or arg.is_output): raise LoopyError("Kernel argument must be either input or output." diff --git a/loopy/match.py b/loopy/match.py index 889f4e74f..5e409791b 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -1,4 +1,4 @@ -"""Matching functionality for instruction ids and subsitution +"""Matching functionality for instruction ids and substitution rule invocations stacks.""" diff --git a/loopy/options.py b/loopy/options.py index 9c4fa0fb4..d58421e3e 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -118,7 +118,7 @@ class Options(ImmutableRecord): .. attribute:: cl_exec_manage_array_events - Within the PyOpenCL executor, respect and udpate + Within the PyOpenCL executor, respect and update :attr:`pyopencl.array.Array.events`. Defaults to *True*. @@ -156,7 +156,7 @@ class Options(ImmutableRecord): Allow re-ordering of floating point arithmetic. Re-ordering may give different results as floating point arithmetic is not - associative in addition and mulitplication. Default is *True*. + associative in addition and multiplication. Default is *True*. Note that the implementation of this option is currently incomplete. .. attribute:: build_options diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d24e14cc2..7176d9d15 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -825,8 +825,8 @@ def preprocess_program(t_unit: TranslationUnit) -> TranslationUnit: from loopy.transform.subst import expand_subst t_unit = expand_subst(t_unit) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - t_unit = apply_single_writer_depencency_heuristic(t_unit) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + t_unit = apply_single_writer_dependency_heuristic(t_unit) # Ordering restrictions: # diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ca45521e3..6249b36ba 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -718,7 +718,7 @@ def get_insns_in_topologically_sorted_order( for dep in insn.depends_on: rev_dep_map[dep].add(insn.id) - # For breaking ties, we compare the features of an intruction + # For breaking ties, we compare the features of an instruction # so that instructions with the same set of features are lumped # together. This helps in :method:`schedule_as_many_run_insns_as_possible` # which bails after 5 insns that don't have the same feature. @@ -1196,7 +1196,7 @@ def insn_sort_key(insn_id): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " "which the schedule is " - "currently stuck inside of, seems mis-nested. " + "currently stuck inside of, seems misnested. " "'%(subdep)s' must occur " "before '%(dep)s', " "but '%(subdep)s must be outside " "'%(iname)s', whereas '%(dep)s' must be back " @@ -1404,7 +1404,7 @@ def insn_sort_key(insn_id): get_priority_tiers(wanted, sched_state.kernel.loop_priority)) # Update the loop priority set, because some constraints may have - # have been contradictary. + # have been contradictory. loop_priority_set = set().union(*[set(t) for t in priority_tiers]) priority_tiers.append( diff --git a/loopy/statistics.py b/loopy/statistics.py index c9cf9d938..0bd1340c1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -629,7 +629,7 @@ class Op(ImmutableRecord): work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. + analogous to an NVIDIA CUDA warp. .. attribute:: kernel_name @@ -723,7 +723,7 @@ class MemAccess(ImmutableRecord): work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. + analogous to an NVIDIA CUDA warp. .. attribute:: kernel_name @@ -1109,7 +1109,7 @@ def _get_lid_and_gid_strides(knl, array, index): # create lid_strides and gid_strides dicts - # strides are coefficents in flattened index, i.e., we want + # strides are coefficients in flattened index, i.e., we want # lid_strides = {0:l0, 1:l1, 2:l2, ...} and # gid_strides = {0:g0, 1:g1, 2:g2, ...}, # where l0, l1, l2, g0, g1, and g2 come from flattened index @@ -1723,7 +1723,7 @@ def get_op_map(program, count_redundant_work=False, :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within - a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be @@ -1921,7 +1921,7 @@ def get_mem_access_map(program, count_redundant_work=False, :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails @@ -2085,7 +2085,7 @@ def get_synchronization_map(program, subgroup_size=None, entrypoint=None): :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within - a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d56b54e79..2a1b140cc 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -683,7 +683,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable): A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to provide metadata on this object. Legacy string tags are converted to :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry - a functional meaning, the tag carrying that same fucntional meaning + a functional meaning, the tag carrying that same functional meaning (e.g. :class:`~loopy.UseStreamingStoreTag`). Inherits from :class:`pymbolic.primitives.Variable` @@ -737,7 +737,7 @@ class Reduction(LoopyExpressionBase): .. attribute:: allow_simultaneous A :class:`bool`. If not *True*, an iname is allowed to be used - in precisely one reduction, to avoid mis-nesting errors. + in precisely one reduction, to avoid misnesting errors. """ init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index fc3238e92..9cde501a7 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -94,21 +94,21 @@ def python_dtype_str_inner(self, dtype): return f"_lpy_np.dtype(_lpy_np.{name})" raise Exception(f"dtype: {dtype} not recognized") - # {{{ handle non numpy arguements + # {{{ handle non numpy arguments def handle_non_numpy_arg(self, gen, arg): pass # }}} - # {{{ handle allocation of unspecified arguements + # {{{ handle allocation of unspecified arguments def handle_alloc( self, gen: CodeGenerator, arg: ArrayArg, strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], skip_arg_checks: bool) -> None: """ - Handle allocation of non-specified arguements for C-execution + Handle allocation of non-specified arguments for C-execution """ from pymbolic import var @@ -181,7 +181,7 @@ def target_specific_preamble(self, gen): def initialize_system_args(self, gen): """ - Initializes possibly empty system arguements + Initializes possibly empty system arguments """ pass @@ -238,7 +238,7 @@ class CCompiler: The general strategy here is as follows: 1. A :class:`codepy.Toolchain` is guessed from distutils. - The user may override any flags obtained therein by passing in arguements + The user may override any flags obtained therein by passing in arguments to cc, cflags, etc. 2. The kernel source is built into and object first, then made into a shared diff --git a/loopy/target/execution.py b/loopy/target/execution.py index cb081a3e5..21600c734 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -377,21 +377,21 @@ def generate_value_arg_check( # }}} - # {{{ handle non numpy arguements + # {{{ handle non numpy arguments def handle_non_numpy_arg(self, gen: CodeGenerator, arg): raise NotImplementedError() # }}} - # {{{ handle allocation of unspecified arguements + # {{{ handle allocation of unspecified arguments def handle_alloc( self, gen: CodeGenerator, arg: ArrayArg, strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], skip_arg_checks: bool) -> None: """ - Handle allocation of non-specified arguements for C-execution + Handle allocation of non-specified arguments for C-execution """ raise NotImplementedError() @@ -647,7 +647,7 @@ def target_specific_preamble(self, gen): def initialize_system_args(self, gen): """ - Override to intialize any default system args + Override to initialize any default system args """ raise NotImplementedError() @@ -674,7 +674,7 @@ def __call__(self, program, entrypoint, codegen_result): """ Generates the wrapping python invoker for this execution target - :arg kernel: the loopy :class:`LoopKernel`(s) to be executued + :arg kernel: the loopy :class:`LoopKernel`(s) to be executed :codegen_result: the loopy :class:`CodeGenerationResult` created by code generation @@ -944,7 +944,7 @@ def __call__(self, queue, **kwargs): # }}} -# {{{ code highlighers +# {{{ code highlighters def get_highlighted_code(text, python=False): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index ce2a150b0..31d1cfd2d 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -112,7 +112,7 @@ def map_subscript(self, expr, type_context): if (isinstance(ary, TemporaryVariable) and ary.address_space == AddressSpace.PRIVATE): - # generate access code for acccess to private-index temporaries + # generate access code for access to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index e2f3ecda2..14383e54f 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -642,7 +642,7 @@ def get_function_declaration( from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) if not codegen_state.is_entrypoint: - # auxiliary kernels need not mention opencl speicific qualifiers + # auxiliary kernels need not mention opencl specific qualifiers # for a functions signature return preambles, fdecl @@ -908,7 +908,7 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, # }}} -# {{{ volatile mem acccess target +# {{{ volatile mem access target class VolatileMemExpressionToOpenCLCExpressionMapper( ExpressionToOpenCLCExpressionMapper): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ec702e39b..ecaea9b57 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -324,7 +324,7 @@ def binary_tree_add(start, end): # -ffp-contract=fast which is the default for PTX codegen, but # for some unknown reason, clang fails to see the FMAs. # - # We need to do this only for complex as we haev temporaries + # We need to do this only for complex as we have temporaries # only in complex. For reals, the code generated looks like # # res = c + a * b @@ -879,7 +879,7 @@ def get_kernel_call( value_arg_code = generate_value_arg_setup( codegen_state.kernel, regular_arg_names) - arry_arg_code = generate_array_arg_setup( + array_arg_code = generate_array_arg_setup( codegen_state.kernel, regular_arg_names) if struct_overflow_arg_names: @@ -952,7 +952,7 @@ def get_kernel_call( "argument count of the kernel ({_lpy_knl.num_args}).'"), Line(), value_arg_code, - arry_arg_code, + array_arg_code, overflow_args_code, Assign("_lpy_evt", f"{self.target.pyopencl_module_name}.enqueue_nd_range_kernel(" @@ -1207,7 +1207,7 @@ def get_expression_to_c_expression_mapper(self, codegen_state): # }}} -# {{{ volatile mem acccess target +# {{{ volatile mem access target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index ec3737233..7e7b6459c 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -413,17 +413,17 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs): except_inames=frozenset(self.primed_sweep_inames)) s2s_domain = stor2sweep.domain() - s2s_domain, aligned_g_s2s_parm_dom = isl.align_two( + s2s_domain, aligned_g_s2s_param_dom = isl.align_two( s2s_domain, global_s2s_par_dom) arg_restrictions = ( - aligned_g_s2s_parm_dom + aligned_g_s2s_param_dom .eliminate(dim_type.set, 0, - aligned_g_s2s_parm_dom.dim(dim_type.set)) + aligned_g_s2s_param_dom.dim(dim_type.set)) .remove_divs()) return (arg_restrictions & s2s_domain).is_subset( - aligned_g_s2s_parm_dom) + aligned_g_s2s_param_dom) class NoOpArrayToBufferMap(ArrayToBufferMapBase): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 1fe40a370..d683cbd29 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -102,7 +102,7 @@ def merge(translation_units): if (prg_i.callables_table[clbl_name] != prg_j.callables_table[clbl_name]): # TODO: generate unique names + rename for the colliding - # callables (if entrypoints are colliding that shuold still + # callables (if entrypoints are colliding that should still # be an error) raise NotImplementedError("Translation units to be merged" " must have different callable names" diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 088d89643..ddfc9b5e8 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -252,7 +252,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, footprint_subscripts, var_descr) # Our _not_provided is actually a different object from the one in the - # precompute module, but precompute acutally uses that to adjust its + # precompute module, but precompute actually uses that to adjust its # warning message. from loopy.transform.precompute import precompute_for_single_kernel @@ -653,7 +653,7 @@ def set_argument_order(kernel, arg_names): :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ - # FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # FIXME: @inducer -- should this only affect the root kernel, or should it # take a within? if isinstance(arg_names, str): diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index bb828221f..6c2688d90 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -154,7 +154,7 @@ def map_call(self, expr, *args): dc = self.diff_context if expr.function.name in dc.kernel.substitutions: - # FIXME: Deal with subsitution rules + # FIXME: Deal with substitution rules # Need to use chain rule here, too. raise NotImplementedError("substitution rules in differentiation") else: @@ -382,8 +382,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", assert isinstance(kernel, LoopKernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=True) if isinstance(diff_outputs, str): diff_outputs = [ diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 18df3dae4..b835373da 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1540,7 +1540,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): break if not found: - raise LoopyError("invlaid tag kind: %s" % kind) + raise LoopyError("invalid tag kind: %s" % kind) from loopy.match import parse_match match = parse_match(insn_match) @@ -2265,7 +2265,7 @@ def add_inames_for_unused_hw_axes(kernel, within=None): Current limitations: * Only one iname in the kernel may be tagged with each of the unused hw axes. - * Occurence of an ``l.auto`` tag when an instruction is missing one of the + * Occurrence of an ``l.auto`` tag when an instruction is missing one of the local hw axes. :arg within: An instruction match as understood by diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 629916628..374587da5 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -425,7 +425,7 @@ def insns_in_conflicting_groups(insn1_id, insn2_id): if not nosync_to_add and not empty_ok: raise LoopyError("No nosync annotations were added as a result " "of this call. add_nosync will (by default) only add them to " - "accompany existing depencies or group exclusions. Maybe you want " + "accompany existing dependencies or group exclusions. Maybe you want " "to pass force=True?") new_instructions = list(kernel.instructions) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b5c7aa7a1..422d22568 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -327,8 +327,8 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ establish the relevant definition of lhs_name for each usage site dep_kernel = expand_subst(kernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - dep_kernel = apply_single_writer_depencency_heuristic(dep_kernel) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + dep_kernel = apply_single_writer_dependency_heuristic(dep_kernel) assigning_insn_ids = {insn.id for insn in dep_kernel.instructions if lhs_name in insn.assignee_var_names()} @@ -354,7 +354,7 @@ def get_relevant_definition_insn_id(usage_insn_id): if len(rel_def_ids) > 1: raise LoopyError("more than one write to '%s' found in " - "depdendencies of '%s'--definition cannot be resolved " + "dependencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" % (lhs_name, usage_insn_id, ", ".join(rel_def_ids))) @@ -433,7 +433,7 @@ def _accesses_lhs(kernel, insn, *args): for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " - "asignee index '%s' is not a plain variable. " + "assignee index '%s' is not a plain variable. " "Perhaps use loopy.affine_map_inames() " "to perform substitution." % (def_id, i)) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index c0d1b0b05..76e795b76 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -196,7 +196,7 @@ class TranslationUnit: .. attribute:: func_id_to_in_knl_callables_mappers A :class:`frozenset` of functions of the signature ``(target: - TargetBase, function_indentifier: str)`` that returns an instance + TargetBase, function_identifier: str)`` that returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. .. automethod:: executor diff --git a/loopy/types.py b/loopy/types.py index 143715a39..a837d1c46 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -165,7 +165,7 @@ def __repr__(self): class OpaqueType(LoopyType): """An opaque data type is truly opaque - it has no allocations, no temporaries of that type, etc. The only thing allowed is to be pass in - through one ValueArg and go out to another. It is introduced to accomodate + through one ValueArg and go out to another. It is introduced to accommodate functional calls to external libraries. """ def __init__(self, name: str) -> None: diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py index 7ee30313c..eb8125cdb 100644 --- a/proto-tests/test_tim.py +++ b/proto-tests/test_tim.py @@ -190,7 +190,7 @@ def test_tim3d(ctx_factory): knl = lp.split_iname(knl, "k", n, inner_tag="l.2") # , slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.0") # , slabs=(0, 1)) -# knl = lp.tag_inames(knl, dict(k_nner="unr")) +# knl = lp.tag_inames(knl, dict(k_inner="unr")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) diff --git a/pyproject.toml b/pyproject.toml index 6f7b977b6..9dadd57f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,3 +90,29 @@ module = [ "IPython.*", ] ignore_missing_imports = true + +[tool.typos.default] +extend-ignore-re = [ + "(?Rm)^.*(#|//)\\s*spellchecker:\\s*disable-line$" +] + +[tool.typos.default.extend-words] +# like the numpy function, array range +arange = "arange" +# N-Dimensional +ND = "ND" +# used for 'diff_output' +dout = "dout" +# an element-wise slice of array u +ue = "ue" +# used in an ordering context, "ab" / "ba" +ba = "ba" + +"dependees" = "dependees" + +[tool.typos.files] +extend-exclude = [ + "loopy/target/c/compyte", + "notes/*/*.eps", +] + diff --git a/test/test_apps.py b/test/test_apps.py index 207bc7ee2..c4cffaee1 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -324,7 +324,7 @@ def test_rob_stroud_bernstein_full(): def test_stencil(ctx_factory): ctx = ctx_factory() - # n=32 causes corner case behavior in size calculations for temprorary (a + # n=32 causes corner case behavior in size calculations for temporary (a # non-unifiable, two-constant-segments PwAff as the base index) n = 256 diff --git a/test/test_c_execution.py b/test/test_c_execution.py index e703d9415..6208b9aed 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -95,17 +95,17 @@ def test_c_target_strides_nonsquare(): from loopy.target.c import ExecutableCTarget def __get_kernel(order="C"): - indicies = ["i", "j", "k"] - sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + indices = ["i", "j", "k"] + sizes = tuple(np.random.randint(1, 11, size=len(indices))) # create domain strings domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}" domains = [] - for idx, size in zip(indicies, sizes): + for idx, size in zip(indices, sizes): domains.append(domain_template.format( iname=idx, size=size)) statement = "out[{indexed}] = 2 * a[{indexed}]".format( - indexed=", ".join(indicies)) + indexed=", ".join(indices)) return lp.make_kernel( domains, statement, @@ -142,17 +142,17 @@ def test_c_optimizations(): from loopy.target.c import ExecutableCTarget def __get_kernel(order="C"): - indicies = ["i", "j", "k"] - sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + indices = ["i", "j", "k"] + sizes = tuple(np.random.randint(1, 11, size=len(indices))) # create domain strings domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}" domains = [] - for idx, size in zip(indicies, sizes): + for idx, size in zip(indices, sizes): domains.append(domain_template.format( iname=idx, size=size)) statement = "out[{indexed}] = 2 * a[{indexed}]".format( - indexed=", ".join(indicies)) + indexed=", ".join(indices)) return lp.make_kernel( domains, statement, diff --git a/test/test_callables.py b/test/test_callables.py index d58247a75..44a94e43a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -1397,8 +1397,8 @@ def test_inline_deps(ctx_factory): prg = lp.merge([parent_knl, child_knl]) inlined = lp.inline_callable_kernel(prg, "func") - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - apply_single_writer_depencency_heuristic(inlined, error_if_used=True) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + apply_single_writer_dependency_heuristic(inlined, error_if_used=True) _evt, (a_dev,) = inlined(cq) diff --git a/test/test_loopy.py b/test/test_loopy.py index e9aa47ef4..34310171f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2306,7 +2306,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): from testlib import GridOverride - # artifically expand via overridden_get_grid_sizes_for_insn_ids + # artificially expand via overridden_get_grid_sizes_for_insn_ids knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) From da5c786f12cb9626118b7aa107b4d8e0a92f8743 Mon Sep 17 00:00:00 2001 From: Matthias Diener Date: Thu, 18 Jul 2024 15:00:41 -0500 Subject: [PATCH 02/68] InstructionBase: implement _with_new_tags --- loopy/kernel/instruction.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index b9b86b53b..fe593b00b 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -463,6 +463,9 @@ def __setstate__(self, val): self.within_inames = ( intern_frozenset_of_ids(self.within_inames)) + def _with_new_tags(self, tags: FrozenSet[Tag]): + return self.copy(tags=tags) + # }}} From 07cee43b503591bec5d32f3683f7ed5392dce3e1 Mon Sep 17 00:00:00 2001 From: Matthias Diener Date: Thu, 18 Jul 2024 18:35:15 -0500 Subject: [PATCH 03/68] TypeInferenceMapper: allow np.bool in map_type_case --- loopy/type_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b997cee25..26be6b352 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -396,7 +396,7 @@ def map_constant(self, expr): def map_type_cast(self, expr): subtype, = self.rec(expr.child) - if not issubclass(subtype.dtype.type, np.number): + if not issubclass(subtype.dtype.type, (np.number, np.bool_)): raise LoopyError(f"Can't cast a '{subtype}' to '{expr.type}'") return [expr.type] From 538b542fff77c1d4f79c97962cf7a82e261c2a0f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 19 Jul 2024 04:43:27 -0500 Subject: [PATCH 04/68] Emit better docs for add_prefetch * Be explicit about what args are accepted. * fetch_insn_id -> prefetch_insn_id * program -> t_unit --- loopy/transform/data.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index ddfc9b5e8..3a29740ce 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -292,7 +292,13 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, return new_kernel -def add_prefetch(program, *args, **kwargs): +def add_prefetch(t_unit, + var_name, sweep_inames=None, dim_arg_names=None, + default_tag=None, + rule_name=None, temporary_name=None, + temporary_address_space=None, temporary_scope=None, + footprint_subscripts=None, fetch_bounding_box=False, + fetch_outer_inames=None, prefetch_insn_id=None, within=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -379,7 +385,7 @@ def add_prefetch(program, *args, **kwargs): :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. - :arg fetch_insn_id: The ID of the instruction generated to perform the + :arg prefetch_insn_id: The ID of the instruction generated to perform the prefetch. :arg within: a stack match as understood by @@ -388,14 +394,26 @@ def add_prefetch(program, *args, **kwargs): This function internally uses :func:`extract_subst` and :func:`precompute`. """ - assert isinstance(program, TranslationUnit) + assert isinstance(t_unit, TranslationUnit) new_callables = {} - for func_id, in_knl_callable in program.callables_table.items(): + for func_id, in_knl_callable in t_unit.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) + in_knl_callable.subkernel, t_unit.callables_table, + var_name=var_name, + sweep_inames=sweep_inames, + dim_arg_names=dim_arg_names, + default_tag=default_tag, + rule_name=rule_name, + temporary_name=temporary_name, + temporary_address_space=temporary_address_space, + temporary_scope=temporary_scope, + footprint_subscripts=footprint_subscripts, + fetch_bounding_box=fetch_bounding_box, + fetch_outer_inames=fetch_outer_inames, + prefetch_insn_id=prefetch_insn_id, + within=within) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -407,7 +425,7 @@ def add_prefetch(program, *args, **kwargs): new_callables[func_id] = in_knl_callable - return program.copy(callables_table=Map(new_callables)) + return t_unit.copy(callables_table=Map(new_callables)) # }}} From 6f257c24c8617e02eca7887fae8503c0d9dd0507 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 19 Jul 2024 06:08:51 -0500 Subject: [PATCH 05/68] spacing in error msg --- loopy/transform/privatize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index e9b2b8c53..ef878c90f 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -247,7 +247,7 @@ def privatize_temporaries_with_inames( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " - "previously within the iname(s). To remedy this, first promote" + "previously within the iname(s). To remedy this, first promote " "the instruction into the iname." % (kernel.name, insn.id, ", ".join( eiii.seen_priv_axis_inames - insn.within_inames))) From dd2da55a16ed7d2e0151728ffa9e12d34ec37348 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 23 Jul 2024 12:24:46 -0500 Subject: [PATCH 06/68] Improve error message for unrecognized base storage Closes gh-858 --- loopy/check.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index c2b3d8cd3..4ec3b46e1 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1312,9 +1312,12 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None: storage_array = name_to_array.get(ary.base_storage, None) if storage_array is None: - raise ValueError("nothing known about storage array " + raise LoopyError("Nothing known about storage array " f"'{ary.base_storage}' serving as base_storage of " - f"'{ary.name}'") + f"'{ary.name}'. " + "(Note: base storage is no longer automatically allocated. " + "Call allocate_temporaries_for_base_storage to automatically " + "allocate.)") if storage_array.base_storage: raise ValueError("storage array " From 88d876ccb9689e2ee040ac8c0efdd22e09bd7728 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 29 Jul 2024 11:33:33 -0500 Subject: [PATCH 07/68] [kernel.creation] duplicate inames only after adding all relevant inames --- loopy/kernel/creation.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f359eec33..36489de47 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2536,13 +2536,6 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): assert len(knl.instructions) == len(inames_to_dup) - from loopy import duplicate_inames - from loopy.match import Id - for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): - for old_iname, new_iname in insn_inames_to_dup: - knl = duplicate_inames(knl, old_iname, - within=Id(insn.id), new_inames=new_iname) - check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) @@ -2563,6 +2556,27 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): knl = add_inferred_inames(knl) from loopy.transform.parameter import fix_parameters knl = fix_parameters(knl, **fixed_parameters) + + # ------------------------------------------------------------------------- + # Ordering dependency: + # ------------------------------------------------------------------------- + # Must duplicate inames after adding all the inames to the instructions. + # To duplicate an iname "i" in statement "S", lp.duplicate requires that + # the statement "S" be nested within the iname "i". + # ------------------------------------------------------------------------- + from loopy import duplicate_inames + from loopy.match import Id + for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): + for old_iname, new_iname in insn_inames_to_dup: + knl = duplicate_inames(knl, old_iname, + within=Id(insn.id), new_inames=new_iname) + new_insn = knl.id_to_insn[insn.id] + assert old_iname not in ( + new_insn.within_inames + | new_insn.reduction_inames() + | new_insn.sub_array_ref_inames() + ) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- From 98d5b62136c37c6f463715a6b3d2cc587eb8ba85 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 28 Jul 2024 19:14:03 -0500 Subject: [PATCH 08/68] Fix _InameDuplicator.within * Changed to update the inames of the instructions that do not access an inames, yet, are nested within the iname to be duplicated. --- loopy/transform/iname.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index b835373da..97257745c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -21,7 +21,7 @@ """ -from typing import FrozenSet, Optional +from typing import Any, FrozenSet, Optional import islpy as isl from islpy import dim_type @@ -29,6 +29,7 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.instruction import InstructionBase from loopy.symbolic import ( RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, @@ -919,9 +920,13 @@ def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, old_to_new=dict(list(zip(inames, new_inames))), within=within) - def _does_access_old_inames(kernel, insn, *args): - return bool(frozenset(inames) & (insn.dependency_names() - | insn.reduction_inames())) + def _does_access_old_inames(kernel: LoopKernel, + insn: InstructionBase, + *args: Any) -> bool: + all_inames = (insn.within_inames + | insn.reduction_inames() + | insn.sub_array_ref_inames()) + return bool(frozenset(inames) & all_inames) kernel = rule_mapping_context.finish_kernel( indup.map_kernel(kernel, within=_does_access_old_inames, From f01a86b28cbb68613803382fd631ba6f425d47a6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 28 Jul 2024 19:22:02 -0500 Subject: [PATCH 09/68] Test iname duplication for cases with only loop-nest based iname-dependence --- test/test_transform.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index 8060d2038..4ad1971fb 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1704,6 +1704,35 @@ def test_precompute_lets_inner_length1_inames_live(): == parse("(e_inner + e_outer*16) / i_0")) +def test_duplicate_iname_not_read_only_nested(ctx_factory): + # See + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i, j]: 0<=i,j<10}", + """ + for i + <> acc = 0 {id=init, tags=foo} + for j + acc = acc + A[i, j] * x[i, j] {id=update, tags=foo} + end + y[i] = acc {id=assign, tags=foo} + end + """, + [lp.GlobalArg("A,x,y", shape=lp.auto, dtype=np.float32), + ...], + ) + ref_t_unit = t_unit + + t_unit = lp.duplicate_inames( + t_unit, + inames="i", within="tag:foo", new_inames="irow") + print(t_unit) + assert (t_unit.default_entrypoint.id_to_insn["init"].within_inames + == frozenset({"irow"})) + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From d77b54f9cde00dc215ef24442412ae7f3bb1ee52 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 2 Aug 2024 12:05:53 -0500 Subject: [PATCH 10/68] Ruff 0.5.6 fixes --- examples/fortran/ipython-integration-demo.ipynb | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index d9ac1f1b2..64fcb0af4 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(prog)" + "print(prog) # noqa: F821" ] }, { @@ -105,17 +105,8 @@ "metadata": {}, "outputs": [], "source": [ - "print(prog)" + "print(prog) # noqa: F821" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { From c73db678962c9a2794030720a8e4dbd7fd18a3ec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 2 Aug 2024 05:11:38 -0500 Subject: [PATCH 11/68] Add helpful error msg for type uninferred temps --- loopy/transform/data.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 3a29740ce..ea6cc0fc1 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -1025,6 +1025,11 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel, raise LoopyError( f"Temporary '{tv.name}' has an offset and no base_storage. " "That's not allowed.") + if not isinstance(tv.dtype, LoopyType): + raise LoopyError( + f"Dtype of temporary '{tv.name}' " + " is not inferred. Call lp.infer_unknown_types" + " first.") if (tv.base_storage and tv.base_storage not in kernel.temporary_variables From ad3618fcc3fb4c2bddbc8cc8348a04dfc2845f1a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 2 Aug 2024 05:12:34 -0500 Subject: [PATCH 12/68] Fix CUDA local temp var allocation with base storage --- loopy/target/c/__init__.py | 69 ++++++++++++++++++++++---------------- loopy/target/cuda.py | 41 +++++++++++++++++++++- 2 files changed, 81 insertions(+), 29 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a99b7d065..56e5dd872 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -35,6 +35,7 @@ Const, Declarator, Generable, + Initializer, NestedDeclarator, Pointer, ) @@ -800,7 +801,7 @@ def get_function_definition( kernel = codegen_state.kernel assert kernel.linearization is not None - from cgen import FunctionBody, Initializer, Line + from cgen import FunctionBody, Line result = [] @@ -883,6 +884,42 @@ def get_kernel_call(self, codegen_state: CodeGenerationState, lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]: return None + def emit_temp_var_decl_for_tv_with_base_storage(self, + codegen_state: CodeGenerationState, + tv: TemporaryVariable) -> Generable: + """ + Returns the statement for initializing a :class:`loopy.TemporaryVariable` + with a user-provided :attr:`loopy.TemporaryVariable.base_storage`. + """ + assert tv.base_storage is not None + assert isinstance(tv.address_space, AddressSpace) + ecm = codegen_state.expression_to_code_mapper + + cast_decl = POD(self, tv.dtype, "") + temp_var_decl = POD(self, tv.dtype, tv.name) + + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer + + cast_decl = self.wrap_decl_for_address_space( + ptrtype(cast_decl), tv.address_space) + temp_var_decl = self.wrap_decl_for_address_space( + ptrtype(temp_var_decl), tv.address_space) + + cast_tp, cast_d = cast_decl.get_decl_pair() + return Initializer( + temp_var_decl, + "({} {}) ({} + {})".format( + " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset) + ), + ) + def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace @@ -930,33 +967,9 @@ def get_temporary_decls(self, codegen_state, schedule_index): else: assert tv.initializer is None - - cast_decl = POD(self, tv.dtype, "") - temp_var_decl = POD(self, tv.dtype, tv.name) - - if tv._base_storage_access_may_be_aliasing: - ptrtype = _ConstPointer - else: - # The 'restrict' part of this is a complete lie--of course - # all these temporaries are aliased. But we're promising to - # not use them to shovel data from one representation to the - # other. That counts, right? - ptrtype = _ConstRestrictPointer - - cast_decl = self.wrap_decl_for_address_space( - ptrtype(cast_decl), tv.address_space) - temp_var_decl = self.wrap_decl_for_address_space( - ptrtype(temp_var_decl), tv.address_space) - - cast_tp, cast_d = cast_decl.get_decl_pair() - temp_var_decl = Initializer( - temp_var_decl, - "({} {}) ({} + {})".format( - " ".join(cast_tp), cast_d, - tv.base_storage, - ecm(tv.offset) - )) - + temp_var_decl = self.emit_temp_var_decl_for_tv_with_base_storage( + codegen_state, tv + ) temp_decls_using_base_storage.append(temp_var_decl) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 3155b3bff..afeb5cee2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -35,7 +35,13 @@ from loopy.codegen.result import CodeGenerationResult from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag -from loopy.kernel.data import AddressSpace, ArrayArg, ConstantArg, ImageArg +from loopy.kernel.data import ( + AddressSpace, + ArrayArg, + ConstantArg, + ImageArg, + TemporaryVariable, +) from loopy.kernel.function_interface import ScalarCallable from loopy.target.c import CFamilyASTBuilder, CFamilyTarget from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -462,6 +468,39 @@ def get_image_arg_declarator( self, arg: ImageArg, is_written: bool) -> Declarator: raise NotImplementedError("not yet: texture arguments in CUDA") + def emit_temp_var_decl_for_tv_with_base_storage(self, + codegen_state: CodeGenerationState, + tv: TemporaryVariable) -> Generable: + from cgen import Initializer + + from loopy.target.c import POD, _ConstPointer, _ConstRestrictPointer + + assert tv.base_storage is not None + ecm = codegen_state.expression_to_code_mapper + + cast_decl = POD(self, tv.dtype, "") + temp_var_decl = POD(self, tv.dtype, tv.name) + + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer + + cast_decl = ptrtype(cast_decl) + temp_var_decl = ptrtype(temp_var_decl) + + cast_tp, cast_d = cast_decl.get_decl_pair() + return Initializer( + temp_var_decl, + "({} {}) ({} + {})".format( + " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset) + ), + ) + # }}} # {{{ atomics From 421ee2a09b95124fb40c545ef24d6ad86a67262f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 2 Aug 2024 15:12:39 -0500 Subject: [PATCH 13/68] Migrate package info to pyproject.toml Also drop _git_rev mechanism --- .gitignore | 2 - loopy/version.py | 33 +++--------- pyproject.toml | 73 ++++++++++++++++++++++++++ setup.py | 130 ----------------------------------------------- 4 files changed, 81 insertions(+), 157 deletions(-) delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 7cf3c4751..4378c7122 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,6 @@ lextab.py yacctab.py .pytest_cache/* -loopy/_git_rev.py - .cache .env virtualenv-[0-9]*[0-9] diff --git a/loopy/version.py b/loopy/version.py index 09d8442a2..609e6c179 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -21,32 +21,16 @@ """ -# {{{ find install- or run-time git revision +import re +from importlib import metadata -import os +VERSION_TEXT = metadata.version("loopy") +_match = re.match("^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT) +assert _match is not None +VERSION_STATUS = _match.group(2) +VERSION = tuple(int(nr) for nr in _match.group(1).split(".")) -if os.environ.get("AKPYTHON_EXEC_IMPORT_UNAVAILABLE") is not None: - # We're just being exec'd by setup.py. We can't import anything. - _git_rev = None - -else: - import loopy._git_rev as _git_rev_mod # pylint: disable=no-name-in-module,import-error # noqa: E501 - _git_rev = _git_rev_mod.GIT_REVISION - - # If we're running from a dev tree, the last install (and hence the most - # recent update of the above git rev) could have taken place very long ago. - from pytools import find_module_git_revision - _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1) - if _runtime_git_rev is not None: - _git_rev = _runtime_git_rev - -# }}} - - -VERSION = (2024, 1) -VERSION_STATUS = "" -VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS try: import islpy.version @@ -62,8 +46,7 @@ else: _cgen_version = cgen.version.VERSION_TEXT -DATA_MODEL_VERSION = "{}-islpy{}-cgen{}-{}-v1".format( - VERSION_TEXT, _islpy_version, _cgen_version, _git_rev) +DATA_MODEL_VERSION = f"{VERSION_TEXT}-islpy{_islpy_version}-cgen{_cgen_version}-v1" FALLBACK_LANGUAGE_VERSION = (2018, 2) diff --git a/pyproject.toml b/pyproject.toml index 9dadd57f5..7f0a38ba4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,76 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = [ + "setuptools>=63", +] + +[project] +name = "loopy" +version = "2024.1" +description = "A code generator for array-based code on CPUs and GPUs" +readme = "README.rst" +license = { text = "MIT" } +authors = [ + { name = "Andreas Kloeckner", email = "inform@tiker.net" }, +] +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Other Audience", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", +] +dependencies = [ + "pytools>=2024.1.5", + "pymbolic>=2022.1", + "genpy>=2016.1.2", + + # https://github.com/inducer/loopy/pull/419 + "numpy>=1.19", + + "cgen>=2016.1", + "islpy>=2019.1", + "codepy>=2017.1", + "colorama", + "Mako", + "pyrsistent", + "immutables", + "typing_extensions", +] +[project.optional-dependencies] +pyopencl = [ + "pyopencl>=2022.3", +] +fortran = [ + # Note that this is *not* regular 'f2py2e', this is + # the Fortran parser from the (unfinished) third-edition + # f2py, as linked below. This package is not on the package index, AFAIK. + # -AK, 2024-08-02 + "f2py @ git+https://github.com/pearu/f2py.git", + "ply>=3.6", +] + +[project.scripts] + +[project.urls] +Documentation = "https://documen.tician.de/loopy" +Homepage = "https://github.com/inducer/loopy" + +[tool.setuptools.packages.find] +include = [ + "loopy*", +] + [tool.ruff] preview = true diff --git a/setup.py b/setup.py deleted file mode 100644 index 0cf58f83a..000000000 --- a/setup.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python - -import os - -from setuptools import find_packages, setup - - -ver_dic = {} -version_file = open("loopy/version.py") -try: - version_file_contents = version_file.read() -finally: - version_file.close() - -os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1" -exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic) - - -# {{{ capture git revision at install time - -# authoritative version in pytools/__init__.py -def find_git_revision(tree_root): - # Keep this routine self-contained so that it can be copy-pasted into - # setup.py. - - from os.path import abspath, exists, join - tree_root = abspath(tree_root) - - if not exists(join(tree_root, ".git")): - return None - - from subprocess import PIPE, STDOUT, Popen - p = Popen(["git", "rev-parse", "HEAD"], shell=False, - stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True, - cwd=tree_root) - (git_rev, _) = p.communicate() - - git_rev = git_rev.decode() - - git_rev = git_rev.rstrip() - - retcode = p.returncode - assert retcode is not None - if retcode != 0: - from warnings import warn - warn("unable to find git revision", stacklevel=1) - return None - - return git_rev - - -def write_git_revision(package_name): - from os.path import dirname, join - dn = dirname(__file__) - git_rev = find_git_revision(dn) - - with open(join(dn, package_name, "_git_rev.py"), "w") as outf: - outf.write('GIT_REVISION = "%s"\n' % git_rev) - - -write_git_revision("loopy") - -# }}} - - -setup(name="loopy", - version=ver_dic["VERSION_TEXT"], - description="A code generator for array-based code on CPUs and GPUs", - long_description=open("README.rst").read(), - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Other Audience", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Information Analysis", - "Topic :: Scientific/Engineering :: Mathematics", - "Topic :: Scientific/Engineering :: Visualization", - "Topic :: Software Development :: Libraries", - "Topic :: Utilities", - ], - - python_requires="~=3.8", - install_requires=[ - "pytools>=2024.1.5", - "pymbolic>=2022.1", - "genpy>=2016.1.2", - - # https://github.com/inducer/loopy/pull/419 - "numpy>=1.19", - - "cgen>=2016.1", - "islpy>=2019.1", - "codepy>=2017.1", - "colorama", - "Mako", - "pyrsistent", - "immutables", - "typing_extensions", - ], - - extras_require={ - "pyopencl": [ - "pyopencl>=2022.3", - ], - "fortran": [ - # Note that this is *not* regular 'f2py2e', this is - # the Fortran parser from the (unfinished) third-edition - # f2py, as linked below. - "f2py>=0.3.1", - "ply>=3.6", - ], - }, - - dependency_links=[ - "git+https://github.com/pearu/f2py.git" - ], - - scripts=["bin/loopy"], - - author="Andreas Kloeckner", - url="https://mathema.tician.de/software/loopy", - author_email="inform@tiker.net", - license="MIT", - packages=find_packages(), - ) From 4009eee43651ee3009e4a774600d48d06d6a01c2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 2 Aug 2024 15:13:09 -0500 Subject: [PATCH 14/68] Add py.typed marker --- loopy/py.typed | 0 pyproject.toml | 4 ++++ 2 files changed, 4 insertions(+) create mode 100644 loopy/py.typed diff --git a/loopy/py.typed b/loopy/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/pyproject.toml b/pyproject.toml index 7f0a38ba4..af9bcb37d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,10 @@ include = [ "loopy*", ] +[tool.setuptools.package-data] +loopy = [ + "py.typed", +] [tool.ruff] preview = true From fef57342bce14dc28104c383e905989f4b37ec5d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:08:09 -0500 Subject: [PATCH 15/68] Bump mypy python_version to 3.10 for X | Y instead of Union --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index af9bcb37d..979d08dd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,7 +140,7 @@ known-local-folder = [ lines-after-imports = 2 [tool.mypy] -python_version = 3.8 +python_version = "3.10" warn_unused_ignores = true # TODO From 25a6acfb0d0302929461477bd5bbb3ac1f6c13e6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Aug 2024 22:39:25 +0200 Subject: [PATCH 16/68] Add a type annotation in loopy.target.pyopencl for numpy 2.1 --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ecaea9b57..e4da6cd8b 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -888,7 +888,7 @@ def get_kernel_call( struct_overflow_arg_names) py_passed_args = [] - struct_pack_types = [] + struct_pack_types: list[str] = [] struct_pack_args = [] for arg_name in skai.passed_names: From 1946cae9f4843af729f3b9080d5b15aeaf35f774 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 Oct 2022 15:33:40 -0500 Subject: [PATCH 17/68] Add Tree as helper for scheduling https://github.com/inducer/loopy/pull/694 Co-authored-by: Matthias Diener Co-authored-by: Andreas Kloeckner --- loopy/schedule/tree.py | 279 +++++++++++++++++++++++++++++++++++++++++ test/test_tree.py | 50 ++++++++ 2 files changed, 329 insertions(+) create mode 100644 loopy/schedule/tree.py create mode 100644 test/test_tree.py diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py new file mode 100644 index 000000000..f91b36e90 --- /dev/null +++ b/loopy/schedule/tree.py @@ -0,0 +1,279 @@ +# mypy: disallow-untyped-defs + +from __future__ import annotations + + +__copyright__ = """ +Copyright (C) 2022 Kaushik Kulkarni +Copyright (C) 2022-24 University of Illinois Board of Trustees +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from dataclasses import dataclass +from functools import cached_property +from typing import Generic, Hashable, Iterator, List, Optional, Sequence, Tuple, TypeVar + +from immutables import Map + +from pytools import memoize_method + + +# {{{ tree data structure + +NodeT = TypeVar("NodeT", bound=Hashable) + + +@dataclass(frozen=True) +class Tree(Generic[NodeT]): + """ + An immutable tree containing nodes of type :class:`NodeT`. + + .. automethod:: ancestors + .. automethod:: parent + .. automethod:: children + .. automethod:: add_node + .. automethod:: depth + .. automethod:: replace_node + .. automethod:: move_node + + .. automethod:: __contains__ + + .. note:: + + Almost all the operations are implemented recursively. NOT suitable for + deep trees. At the very least if the Python implementation is CPython + this allocates a new stack frame for each iteration of the operation. + """ + + _parent_to_children: Map[NodeT, Tuple[NodeT, ...]] + _child_to_parent: Map[NodeT, Optional[NodeT]] + + @staticmethod + def from_root(root: NodeT) -> "Tree[NodeT]": + return Tree(Map({root: ()}), + Map({root: None})) + + @cached_property + def root(self) -> NodeT: + guess = set(self._child_to_parent).pop() + parent_of_guess = self.parent(guess) + while parent_of_guess is not None: + guess = parent_of_guess + parent_of_guess = self.parent(guess) + + return guess + + @memoize_method + def ancestors(self, node: NodeT) -> Tuple[NodeT, ...]: + """ + Returns a :class:`tuple` of nodes that are ancestors of *node*. + """ + assert node in self + + if self.is_root(node): + # => root + return () + + parent = self._child_to_parent[node] + assert parent is not None + + return (parent,) + self.ancestors(parent) + + def parent(self, node: NodeT) -> Optional[NodeT]: + """ + Returns the parent of *node*. + """ + assert node in self + + return self._child_to_parent[node] + + def children(self, node: NodeT) -> Tuple[NodeT, ...]: + """ + Returns the children of *node*. + """ + assert node in self + + return self._parent_to_children[node] + + @memoize_method + def depth(self, node: NodeT) -> int: + """ + Returns the depth of *node*, with the root having depth 0. + """ + assert node in self + + if self.is_root(node): + # => None + return 0 + + parent_of_node = self.parent(node) + assert parent_of_node is not None + + return 1 + self.depth(parent_of_node) + + def is_root(self, node: NodeT) -> bool: + assert node in self + + return self.parent(node) is None + + def is_leaf(self, node: NodeT) -> bool: + assert node in self + + return len(self.children(node)) == 0 + + def __contains__(self, node: NodeT) -> bool: + """Return *True* if *node* is a node in the tree.""" + return node in self._child_to_parent + + def add_node(self, node: NodeT, parent: NodeT) -> "Tree[NodeT]": + """ + Returns a :class:`Tree` with added node *node* having a parent + *parent*. + """ + if node in self: + raise ValueError(f"'{node}' already present in tree.") + + siblings = self._parent_to_children[parent] + + return Tree((self._parent_to_children + .set(parent, siblings + (node,)) + .set(node, ())), + self._child_to_parent.set(node, parent)) + + def replace_node(self, node: NodeT, new_node: NodeT) -> "Tree[NodeT]": + """ + Returns a copy of *self* with *node* replaced with *new_node*. + """ + if node not in self: + raise ValueError(f"'{node}' not present in tree.") + + if new_node in self: + raise ValueError(f"cannot replace with '{new_node}', as its already a part" + " of the tree.") + + parent = self.parent(node) + children = self.children(node) + + # {{{ update child to parent + + child_to_parent_mut = self._child_to_parent.mutate() + del child_to_parent_mut[node] + child_to_parent_mut[new_node] = parent + + for child in children: + child_to_parent_mut[child] = new_node + + # }}} + + # {{{ update parent_to_children + + parent_to_children_mut = self._parent_to_children.mutate() + del parent_to_children_mut[node] + parent_to_children_mut[new_node] = children + + if parent is not None: + # update the child's name in the parent's children + parent_to_children_mut[parent] = ( + *(frozenset(self.children(parent)) - frozenset([node])), + new_node,) + + # }}} + + return Tree(parent_to_children_mut.finish(), + child_to_parent_mut.finish()) + + def move_node(self, node: NodeT, new_parent: Optional[NodeT]) -> "Tree[NodeT]": + """ + Returns a copy of *self* with node *node* as a child of *new_parent*. + """ + if node not in self: + raise ValueError(f"'{node}' not a part of the tree => cannot move.") + + if self.is_root(node): + if new_parent is None: + return self + else: + raise ValueError("Moving root not allowed.") + + if new_parent is None: + raise ValueError("Making multiple roots not allowed") + + if new_parent not in self: + raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.") + + parent = self.parent(node) + assert parent is not None # parent=root handled as a special case + siblings = self.children(parent) + parents_new_children = tuple(frozenset(siblings) - frozenset([node])) + new_parents_children = self.children(new_parent) + (node,) + + new_child_to_parent = self._child_to_parent.set(node, new_parent) + new_parent_to_children = (self._parent_to_children + .set(parent, parents_new_children) + .set(new_parent, new_parents_children)) + + return Tree(new_parent_to_children, + new_child_to_parent) + + def __str__(self) -> str: + """ + Stringifies the tree by using the box-drawing unicode characters. + + .. doctest:: + + >>> from loopy.schedule.tree import Tree + >>> tree = (Tree.from_root("Root") + ... .add_node("A", "Root") + ... .add_node("B", "Root") + ... .add_node("D", "B") + ... .add_node("E", "B") + ... .add_node("C", "A")) + + >>> print(tree) + Root + ├── A + │ └── C + └── B + ├── D + └── E + """ + def rec(node: NodeT) -> List[str]: + children_result = [rec(c) for c in self.children(node)] + + def post_process_non_last_child(children: Sequence[str]) -> list[str]: + return ["├── " + children[0]] + [f"│ {c}" for c in children[1:]] + + def post_process_last_child(children: Sequence[str]) -> list[str]: + return ["└── " + children[0]] + [f" {c}" for c in children[1:]] + + children_result = ([post_process_non_last_child(c) + for c in children_result[:-1]] + + [post_process_last_child(c) + for c in children_result[-1:]]) + return [str(node)] + sum(children_result, start=[]) + + return "\n".join(rec(self.root)) + + def nodes(self) -> Iterator[NodeT]: + return iter(self._child_to_parent.keys()) + +# }}} diff --git a/test/test_tree.py b/test/test_tree.py new file mode 100644 index 000000000..3dea8470e --- /dev/null +++ b/test/test_tree.py @@ -0,0 +1,50 @@ +__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl as pytest_generate_tests, +) + +from loopy.schedule.tree import Tree + + +def test_tree_simple(): + tree = Tree.from_root("") + + tree = tree.add_node("bar", parent="") + tree = tree.add_node("baz", parent="bar") + + assert tree.depth("") == 0 + assert tree.depth("bar") == 1 + assert tree.depth("baz") == 2 + + assert "" in tree + assert "bar" in tree + assert "baz" in tree + assert "foo" not in tree + + tree = tree.replace_node("bar", "foo") + assert "bar" not in tree + assert "foo" in tree + + tree = tree.move_node("baz", new_parent="") + assert tree.depth("baz") == 1 From 2ed95fde719636ca4d2f1fbd6353ea09ae118733 Mon Sep 17 00:00:00 2001 From: "Addison J. Alvey-Blanco" Date: Sat, 24 Aug 2024 13:19:48 -0500 Subject: [PATCH 18/68] Add HappensAfter, type more of loopy.kernel.isntruction Co-authored-by: Andreas Kloeckner --- doc/ref_kernel.rst | 1 + loopy/__init__.py | 8 +- loopy/kernel/instruction.py | 254 +++++++++++++++++++++------ loopy/transform/realize_reduction.py | 2 +- 4 files changed, 203 insertions(+), 62 deletions(-) diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 1fa237b25..2962c23b8 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -262,6 +262,7 @@ Instructions .. {{{ +.. autoclass:: HappensAfter .. autoclass:: InstructionBase .. _assignments: diff --git a/loopy/__init__.py b/loopy/__init__.py index 1eebb8223..149a1af26 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -56,6 +56,7 @@ BarrierInstruction, CallInstruction, CInstruction, + HappensAfter, InstructionBase, LegacyStringInstructionTag, MemoryOrdering, @@ -203,15 +204,9 @@ find_rules_matching, ) from loopy.translation_unit import TranslationUnit, for_each_kernel, make_program - -# }}} from loopy.type_inference import infer_unknown_types from loopy.types import to_loopy_type - -# {{{ imported user interface from loopy.typing import auto - -# {{{ import transforms from loopy.version import MOST_RECENT_LANGUAGE_VERSION, VERSION @@ -242,6 +237,7 @@ "ExecutorBase", "GeneratedProgram", "GlobalArg", + "HappensAfter", "ISPCTarget", "ImageArg", "InKernelCallable", diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fe593b00b..f64062e64 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -20,10 +20,11 @@ THE SOFTWARE. """ -from collections.abc import Set as abc_Set +from collections.abc import Mapping as MappingABC, Set as abc_Set +from dataclasses import dataclass from functools import cached_property from sys import intern -from typing import FrozenSet +from typing import FrozenSet, Mapping, Optional, Tuple, Type, Union from warnings import warn import islpy as isl @@ -31,7 +32,8 @@ from pytools.tag import Tag, Taggable, tag_dataclass from loopy.diagnostic import LoopyError -from loopy.tools import Optional +from loopy.tools import Optional as LoopyOptional +from loopy.typing import ExpressionT # {{{ instruction tags @@ -77,6 +79,44 @@ class UseStreamingStoreTag(Tag): # }}} +# {{{ HappensAfter + +@dataclass(frozen=True) +class HappensAfter: + """A class representing a "happens-after" relationship between two + statements found in a :class:`loopy.LoopKernel`. Used to validate that a + given kernel transformation respects the data dependencies in a given + program. + + .. attribute:: variable_name + + The name of the variable responsible for the dependency. For + backward compatibility purposes, this may be *None*. In this case, the + dependency semantics revert to the deprecated, statement-level + dependencies of prior versions of :mod:`loopy`. + + .. attribute:: instances_rel + + An :class:`islpy.Map` representing the precise happens-after + relationship. The domain and range are sets of statement instances. The + instances in the domain are required to execute before the instances in + the range. + + Map dimensions are named according to the order of appearance of the + inames in a :mod:`loopy` program. The dimension names in the range are + appended with a prime to signify that the mapped instances are distinct. + + As a (deprecated) matter of backward compatibility, this may be *None*, + in which case the semantics revert to the (underspecified) + statement-level dependencies of prior versions of :mod:`loopy`. + """ + + variable_name: Optional[str] + instances_rel: Optional[isl.Map] + +# }}} + + # {{{ instructions: base class class InstructionBase(ImmutableRecord, Taggable): @@ -200,10 +240,20 @@ class InstructionBase(ImmutableRecord, Taggable): Inherits from :class:`pytools.tag.Taggable`. """ + id: Optional[str] + happens_after: Mapping[str, HappensAfter] + depends_on_is_final: bool + groups: FrozenSet[str] + conflicts_with_groups: FrozenSet[str] + no_sync_with: FrozenSet[Tuple[str, str]] + predicates: FrozenSet[ExpressionT] + within_inames: FrozenSet[str] + within_inames_is_final: bool + priority: int # within_inames_is_final is deprecated and will be removed in version 2017.x. - fields = set("id depends_on depends_on_is_final " + fields = set("id depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " @@ -216,12 +266,22 @@ class InstructionBase(ImmutableRecord, Taggable): # Names of fields that are sets of pymbolic expressions. Needed for key building pymbolic_set_fields = {"predicates"} - def __init__(self, id, depends_on, depends_on_is_final, - groups, conflicts_with_groups, - no_sync_with, - within_inames_is_final, within_inames, - priority, - predicates, tags): + def __init__(self, + id: Optional[str], + happens_after: Union[ + Mapping[str, HappensAfter], FrozenSet[str], str, None], + depends_on_is_final: Optional[bool], + groups: Optional[FrozenSet[str]], + conflicts_with_groups: Optional[FrozenSet[str]], + no_sync_with: Optional[FrozenSet[Tuple[str, str]]], + within_inames_is_final: Optional[bool], + within_inames: Optional[FrozenSet[str]], + priority: Optional[int], + predicates: Optional[FrozenSet[str]], + tags: Optional[FrozenSet[Tag]], + *, + depends_on: Union[FrozenSet[str], str, None] = None, + ) -> None: if predicates is None: predicates = frozenset() @@ -237,8 +297,49 @@ def __init__(self, id, depends_on, depends_on_is_final, predicates = frozenset(new_predicates) del new_predicates - if depends_on is None: - depends_on = frozenset() + # {{{ process happens_after/depends_on + + if happens_after is not None and depends_on is not None: + raise TypeError("may not pass both happens_after and depends_on") + elif depends_on is not None: + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Pass happens_after instead.", DeprecationWarning, stacklevel=2) + happens_after = depends_on + + del depends_on + + if depends_on_is_final and happens_after is None: + raise LoopyError("Setting depends_on_is_final to True requires " + "actually specifying happens_after/depends_on") + + if happens_after is None: + happens_after = {} + elif isinstance(happens_after, str): + warn("Passing a string for happens_after/depends_on is deprecated and " + "will stop working in 2025. Instead, pass a full-fledged " + "happens_after data structure.", DeprecationWarning, stacklevel=2) + + happens_after = { + after_id.strip(): HappensAfter( + variable_name=None, + instances_rel=None) + for after_id in happens_after.split(",") + if after_id.strip()} + elif isinstance(happens_after, frozenset): + happens_after = { + after_id: HappensAfter( + variable_name=None, + instances_rel=None) + for after_id in happens_after} + elif isinstance(happens_after, MappingABC): + if isinstance(happens_after, dict): + happens_after = happens_after + else: + raise TypeError("'happens_after' has unexpected type: " + f"{type(happens_after)}") + + # }}} if groups is None: groups = frozenset() @@ -255,16 +356,12 @@ def __init__(self, id, depends_on, depends_on_is_final, if within_inames_is_final is None: within_inames_is_final = False - if isinstance(depends_on, str): - depends_on = frozenset( - s.strip() for s in depends_on.split(",") if s.strip()) - if depends_on_is_final is None: depends_on_is_final = False - if depends_on_is_final and not isinstance(depends_on, abc_Set): + if depends_on_is_final and not isinstance(happens_after, MappingABC): raise LoopyError("Setting depends_on_is_final to True requires " - "actually specifying depends_on") + "actually specifying happens_after/depends_on") if tags is None: tags = frozenset() @@ -288,13 +385,13 @@ def __init__(self, id, depends_on, depends_on_is_final, # assert all(is_interned(pred) for pred in predicates) assert isinstance(within_inames, abc_Set) - assert isinstance(depends_on, abc_Set) or depends_on is None + assert isinstance(happens_after, MappingABC) or happens_after is None assert isinstance(groups, abc_Set) assert isinstance(conflicts_with_groups, abc_Set) ImmutableRecord.__init__(self, id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -307,6 +404,24 @@ def __init__(self, id, depends_on, depends_on_is_final, # The Taggable constructor call does extra validation. tags=tags) + Taggable.__init__(self, tags) + + def get_copy_kwargs(self, **kwargs): + passed_depends_on = "depends_on" in kwargs + + if passed_depends_on: + assert "happens_after" not in kwargs + + kwargs = super().get_copy_kwargs(**kwargs) + + if passed_depends_on: + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Instead, use happens_after.", DeprecationWarning, stacklevel=2) + del kwargs["happens_after"] + + return kwargs + # {{{ abstract interface def read_dependency_names(self): @@ -348,6 +463,13 @@ def with_transformed_expressions(self, f, assignee_f=None): # }}} + @property + def depends_on(self): + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Use happens_after instead.", DeprecationWarning, stacklevel=2) + return frozenset(self.happens_after) + @property def assignee_name(self): """A convenience wrapper around :meth:`assignee_var_names` @@ -456,7 +578,9 @@ def __setstate__(self, val): if self.id is not None: # pylint:disable=access-member-before-definition self.id = intern(self.id) - self.depends_on = intern_frozenset_of_ids(self.depends_on) + self.happens_after = { + intern(after_id): ha + for after_id, ha in self.happens_after.items()} self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( intern_frozenset_of_ids(self.conflicts_with_groups)) @@ -793,30 +917,43 @@ class Assignment(MultiAssignmentBase): .. automethod:: __init__ """ + assignee: ExpressionT + expression: ExpressionT + temp_var_type: LoopyOptional + atomicity: Tuple[VarAtomicity, ...] + fields = MultiAssignmentBase.fields | \ set("assignee temp_var_type atomicity".split()) pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignee"} def __init__(self, - assignee, expression, - id=None, - depends_on=None, - depends_on_is_final=None, - groups=None, - conflicts_with_groups=None, - no_sync_with=None, - within_inames_is_final=None, - within_inames=None, - tags=None, - temp_var_type=_not_provided, atomicity=(), - priority=0, predicates=frozenset()): + assignee: Union[str, ExpressionT], + expression: Union[str, ExpressionT], + id: Optional[str] = None, + happens_after: Union[ + Mapping[str, HappensAfter], FrozenSet[str], str, None] = None, + depends_on_is_final: Optional[bool] = None, + groups: Optional[FrozenSet[str]] = None, + conflicts_with_groups: Optional[FrozenSet[str]] = None, + no_sync_with: Optional[FrozenSet[Tuple[str, str]]] = None, + within_inames_is_final: Optional[bool] = None, + within_inames: Optional[FrozenSet[str]] = None, + priority: Optional[int] = None, + predicates: Optional[FrozenSet[str]] = None, + tags: Optional[FrozenSet[Tag]] = None, + temp_var_type: Union[ + Type[_not_provided], None, LoopyOptional] = _not_provided, + atomicity: Tuple[VarAtomicity, ...] = (), + *, + depends_on: Union[FrozenSet[str], str, None] = None, + ) -> None: if temp_var_type is _not_provided: - temp_var_type = Optional() + temp_var_type = LoopyOptional() super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -825,7 +962,8 @@ def __init__(self, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) from loopy.symbolic import parse if isinstance(assignee, str): @@ -949,7 +1087,7 @@ class CallInstruction(MultiAssignmentBase): def __init__(self, assignees, expression, id=None, - depends_on=None, + happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, @@ -958,11 +1096,12 @@ def __init__(self, within_inames=None, tags=None, temp_var_types=None, - priority=0, predicates=frozenset()): + priority=0, predicates=frozenset(), + depends_on=None): super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -971,7 +1110,8 @@ def __init__(self, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) from pymbolic.primitives import Call @@ -1004,7 +1144,7 @@ def __init__(self, self.expression = expression if temp_var_types is None: - self.temp_var_types = (Optional(),) * len(self.assignees) + self.temp_var_types = (LoopyOptional(),) * len(self.assignees) else: self.temp_var_types = tuple( _check_and_fix_temp_var_type(tvt, stacklevel=3) @@ -1149,7 +1289,7 @@ def modify_assignee_for_array_call(assignee): def make_assignment(assignees, expression, temp_var_types=None, **kwargs): if temp_var_types is None: - temp_var_types = (Optional(),) * len(assignees) + temp_var_types = (LoopyOptional(),) * len(assignees) if len(assignees) != 1 or is_array_call(assignees, expression): atomicity = kwargs.pop("atomicity", ()) @@ -1251,12 +1391,13 @@ class CInstruction(InstructionBase): def __init__(self, iname_exprs, code, read_variables=frozenset(), assignees=(), - id=None, depends_on=None, depends_on_is_final=None, + id=None, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=0, - predicates=frozenset(), tags=None): + predicates=frozenset(), tags=None, + depends_on=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -1269,13 +1410,14 @@ def __init__(self, InstructionBase.__init__(self, id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - priority=priority, predicates=predicates, tags=tags) + priority=priority, predicates=predicates, tags=tags, + depends_on=depends_on) # {{{ normalize iname_exprs @@ -1419,15 +1561,15 @@ class NoOpInstruction(_DataObliviousInstruction): ... nop """ - def __init__(self, id=None, depends_on=None, depends_on_is_final=None, + def __init__(self, id=None, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - predicates=None, tags=None): + predicates=None, tags=None, depends_on=None): super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -1436,7 +1578,8 @@ def __init__(self, id=None, depends_on=None, depends_on_is_final=None, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) def __str__(self): first_line = "%s: ... nop" % self.id @@ -1478,20 +1621,21 @@ class BarrierInstruction(_DataObliviousInstruction): fields = _DataObliviousInstruction.fields | {"synchronization_kind", "mem_kind"} - def __init__(self, id, depends_on=None, depends_on_is_final=None, + def __init__(self, id, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, predicates=None, tags=None, synchronization_kind="global", - mem_kind="local"): + mem_kind="local", + depends_on=None): if predicates: raise LoopyError("conditional barriers are not supported") super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -1500,8 +1644,8 @@ def __init__(self, id, depends_on=None, depends_on_is_final=None, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags - ) + tags=tags, + depends_on=depends_on) self.synchronization_kind = synchronization_kind self.mem_kind = mem_kind diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 5161efba6..8aea6541a 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -2019,7 +2019,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, | red_realize_ctx.surrounding_insn_add_within_inames)) kwargs.pop("id") - kwargs.pop("depends_on") + kwargs.pop("happens_after") kwargs.pop("expression") kwargs.pop("assignee", None) kwargs.pop("assignees", None) From 2257b5e908835809e32f35813bb176af01b1dda4 Mon Sep 17 00:00:00 2001 From: "Addison J. Alvey-Blanco" Date: Sat, 24 Aug 2024 13:25:13 -0500 Subject: [PATCH 19/68] Typing fixes regarding None-ness of insn.id --- loopy/schedule/__init__.py | 7 +++++-- loopy/transform/instruction.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 6249b36ba..31f9bc4a8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1,5 +1,7 @@ from __future__ import annotations +from loopy.typing import not_none + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -713,9 +715,10 @@ def get_insns_in_topologically_sorted_order( from pytools.graph import compute_topological_order rev_dep_map: Dict[str, Set[str]] = { - insn.id: set() for insn in kernel.instructions} + not_none(insn.id): set() for insn in kernel.instructions} for insn in kernel.instructions: for dep in insn.depends_on: + assert insn.id is not None rev_dep_map[dep].add(insn.id) # For breaking ties, we compare the features of an instruction @@ -2102,7 +2105,7 @@ def _generate_loop_schedules_inner( schedule=(), - unscheduled_insn_ids={insn.id for insn in kernel.instructions}, + unscheduled_insn_ids={not_none(insn.id) for insn in kernel.instructions}, scheduled_insn_ids=frozenset(), within_subkernel=kernel.state != KernelState.LINEARIZED, may_schedule_global_barriers=True, diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 374587da5..494bbf0bc 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -267,6 +267,7 @@ def replace_instruction_ids_in_insn( new_no_sync_with: List[Tuple[str, str]] = [] if insn.id in replacements: + assert isinstance(insn.id, str) insn = insn.copy(id=replacements[insn.id][0]) new_depends_on = list(insn.depends_on) From 206961fb20459d4398351ef57fa6e7b36c0fe79e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:07:57 -0500 Subject: [PATCH 20/68] Add typing_extensions to dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 979d08dd3..a0ec51c81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,9 @@ dependencies = [ "Mako", "pyrsistent", "immutables", - "typing_extensions", + + # for Self + "typing-extensions>=4; python_version<'3.12'", ] [project.optional-dependencies] pyopencl = [ From 135b319bd971ba4f0dc9c6ecbf04ec13dbd25627 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:08:35 -0500 Subject: [PATCH 21/68] Make an ArrayArgDescriptor base class --- loopy/kernel/function_interface.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e81e4dafc..c96cd0fbb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -23,7 +23,8 @@ THE SOFTWARE. """ -from typing import TYPE_CHECKING, ClassVar, FrozenSet, Tuple +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Callable, ClassVar, FrozenSet, Tuple, TypeVar from pytools import ImmutableRecord @@ -36,6 +37,8 @@ if TYPE_CHECKING: + from typing_extensions import Self + from loopy.translation_unit import CallablesTable, FunctionIdT __doc__ = """ @@ -57,7 +60,23 @@ # {{{ argument descriptors -class ValueArgDescriptor(ImmutableRecord): +ArgDescriptorT = TypeVar("ArgDescriptorT", bound="ArgDescriptor") + + +class ArgDescriptor(ABC, ImmutableRecord): + @abstractmethod + def map_expr( + self, + subst_mapper: Callable[[ArgDescriptorT], ArgDescriptorT] + ) -> Self: + ... + + @abstractmethod + def depends_on(self) -> frozenset[str]: + ... + + +class ValueArgDescriptor(ArgDescriptor): hash_fields = () def map_expr(self, subst_mapper): @@ -69,7 +88,7 @@ def depends_on(self): update_persistent_hash = update_persistent_hash -class ArrayArgDescriptor(ImmutableRecord): +class ArrayArgDescriptor(ArgDescriptor): """ Records information about an array argument to an in-kernel callable. To be passed to and returned from From cb7d4a6bcde20d8b07a69cfdbe066ccf7dcfb243 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:13:54 -0500 Subject: [PATCH 22/68] Restrict TranslationUnit.{__getitem__, default_entrypoint} to returning LoopKernels --- loopy/check.py | 12 +++++++----- loopy/translation_unit.py | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 4ec3b46e1..c23aa4582 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -44,6 +44,7 @@ SeparateArrayArrayDimTag, ) from loopy.kernel.data import ArrayArg, ArrayDimImplementationTag, auto +from loopy.kernel.function_interface import CallableKernel from loopy.kernel.instruction import ( CallInstruction, CInstruction, @@ -52,7 +53,7 @@ _DataObliviousInstruction, ) from loopy.symbolic import CombineMapper, ResolvedFunction, WalkMapper -from loopy.translation_unit import for_each_kernel +from loopy.translation_unit import TranslationUnit, for_each_kernel from loopy.type_inference import TypeReader from loopy.typing import ExpressionT @@ -1725,11 +1726,12 @@ def _validate_kernel_call_sites_inner(kernel, callables): raise NotImplementedError(type(insn)) -def validate_kernel_call_sites(translation_unit): +def validate_kernel_call_sites(translation_unit: TranslationUnit) -> None: for name in translation_unit.callables_table: - clbl = translation_unit[name] - if isinstance(clbl, LoopKernel): - _validate_kernel_call_sites_inner(clbl, translation_unit.callables_table) + clbl = translation_unit.callables_table[name] + if isinstance(clbl, CallableKernel): + _validate_kernel_call_sites_inner( + clbl.subkernel, translation_unit.callables_table) # }}} diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 76e795b76..c991ec030 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -182,6 +182,8 @@ class TranslationUnit: The :class:`~loopy.LoopKernel` representing the main entrypoint of the program, if defined. Currently, this attribute may only be accessed if there is exactly one entrypoint in the translation unit. + Will raise an error if the default entrypoint is not a + :class:`~loopy.LoopKernel`. .. attribute:: callables_table @@ -300,9 +302,9 @@ def with_kernel(self, kernel): new_callables = self.callables_table.set(kernel.name, clbl) return self.copy(callables_table=new_callables) - def __getitem__(self, name): + def __getitem__(self, name) -> LoopKernel: """ - For the callable named *name*, return a :class:`loopy.LoopKernel` if + For the callable named *name*, return a :class:`loopy.LoopKernel`. if it's a :class:`~loopy.kernel.function_interface.CallableKernel` otherwise return the callable itself. """ @@ -310,13 +312,20 @@ def __getitem__(self, name): if isinstance(result, CallableKernel): return result.subkernel else: - return result + raise ValueError("TranslationUnit.__getitem__ " + "can only be used for instances of LoopKernel. " + "Access all other callables via callables_table.") @property - def default_entrypoint(self): + def default_entrypoint(self) -> LoopKernel: if len(self.entrypoints) == 1: - entrypoint, = self.entrypoints - return self[entrypoint] + ep_name, = self.entrypoints + entrypoint = self[ep_name] + + if not isinstance(entrypoint, LoopKernel): + raise ValueError("default entrypoint is not a kernel") + + return entrypoint else: raise ValueError("TranslationUnit has multiple possible entrypoints." " The default entrypoint kernel is not uniquely" From a3f4ef39dc0783dc9b1d3e480b10e393ed574a5a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:14:55 -0500 Subject: [PATCH 23/68] Add TUnitOrKernelT --- loopy/translation_unit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index c991ec030..5f26f48ae 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -735,6 +735,9 @@ def __getitem__(self, name): # }}} +TUnitOrKernelT = TypeVar("TUnitOrKernelT", LoopKernel, TranslationUnit) + + # {{{ helper functions def make_program(kernel: LoopKernel) -> TranslationUnit: From ade9c7397fffcfdb303a88775b46550a1967702a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:16:22 -0500 Subject: [PATCH 24/68] Type TranslationUnit.{copy,with_kernel} --- loopy/translation_unit.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 5f26f48ae..801ec2964 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -27,10 +27,20 @@ from collections.abc import Set as abc_Set from dataclasses import dataclass, field, replace from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, FrozenSet, Mapping, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + FrozenSet, + Mapping, + Optional, + TypeVar, + Union, +) from warnings import warn from immutables import Map +from typing_extensions import Self from pymbolic.primitives import Call, Variable @@ -228,9 +238,9 @@ def __post_init__(self): object.__setattr__(self, "_program_executor_cache", {}) - def copy(self, **kwargs): + def copy(self, **kwargs: Any) -> Self: target = kwargs.pop("target", None) - program = replace(self, **kwargs) + t_unit = replace(self, **kwargs) if target: from loopy.kernel import KernelState if max(callable_knl.subkernel.state @@ -242,7 +252,7 @@ def copy(self, **kwargs): "preprocessed, cannot modify target now.") new_callables = {} - for func_id, clbl in program.callables_table.items(): + for func_id, clbl in t_unit.callables_table.items(): if isinstance(clbl, CallableKernel): knl = clbl.subkernel knl = knl.copy(target=target) @@ -253,16 +263,12 @@ def copy(self, **kwargs): raise NotImplementedError() new_callables[func_id] = clbl - program = replace( + t_unit = replace( self, callables_table=Map(new_callables), target=target) - return program + return t_unit - def with_entrypoints(self, entrypoints): - """ - :param entrypoints: Either a comma-separated :class:`str` or - :class:`frozenset`. - """ + def with_entrypoints(self, entrypoints: str | frozenset[str]) -> Self: if isinstance(entrypoints, str): entrypoints = frozenset([e.strip() for e in entrypoints.split(",")]) @@ -280,7 +286,7 @@ def state(self): if isinstance(callable_knl, CallableKernel)), default=KernelState.INITIAL) - def with_kernel(self, kernel): + def with_kernel(self, kernel: LoopKernel) -> Self: """ If *self* contains a callable kernel with *kernel*'s name, replaces its subkernel and returns a copy of *self*. Else records a new callable From 0832f380c80135fdddf6abcd80123502f4ed95d1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:16:49 -0500 Subject: [PATCH 25/68] Type infer_unknown_types --- loopy/type_inference.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 26be6b352..b8e3db720 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -43,7 +43,11 @@ parse_tagged_name, ) from loopy.tools import is_integer -from loopy.translation_unit import CallablesInferenceContext, make_clbl_inf_ctx +from loopy.translation_unit import ( + CallablesInferenceContext, + TranslationUnit, + make_clbl_inf_ctx, +) from loopy.types import NumpyType @@ -1026,31 +1030,34 @@ def _instruction_missed_during_inference(insn): return type_specialized_kernel, clbl_inf_ctx -def infer_unknown_types(program, expect_completion=False): +def infer_unknown_types( + t_unit: TranslationUnit, + expect_completion: bool = False + ) -> TranslationUnit: """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto from loopy.translation_unit import resolve_callables - program = resolve_callables(program) + t_unit = resolve_callables(t_unit) # {{{ early-exit criterion if all(clbl.is_type_specialized() - for clbl in program.callables_table.values()): + for clbl in t_unit.callables_table.values()): # all the callables including the kernels have inferred their types # => no need for type inference - return program + return t_unit # }}} - clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, - program.entrypoints) + clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table, + t_unit.entrypoints) - for e in program.entrypoints: + for e in t_unit.entrypoints: logger.debug(f"Entering entrypoint: {e}") arg_id_to_dtype = {arg.name: arg.dtype for arg in - program[e].args if arg.dtype not in (None, auto)} - new_callable, clbl_inf_ctx = program.callables_table[e].with_types( + t_unit[e].args if arg.dtype not in (None, auto)} + new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_types( arg_id_to_dtype, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, is_entrypoint=True) @@ -1073,7 +1080,7 @@ def infer_unknown_types(program, expect_completion=False): raise LoopyError("could not determine type of" f" '{vars_not_inferred.pop()}' of kernel '{e}'.") - return clbl_inf_ctx.finish_program(program) + return clbl_inf_ctx.finish_program(t_unit) # }}} From 490d365511210879acbc0165e069b127f92b518f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:17:24 -0500 Subject: [PATCH 26/68] Type Reduction and TypeCast --- loopy/symbolic.py | 73 ++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 2a1b140cc..f2f04f9ae 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1,5 +1,7 @@ """Pymbolic mappers for loopy.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,7 +29,7 @@ import re from functools import cached_property, reduce from sys import intern -from typing import AbstractSet, ClassVar, Mapping, Sequence, Tuple +from typing import TYPE_CHECKING, AbstractSet, Any, ClassVar, Mapping, Sequence, Tuple import immutables import numpy as np @@ -65,9 +67,14 @@ LoopyError, UnableToDetermineAccessRangeError, ) +from loopy.types import ToLoopyTypeConvertible from loopy.typing import ExpressionT +if TYPE_CHECKING: + from loopy.library.reduction import ReductionOperation + + __doc__ = """ .. currentmodule:: loopy.symbolic @@ -640,8 +647,9 @@ class TypeCast(LoopyExpressionBase): The expression to be cast. """ + child: ExpressionT - def __init__(self, type, child): + def __init__(self, type: ToLoopyTypeConvertible, child: ExpressionT): super().__init__() from loopy.types import NumpyType, to_loopy_type @@ -718,31 +726,40 @@ class Reduction(LoopyExpressionBase): """ Represents a reduction operation on :attr:`expr` across :attr:`inames`. - .. attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` - - .. attribute:: inames - - a list of inames across which reduction on :attr:`expr` is being - carried out. + .. autoattribute:: operation + .. autoattribute:: inames + .. autoattribute:: expr + .. autoattribute:: allow_simultaneous + """ - .. attribute:: expr + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") - An expression which may have tuple type. If the expression has tuple - type, it must be one of the following: - * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or - * a :class:`loopy.symbolic.Reduction`, or - * a function call or substitution rule invocation. + operation: ReductionOperation - .. attribute:: allow_simultaneous + inames: Sequence[str] + """The inames across which reduction on :attr:`expr` is being + carried out. + """ - A :class:`bool`. If not *True*, an iname is allowed to be used - in precisely one reduction, to avoid misnesting errors. + expr: ExpressionT + """An expression which may have tuple type. If the expression has tuple + type, it must be one of the following: + * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or + * a :class:`loopy.symbolic.Reduction`, or + * a function call or substitution rule invocation. """ - init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + allow_simultaneous: bool + """If not *True*, an iname is allowed to be used + in precisely one reduction, to avoid misnesting errors. + """ - def __init__(self, operation, inames, expr, allow_simultaneous=False): + def __init__(self, + operation: ReductionOperation | str, + inames: tuple[str | p.Variable, ...] | p.Variable | str, + expr: ExpressionT, + allow_simultaneous: bool = False + ) -> None: if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -751,7 +768,7 @@ def __init__(self, operation, inames, expr, allow_simultaneous=False): assert isinstance(inames, tuple) - def strip_var(iname): + def strip_var(iname: Any) -> str: if isinstance(iname, p.Variable): iname = iname.name @@ -968,10 +985,14 @@ class SubArrayRef(LoopyExpressionBase): .. automethod:: is_equal """ + swept_inames: tuple[p.Variable, ...] + subscript: p.Subscript init_arg_names = ("swept_inames", "subscript") - def __init__(self, swept_inames, subscript): + def __init__(self, + swept_inames: tuple[p.Variable, ...] | p.Variable, + subscript: p.Subscript) -> None: # {{{ sanity checks @@ -1787,7 +1808,7 @@ def map_subscript(self, expr): # {{{ (pw)aff to expr conversion -def aff_to_expr(aff): +def aff_to_expr(aff: isl.Aff) -> ExpressionT: from pymbolic import var denom = aff.get_denominator_val().to_python() @@ -1808,7 +1829,7 @@ def aff_to_expr(aff): return result // denom -def pw_aff_to_expr(pw_aff, int_ok=False): +def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> ExpressionT: if isinstance(pw_aff, int): if not int_ok: from warnings import warn @@ -1830,7 +1851,7 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return expr -def pw_aff_to_pw_aff_implemented_by_expr(pw_aff): +def pw_aff_to_pw_aff_implemented_by_expr(pw_aff: isl.PwAff) -> isl.PwAff: pieces = pw_aff.get_pieces() rest = isl.Set.universe(pw_aff.space.params()) @@ -1923,7 +1944,7 @@ def map_call(self, expr): "for as-pwaff evaluation") -def aff_from_expr(space, expr, vars_to_zero=None): +def aff_from_expr(space: isl.Space, expr: ExpressionT, vars_to_zero=None) -> isl.Aff: if vars_to_zero is None: vars_to_zero = frozenset() From 48e08742b25199a8451a5b72324165789ea8b413 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:17:52 -0500 Subject: [PATCH 27/68] Type callable transforms --- loopy/transform/callable.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index d683cbd29..6866f23f3 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -20,6 +20,8 @@ THE SOFTWARE. """ +from collections.abc import Sequence + from immutables import Map import islpy as isl @@ -27,7 +29,11 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.function_interface import ( + CallableKernel, + InKernelCallable, + ScalarCallable, +) from loopy.kernel.instruction import ( Assignment, CallInstruction, @@ -40,7 +46,7 @@ RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, ) -from loopy.translation_unit import TranslationUnit, for_each_kernel +from loopy.translation_unit import FunctionIdT, TranslationUnit, for_each_kernel __doc__ = """ @@ -80,10 +86,8 @@ def register_callable(translation_unit, function_identifier, callable_, callables_table=new_callables) -def merge(translation_units): +def merge(translation_units: Sequence[TranslationUnit]) -> TranslationUnit: """ - :param translation_units: A sequence of :class:`loopy.TranslationUnit`. - :returns: An instance of :class:`loopy.TranslationUnit` which contains all the callables from each of the *translation_units. """ @@ -110,7 +114,7 @@ def merge(translation_units): # }}} - callables_table = {} + callables_table: dict[FunctionIdT, InKernelCallable] = {} for trans_unit in translation_units: callables_table.update(trans_unit.callables_table) @@ -534,7 +538,12 @@ def inline_callable_kernel(translation_unit, function_name): # {{{ rename_callable -def rename_callable(program, old_name, new_name=None, existing_ok=False): +def rename_callable( + t_unit: TranslationUnit, + old_name: str, + new_name: str | None = None, + existing_ok=False + ) -> TranslationUnit: """ :arg program: An instance of :class:`loopy.TranslationUnit` :arg old_name: The callable to be renamed @@ -548,21 +557,21 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): SubstitutionRuleMappingContext, ) - assert isinstance(program, TranslationUnit) + assert isinstance(t_unit, TranslationUnit) assert isinstance(old_name, str) - if (new_name in program.callables_table) and not existing_ok: + if (new_name in t_unit.callables_table) and not existing_ok: raise LoopyError(f"callables named '{new_name}' already exists") if new_name is None: - namegen = UniqueNameGenerator(program.callables_table.keys()) + namegen = UniqueNameGenerator(t_unit.callables_table.keys()) new_name = namegen(old_name) assert isinstance(new_name, str) new_callables_table = {} - for name, clbl in program.callables_table.items(): + for name, clbl in t_unit.callables_table.items(): if name == old_name: name = new_name @@ -582,12 +591,12 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): new_callables_table[name] = clbl - new_entrypoints = program.entrypoints.copy() + new_entrypoints = t_unit.entrypoints.copy() if old_name in new_entrypoints: new_entrypoints = ((new_entrypoints | frozenset([new_name])) - frozenset([old_name])) - return program.copy(callables_table=Map(new_callables_table), + return t_unit.copy(callables_table=Map(new_callables_table), entrypoints=new_entrypoints) # }}} From 864ca06e4029e3e3b521f504018f3773ba220ea0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:18:19 -0500 Subject: [PATCH 28/68] Type add_dtypes --- loopy/kernel/tools.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 0826ed010..2f2e9a8a9 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -44,7 +44,8 @@ _DataObliviousInstruction, ) from loopy.symbolic import CombineMapper -from loopy.translation_unit import TranslationUnit, for_each_kernel +from loopy.translation_unit import TranslationUnit, TUnitOrKernelT, for_each_kernel +from loopy.types import ToLoopyTypeConvertible logger = logging.getLogger(__name__) @@ -52,15 +53,20 @@ # {{{ add and infer argument dtypes -def add_dtypes(prog_or_kernel, dtype_dict): +def add_dtypes( + kernel: TUnitOrKernelT, + dtype_dict: Mapping[str, ToLoopyTypeConvertible], + ) -> TUnitOrKernelT: """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - if isinstance(prog_or_kernel, TranslationUnit): + if isinstance(kernel, TranslationUnit): + t_unit = kernel + del kernel kernel_names = [clbl.subkernel.name for clbl in - prog_or_kernel.callables_table.values() if isinstance(clbl, + t_unit.callables_table.values() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError("add_dtypes may not take a TranslationUnit with more" @@ -69,10 +75,10 @@ def add_dtypes(prog_or_kernel, dtype_dict): kernel_name, = kernel_names - return prog_or_kernel.with_kernel( - add_dtypes(prog_or_kernel[kernel_name], dtype_dict)) + return t_unit.with_kernel( + add_dtypes(t_unit[kernel_name], dtype_dict)) - assert isinstance(prog_or_kernel, LoopKernel) + assert isinstance(kernel, LoopKernel) processed_dtype_dict = {} @@ -83,13 +89,13 @@ def add_dtypes(prog_or_kernel, dtype_dict): processed_dtype_dict[subkey] = v dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( - prog_or_kernel, processed_dtype_dict) + kernel, processed_dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars) + return kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(kernel, dtype_dict): From c77ef9da92bc400401f12db15c3080f79b241da3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:12:24 -0500 Subject: [PATCH 29/68] Misc smaller typing improvements --- loopy/kernel/__init__.py | 6 +++--- loopy/kernel/creation.py | 5 +++-- loopy/options.py | 3 ++- loopy/transform/concatenate.py | 3 +++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 7bf4cb845..faa9fcab6 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -274,11 +274,11 @@ def all_variable_names(self): | {arg.name for arg in self.args} | set(self.all_inames())) - def get_var_name_generator(self): + def get_var_name_generator(self) -> UniqueNameGenerator: return UniqueNameGenerator(self.all_variable_names()) - def get_instruction_id_generator(self, based_on="insn"): - used_ids = {insn.id for insn in self.instructions} + def get_instruction_id_generator(self, based_on="insn") -> UniqueNameGenerator: + used_ids = {insn.id for insn in self.instructions if insn.id is not None} return UniqueNameGenerator(used_ids) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 36489de47..d023015fa 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -26,6 +26,7 @@ import logging import re from sys import intern +from typing import Any import numpy as np @@ -48,7 +49,7 @@ ) from loopy.symbolic import IdentityMapper, SubArrayRef, WalkMapper from loopy.tools import Optional, intern_frozenset_of_ids -from loopy.translation_unit import for_each_kernel +from loopy.translation_unit import TranslationUnit, for_each_kernel logger = logging.getLogger(__name__) @@ -2614,7 +2615,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): # {{{ make_kernel -def make_kernel(*args, **kwargs): +def make_kernel(*args: Any, **kwargs: Any) -> TranslationUnit: tunit = make_function(*args, **kwargs) name, = tunit.callables_table return tunit.with_entrypoints(name) diff --git a/loopy/options.py b/loopy/options.py index d58421e3e..293670774 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -23,6 +23,7 @@ import os import re +from typing import Any from warnings import warn from pytools import ImmutableRecord @@ -214,7 +215,7 @@ def __init__( # All defaults are further required to be False when cast to bool # for the update() functionality to work. - self, **kwargs): + self, **kwargs: Any) -> None: kwargs = _apply_legacy_map(self._legacy_options_map, kwargs) diff --git a/loopy/transform/concatenate.py b/loopy/transform/concatenate.py index 8a4bb28fb..fcf2e07ca 100644 --- a/loopy/transform/concatenate.py +++ b/loopy/transform/concatenate.py @@ -85,6 +85,9 @@ def concatenate_arrays( axis_length += ary.shape[axis_nr] new_ary = arrays[0] + if not isinstance(new_ary.shape, tuple): + raise ValueError("one of the arrays has indeterminate shape") + new_shape = list(new_ary.shape) new_shape[axis_nr] = axis_length new_ary = new_ary.copy(shape=tuple(new_shape)) From ac7df5f6cc1176a3cb990fd63240068f6fda3a40 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:21:12 -0500 Subject: [PATCH 30/68] Reformat ArrayBase docs to use autoattribute --- loopy/kernel/array.py | 161 +++++++++++++++++++++--------------------- 1 file changed, 81 insertions(+), 80 deletions(-) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 43e1f86b5..c6b4565a4 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -620,105 +620,106 @@ def _parse_shape_or_strides(x): class ArrayBase(ImmutableRecord, Taggable): """ - .. attribute :: name + .. autoattribute:: name + .. autoattribute:: dtype + .. autoattribute:: shape + .. autoattribute:: dim_tags + .. autoattribute:: offset + .. autoattribute:: dim_names + .. autoattribute:: alignment + .. autoattribute:: tags - .. attribute :: dtype - - The :class:`loopy.types.LoopyType` of the array. If this is *None*, - :mod:`loopy` will try to continue without knowing the type of this - array, where the idea is that precise knowledge of the type will become - available at invocation time. Calling the kernel - (via :meth:`loopy.LoopKernel.__call__`) - automatically adds this type information based on invocation arguments. - - Note that some transformations, such as :func:`loopy.add_padding` - cannot be performed without knowledge of the exact *dtype*. - - .. attribute :: shape - - May be one of the following: - - * *None*. In this case, no shape is intended to be specified, - only the strides will be used to access the array. Bounds checking - will not be performed. - - * :class:`loopy.auto`. The shape will be determined by finding the - access footprint. - - * a tuple like like :attr:`numpy.ndarray.shape`. - - Each entry of the tuple is also allowed to be a :mod:`pymbolic` - expression involving kernel parameters, or a (potentially-comma - separated) or a string that can be parsed to such an expression. - - Any element of the shape tuple not used to compute strides - may be *None*. + .. automethod:: __init__ + .. automethod:: __eq__ + .. automethod:: num_user_axes + .. automethod:: num_target_axes + .. automethod:: vector_size - .. attribute:: dim_tags + (supports persistent hashing) + """ + name: str - See :ref:`data-dim-tags`. + dtype: Optional[LoopyType] + """The :class:`loopy.types.LoopyType` of the array. If this is *None*, + :mod:`loopy` will try to continue without knowing the type of this + array, where the idea is that precise knowledge of the type will become + available at invocation time. Calling the kernel + (via :meth:`loopy.LoopKernel.__call__`) + automatically adds this type information based on invocation arguments. + + Note that some transformations, such as :func:`loopy.add_padding` + cannot be performed without knowledge of the exact *dtype*. + """ - .. attribute:: offset + shape: Union[ShapeType, Type["auto"], None] + """ + May be one of the following: - Offset from the beginning of the buffer to the point from - which the strides are counted, in units of the :attr:`dtype`. - May be one of + * *None*. In this case, no shape is intended to be specified, + only the strides will be used to access the array. Bounds checking + will not be performed. - * 0 or None - * a string (that is interpreted as an argument name). - * a pymbolic expression - * :class:`loopy.auto`, in which case an offset argument - is added automatically, immediately following this argument. + * :class:`loopy.auto`. The shape will be determined by finding the + access footprint. - .. attribute:: dim_names + * a tuple like like :attr:`numpy.ndarray.shape`. - A tuple of strings providing names for the array axes, or *None*. - If given, must have the same number of entries as :attr:`dim_tags` - and :attr:`dim_tags`. These do not live in any particular namespace - (i.e. collide with no other names) and serve a purely - informational/documentational purpose. On occasion, they are used - to generate more informative names than could be achieved by - axis numbers. + Each entry of the tuple is also allowed to be a :mod:`pymbolic` + expression involving kernel parameters, or a (potentially-comma + separated) or a string that can be parsed to such an expression. - .. attribute:: alignment + Any element of the shape tuple not used to compute strides + may be *None*. + """ - Memory alignment of the array in bytes. For temporary arrays, - this ensures they are allocated with this alignment. For arguments, - this entails a promise that the incoming array obeys this alignment - restriction. + dim_tags: Optional[Sequence[ArrayDimImplementationTag]] + """See :ref:`data-dim-tags`. + """ - Defaults to *None*. + offset: Union[ExpressionT, str, None] + """Offset from the beginning of the buffer to the point from + which the strides are counted, in units of the :attr:`dtype`. + May be one of + + * 0 or None + * a string (that is interpreted as an argument name). + * a pymbolic expression + * :class:`loopy.auto`, in which case an offset argument + is added automatically, immediately following this argument. + """ - If an integer N is given, the array would be declared - with ``__attribute__((aligned(N)))`` in code generation for - :class:`loopy.CFamilyTarget`. + dim_names: Optional[Tuple[str, ...]] + """A tuple of strings providing names for the array axes, or *None*. + If given, must have the same number of entries as :attr:`dim_tags` + and :attr:`dim_tags`. These do not live in any particular namespace + (i.e. collide with no other names) and serve a purely + informational/documentational purpose. On occasion, they are used + to generate more informative names than could be achieved by + axis numbers. + """ - .. versionadded:: 2018.1 + alignment: Optional[int] + """Memory alignment of the array in bytes. For temporary arrays, + this ensures they are allocated with this alignment. For arguments, + this entails a promise that the incoming array obeys this alignment + restriction. - .. attribute:: tags + Defaults to *None*. - A (possibly empty) frozenset of instances of - :class:`pytools.tag.Tag` intended for - consumption by an application. + If an integer N is given, the array would be declared + with ``__attribute__((aligned(N)))`` in code generation for + :class:`loopy.CFamilyTarget`. - .. versionadded:: 2020.2.2 + .. versionadded:: 2018.1 + """ - .. automethod:: __init__ - .. automethod:: __eq__ - .. automethod:: num_user_axes - .. automethod:: num_target_axes - .. automethod:: vector_size + tags: FrozenSet[Tag] + """A (possibly empty) frozenset of instances of + :class:`pytools.tag.Tag` intended for + consumption by an application. - (supports persistent hashing) + .. versionadded:: 2020.2.2 """ - name: str - dtype: Optional[LoopyType] - shape: Union[ShapeType, Type["auto"], None] - dim_tags: Optional[Sequence[ArrayDimImplementationTag]] - offset: Union[ExpressionT, str, None] - dim_names: Optional[Tuple[str, ...]] - alignment: Optional[int] - tags: FrozenSet[Tag] # Note that order may also wind up in attributes, if the # number of dimensions has not yet been determined. From d0633b6a03914009f8808796f35c6f7ea3c64d46 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:23:24 -0500 Subject: [PATCH 31/68] Drop a spurious mid-file docstring --- loopy/kernel/data.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index bdac071da..9946074cf 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -579,15 +579,6 @@ def supporting_names(self) -> FrozenSet[str]: ) -""" - :attribute tags: A (possibly empty) frozenset of instances of - :class:`pytools.tag.Tag` intended for consumption by an - application. - - ..versionadded: 2020.2.2 -""" - - class ValueArg(KernelArgument, Taggable): def __init__(self, name, dtype=None, approximately=1000, target=None, is_output=False, is_input=True, tags=None): From 4489a7b892ba6269597c0e98d12614f1273870bd Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:24:19 -0500 Subject: [PATCH 32/68] Type KernelArgument and subclass's constructors --- loopy/kernel/data.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 9946074cf..aeab22cc8 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -1,5 +1,7 @@ """Data used by the kernel object.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -38,7 +40,6 @@ Union, cast, ) -from warnings import warn import numpy as np # noqa from immutables import Map @@ -61,8 +62,8 @@ VarAtomicity, make_assignment, ) -from loopy.types import LoopyType, auto -from loopy.typing import ExpressionT, ShapeType +from loopy.types import LoopyType, ToLoopyTypeConvertible +from loopy.typing import ExpressionT, ShapeType, auto __doc__ = """ @@ -390,12 +391,6 @@ class KernelArgument(ImmutableRecord): def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) - target = kwargs.pop("target", None) - if target is not None: - warn("Passing 'target' is deprecated and will stop working in 2023. " - "It is already being ignored.", - DeprecationWarning, stacklevel=2) - dtype = kwargs.pop("dtype", None) for_atomic = kwargs.pop("for_atomic", False) @@ -521,7 +516,7 @@ def supporting_names(self) -> FrozenSet[str]: # Making this a function prevents incorrect use in isinstance. # Note: This is *not* deprecated, as it is super-common and # incrementally more convenient to use than ArrayArg directly. -def GlobalArg(*args, **kwargs): # noqa: N802 +def GlobalArg(*args, **kwargs) -> ArrayArg: # noqa: N802 address_space = kwargs.pop("address_space", None) if address_space is not None: raise TypeError("may not pass 'address_space' to GlobalArg") @@ -580,8 +575,14 @@ def supporting_names(self) -> FrozenSet[str]: class ValueArg(KernelArgument, Taggable): - def __init__(self, name, dtype=None, approximately=1000, target=None, - is_output=False, is_input=True, tags=None): + def __init__(self, + name: str, + dtype: ToLoopyTypeConvertible | None = None, + approximately: int = 1000, + is_output: bool = False, + is_input: bool = True, + tags: frozenset[Tag] | None = None, + ) -> None: """ :arg tags: A an instance of or Iterable of instances of :class:`pytools.tag.Tag` intended for consumption by an @@ -594,7 +595,6 @@ def __init__(self, name, dtype=None, approximately=1000, target=None, KernelArgument.__init__(self, name=name, dtype=dtype, approximately=approximately, - target=target, is_output=is_output, is_input=is_input, tags=tags) From 93974d0832ecbe885c391beea049b6cc582d8f51 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:24:44 -0500 Subject: [PATCH 33/68] Make SubstitutionRule a dataclass --- loopy/kernel/data.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index aeab22cc8..1796e01ff 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -862,35 +862,27 @@ def supporting_names(self) -> FrozenSet[str]: # {{{ substitution rule -class SubstitutionRule(ImmutableRecord): +@dataclass(frozen=True) +class SubstitutionRule: """ - .. attribute:: name - .. attribute:: arguments - - A tuple of strings - - .. attribute:: expression + .. autoattribute:: name + .. autoattribute:: arguments + .. autoattribute:: expression """ - def __init__(self, name, arguments, expression): - assert isinstance(arguments, tuple) - - ImmutableRecord.__init__(self, - name=name, arguments=arguments, expression=expression) + name: str + arguments: Sequence[str] + expression: ExpressionT - def __str__(self): - return "{}({}) := {}".format( - self.name, ", ".join(self.arguments), self.expression) + def copy(self, **kwargs: Any) -> SubstitutionRule: + return replace(self, **kwargs) def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - """ - key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.arguments) key_builder.update_for_pymbolic_expression(key_hash, self.expression) + # }}} From 8eaec93be445afe814dbcdffcb6d01349afce784 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:25:40 -0500 Subject: [PATCH 34/68] Type make_assignment --- loopy/kernel/instruction.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index f64062e64..b355e3e70 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -24,7 +24,7 @@ from dataclasses import dataclass from functools import cached_property from sys import intern -from typing import FrozenSet, Mapping, Optional, Tuple, Type, Union +from typing import Any, FrozenSet, Mapping, Optional, Sequence, Tuple, Type, Union from warnings import warn import islpy as isl @@ -33,6 +33,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import Optional as LoopyOptional +from loopy.types import LoopyType from loopy.typing import ExpressionT @@ -1286,10 +1287,16 @@ def modify_assignee_for_array_call(assignee): "SubArrayRef as its inputs") -def make_assignment(assignees, expression, temp_var_types=None, **kwargs): +def make_assignment(assignees: tuple[ExpressionT, ...], + expression: ExpressionT, + temp_var_types: ( + Sequence[LoopyType | None] | None) = None, + **kwargs: Any) -> Assignment | CallInstruction: - if temp_var_types is None: - temp_var_types = (LoopyOptional(),) * len(assignees) + if temp_var_types is not None: + tv_types: Sequence[LoopyType | LoopyOptional | None] = temp_var_types + else: + tv_types = (LoopyOptional(),) * len(assignees) if len(assignees) != 1 or is_array_call(assignees, expression): atomicity = kwargs.pop("atomicity", ()) @@ -1319,7 +1326,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, - temp_var_types=temp_var_types, + temp_var_types=tuple(tv_types), **kwargs) else: def _is_array(expr): @@ -1339,10 +1346,13 @@ def _is_array(expr): raise LoopyError("Array calls only supported as instructions" " with function call as RHS for now.") + assignee, = assignees + tv_type, = tv_types + return Assignment( - assignee=assignees[0], + assignee=assignee, expression=expression, - temp_var_type=temp_var_types[0], + temp_var_type=tv_type, **kwargs) From 75bea1e6b2414becc110544101642f67e26475d3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:27:13 -0500 Subject: [PATCH 35/68] Reformat TemporaryVariable docs to use autoattribute --- loopy/kernel/data.py | 60 ++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 1796e01ff..22c9ce562 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -632,48 +632,42 @@ def get_arg_decl(self, ast_builder): class TemporaryVariable(ArrayBase): __doc__ = cast(str, ArrayBase.__doc__) + """ - .. attribute:: storage_shape - .. attribute:: base_indices - .. attribute:: address_space - - What memory this temporary variable lives in. - One of the values in :class:`AddressSpace`, - or :class:`loopy.auto` if this is - to be automatically determined. - - .. attribute:: base_storage - - The name of a storage array that is to be used to actually - hold the data in this temporary, or *None*. If not *None* or the name - of an existing variable, a variable of this name and appropriate size - will be created. - - .. attribute:: initializer - - *None* or a :class:`numpy.ndarray` of data to be used to initialize the - array. - - .. attribute:: read_only - - A :class:`bool` indicating whether the variable may be written during - its lifetime. If *True*, *initializer* must be given. - - .. attribute:: _base_storage_access_may_be_aliasing - - Whether the temporary is used to alias the underlying base storage. - Defaults to *False*. If *False*, C-based code generators will declare - the temporary as a ``restrict`` const pointer to the base storage - memory location. If *True*, the restrict part is omitted on this - declaration. + .. autoattribute:: storage_shape + .. autoattribute:: base_indices + .. autoattribute:: address_space + .. autoattribute:: base_storage + .. autoattribute:: initializer + .. autoattribute:: read_only + .. autoattribute:: _base_storage_access_may_be_aliasing """ storage_shape: Optional[ShapeType] base_indices: Optional[Tuple[ExpressionT, ...]] address_space: Union[AddressSpace, Type[auto]] base_storage: Optional[str] + """The name of a storage array that is to be used to actually + hold the data in this temporary, or *None*. If not *None* or the name + of an existing variable, a variable of this name and appropriate size + will be created. + """ + initializer: Optional[np.ndarray] + """*None* or a :class:`numpy.ndarray` of data to be used to initialize the + array. + """ + read_only: bool + """A :class:`bool` indicating whether the variable may be written during + its lifetime. If *True*, *initializer* must be given. + """ + _base_storage_access_may_be_aliasing: bool + """Whether the temporary is used to alias the underlying base storage. + Defaults to *False*. If *False*, C-based code generators will declare + the temporary as a ``restrict`` const pointer to the base storage + memory location. If *True*, the restrict part is omitted on this + declaration. + """ min_target_axes: ClassVar[int] = 0 max_target_axes: ClassVar[int] = 1 From d858cad930fa8cc687643aad2e5025e43f4528a2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:27:49 -0500 Subject: [PATCH 36/68] Type TemporaryVariable methods --- loopy/kernel/array.py | 41 +++++++++++++++------------ loopy/kernel/data.py | 56 ++++++++++++++++++++++++++----------- loopy/kernel/instruction.py | 5 ++-- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index c6b4565a4..d6a0126d0 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -45,6 +45,7 @@ from warnings import warn import numpy as np # noqa +from typing_extensions import TypeAlias from pytools import ImmutableRecord from pytools.tag import Tag, Taggable @@ -52,7 +53,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType -from loopy.typing import ExpressionT, ShapeType +from loopy.typing import ExpressionT, ShapeType, auto if TYPE_CHECKING: @@ -593,29 +594,33 @@ def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, # {{{ array base class (for arguments and temporary arrays) -def _pymbolic_parse_if_necessary(x): - if isinstance(x, str): - from pymbolic import parse - return parse(x) - else: - return x +ToShapeLikeConvertible: TypeAlias = (Tuple[ExpressionT | str, ...] + | ExpressionT | type[auto] | str | tuple[str, ...]) -def _parse_shape_or_strides(x): - import loopy as lp +def _parse_shape_or_strides( + x: ToShapeLikeConvertible, + ) -> ShapeType | type[auto]: + from pymbolic import parse + if x == "auto": - warn("use of 'auto' as a shape or stride won't work " - "any more--use loopy.auto instead", - stacklevel=3) - x = _pymbolic_parse_if_necessary(x) - if isinstance(x, lp.auto): - return x - assert not isinstance(x, list) + raise ValueError("use of 'auto' as a shape or stride won't work " + "any more--use loopy.auto instead") + + if x is auto: + return auto + + if isinstance(x, str): + x = parse(x) + + if isinstance(x, list): + raise ValueError("shape can't be a list") + if not isinstance(x, tuple): - assert x is not lp.auto + assert x is not auto x = (x,) - return tuple(_pymbolic_parse_if_necessary(xi) for xi in x) + return tuple(parse(xi) if isinstance(xi, str) else xi for xi in x) class ArrayBase(ImmutableRecord, Taggable): diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 22c9ce562..aec7c6d97 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -682,11 +682,28 @@ class TemporaryVariable(ArrayBase): "_base_storage_access_may_be_aliasing", ) - def __init__(self, name, dtype=None, shape=auto, address_space=None, - dim_tags=None, offset=0, dim_names=None, strides=None, order=None, - base_indices=None, storage_shape=None, - base_storage=None, initializer=None, read_only=False, - _base_storage_access_may_be_aliasing=False, **kwargs): + def __init__( + self, + name: str, + dtype: ToLoopyTypeConvertible = None, + shape: Union[ShapeType, Type["auto"], None] = auto, + address_space: Union[AddressSpace, Type[auto], None] = None, + dim_tags: Optional[Sequence[ArrayDimImplementationTag]] = None, + offset: Union[ExpressionT, str, None] = 0, + dim_names: Optional[Tuple[str, ...]] = None, + strides: Optional[Tuple[ExpressionT, ...]] = None, + order: str | None = None, + + base_indices: Optional[Tuple[ExpressionT, ...]] = None, + storage_shape: ShapeType | None = None, + + base_storage: Optional[str] = None, + initializer: Optional[np.ndarray] = None, + read_only: bool = False, + + _base_storage_access_may_be_aliasing: bool = False, + **kwargs: Any + ) -> None: """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple @@ -696,12 +713,6 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, if address_space is None: address_space = auto - if address_space is None: - raise LoopyError( - "temporary variable '%s': " - "address_space must not be None" - % name) - if initializer is None: pass elif isinstance(initializer, np.ndarray): @@ -736,7 +747,12 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, if order is None: order = "C" - if base_indices is None and shape is not auto: + if shape is not None: + from loopy.kernel.array import _parse_shape_or_strides + shape = _parse_shape_or_strides(shape) + + if base_indices is None and shape is not auto and shape is not None: + assert isinstance(shape, tuple) base_indices = (0,) * len(shape) if not read_only and initializer is not None: @@ -775,7 +791,7 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, _base_storage_access_may_be_aliasing), **kwargs) - def copy(self, **kwargs): + def copy(self, **kwargs: Any) -> TemporaryVariable: address_space = kwargs.pop("address_space", None) if address_space is not None: @@ -784,15 +800,23 @@ def copy(self, **kwargs): return super().copy(**kwargs) @property - def nbytes(self): - shape = self.shape + def nbytes(self) -> ExpressionT: if self.storage_shape is not None: shape = self.storage_shape + else: + if self.shape is None: + raise ValueError("shape is None") + if self.shape is auto: + raise ValueError("shape is auto") + shape = cast(Tuple[ExpressionT], self.shape) + + if self.dtype is None: + raise ValueError("data type is indeterminate") from pytools import product return product(si for si in shape)*self.dtype.itemsize - def __str__(self): + def __str__(self) -> str: if self.address_space is auto: aspace_str = "auto" else: diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index b355e3e70..28c75c31f 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -405,8 +405,6 @@ def __init__(self, # The Taggable constructor call does extra validation. tags=tags) - Taggable.__init__(self, tags) - def get_copy_kwargs(self, **kwargs): passed_depends_on = "depends_on" in kwargs @@ -943,7 +941,8 @@ def __init__(self, predicates: Optional[FrozenSet[str]] = None, tags: Optional[FrozenSet[Tag]] = None, temp_var_type: Union[ - Type[_not_provided], None, LoopyOptional] = _not_provided, + Type[_not_provided], None, LoopyOptional, + LoopyType] = _not_provided, atomicity: Tuple[VarAtomicity, ...] = (), *, depends_on: Union[FrozenSet[str], str, None] = None, From ea29c69613f405bd97456a78b3f7c05c1b1b7657 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 6 Aug 2024 14:28:19 -0500 Subject: [PATCH 37/68] Type infer_arg_descr --- loopy/preprocess.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 7176d9d15..3293e9a1e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -49,7 +49,11 @@ auto, filter_iname_tags_by_type, ) -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.function_interface import ( + ArgDescriptor, + CallableKernel, + ScalarCallable, +) # from loopy.transform.iname import remove_any_newly_unused_inames from loopy.kernel.instruction import ( @@ -655,7 +659,7 @@ def traverse_to_infer_arg_descr(kernel, callables_table): return descr_inferred_kernel, arg_descr_inf_mapper.clbl_inf_ctx -def infer_arg_descr(program): +def infer_arg_descr(t_unit: TranslationUnit) -> TranslationUnit: """ Returns a copy of *program* with the :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the @@ -666,12 +670,12 @@ def infer_arg_descr(program): from loopy.kernel.function_interface import ArrayArgDescriptor, ValueArgDescriptor from loopy.translation_unit import make_clbl_inf_ctx, resolve_callables - program = resolve_callables(program) + t_unit = resolve_callables(t_unit) - clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, - program.entrypoints) + clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table, + t_unit.entrypoints) - for e in program.entrypoints: + for e in t_unit.entrypoints: def _tuple_or_none(s): if isinstance(s, tuple): return s @@ -680,8 +684,8 @@ def _tuple_or_none(s): else: return s, - arg_id_to_descr = {} - for arg in program[e].args: + arg_id_to_descr: dict[str, ArgDescriptor] = {} + for arg in t_unit[e].args: if isinstance(arg, ArrayBase): if arg.shape not in (None, auto): arg_id_to_descr[arg.name] = ArrayArgDescriptor( @@ -691,12 +695,12 @@ def _tuple_or_none(s): arg_id_to_descr[arg.name] = ValueArgDescriptor() else: raise NotImplementedError() - new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( + new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_descrs( arg_id_to_descr, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, is_entrypoint=True) - return clbl_inf_ctx.finish_program(program) + return clbl_inf_ctx.finish_program(t_unit) # }}} From b22c45ed7a84c66cc7a5aeb952f4de783205dff6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 12 Aug 2024 20:50:09 +0200 Subject: [PATCH 38/68] Fix up references in documentation --- doc/ref_other.rst | 5 +++++ loopy/kernel/array.py | 21 +++++++++++++++++---- loopy/kernel/data.py | 5 +++-- loopy/statistics.py | 2 +- loopy/symbolic.py | 29 ++++++++++++++++++----------- loopy/translation_unit.py | 1 + loopy/typing.py | 25 +++++++++++++++++++------ pyproject.toml | 2 +- 8 files changed, 65 insertions(+), 25 deletions(-) diff --git a/doc/ref_other.rst b/doc/ref_other.rst index e5059380d..538f0cdb9 100644 --- a/doc/ref_other.rst +++ b/doc/ref_other.rst @@ -1,6 +1,11 @@ Reference: Other Functionality ============================== +Auxiliary Data Types +-------------------- + +.. automodule:: loopy.typing + Obtaining Kernel Performance Statistics --------------------------------------- diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index d6a0126d0..884c26d2f 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1,5 +1,3 @@ -"""Implementation tagging of array axes.""" - from __future__ import annotations @@ -70,8 +68,6 @@ __doc__ = """ -.. currentmodule:: loopy.kernel.array - .. autoclass:: ArrayDimImplementationTag .. autoclass:: _StrideArrayDimTagBase @@ -85,6 +81,23 @@ .. autoclass:: VectorArrayDimTag .. autofunction:: parse_array_dim_tags + +Cross-references +---------------- + +(This section shouldn't exist: Sphinx should be able to resolve these on its own.) + +.. class:: ShapeType + + See :class:`loopy.typing.ShapeType` + +.. class:: ExpressionT + + See :class:`loopy.typing.ExpressionT` + +.. class:: Tag + + See :class:`pytools.tag.Tag` """ diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index aec7c6d97..d2d80bedf 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -41,7 +41,8 @@ cast, ) -import numpy as np # noqa +import numpy # FIXME: imported as numpy to allow sphinx to resolve things +import numpy as np from immutables import Map from pytools import ImmutableRecord @@ -651,7 +652,7 @@ class TemporaryVariable(ArrayBase): will be created. """ - initializer: Optional[np.ndarray] + initializer: Optional[numpy.ndarray] """*None* or a :class:`numpy.ndarray` of data to be used to initialize the array. """ diff --git a/loopy/statistics.py b/loopy/statistics.py index 0bd1340c1..29ea91259 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -709,7 +709,7 @@ class MemAccess(ImmutableRecord): .. attribute:: variable_tags A :class:`frozenset` of subclasses of :class:`~pytools.tag.Tag` - that reflects :attr:`~loopy.symbolic.TaggedVariable.tags` of + that reflects :attr:`~loopy.TaggedVariable.tags` of an accessed variable. .. attribute:: count_granularity diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f2f04f9ae..22dbd3bf5 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -35,6 +35,7 @@ import numpy as np import islpy as isl +import pymbolic.primitives # FIXME: also import by full name to allow sphinx to resolve import pymbolic.primitives as p import pytools.lex from islpy import dim_type @@ -60,7 +61,7 @@ from pymbolic.mapper.unifier import UnidirectionalUnifier as UnidirectionalUnifierBase from pymbolic.parser import Parser as ParserBase from pytools import ImmutableRecord, memoize, memoize_method, memoize_on_first_arg -from pytools.tag import Taggable +from pytools.tag import Tag, Taggable from loopy.diagnostic import ( ExpressionToAffineConversionError, @@ -76,8 +77,6 @@ __doc__ = """ -.. currentmodule:: loopy.symbolic - Loopy-specific expression types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -89,6 +88,8 @@ .. autoclass:: TypedCSE +.. currentmodule:: loopy + .. autoclass:: TypeCast .. autoclass:: TaggedVariable @@ -97,6 +98,8 @@ .. autoclass:: LinearSubscript +.. currentmodule:: loopy.symbolic + .. autoclass:: RuleArgument .. autoclass:: ExpansionState @@ -686,13 +689,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable): may then be used to address these uses--such as by prefetching only accesses tagged a certain way. - .. attribute:: tags - - A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to - provide metadata on this object. Legacy string tags are converted to - :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry - a functional meaning, the tag carrying that same functional meaning - (e.g. :class:`~loopy.UseStreamingStoreTag`). + .. autoattribute:: tags Inherits from :class:`pymbolic.primitives.Variable` and :class:`pytools.tag.Taggable`. @@ -700,6 +697,14 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable): init_arg_names = ("name", "tags") + tags: frozenset[Tag] + """A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to + provide metadata on this object. Legacy string tags are converted to + :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry + a functional meaning, the tag carrying that same functional meaning + (e.g. :class:`~loopy.UseStreamingStoreTag`). + """ + def __init__(self, name, tags): p.Variable.__init__(self, name) if isinstance(tags, str): @@ -744,6 +749,7 @@ class Reduction(LoopyExpressionBase): expr: ExpressionT """An expression which may have tuple type. If the expression has tuple type, it must be one of the following: + * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or * a :class:`loopy.symbolic.Reduction`, or * a function call or substitution rule invocation. @@ -756,7 +762,8 @@ class Reduction(LoopyExpressionBase): def __init__(self, operation: ReductionOperation | str, - inames: tuple[str | p.Variable, ...] | p.Variable | str, + inames: (tuple[str | pymbolic.primitives.Variable, ...] + | pymbolic.primitives.Variable | str), expr: ExpressionT, allow_simultaneous: bool = False ) -> None: diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 801ec2964..5a4888936 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -88,6 +88,7 @@ .. autofunction:: for_each_kernel +.. autoclass:: TUnitOrKernelT """ diff --git a/loopy/typing.py b/loopy/typing.py index 948616578..cbf417d2f 100644 --- a/loopy/typing.py +++ b/loopy/typing.py @@ -1,3 +1,15 @@ +""" +.. autoclass:: IntegralT +.. autoclass:: FloatT +.. autoclass:: ExpressionT +.. autoclass:: ShapeType +.. autoclass:: auto +""" + + +from __future__ import annotations + + __copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees" __license__ = """ @@ -24,25 +36,26 @@ from typing import Optional, Tuple, TypeVar, Union import numpy as np +from typing_extensions import TypeAlias from pymbolic.primitives import Expression -IntegralT = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8, +IntegralT: TypeAlias = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64] -FloatT = Union[float, complex, np.float32, np.float64, np.complex64, +FloatT: TypeAlias = Union[float, complex, np.float32, np.float64, np.complex64, np.complex128] -ExpressionT = Union[IntegralT, FloatT, Expression] -ShapeType = Tuple[ExpressionT, ...] -StridesType = ShapeType +ExpressionT: TypeAlias = Union[IntegralT, FloatT, Expression] +ShapeType: TypeAlias = Tuple[ExpressionT, ...] +StridesType: TypeAlias = ShapeType class auto: # noqa """A generic placeholder object for something that should be automatically determined. See, for example, the *shape* or *strides* argument of - :class:`ArrayArg`. + :class:`~loopy.ArrayArg`. """ diff --git a/pyproject.toml b/pyproject.toml index a0ec51c81..8d4d51cd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "pyrsistent", "immutables", - # for Self + # for Self, TypeAlias "typing-extensions>=4; python_version<'3.12'", ] [project.optional-dependencies] From 8b73fedeea9b380c3cf362387800dacb169282c6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 18 Aug 2024 14:09:22 +0200 Subject: [PATCH 39/68] Fix Github CI README badge --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 1ef7773db..87c4887a3 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code .. image:: https://gitlab.tiker.net/inducer/loopy/badges/main/pipeline.svg :alt: Gitlab Build Status :target: https://gitlab.tiker.net/inducer/loopy/commits/main -.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main&event=push +.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main :alt: Github Build Status - :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI+event%3Apush + :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI .. image:: https://badge.fury.io/py/loopy.png :alt: Python Package Index Release Page :target: https://pypi.org/project/loopy/ From 5c025def05fa4e341ec9b62f33f19377d78abacf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 18 Aug 2024 14:10:08 +0200 Subject: [PATCH 40/68] Let ruff refactor some comprehensions --- examples/python/ispc-stream-harness.py | 5 +---- test/test_target.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 070e0d071..bf6e29e47 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -24,10 +24,7 @@ def transform(knl, vars, stream_dtype): knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") - knl = lp.add_and_infer_dtypes(knl, { - var: stream_dtype - for var in vars - }) + knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype)) knl = lp.set_argument_order(knl, vars + ["n"]) diff --git a/test/test_target.py b/test/test_target.py index 6c448debf..08bf286cd 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -314,10 +314,7 @@ def test_ispc_streaming_stores(): knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") - knl = lp.add_and_infer_dtypes(knl, { - var: stream_dtype - for var in vars - }) + knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype)) knl = lp.set_argument_order(knl, vars + ["n"]) From 2410e6ccb8f77292426f4eba0ee72ebe4428a9fd Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 18 Aug 2024 14:13:07 +0200 Subject: [PATCH 41/68] Type for_each_kernel, add check_each kernel, mostly type loopy.check --- loopy/check.py | 192 +++++++++++++++++++++------------ loopy/transform/add_barrier.py | 2 +- loopy/translation_unit.py | 59 +++++++--- 3 files changed, 171 insertions(+), 82 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index c23aa4582..31bbd7b0f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -22,6 +22,7 @@ import logging from collections import defaultdict +from collections.abc import Mapping, Sequence from functools import reduce from typing import List, Optional, Tuple, Union @@ -43,7 +44,14 @@ FixedStrideArrayDimTag, SeparateArrayArrayDimTag, ) -from loopy.kernel.data import ArrayArg, ArrayDimImplementationTag, auto +from loopy.kernel.data import ( + AddressSpace, + ArrayArg, + ArrayDimImplementationTag, + InameImplementationTag, + TemporaryVariable, + auto, +) from loopy.kernel.function_interface import CallableKernel from loopy.kernel.instruction import ( CallInstruction, @@ -52,10 +60,14 @@ NoOpInstruction, _DataObliviousInstruction, ) -from loopy.symbolic import CombineMapper, ResolvedFunction, WalkMapper -from loopy.translation_unit import TranslationUnit, for_each_kernel +from loopy.symbolic import CombineMapper, ResolvedFunction, SubArrayRef, WalkMapper +from loopy.translation_unit import ( + CallablesTable, + TranslationUnit, + check_each_kernel, +) from loopy.type_inference import TypeReader -from loopy.typing import ExpressionT +from loopy.typing import ExpressionT, not_none logger = logging.getLogger(__name__) @@ -145,8 +157,8 @@ def map_constant(self, expr): map_nan = map_constant -@for_each_kernel -def check_functions_are_resolved(kernel): +@check_each_kernel +def check_functions_are_resolved(kernel: LoopKernel) -> None: """ Checks if all call nodes in the *kernel* expression have been resolved. """ @@ -167,7 +179,7 @@ def check_functions_are_resolved(kernel): raise NotImplementedError(type(insn)) -@for_each_kernel +@check_each_kernel def check_separated_array_consistency(kernel: LoopKernel) -> None: # Boo. This is (part of) the price of redundant representation. for arg in kernel.args: @@ -198,7 +210,7 @@ def check_separated_array_consistency(kernel: LoopKernel) -> None: f"'{sub_arg.name}' is not consistent.") -@for_each_kernel +@check_each_kernel def check_offsets_and_dim_tags(kernel: LoopKernel) -> None: from pymbolic.primitives import Expression, Variable @@ -357,8 +369,8 @@ def check_for_integer_subscript_indices(t_unit): raise NotImplementedError(type(clbl).__name__) -@for_each_kernel -def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): +@check_each_kernel +def check_sub_array_ref_inames_not_within_or_redn_inames(kernel: LoopKernel) -> None: all_within_inames = frozenset().union(*(insn.within_inames for insn in kernel.instructions)) all_redn_inames = frozenset().union(*(insn.reduction_inames() @@ -379,8 +391,8 @@ def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): " illegal.") -@for_each_kernel -def check_insn_attributes(kernel): +@check_each_kernel +def check_insn_attributes(kernel: LoopKernel) -> None: """ Check for legality of attributes of every instruction in *kernel*. """ @@ -413,8 +425,8 @@ def check_insn_attributes(kernel): ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES))) -@for_each_kernel -def check_for_duplicate_insn_ids(knl): +@check_each_kernel +def check_for_duplicate_insn_ids(knl: LoopKernel) -> None: """ Check if multiple instructions of *knl* have the same :attr:`loopy.InstructionBase.id`. @@ -429,8 +441,8 @@ def check_for_duplicate_insn_ids(knl): insn_ids.add(insn.id) -@for_each_kernel -def check_loop_priority_inames_known(kernel): +@check_each_kernel +def check_loop_priority_inames_known(kernel: LoopKernel) -> None: """ Checks if the inames in :attr:`loopy.LoopKernel.loop_priority` are part of the *kernel*'s domain. @@ -441,8 +453,8 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) -@for_each_kernel -def check_multiple_tags_allowed(kernel): +@check_each_kernel +def check_multiple_tags_allowed(kernel: LoopKernel) -> None: """ Checks if a multiple tags of an iname are compatible. """ @@ -466,7 +478,10 @@ def check_multiple_tags_allowed(kernel): "tags: {}".format(iname.name, iname.tags)) -def _check_for_double_use_of_hw_axes_inner(kernel, callables_table): +def _check_for_double_use_of_hw_axes_inner( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: from loopy.kernel.data import GroupInameTag, LocalInameTag, UniqueInameTag from loopy.kernel.instruction import CallInstruction from loopy.symbolic import ResolvedFunction @@ -490,7 +505,7 @@ def _check_for_double_use_of_hw_axes_inner(kernel, callables_table): insn_tag_keys.add(key) -def check_for_double_use_of_hw_axes(t_unit): +def check_for_double_use_of_hw_axes(t_unit: TranslationUnit) -> None: """ Check if any instruction of *kernel* is within multiple inames tagged with the same hw axis tag. @@ -506,8 +521,8 @@ def check_for_double_use_of_hw_axes(t_unit): raise NotImplementedError(type(clbl).__name__) -@for_each_kernel -def check_for_inactive_iname_access(kernel): +@check_each_kernel +def check_for_inactive_iname_access(kernel: LoopKernel) -> None: """ Check if any instruction accesses an iname but is not within it. """ @@ -524,8 +539,8 @@ def check_for_inactive_iname_access(kernel): - insn.within_inames), kernel.name)) -@for_each_kernel -def check_for_unused_inames(kernel): +@check_each_kernel +def check_for_unused_inames(kernel: LoopKernel) -> None: """ Check if there are any unused inames in the kernel. """ @@ -541,7 +556,7 @@ def check_for_unused_inames(kernel): % unused_inames) -def _is_racing_iname_tag(tv, tag): +def _is_racing_iname_tag(tv: TemporaryVariable, tag: InameImplementationTag) -> bool: from loopy.kernel.data import ( AddressSpace, ConcurrentTag, @@ -572,8 +587,8 @@ def _is_racing_iname_tag(tv, tag): "temporary variable '%s'" % tv.name) -@for_each_kernel -def check_for_write_races(kernel): +@check_each_kernel +def check_for_write_races(kernel: LoopKernel) -> None: """ Check if any memory accesses lead to write races. """ @@ -621,8 +636,8 @@ def check_for_write_races(kernel): WriteRaceConditionWarning) -@for_each_kernel -def check_for_data_dependent_parallel_bounds(kernel): +@check_each_kernel +def check_for_data_dependent_parallel_bounds(kernel: LoopKernel) -> None: """ Check that inames tagged as hw axes have bounds that are known at kernel launch. @@ -849,7 +864,7 @@ def map_call(self, expr, domain, insn_id): _check_bounds_inner_rec(subkernel, self.callables_table) -def _check_bounds_inner(kernel, callables_table): +def _check_bounds_inner(kernel: LoopKernel, callables_table: CallablesTable) -> None: from loopy.kernel.instruction import get_insn_domain temp_var_names = set(kernel.temporary_variables) @@ -875,7 +890,10 @@ def run_acm(expr): insn.with_transformed_expressions(run_acm) -def _check_bounds_inner_rec(kernel, callables_table): +def _check_bounds_inner_rec( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: if kernel.options.enforce_array_accesses_within_bounds not in [ "no_check", True, @@ -900,7 +918,7 @@ def _check_bounds_inner_rec(kernel, callables_table): warn_with_kernel(kernel, "array_access_out_of_bounds", str(e)) -def check_bounds(t_unit): +def check_bounds(t_unit: TranslationUnit) -> None: """ Performs out-of-bound check for every array access. """ @@ -913,8 +931,8 @@ def check_bounds(t_unit): # {{{ check write destinations -@for_each_kernel -def check_write_destinations(kernel): +@check_each_kernel +def check_write_destinations(kernel: LoopKernel) -> None: for insn in kernel.instructions: for wvar in insn.assignee_var_names(): if wvar in kernel.all_inames(): @@ -941,8 +959,8 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting -@for_each_kernel -def check_has_schedulable_iname_nesting(kernel): +@check_each_kernel +def check_has_schedulable_iname_nesting(kernel: LoopKernel) -> None: from loopy.transform.iname import ( get_iname_duplication_options, has_schedulable_iname_nesting, @@ -989,8 +1007,8 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync -def _get_address_space(kernel, var): - from loopy.kernel.data import AddressSpace, ArrayArg, ValueArg +def _get_address_space(kernel: LoopKernel, var: str) -> AddressSpace | type[auto]: + from loopy.kernel.data import ArrayArg, ValueArg if var in kernel.temporary_variables: address_space = kernel.temporary_variables[var].address_space else: @@ -1006,7 +1024,7 @@ def _get_address_space(kernel, var): return address_space -def _get_topological_order(kernel): +def _get_topological_order(kernel: LoopKernel) -> Sequence[str]: """ Returns a :class:`list` of insn ids of *kernel* in a topological sort order. @@ -1034,7 +1052,7 @@ def _get_topological_order(kernel): return order -def _check_variable_access_ordered_inner(kernel): +def _check_variable_access_ordered_inner(kernel: LoopKernel) -> None: from loopy.kernel.tools import find_aliasing_equivalence_classes from loopy.symbolic import AccessRangeOverlapChecker overlap_checker = AccessRangeOverlapChecker(kernel) @@ -1051,7 +1069,7 @@ def _check_variable_access_ordered_inner(kernel): # the mapping in both directions. # # Note: This can be worst-case O(n^2) in the number of instructions. - dep_reqs_to_vars = {} + dep_reqs_to_vars: dict[tuple[str, str], set[str]] = {} wmap = kernel.writer_map() rmap = kernel.reader_map() @@ -1082,14 +1100,16 @@ def _check_variable_access_ordered_inner(kernel): # {{{ compute rev_depends, depends_on # depends_on: mapping from insn_ids to their dependencies - depends_on = {insn.id: set() for insn in kernel.instructions} + depends_on: dict[str, set[str]] = { + not_none(insn.id): set() for insn in kernel.instructions} # rev_depends: mapping from insn_ids to their reverse deps. - rev_depends = {insn.id: set() for insn in kernel.instructions} + rev_depends: dict[str, set[str]] = { + not_none(insn.id): set() for insn in kernel.instructions} for insn in kernel.instructions: - depends_on[insn.id].update(insn.depends_on) + depends_on[not_none(insn.id)].update(insn.depends_on) for dep in insn.depends_on: - rev_depends[dep].add(insn.id) + rev_depends[dep].add(not_none(insn.id)) # }}} @@ -1097,7 +1117,8 @@ def _check_variable_access_ordered_inner(kernel): topological_order = _get_topological_order(kernel) - def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): + # TODO: Type this + def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order) -> None: """ Considering a graph defined by *edges* (as ``key -> value``), remove pairs of nodes from *dep_reqs_to_vars* for which edges @@ -1215,8 +1236,8 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): # }}} -@for_each_kernel -def check_variable_access_ordered(kernel): +@check_each_kernel +def check_variable_access_ordered(kernel: LoopKernel) -> None: """Checks that between each write to a variable and all other accesses to the variable there is either: @@ -1253,7 +1274,7 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(t_unit): +def pre_schedule_checks(t_unit: TranslationUnit) -> None: try: logger.debug("pre-schedule checks start for entrypoints: " f"{t_unit.entrypoints}.") @@ -1331,8 +1352,11 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None: # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, - sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk( + kernel: LoopKernel, + callables_table: CallablesTable, + sched_index: int | None = None + ) -> int: from loopy.schedule import ( Barrier, CallKernel, @@ -1343,6 +1367,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, gather_schedule_block, get_insn_ids_for_block_at, ) + assert kernel.linearization is not None if sched_index is None: group_axes = set() @@ -1443,7 +1468,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, return past_end_i -def check_for_unused_hw_axes_in_insns(kernel, callables_table): +def check_for_unused_hw_axes_in_insns( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: if kernel.linearization: _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table) @@ -1453,7 +1481,9 @@ def check_for_unused_hw_axes_in_insns(kernel, callables_table): # {{{ check that atomic ops are used exactly on atomic arrays -def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): +def check_that_atomic_ops_are_used_exactly_on_atomic_arrays( + kernel: LoopKernel + ) -> None: from loopy.kernel.data import ArrayBase, Assignment from loopy.types import AtomicType atomicity_candidates = ( @@ -1488,7 +1518,9 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used -def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): +def check_that_temporaries_are_defined_in_subkernels_where_used( + kernel: LoopKernel + ) -> None: from loopy.kernel.data import AddressSpace from loopy.kernel.tools import get_subkernels @@ -1541,9 +1573,10 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): # {{{ check that all instructions are scheduled -def check_that_all_insns_are_scheduled(kernel): +def check_that_all_insns_are_scheduled(kernel: LoopKernel) -> None: + assert kernel.linearization is not None - all_schedulable_insns = {insn.id for insn in kernel.instructions} + all_schedulable_insns = {not_none(insn.id) for insn in kernel.instructions} from loopy.schedule import sched_item_to_insn_id scheduled_insns = { insn_id @@ -1563,7 +1596,7 @@ def check_that_all_insns_are_scheduled(kernel): # {{{ check that shapes and strides are arguments -def check_that_shapes_and_strides_are_arguments(kernel): +def check_that_shapes_and_strides_are_arguments(kernel: LoopKernel) -> None: import loopy as lp from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag from loopy.kernel.data import ValueArg @@ -1573,12 +1606,12 @@ def check_that_shapes_and_strides_are_arguments(kernel): arg.name for arg in kernel.args if isinstance(arg, ValueArg) - and arg.dtype.is_integral()} + and not_none(arg.dtype).is_integral()} for arg in kernel.args: if isinstance(arg, ArrayBase): if isinstance(arg.shape, tuple): - shape_deps = set() + shape_deps: set[str] = set() for shape_axis in arg.shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) @@ -1607,14 +1640,21 @@ def check_that_shapes_and_strides_are_arguments(kernel): # {{{ validate_kernel_call_sites -def _get_sub_array_ref_swept_range(kernel, sar): +def _get_sub_array_ref_swept_range( + kernel: LoopKernel, + sar: SubArrayRef + ) -> isl.Set: from loopy.symbolic import get_access_map domain = kernel.get_inames_domain(frozenset({iname_var.name for iname_var in sar.swept_inames})) return get_access_map(domain, sar.swept_inames, kernel.assumptions).range() -def _are_sub_array_refs_equivalent(sar1, sar2, caller): +def _are_sub_array_refs_equivalent( + sar1: SubArrayRef, + sar2: SubArrayRef, + caller: LoopKernel + ) -> bool: """ Returns *True* iff *sar1* and *sar2* are equivalent :class:`loopy.SubArrayRef`s. @@ -1657,7 +1697,11 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): return True -def _validate_kernel_call_insn(caller, call_insn, callee): +def _validate_kernel_call_insn( + caller: LoopKernel, + call_insn: CallInstruction, + callee: LoopKernel + ) -> None: assert call_insn.expression.function.name == callee.name from loopy.kernel.array import ArrayBase from loopy.symbolic import SubArrayRef @@ -1707,7 +1751,10 @@ def _validate_kernel_call_insn(caller, call_insn, callee): f" (got {in_val}, {out_val}).") -def _validate_kernel_call_sites_inner(kernel, callables): +def _validate_kernel_call_sites_inner( + kernel: LoopKernel, + callables: CallablesTable, + ) -> None: from pymbolic.primitives import Call from loopy.kernel.function_interface import CallableKernel @@ -1739,8 +1786,10 @@ def validate_kernel_call_sites(translation_unit: TranslationUnit) -> None: # {{{ check_all_callees_have_same_index_dtype -def check_all_callees_have_same_index_dtype(epoint: LoopKernel, - callables_table): +def check_all_callees_have_same_index_dtype( + epoint: LoopKernel, + callables_table: CallablesTable + ) -> None: from loopy.kernel.function_interface import CallableKernel epoint_clbl = callables_table[epoint.name] @@ -1757,7 +1806,10 @@ def check_all_callees_have_same_index_dtype(epoint: LoopKernel, # }}} -def pre_codegen_entrypoint_checks(kernel, callables_table): +def pre_codegen_entrypoint_checks( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: logger.debug("pre-codegen entrypoint check %s: start" % kernel.name) kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) @@ -1780,7 +1832,7 @@ def pre_codegen_callable_checks(kernel, callables_table): logger.debug("pre-codegen callable check %s: done" % kernel.name) -def pre_codegen_checks(t_unit): +def pre_codegen_checks(t_unit: TranslationUnit) -> None: from loopy.kernel.function_interface import CallableKernel try: @@ -1803,7 +1855,11 @@ def pre_codegen_checks(t_unit): # {{{ sanity-check for implemented domains of each instruction -def check_implemented_domains(kernel, implemented_domains, code=None): +def check_implemented_domains( + kernel: LoopKernel, + implemented_domains: Mapping[str, isl.Set], + code: str | None = None, + ) -> bool: from islpy import align_two, dim_type last_idomains = None diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index 73bd5cdde..7ab5e376e 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -91,7 +91,7 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add]) if insn_after is not None: - new_kernel = add_dependency(kernel=new_kernel, + new_kernel = add_dependency(new_kernel, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 5a4888936..4afdfcef7 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -40,7 +40,7 @@ from warnings import warn from immutables import Map -from typing_extensions import Self +from typing_extensions import Concatenate, ParamSpec, Self from pymbolic.primitives import Call, Variable @@ -86,9 +86,16 @@ .. autofunction:: make_program +.. autofunction:: check_each_kernel + .. autofunction:: for_each_kernel .. autoclass:: TUnitOrKernelT + +.. class:: P + + A :class:`typing.ParamSpec` for use in annotating :func:`for_each_kernel` and + :func:`check_each_kernel`. """ @@ -760,21 +767,46 @@ def make_program(kernel: LoopKernel) -> TranslationUnit: entrypoints=frozenset()) -def for_each_kernel(transform): +P = ParamSpec("P") + + +def check_each_kernel( + check: Callable[Concatenate[LoopKernel, P], None] + ) -> Callable[Concatenate[TranslationUnit, P], None]: + def _collective_check( + t_unit_or_kernel: TranslationUnit | LoopKernel, /, + *args: P.args, + **kwargs: P.kwargs + ) -> None: + if isinstance(t_unit_or_kernel, TranslationUnit): + for clbl in t_unit_or_kernel.callables_table.values(): + if isinstance(clbl, CallableKernel): + check(clbl.subkernel, *args, **kwargs) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + elif isinstance(t_unit_or_kernel, LoopKernel): + check(t_unit_or_kernel, *args, **kwargs) + else: + raise TypeError("expected LoopKernel or TranslationUnit") + + return wraps(check)(_collective_check) + + +def for_each_kernel( + transform: Callable[Concatenate[LoopKernel, P], LoopKernel] + ) -> Callable[Concatenate[TUnitOrKernelT, P], TUnitOrKernelT]: """ Function wrapper for transformations of the type ``transform(kernel: LoopKernel, *args, **kwargs) -> LoopKernel``. Returns a function that would apply *transform* to all callable kernels in a :class:`loopy.TranslationUnit`. """ - def _collective_transform(*args, **kwargs): - if "translation_unit" in kwargs: - t_unit_or_kernel = kwargs.pop("translation_unit") - elif "kernel" in kwargs: - t_unit_or_kernel = kwargs.pop("kernel") - else: - t_unit_or_kernel = args[0] - args = args[1:] - + def _collective_transform( + t_unit_or_kernel: TUnitOrKernelT, /, + *args: P.args, + **kwargs: P.kwargs + ) -> TUnitOrKernelT: if isinstance(t_unit_or_kernel, TranslationUnit): t_unit = t_unit_or_kernel new_callables = {} @@ -790,10 +822,11 @@ def _collective_transform(*args, **kwargs): new_callables[func_id] = clbl return t_unit.copy(callables_table=Map(new_callables)) - else: - assert isinstance(t_unit_or_kernel, LoopKernel) + elif isinstance(t_unit_or_kernel, LoopKernel): kernel = t_unit_or_kernel return transform(kernel, *args, **kwargs) + else: + raise TypeError("expected LoopKernel or TranslationUnit") return wraps(transform)(_collective_transform) From b700bc6144a80644ef3e06467f1940cb06860c8c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Aug 2024 00:05:22 +0200 Subject: [PATCH 42/68] Type stringify_instruction_list --- loopy/kernel/tools.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 2f2e9a8a9..9a14aedd5 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -34,6 +34,7 @@ import islpy as isl from islpy import dim_type from pytools import memoize_on_first_arg, natsorted +from pytools.tag import Tag from loopy.diagnostic import LoopyError, warn_with_kernel from loopy.kernel import LoopKernel @@ -1483,7 +1484,7 @@ def conform_to_uniform_length(s): # {{{ stringify_instruction_list -def stringify_instruction_tag(tag): +def stringify_instruction_tag(tag: Tag) -> str: from loopy.kernel.instruction import LegacyStringInstructionTag if isinstance(tag, LegacyStringInstructionTag): return f"S({tag.value})" @@ -1491,7 +1492,7 @@ def stringify_instruction_tag(tag): return str(tag) -def stringify_instruction_list(kernel): +def stringify_instruction_list(kernel: LoopKernel) -> list[str]: # {{{ topological sort printed_insn_ids = set() @@ -1525,7 +1526,7 @@ def insert_insn_into_order(insn): leader = " " * uniform_arrow_length lines = [] - current_inames = [set()] + current_inames: list[set[str]] = [set()] if uniform_arrow_length: indent_level = [1] @@ -1536,13 +1537,13 @@ def insert_insn_into_order(insn): iname_order = kernel._get_iname_order_for_printing() - def add_pre_line(s): + def add_pre_line(s: str) -> None: lines.append(leader + " " * indent_level[0] + s) - def add_main_line(s): + def add_main_line(s: str) -> None: lines.append(arrows + " " * indent_level[0] + s) - def add_post_line(s): + def add_post_line(s: str) -> None: lines.append(extender + " " * indent_level[0] + s) def adapt_to_new_inames_list(new_inames): From e9f2b962eb9fcad76e47a334f737b6bfd4780c31 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 24 Aug 2024 11:45:07 -0500 Subject: [PATCH 43/68] Make a type alias for iname strings --- loopy/kernel/instruction.py | 4 ++-- loopy/typing.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 28c75c31f..d564d5e36 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -34,7 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import Optional as LoopyOptional from loopy.types import LoopyType -from loopy.typing import ExpressionT +from loopy.typing import ExpressionT, InameStr # {{{ instruction tags @@ -248,7 +248,7 @@ class InstructionBase(ImmutableRecord, Taggable): conflicts_with_groups: FrozenSet[str] no_sync_with: FrozenSet[Tuple[str, str]] predicates: FrozenSet[ExpressionT] - within_inames: FrozenSet[str] + within_inames: FrozenSet[InameStr] within_inames_is_final: bool priority: int diff --git a/loopy/typing.py b/loopy/typing.py index cbf417d2f..e6166fb7a 100644 --- a/loopy/typing.py +++ b/loopy/typing.py @@ -51,6 +51,8 @@ ShapeType: TypeAlias = Tuple[ExpressionT, ...] StridesType: TypeAlias = ShapeType +InameStr: TypeAlias = str + class auto: # noqa """A generic placeholder object for something that should be automatically From 736ccbaaf00856dd8e1bcc8e35b8d9d30009f83c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 24 Aug 2024 11:45:44 -0500 Subject: [PATCH 44/68] Convert LoopKernel to in-line attribute docs --- loopy/kernel/__init__.py | 137 ++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 81 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index faa9fcab6..d85b1568a 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -74,7 +74,7 @@ from loopy.target import TargetBase from loopy.tools import update_persistent_hash from loopy.types import LoopyType, NumpyType -from loopy.typing import ExpressionT +from loopy.typing import ExpressionT, InameStr if TYPE_CHECKING: @@ -117,82 +117,25 @@ class LoopKernel(Taggable): even if it contains mutable data types. See :meth:`copy` for an easy way of producing a modified copy. - .. attribute:: domains - - a list of :class:`islpy.BasicSet` instances representing the - :ref:`domain-tree`. - - .. attribute:: instructions - - A list of :class:`InstructionBase` instances, e.g. - :class:`Assignment`. See :ref:`instructions`. - - .. attribute:: args - - A list of :class:`loopy.KernelArgument` - - .. attribute:: schedule - - *None* or a list of :class:`loopy.schedule.ScheduleItem` - - .. attribute:: name - .. attribute:: preambles - .. attribute:: preamble_generators - .. attribute:: assumptions - - A :class:`islpy.BasicSet` parameter domain. - - .. attribute:: temporary_variables - - A :class:`dict` of mapping variable names to - :class:`loopy.TemporaryVariable` - instances. - - .. attribute:: symbol_manglers - - .. attribute:: substitutions - - a mapping from substitution names to - :class:`SubstitutionRule` objects - - .. attribute:: iname_slab_increments - - a dictionary mapping inames to (lower_incr, - upper_incr) tuples that will be separated out in the execution to generate - 'bulk' slabs with fewer conditionals. - - .. attribute:: loop_priority - - A frozenset of priority constraints to the kernel. Each such constraint - is a tuple of inames. Inames occurring in such a tuple will be scheduled - earlier than any iname following in the tuple. This applies only to inames - with non-parallel implementation tags. - - .. attribute:: silenced_warnings - - .. attribute:: applied_iname_rewrites - - A list of past substitution dictionaries that - were applied to the kernel. These are stored so that they may be repeated - on expressions the user specifies later. - - .. attribute:: options - - An instance of :class:`loopy.Options` - - .. attribute:: state - - A value from :class:`KernelState`. - - .. attribute:: target - - A subclass of :class:`loopy.TargetBase`. - - .. attribute:: inames - - An instance of :class:`dict`, a mapping from the names of kernel's - inames to their corresponding instances of :class:`loopy.kernel.data.Iname`. - An entry is guaranteed to be present for each iname. + .. autoattribute:: domains + .. autoattribute:: instructions + .. autoattribute:: args + .. autoattribute:: schedule + .. autoattribute:: name + .. autoattribute:: preambles + .. autoattribute:: preamble_generators + .. autoattribute:: assumptions + .. autoattribute:: temporary_variables + .. autoattribute:: symbol_manglers + .. autoattribute:: substitutions + .. autoattribute:: iname_slab_increments + .. autoattribute:: loop_priority + .. autoattribute:: silenced_warnings + .. autoattribute:: applied_iname_rewrites + .. autoattribute:: options + .. autoattribute:: state + .. autoattribute:: target + .. autoattribute:: inames .. automethod:: __call__ .. automethod:: copy @@ -201,11 +144,25 @@ class LoopKernel(Taggable): .. automethod:: without_tags """ domains: Sequence[isl.BasicSet] + """Represents the :ref:`domain-tree`.""" + instructions: Sequence[InstructionBase] + """ + See :ref:`instructions`. + """ + args: Sequence[KernelArgument] assumptions: isl.BasicSet + """ + Must be a :class:`islpy.BasicSet` parameter domain. + """ + temporary_variables: Mapping[str, TemporaryVariable] - inames: Mapping[str, Iname] + inames: Mapping[InameStr, Iname] + """ + An entry is guaranteed to be present for each iname. + """ + substitutions: Mapping[str, SubstitutionRule] options: Options target: TargetBase @@ -218,11 +175,29 @@ class LoopKernel(Taggable): symbol_manglers: Sequence[ Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = () linearization: Optional[Sequence[ScheduleItem]] = None - iname_slab_increments: Mapping[str, Tuple[int, int]] = field( + iname_slab_increments: Mapping[InameStr, Tuple[int, int]] = field( default_factory=Map) - loop_priority: FrozenSet[Tuple[str]] = field( + """ + A mapping from inames to (lower_incr, + upper_incr) tuples that will be separated out in the execution to generate + 'bulk' slabs with fewer conditionals. + """ + + loop_priority: FrozenSet[Tuple[InameStr, ...]] = field( default_factory=frozenset) - applied_iname_rewrites: Tuple[Dict[str, ExpressionT], ...] = () + """ + A frozenset of priority constraints to the kernel. Each such constraint + is a tuple of inames. Inames occurring in such a tuple will be scheduled + earlier than any iname following in the tuple. This applies only to inames + with non-parallel implementation tags. + """ + + applied_iname_rewrites: Tuple[Dict[InameStr, ExpressionT], ...] = () + """ + A list of past substitution dictionaries that + were applied to the kernel. These are stored so that they may be repeated + on expressions the user specifies later. + """ index_dtype: NumpyType = NumpyType(np.dtype(np.int32)) silenced_warnings: FrozenSet[str] = frozenset() From 9d9b08fd77eacbffc698bce58fca03abd0d627fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Aug 2024 13:07:09 -0500 Subject: [PATCH 45/68] Add helpers to figure out loop nestings from a kernel This is a typed version of the code from this commit: https://github.com/inducer/loopy/pull/690/commits/be5a31849c8ea16670a416c83b7b86a0cc5f71ae Co-authored-by: Andreas Kloeckner --- loopy/schedule/tools.py | 448 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 444 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index b2a44c499..a02028231 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -21,15 +21,23 @@ """ import enum +from collections.abc import Collection, Mapping from dataclasses import dataclass -from functools import cached_property -from typing import Dict, FrozenSet, List, Sequence, Set, Tuple +from functools import cached_property, reduce +from typing import AbstractSet, Dict, FrozenSet, List, Sequence, Set, Tuple + +from immutables import Map +from typing_extensions import TypeAlias import islpy as isl -from pytools import memoize_method +from pytools import memoize_method, memoize_on_first_arg +from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.data import AddressSpace, ArrayArg, TemporaryVariable +from loopy.schedule import ScheduleItem +from loopy.schedule.tree import Tree +from loopy.typing import InameStr, not_none # {{{ block boundary finder @@ -621,4 +629,436 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir, # }}} -# vim: foldmethod=marker + +InameStrSet: TypeAlias = FrozenSet[InameStr] +LoopNestTree: TypeAlias = Tree[InameStrSet] +LoopTree: TypeAlias = Tree[InameStr] + + +class V2SchedulerNotImplementedError(LoopyError): + pass + + +def _pull_out_loop_nest( + tree: LoopNestTree, + loop_nests: Collection[InameStrSet], + inames_to_pull_out: InameStrSet + ) -> tuple[LoopNestTree, InameStrSet, InameStrSet | None]: + """ + Returns a copy of *tree* that realizes *inames_to_pull_out* as loop + nesting. + + :arg tree: A :class:`loopy.tools.Tree`, where each node is + :class:`frozenset` of inames representing a loop nest. + + :arg loop_nests: A collection of nodes in *tree* that cover + *inames_to_pull_out*. + + :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, + where outer_loop_nest is the identifier for the new outer and inner + loop nests so that *inames_to_pull_out* is a valid nesting. + + .. note:: + + We could compute *loop_nests* within this routine's implementation, but + computing would be expensive and hence we ask the caller for this info. + + Example:: + *tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k', 'l'}) + + *inames_to_pull_out*: frozenset({'k', 'i', 'j'}) + *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} + + Returns: + + *new_tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k'}) + └── frozenset({'l'}) + + *outer_loop_nest*: frozenset({'k'}) + *inner_loop_nest*: frozenset({'l'}) + """ + assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests) + + # annotation to avoid https://github.com/python/mypy/issues/17693 + emptyset: InameStrSet = frozenset() + + assert inames_to_pull_out <= reduce(frozenset.union, loop_nests, emptyset) + + # {{{ sanity check to ensure the loop nest *inames_to_pull_out* is possible + + loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) + + for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): + if outer != tree.parent(inner): + raise LoopyError(f"Cannot schedule loop nest {inames_to_pull_out} " + f" in the nesting tree:\n{tree}") + + assert tree.depth(loop_nests[0]) == 0 + + # }}} + + innermost_loop_nest = loop_nests[-1] + # separate variable to avoid https://github.com/python/mypy/issues/17694 + outerer_loops = reduce(frozenset.union, loop_nests[:-1], emptyset) + new_outer_loop_nest = inames_to_pull_out - outerer_loops + new_inner_loop_nest = innermost_loop_nest - inames_to_pull_out + + if new_outer_loop_nest == innermost_loop_nest: + # such a loop nesting already exists => do nothing + return tree, new_outer_loop_nest, None + + # add the outer loop to our loop nest tree + tree = tree.add_node(new_outer_loop_nest, + parent=not_none(tree.parent(innermost_loop_nest))) + + # rename the old loop to the inner loop + tree = tree.replace_node(innermost_loop_nest, + new_node=new_inner_loop_nest) + + # set the parent of inner loop to be the outer loop + tree = tree.move_node(new_inner_loop_nest, new_parent=new_outer_loop_nest) + + return tree, new_outer_loop_nest, new_inner_loop_nest + + +def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): + """ + Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*. + """ + # add the outer loop to our loop nest tree + return tree.add_node(inner_loop_nest, parent=outer_loop_nest) + + +def _order_loop_nests( + loop_nest_tree: LoopNestTree, + strict_priorities: FrozenSet[Tuple[InameStr, ...]], + relaxed_priorities: FrozenSet[Tuple[InameStr, ...]], + iname_to_tree_node_id: Mapping[InameStr, InameStrSet], + ) -> LoopTree: + """ + Returns a loop nest where all nodes in the tree are instances of + :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to + multiple loop nesting, this routine returns a unique loop nest that is + obtained after constraining *loop_nest_tree* with the constraints enforced + by *priorities*. + + :arg strict_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These priorities are imposed + strictly i.e. if these conditions cannot be met a + :class:`loopy.diagnostic.LoopyError` is raised. + + :arg relaxed_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These nesting constraints are + treated as options. + + :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a + part of. + """ + from warnings import warn + + from pytools.graph import compute_topological_order as toposort + + loop_nests = set(iname_to_tree_node_id.values()) + + # flow_requirements: A mapping from the loop nest level to the nesting + # constraints applicable to it. + # Each nesting constraint is represented as a DAG. In the DAG, if there + # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested + # inside 'i'. + flow_requirements: dict[InameStrSet, dict[InameStr, InameStrSet]] = { + loop_nest: {iname: frozenset() for iname in loop_nest} + for loop_nest in loop_nests} + + # The plan here is populate DAGs in *flow_requirements* and then perform a + # toposort for each loop nest. + + def _update_flow_requirements(priorities, cannot_satisfy_callback): + """ + Records *priorities* in *flow_requirements* and calls + *cannot_satisfy_callback* with an appropriate error message if the + priorities cannot be met. + """ + for priority in priorities: + for outer_iname, inner_iname in zip(priority[:-1], priority[1:]): + if inner_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {inner_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + if outer_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {outer_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + flow_requirements[inner_iname_nest][outer_iname] |= {inner_iname} + else: + ancestors_of_inner_iname = (loop_nest_tree + .ancestors(inner_iname_nest)) + ancestors_of_outer_iname = (loop_nest_tree + .ancestors(outer_iname_nest)) + if outer_iname in ancestors_of_inner_iname: + # nesting constraint already satisfied => do nothing + pass + elif inner_iname in ancestors_of_outer_iname: + cannot_satisfy_callback("Cannot satisfy constraint that" + f" iname '{inner_iname}' must be" + f" nested within '{outer_iname}''.") + else: + # inner iname and outer iname are indirect family members + # => must be realized via dependencies in the linearization + # phase, not implemented in v2-scheduler yet. + raise V2SchedulerNotImplementedError("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") + + def _raise_loopy_err(x): + raise LoopyError(x) + + # record strict priorities + _update_flow_requirements(strict_priorities, _raise_loopy_err) + # record relaxed priorities + _update_flow_requirements(relaxed_priorities, warn) + + # ordered_loop_nests: A mapping from the unordered loop nests to their + # ordered couterparts. For example. If we had only one loop nest + # `frozenset({"i", "j", "k"})`, and the prioirities said added the + # constraint that "i" must be nested within "k", then `ordered_loop_nests` + # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop + # nests would now have an order. + ordered_loop_nests = {unordered_nest: toposort(flow, + key=lambda x: x) + for unordered_nest, flow in flow_requirements.items()} + + # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' + + assert loop_nest_tree.root == frozenset() + + new_tree = Tree.from_root("") + + old_to_new_parent = {} + + old_to_new_parent[loop_nest_tree.root] = "" + + # traversing 'tree' in an BFS fashion to create 'new_tree' + queue = list(loop_nest_tree.children(loop_nest_tree.root)) + + while queue: + current_nest = queue.pop(0) + + ordered_nest = ordered_loop_nests[current_nest] + new_tree = new_tree.add_node(ordered_nest[0], + parent=old_to_new_parent[not_none(loop_nest_tree + .parent(current_nest))]) + for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]): + new_tree = new_tree.add_node(node=new_child, parent=new_parent) + + old_to_new_parent[current_nest] = ordered_nest[-1] + + queue.extend(list(loop_nest_tree.children(current_nest))) + + # }}} + + return new_tree + + +@memoize_on_first_arg +def _get_parallel_inames(kernel: LoopKernel) -> AbstractSet[str]: + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + return (concurrent_inames - ilp_inames - vec_inames) + + +def _get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: + """ + Returns :class:`loopy.Tree` representing the *kernel*'s loop-nests. + + Each node of the returned tree has a :class:`frozenset` of inames. + All the inames in the identifier of a parent node of a loop nest in the + tree must be nested outside all the iname in identifier of the loop nest. + + .. note:: + + This routine only takes into account the nesting dependency + constraints of :attr:`loopy.InstructionBase.within_inames` of all the + *kernel*'s instructions and the iname tags. This routine does *NOT* + include the nesting constraints imposed by the dependencies between the + instructions and the dependencies imposed by the kernel's domain tree. + """ + from loopy.kernel.data import IlpBaseTag + + # figuring the possible loop nestings minus the concurrent_inames as they + # are never realized as actual loops + iname_chains = {insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} + + root: InameStrSet = frozenset() + tree = Tree.from_root(root) + + # mapping from iname to the innermost loop nest they are part of in *tree*. + iname_to_tree_node_id: Dict[InameStr, InameStrSet] = {} + + # if there were any loop with no inames, those have been already account + # for as the root. + iname_chains = iname_chains - {root} + + for iname_chain in iname_chains: + not_seen_inames = frozenset(iname for iname in iname_chain + if iname not in iname_to_tree_node_id) + seen_inames = iname_chain - not_seen_inames + + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + tree = _add_inner_loops(tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + for iname in not_seen_inames: + iname_to_tree_node_id[iname] = not_seen_inames + + # }}} + + # {{{ make ILP tagged inames innermost + + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + + for iname_chain in iname_chains: + for ilp_iname in (ilp_inames & iname_chains): + # pull out other loops so that ilp_iname is the innermost + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + (iname_chain + - {ilp_iname})) + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + # }}} + + return tree + + +def _get_iname_to_tree_node_id_from_partial_loop_nest_tree( + tree: LoopNestTree, + ) -> Mapping[str, frozenset[str]]: + """ + Returns the mapping from the iname to the *tree*'s node that it was a part + of. + + :arg tree: A partial loop nest tree. + """ + iname_to_tree_node_id = {} + for node in tree.nodes(): + assert isinstance(node, frozenset) + for iname in node: + iname_to_tree_node_id[iname] = node + + return Map(iname_to_tree_node_id) + + +def get_loop_tree(kernel: LoopKernel) -> LoopTree: + """ + Returns ```tree``` (an instance of :class:`Tree`) representing the loop + nesting for *kernel*. Each node of ``tree`` is an instance of :class:`str` + corresponding to the inames of *kernel* that are realized as concrete + ``for-loops``. A parent node in `tree` is always nested outside all its + children. + + .. note:: + + Multiple loop nestings might exist for *kernel*, but this routine returns + one valid loop nesting. + """ + from islpy import dim_type + + tree = _get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + strict_loop_priorities: FrozenSet[Tuple[InameStr, ...]] = frozenset() + + # {{{ impose constraints by the domain tree + + # FIXME: These three could be one statement if it weren't for + # - https://github.com/python/mypy/issues/17693 + # - https://github.com/python/mypy/issues/17694 + emptyset: InameStrSet = frozenset() + loop_inames = reduce(frozenset.union, + (insn.within_inames + for insn in kernel.instructions), + emptyset) + loop_inames = loop_inames - _get_parallel_inames(kernel) + + for dom in kernel.domains: + for outer_iname in set(dom.get_var_names(dim_type.param)): + if outer_iname not in loop_inames: + continue + + for inner_iname in dom.get_var_names(dim_type.set): + if inner_iname not in loop_inames: + continue + + # either outer_iname and inner_iname should belong to the same + # loop nest level or outer should be strictly outside inner + # iname + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + strict_loop_priorities |= {(outer_iname, inner_iname)} + else: + ancestors_of_inner_iname = tree.ancestors(inner_iname_nest) + if outer_iname_nest not in ancestors_of_inner_iname: + raise LoopyError(f"Loop '{outer_iname}' cannot be nested" + f" outside '{inner_iname}'.") + + # }}} + + return _order_loop_nests(tree, + strict_loop_priorities, + kernel.loop_priority, + iname_to_tree_node_id) + +# vim: fdm=marker From 6ef3ac04a5429f7959722b77370aa3e664fcad11 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 24 Aug 2024 13:08:03 -0500 Subject: [PATCH 46/68] Type a few more bits of loopy.schedule.tools --- loopy/schedule/tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index a02028231..5938d1e2b 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -42,7 +42,7 @@ # {{{ block boundary finder -def get_block_boundaries(schedule): +def get_block_boundaries(schedule: Sequence[ScheduleItem]) -> Mapping[int, int]: """ Return a dictionary mapping indices of :class:`loopy.schedule.BlockBeginItem`s to @@ -309,7 +309,7 @@ def get_subkernel_arg_info( # {{{ get_return_from_kernel_mapping -def get_return_from_kernel_mapping(kernel): +def get_return_from_kernel_mapping(kernel: LoopKernel) -> Mapping[int, int | None]: """ Returns a mapping from schedule index of every schedule item (S) in *kernel* to the schedule index of :class:`loopy.schedule.ReturnFromKernel` @@ -326,8 +326,8 @@ def get_return_from_kernel_mapping(kernel): ) assert isinstance(kernel, LoopKernel) assert isinstance(kernel.linearization, list) - return_from_kernel_idxs = {} - current_return_from_kernel = None + return_from_kernel_idxs: dict[int, int | None] = {} + current_return_from_kernel: int | None = None for sched_idx, sched_item in list(enumerate(kernel.linearization))[::-1]: if isinstance(sched_item, CallKernel): return_from_kernel_idxs[sched_idx] = current_return_from_kernel From adca92328b7868373a72544dd93cef62ec1507a3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 24 Aug 2024 14:34:48 -0500 Subject: [PATCH 47/68] Document tree and loopy.schedule.tools (and a few typing fixes) --- doc/ref_internals.rst | 4 + loopy/schedule/__init__.py | 3 +- loopy/schedule/tools.py | 184 +++++++++++++++++++++++-------------- loopy/schedule/tree.py | 6 ++ 4 files changed, 128 insertions(+), 69 deletions(-) diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst index 3dc0a2bd7..86e2edad2 100644 --- a/doc/ref_internals.rst +++ b/doc/ref_internals.rst @@ -53,3 +53,7 @@ Schedule -------- .. automodule:: loopy.schedule +.. automodule:: loopy.schedule.tools +.. automodule:: loopy.schedule.tree + + diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 31f9bc4a8..b29541da8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -65,12 +65,11 @@ __doc__ = """ -.. currentmodule:: loopy.schedule - .. autoclass:: ScheduleItem .. autoclass:: BeginBlockItem .. autoclass:: EndBlockItem .. autoclass:: CallKernel +.. autoclass:: ReturnFromKernel .. autoclass:: Barrier .. autoclass:: RunInstruction diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5938d1e2b..a7860e343 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -1,4 +1,35 @@ -__copyright__ = "Copyright (C) 2016 Matt Wala" +__copyright__ = """ +Copyright (C) 2016 Matt Wala +Copyright (C) 2020 University of Illinois Board of Trustees +Copyright (C) 2022 Kaushik Kulkarni +""" + +__doc__ = """ +.. autofunction:: get_block_boundaries +.. autofunction:: temporaries_read_in_subkernel +.. autofunction:: args_read_in_subkernel +.. autofunction:: args_written_in_subkernel +.. autofunction:: supporting_temporary_names + +.. autoclass:: KernelArgInfo +.. autoclass:: SubKernelArgInfo + +.. autofunction:: get_kernel_arg_info +.. autofunction:: get_subkernel_arg_info + +.. autofunction:: get_return_from_kernel_mapping + +.. autoclass:: AccessMapDescriptor +.. autoclass:: WriteRaceChecker + +.. autoclass:: InameStrSet +.. autoclass:: LoopNestTree +.. autoclass:: LoopTree + +.. autofunction:: separate_loop_nest +.. autofunction:: get_partial_loop_nest_tree +.. autofunction:: get_loop_tree +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -21,7 +52,7 @@ """ import enum -from collections.abc import Collection, Mapping +from collections.abc import Callable, Collection, Mapping from dataclasses import dataclass from functools import cached_property, reduce from typing import AbstractSet, Dict, FrozenSet, List, Sequence, Set, Tuple @@ -43,10 +74,10 @@ # {{{ block boundary finder def get_block_boundaries(schedule: Sequence[ScheduleItem]) -> Mapping[int, int]: - """ + r""" Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. + :class:`loopy.schedule.BeginBlockItem`\ s to + :class:`loopy.schedule.EndBlockItem`\ s and vice versa. """ from loopy.schedule import BeginBlockItem, EndBlockItem block_bounds = {} @@ -139,8 +170,12 @@ def supporting_temporary_names( @dataclass(frozen=True) class KernelArgInfo: - passed_arg_names: Sequence[str] + """ + .. autoattribute:: passed_arg_names + .. autoattribute:: written_names + """ + passed_arg_names: Sequence[str] written_names: FrozenSet[str] @property @@ -150,6 +185,12 @@ def passed_names(self) -> Sequence[str]: @dataclass(frozen=True) class SubKernelArgInfo(KernelArgInfo): + """Inherits from :class:`KernelArgInfo`. + + .. autoattribute:: passed_inames + .. autoattribute:: passed_temporaries + """ + passed_inames: Sequence[str] passed_temporaries: Sequence[str] @@ -639,24 +680,31 @@ class V2SchedulerNotImplementedError(LoopyError): pass -def _pull_out_loop_nest( +def separate_loop_nest( tree: LoopNestTree, loop_nests: Collection[InameStrSet], - inames_to_pull_out: InameStrSet + inames_to_separate: InameStrSet ) -> tuple[LoopNestTree, InameStrSet, InameStrSet | None]: """ - Returns a copy of *tree* that realizes *inames_to_pull_out* as loop - nesting. - - :arg tree: A :class:`loopy.tools.Tree`, where each node is - :class:`frozenset` of inames representing a loop nest. + Returns a copy of *tree* that has *inames_to_separate* occur in + nodes that are not shared with other inames. + Returns a version of the loop nest tree *tree* so that every node in the tree is + either a subset of *outermost_inames* or has an empty intersection with + *outermost_inames*. + + This routine modifies at most one node of the tree. + All its ancestors must satisfy `ancestor <= outermost_inames`. + For the first node not satisfying this relationship, + if `node & outermost_inames` is empty, no modification is made. + Otherwise, if ``node & outermost_inames < node``, that node is split + so as to separate *outermost_inames* in their own node. :arg loop_nests: A collection of nodes in *tree* that cover - *inames_to_pull_out*. + *inames_to_separate*. :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, where outer_loop_nest is the identifier for the new outer and inner - loop nests so that *inames_to_pull_out* is a valid nesting. + loop nests so that *inames_to_separate* is a valid nesting. .. note:: @@ -668,7 +716,7 @@ def _pull_out_loop_nest( └── frozenset({'j', 'i'}) └── frozenset({'k', 'l'}) - *inames_to_pull_out*: frozenset({'k', 'i', 'j'}) + *inames_to_separate*: frozenset({'k', 'i', 'j'}) *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} Returns: @@ -686,43 +734,43 @@ def _pull_out_loop_nest( # annotation to avoid https://github.com/python/mypy/issues/17693 emptyset: InameStrSet = frozenset() - assert inames_to_pull_out <= reduce(frozenset.union, loop_nests, emptyset) + assert inames_to_separate <= reduce(frozenset.union, loop_nests, emptyset) - # {{{ sanity check to ensure the loop nest *inames_to_pull_out* is possible + # {{{ sanity check to ensure the loop nest *inames_to_separate* is possible loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): if outer != tree.parent(inner): - raise LoopyError(f"Cannot schedule loop nest {inames_to_pull_out} " + raise LoopyError(f"Cannot schedule loop nest {inames_to_separate} " f" in the nesting tree:\n{tree}") assert tree.depth(loop_nests[0]) == 0 # }}} - innermost_loop_nest = loop_nests[-1] + innermost_node = loop_nests[-1] # separate variable to avoid https://github.com/python/mypy/issues/17694 outerer_loops = reduce(frozenset.union, loop_nests[:-1], emptyset) - new_outer_loop_nest = inames_to_pull_out - outerer_loops - new_inner_loop_nest = innermost_loop_nest - inames_to_pull_out + new_outer_node = inames_to_separate - outerer_loops + new_inner_node = innermost_node - inames_to_separate - if new_outer_loop_nest == innermost_loop_nest: + if new_outer_node == innermost_node: # such a loop nesting already exists => do nothing - return tree, new_outer_loop_nest, None + return tree, new_outer_node, None # add the outer loop to our loop nest tree - tree = tree.add_node(new_outer_loop_nest, - parent=not_none(tree.parent(innermost_loop_nest))) + tree = tree.add_node(new_outer_node, + parent=not_none(tree.parent(innermost_node))) # rename the old loop to the inner loop - tree = tree.replace_node(innermost_loop_nest, - new_node=new_inner_loop_nest) + tree = tree.replace_node(innermost_node, + new_node=new_inner_node) # set the parent of inner loop to be the outer loop - tree = tree.move_node(new_inner_loop_nest, new_parent=new_outer_loop_nest) + tree = tree.move_node(new_inner_node, new_parent=new_outer_node) - return tree, new_outer_loop_nest, new_inner_loop_nest + return tree, new_outer_node, new_inner_node def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): @@ -746,14 +794,14 @@ def _order_loop_nests( obtained after constraining *loop_nest_tree* with the constraints enforced by *priorities*. - :arg strict_priorities: Expresses strict nesting constraints similar to - :attr:`loopy.LoopKernel.loop_priorities`. These priorities are imposed - strictly i.e. if these conditions cannot be met a + :arg strict_priorities: Expresses strict nesting constraints using the same + data structure as :attr:`loopy.LoopKernel.loop_priority`. + These priorities are imposed strictly i.e. if these conditions cannot be met a :class:`loopy.diagnostic.LoopyError` is raised. - :arg relaxed_priorities: Expresses strict nesting constraints similar to - :attr:`loopy.LoopKernel.loop_priorities`. These nesting constraints are - treated as options. + :arg relaxed_priorities: Expresses strict nesting constraints using the same + data structure as :attr:`loopy.LoopKernel.loop_priority`. + These nesting constraints are treated as optional. :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a part of. @@ -764,21 +812,24 @@ def _order_loop_nests( loop_nests = set(iname_to_tree_node_id.values()) - # flow_requirements: A mapping from the loop nest level to the nesting + # nesting_constraints: A mapping from the loop nest level to the nesting # constraints applicable to it. # Each nesting constraint is represented as a DAG. In the DAG, if there # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested # inside 'i'. - flow_requirements: dict[InameStrSet, dict[InameStr, InameStrSet]] = { + iname_to_nesting_constraints: dict[InameStrSet, dict[InameStr, InameStrSet]] = { loop_nest: {iname: frozenset() for iname in loop_nest} for loop_nest in loop_nests} - # The plan here is populate DAGs in *flow_requirements* and then perform a + # The plan here is populate DAGs in *nesting_constraints* and then perform a # toposort for each loop nest. - def _update_flow_requirements(priorities, cannot_satisfy_callback): + def _update_nesting_constraints( + priorities: FrozenSet[Tuple[InameStr, ...]], + cannot_satisfy_callback: Callable[[str], None] + ) -> None: """ - Records *priorities* in *flow_requirements* and calls + Records *priorities* in *nesting_constraints* and calls *cannot_satisfy_callback* with an appropriate error message if the priorities cannot be met. """ @@ -804,7 +855,8 @@ def _update_flow_requirements(priorities, cannot_satisfy_callback): outer_iname_nest = iname_to_tree_node_id[outer_iname] if inner_iname_nest == outer_iname_nest: - flow_requirements[inner_iname_nest][outer_iname] |= {inner_iname} + iname_to_nesting_constraints[ + inner_iname_nest][outer_iname] |= {inner_iname} else: ancestors_of_inner_iname = (loop_nest_tree .ancestors(inner_iname_nest)) @@ -829,9 +881,9 @@ def _raise_loopy_err(x): raise LoopyError(x) # record strict priorities - _update_flow_requirements(strict_priorities, _raise_loopy_err) + _update_nesting_constraints(strict_priorities, _raise_loopy_err) # record relaxed priorities - _update_flow_requirements(relaxed_priorities, warn) + _update_nesting_constraints(relaxed_priorities, warn) # ordered_loop_nests: A mapping from the unordered loop nests to their # ordered couterparts. For example. If we had only one loop nest @@ -839,9 +891,9 @@ def _raise_loopy_err(x): # constraint that "i" must be nested within "k", then `ordered_loop_nests` # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop # nests would now have an order. - ordered_loop_nests = {unordered_nest: toposort(flow, - key=lambda x: x) - for unordered_nest, flow in flow_requirements.items()} + ordered_loop_nests = { + unordered_nest: toposort(flow, key=lambda x: x) + for unordered_nest, flow in iname_to_nesting_constraints.items()} # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' @@ -868,7 +920,7 @@ def _raise_loopy_err(x): old_to_new_parent[current_nest] = ordered_nest[-1] - queue.extend(list(loop_nest_tree.children(current_nest))) + queue.extend(loop_nest_tree.children(current_nest)) # }}} @@ -888,9 +940,9 @@ def _get_parallel_inames(kernel: LoopKernel) -> AbstractSet[str]: return (concurrent_inames - ilp_inames - vec_inames) -def _get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: +def get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: """ - Returns :class:`loopy.Tree` representing the *kernel*'s loop-nests. + Returns a tree representing the *kernel*'s loop nests. Each node of the returned tree has a :class:`frozenset` of inames. All the inames in the identifier of a parent node of a loop nest in the @@ -908,8 +960,9 @@ def _get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: # figuring the possible loop nestings minus the concurrent_inames as they # are never realized as actual loops - iname_chains = {insn.within_inames - _get_parallel_inames(kernel) - for insn in kernel.instructions} + insn_iname_sets = { + insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} root: InameStrSet = frozenset() tree = Tree.from_root(root) @@ -919,16 +972,16 @@ def _get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: # if there were any loop with no inames, those have been already account # for as the root. - iname_chains = iname_chains - {root} + insn_iname_sets = insn_iname_sets - {root} - for iname_chain in iname_chains: - not_seen_inames = frozenset(iname for iname in iname_chain + for iname_set in insn_iname_sets: + not_seen_inames = frozenset(iname for iname in iname_set if iname not in iname_to_tree_node_id) - seen_inames = iname_chain - not_seen_inames + seen_inames = iname_set - not_seen_inames all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} - tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + tree, outer_loop, inner_loop = separate_loop_nest(tree, (all_nests | {frozenset()}), seen_inames) @@ -958,14 +1011,14 @@ def _get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: ilp_inames = {iname for iname in kernel.all_inames() if kernel.iname_tags_of_type(iname, IlpBaseTag)} - for iname_chain in iname_chains: - for ilp_iname in (ilp_inames & iname_chains): + for iname_set in insn_iname_sets: + for ilp_iname in (ilp_inames & insn_iname_sets): # pull out other loops so that ilp_iname is the innermost all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} - tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + tree, outer_loop, inner_loop = separate_loop_nest(tree, (all_nests | {frozenset()}), - (iname_chain + (iname_set - {ilp_iname})) for iname in outer_loop: @@ -1000,11 +1053,8 @@ def _get_iname_to_tree_node_id_from_partial_loop_nest_tree( def get_loop_tree(kernel: LoopKernel) -> LoopTree: """ - Returns ```tree``` (an instance of :class:`Tree`) representing the loop - nesting for *kernel*. Each node of ``tree`` is an instance of :class:`str` - corresponding to the inames of *kernel* that are realized as concrete - ``for-loops``. A parent node in `tree` is always nested outside all its - children. + Returns a tree representing the loop nesting for *kernel*. A parent node in + the tree is always nested outside all its children. .. note:: @@ -1013,7 +1063,7 @@ def get_loop_tree(kernel: LoopKernel) -> LoopTree: """ from islpy import dim_type - tree = _get_partial_loop_nest_tree(kernel) + tree = get_partial_loop_nest_tree(kernel) iname_to_tree_node_id = ( _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py index f91b36e90..253ff5f84 100644 --- a/loopy/schedule/tree.py +++ b/loopy/schedule/tree.py @@ -8,6 +8,12 @@ Copyright (C) 2022-24 University of Illinois Board of Trustees """ + +__doc__ = """ +.. autoclass:: NodeT +.. autoclass:: Tree +""" + __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 85ab47a4a4e42371b115ead930c96c685dbdf0b5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:22:01 -0500 Subject: [PATCH 48/68] adds loopy scheduler v2 --- loopy/schedule/__init__.py | 243 ++++++++++++++++++++++++++++++++----- 1 file changed, 215 insertions(+), 28 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b29541da8..ef6022a56 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -53,11 +53,16 @@ from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel from loopy.kernel.instruction import InstructionBase from loopy.tools import LoopyKeyBuilder, caches +from loopy.typing import InameStr from loopy.version import DATA_MODEL_VERSION if TYPE_CHECKING: from loopy.kernel import LoopKernel + from loopy.schedule.tools import ( + InameStrSet, + LoopTree, + ) from loopy.translation_unit import CallablesTable, TranslationUnit @@ -879,7 +884,170 @@ def is_similar_to_template(insn): # }}} -# {{{ scheduling algorithm +# {{{ scheduling algorithm v2 + +def _get_outermost_diverging_inames( + tree: LoopTree, + within1: InameStrSet, + within2: InameStrSet + ) -> Tuple[InameStr, InameStr]: + """ + For loop nestings *within1* and *within2*, returns the first inames at which + the loops nests diverge in the loop nesting tree *tree*. + """ + common_ancestors = (within1 & within2) | {""} + + innermost_parent = max(common_ancestors, + key=lambda k: tree.depth(k)) + iname1, = frozenset(tree.children(innermost_parent)) & within1 + iname2, = frozenset(tree.children(innermost_parent)) & within2 + + return iname1, iname2 + + +def _generate_loop_schedules_v2(kernel: LoopKernel) -> Sequence[ScheduleItem]: + from functools import reduce + + from pytools.graph import compute_topological_order + + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.schedule.tools import get_loop_tree + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + parallel_inames = (concurrent_inames - ilp_inames - vec_inames) + + # {{{ can v2 scheduler handle the kernel? + + from loopy.schedule.tools import V2SchedulerNotImplementedError + if any(insn.conflicts_with_groups for insn in kernel.instructions): + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " kernels with instruction having conflicts with groups.") + + if any(insn.priority != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") + + if kernel.schedule is not None: + # cannot handle preschedule yet + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " prescheduled kernels.") + + if ilp_inames or vec_inames: + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " loops tagged with 'ilp'/'vec' as they are not guaranteed to" + " be single entry loops.") + + # }}} + + loop_tree = get_loop_tree(kernel) + + # loop_inames: inames that are realized as loops. Concurrent inames aren't + # realized as a loop in the generated code for a loopy.TargetBase. + + # FIXME: These three could be one statement if it weren't for + # - https://github.com/python/mypy/issues/17693 + # - https://github.com/python/mypy/issues/17694 + emptyset: frozenset[InameStr] = frozenset() + all_inames = reduce( + frozenset.union, + (insn.within_inames for insn in kernel.instructions), + emptyset) + loop_inames = all_inames - parallel_inames + + # The idea here is to build a DAG, where nodes are schedule items and if + # there exists an edge from schedule item A to schedule item B in the DAG => + # B *must* come after A in the linearized result. + + dag: dict[ScheduleItem, frozenset[ScheduleItem]] = {} + + # LeaveLoop(i) *must* follow EnterLoop(i) + dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)}) + for iname in loop_inames}) + dag.update({LeaveLoop(iname=iname): frozenset() + for iname in loop_inames}) + dag.update({RunInstruction(insn_id=not_none(insn.id)): frozenset() + for insn in kernel.instructions}) + + # {{{ add constraints imposed by the loop nesting + + for outer_loop in loop_tree.nodes(): + if outer_loop == "": + continue + + for child in loop_tree.children(outer_loop): + inner_loop = child + dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)} + dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)} + + # }}} + + # {{{ add deps. between schedule items coming from insn. depepdencies + + for insn in kernel.instructions: + assert insn.id is not None + + insn_loop_inames = insn.within_inames & loop_inames + for dep_id in insn.depends_on: + dep = kernel.id_to_insn[dep_id] + dep_loop_inames = dep.within_inames & loop_inames + # Enforce instruction dep: + dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)} + + # {{{ register deps on loop entry/leave because of insn. deps + + if dep_loop_inames < insn_loop_inames: + for iname in insn_loop_inames - dep_loop_inames: + dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)} + elif insn_loop_inames < dep_loop_inames: + for iname in dep_loop_inames - insn_loop_inames: + dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + elif dep_loop_inames != insn_loop_inames: + insn_iname, dep_iname = _get_outermost_diverging_inames( + loop_tree, insn_loop_inames, dep_loop_inames) + dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)} + else: + pass + + # }}} + + for iname in insn_loop_inames: + # For an insn within a loop nest 'i' + # for i + # insn + # end i + # 'insn' *must* come b/w 'for i' and 'end i' + dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)} + + # }}} + + def iname_key(iname: str) -> str: + all_ancestors = sorted(loop_tree.ancestors(iname), + key=lambda x: loop_tree.depth(x)) + return ",".join(all_ancestors+[iname]) + + def key(x: ScheduleItem) -> tuple[str, ...]: + if isinstance(x, RunInstruction): + iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames), + key=lambda k: loop_tree.depth(k), + default="") + return (iname_key(iname), x.insn_id) + elif isinstance(x, (EnterLoop, LeaveLoop)): + return (iname_key(x.iname),) + else: + raise NotImplementedError + + return compute_topological_order(dag, key=key) + +# }}} + + +# {{{ legacy scheduling algorithm def _generate_loop_schedules_internal( sched_state, debug=None): @@ -2031,6 +2199,40 @@ def generate_loop_schedules( callables_table, debug_args=debug_args) +def _postprocess_schedule(kernel, callables_table, gen_sched): + from loopy.kernel import KernelState + + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) + + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="global", + verify_only=(not + kernel.options.insert_gbarriers)) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) + + new_kernel = kernel.copy( + linearization=gen_sched, + state=KernelState.LINEARIZED) + + from loopy.schedule.device_mapping import map_schedule_onto_host_or_device + if kernel.state != KernelState.LINEARIZED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + return new_kernel + + def _generate_loop_schedules_inner( kernel: LoopKernel, callables_table: CallablesTable, @@ -2043,6 +2245,17 @@ def _generate_loop_schedules_inner( raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") + from loopy.schedule.tools import V2SchedulerNotImplementedError + try: + gen_sched = _generate_loop_schedules_v2(kernel) + yield _postprocess_schedule(kernel, callables_table, gen_sched) + return + + except V2SchedulerNotImplementedError as e: + from warnings import warn + warn(f"Falling back to a slow scheduler implementation due to: {e}", + stacklevel=1) + schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -2157,33 +2370,7 @@ def print_longest_dead_end(): sched_state, debug=debug, **schedule_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) - - gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, - return_dict=True) - - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="global", - verify_only=(not - kernel.options.insert_gbarriers)) - - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) - - new_kernel = kernel.copy( - linearization=gen_sched, - state=KernelState.LINEARIZED) - - from loopy.schedule.device_mapping import map_schedule_onto_host_or_device - if kernel.state != KernelState.LINEARIZED: - # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) + new_kernel = _postprocess_schedule(kernel, callables_table, gen_sched) yield new_kernel From a933682eb7c0dc967736ce69fc3e8af476b3d8cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Aug 2021 12:22:57 -0500 Subject: [PATCH 49/68] changes in docs to account for equivalent generated codes from the same LoopKernel --- doc/tutorial.rst | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index a697bed30..301411123 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -613,7 +613,7 @@ commonly called 'loop tiling': ... assumptions="n mod 16 = 0 and n >= 1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) - >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") + >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner") >>> knl = lp.set_options(knl, write_code=True) >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -1032,8 +1032,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - acc_k = 0.0f; a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1056,12 +1056,11 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch[lid(0)]; out[16 * gid(0) + lid(0)] = acc_k; @@ -1903,18 +1902,16 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; - { - int const k_outer = 0; - + for (int i = 0; i <= 49; ++i) for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } - } + { + int const k_outer = 0; + + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } } In this kernel, when a work-item performs the second instruction it uses data From a5b1452146d17aa7b59bcaf8852ea2eda0cbbbd3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 24 Aug 2024 22:39:12 -0500 Subject: [PATCH 50/68] Fix missing dependencies in test_duplicate_iname_not_read_only_nested --- test/test_transform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_transform.py b/test/test_transform.py index 4ad1971fb..98398fefd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1720,6 +1720,7 @@ def test_duplicate_iname_not_read_only_nested(ctx_factory): """, [lp.GlobalArg("A,x,y", shape=lp.auto, dtype=np.float32), ...], + seq_dependencies=True, ) ref_t_unit = t_unit From bb46dcefa5bfdd2552c14d44a668e048d67689e1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 16 Feb 2024 12:45:23 -0600 Subject: [PATCH 51/68] Add test_long_kernel On Python 3.12, this provokes a stack overflow in the scheduler. It is not quite clear why that's the case; pure-Python recursion even with generators seems to respond well to setrecursionlimit(): ```py def f(n): if n: yield from f(n-1) else: yield 5 import sys sys.setrecursionlimit(3500) print(list(f(3400))) ``` That said, there have been [behavior](https://github.com/python/cpython/pull/96510) [changes](https://github.com/python/cpython/issues/112215) in Py3.12 in this regard, but it is not clear what exactly about Loopy's behavior makes it fall into the 'bad' case. --- test/test_loopy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index 34310171f..bfa607328 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3689,6 +3689,21 @@ def test_no_unnecessary_lbarrier(ctx_factory): assert not barrier_between(knl, "write_s_a", "write_ao") +def test_long_kernel(): + n = 500 + insns = [ + f"a{i}[j{i}] = j{i}" + for i in range(n) + ] + domains = [ + f"{{ [j{i}]: 0<=j{i}<10 }}" + for i in range(n) + ] + t_unit = lp.make_kernel(domains, insns) + t_unit = lp.preprocess_kernel(t_unit) + lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From d5ee69004a12ec341546a5eb5b0136f3ead4d260 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 1 Sep 2024 14:04:18 -0500 Subject: [PATCH 52/68] Work around setuptools 64's breakage of static analysis tools --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8d4d51cd8..4134ba24d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,10 @@ loopy = [ "py.typed", ] +[tool.setuptools.package-dir] +# https://github.com/Infleqtion/client-superstaq/pull/715 +"" = "." + [tool.ruff] preview = true From 0f78426de9e1f74e3e678d1f6b8d91600339873c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 4 Sep 2024 15:11:03 -0500 Subject: [PATCH 53/68] Update tutorial for islpy 2024.2 --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 301411123..166357d7c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -822,7 +822,7 @@ enabling some cost savings: { int const i_outer = -1 + n + -1 * ((3 * n) / 4); - if (-1 + n >= 0) + if (i_outer >= 0) { a[4 * i_outer] = (float) (0.0f); if (-2 + -4 * i_outer + n >= 0) From 070df9f8393e75a241e5e1e5a77fe2824ec419d2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:25:51 -0700 Subject: [PATCH 54/68] Require that happens_after is not mutable (#866) * Require that happens_after is not mutable * Tweak type tests for happens_after --------- Co-authored-by: Andreas Kloeckner --- loopy/kernel/instruction.py | 34 ++++++++++++++++++++++------------ loopy/tools.py | 9 +++++++++ pyproject.toml | 1 + 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index d564d5e36..a6420b8fc 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -20,7 +20,10 @@ THE SOFTWARE. """ -from collections.abc import Mapping as MappingABC, Set as abc_Set +from collections.abc import ( + Mapping as MappingABC, + Set as abc_Set, +) from dataclasses import dataclass from functools import cached_property from sys import intern @@ -283,6 +286,7 @@ def __init__(self, *, depends_on: Union[FrozenSet[str], str, None] = None, ) -> None: + from immutabledict import immutabledict if predicates is None: predicates = frozenset() @@ -314,28 +318,29 @@ def __init__(self, raise LoopyError("Setting depends_on_is_final to True requires " "actually specifying happens_after/depends_on") - if happens_after is None: - happens_after = {} + if isinstance(happens_after, immutabledict): + pass + elif happens_after is None: + happens_after = immutabledict() elif isinstance(happens_after, str): warn("Passing a string for happens_after/depends_on is deprecated and " "will stop working in 2025. Instead, pass a full-fledged " "happens_after data structure.", DeprecationWarning, stacklevel=2) - happens_after = { + happens_after = immutabledict({ after_id.strip(): HappensAfter( variable_name=None, instances_rel=None) for after_id in happens_after.split(",") - if after_id.strip()} + if after_id.strip()}) elif isinstance(happens_after, frozenset): - happens_after = { + happens_after = immutabledict({ after_id: HappensAfter( variable_name=None, instances_rel=None) - for after_id in happens_after} - elif isinstance(happens_after, MappingABC): - if isinstance(happens_after, dict): - happens_after = happens_after + for after_id in happens_after}) + elif isinstance(happens_after, dict): + happens_after = immutabledict(happens_after) else: raise TypeError("'happens_after' has unexpected type: " f"{type(happens_after)}") @@ -390,6 +395,9 @@ def __init__(self, assert isinstance(groups, abc_Set) assert isinstance(conflicts_with_groups, abc_Set) + from loopy.tools import is_hashable + assert is_hashable(happens_after) + ImmutableRecord.__init__(self, id=id, happens_after=happens_after, @@ -573,13 +581,15 @@ def update_persistent_hash(self, key_hash, key_builder): def __setstate__(self, val): super().__setstate__(val) + from immutabledict import immutabledict + from loopy.tools import intern_frozenset_of_ids if self.id is not None: # pylint:disable=access-member-before-definition self.id = intern(self.id) - self.happens_after = { + self.happens_after = immutabledict({ intern(after_id): ha - for after_id, ha in self.happens_after.items()} + for after_id, ha in self.happens_after.items()}) self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( intern_frozenset_of_ids(self.conflicts_with_groups)) diff --git a/loopy/tools.py b/loopy/tools.py index 50a523ee8..2e3b5db4f 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -972,4 +972,13 @@ def _get_persistent_hashable_arg(arg): # }}} + +def is_hashable(o: object) -> bool: + try: + hash(o) + except TypeError: + return False + return True + + # vim: fdm=marker diff --git a/pyproject.toml b/pyproject.toml index 4134ba24d..70672a1ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "Mako", "pyrsistent", "immutables", + "immutabledict", # for Self, TypeAlias "typing-extensions>=4; python_version<'3.12'", From 66389cdb1ff3392cb12ee7d3ee7216463ac6d41b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 21 Sep 2024 23:10:48 -0500 Subject: [PATCH 55/68] Use warn_with_kernel for V1-scheduler fallback --- loopy/schedule/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ef6022a56..1364be850 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -2252,8 +2252,10 @@ def _generate_loop_schedules_inner( return except V2SchedulerNotImplementedError as e: - from warnings import warn - warn(f"Falling back to a slow scheduler implementation due to: {e}", + warn_with_kernel( + kernel, + "v1_scheduler_fallback", + f"Falling back to a slow scheduler implementation due to: {e}", stacklevel=1) schedule_count = 0 From 7b5d73d6ce8235d8cdb79225f7610d539994b0d1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 21 Sep 2024 23:54:55 -0500 Subject: [PATCH 56/68] Avoid setting loop priority for disjoint loops --- doc/tutorial.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 166357d7c..b0d9cebd4 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -438,7 +438,8 @@ with identical bounds, for the use of the transpose: ... out[ii,jj] = 2*out[ii,jj] {dep=transpose} ... """, ... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...]) - >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj") + >>> knl = lp.prioritize_loops(knl, "i,j") + >>> knl = lp.prioritize_loops(knl, "ii,jj") :func:`loopy.duplicate_inames` can be used to achieve the same goal. Now the intended code is generated and our test passes. @@ -957,7 +958,7 @@ Consider the following example: ... "{ [i_outer,i_inner, k]: " ... "0<= 16*i_outer + i_inner a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10} + ... <> a_temp[i_inner] = a[16*i_outer + i_inner] ... out[16*i_outer + i_inner] = sum(k, a_temp[k]) ... """) >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) @@ -1208,6 +1209,12 @@ Let us start with an example. Consider the kernel from above with a ... assumptions="n mod 16 = 0") >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") +.. testsetup:: + + >>> prog = prog.with_kernel( + ... prog.default_entrypoint.copy( + ... silenced_warnings=["v1_scheduler_fallback"])) + Here is what happens when we try to generate code for the kernel: >>> cgr = lp.generate_code_v2(prog) From 6cc60f0bff20e1204320841d8b91eaf27c5acbb0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 8 Oct 2024 17:13:44 -0500 Subject: [PATCH 57/68] Unconditionally depend on typing-extensions --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70672a1ab..4b4334adb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,8 +47,7 @@ dependencies = [ "immutables", "immutabledict", - # for Self, TypeAlias - "typing-extensions>=4; python_version<'3.12'", + "typing-extensions>=4", ] [project.optional-dependencies] pyopencl = [ From db136129dc53b48a63cc5e5eb597a77540093132 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 11 Oct 2024 10:42:30 -0500 Subject: [PATCH 58/68] Documentation defined the unpack argument twice. Now it only defines it once and also defines the pack argument. --- loopy/transform/pack_and_unpack_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 1c1b48c59..2a82952c2 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -48,7 +48,7 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, :arg call_name: An instance of :class:`str` denoting the function call in the *kernel*. - :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + :arg args_to_pack: A list of the arguments as instances of :class:`str` which must be packed. If set *None*, it is interpreted that all the array arguments would be packed. :arg args_to_unpack: A list of the arguments as instances of :class:`str` From e390b53b1f126736ec65fa580edb1b3b5a47f2f1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 21 Oct 2024 21:24:38 -0500 Subject: [PATCH 59/68] RecursiveMapper -> Mapper --- loopy/expression.py | 4 ++-- loopy/target/c/codegen/expression.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/loopy/expression.py b/loopy/expression.py index 224521b03..2581ec022 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -23,7 +23,7 @@ import numpy as np -from pymbolic.mapper import RecursiveMapper +from pymbolic.mapper import Mapper from loopy.codegen import UnvectorizableError from loopy.diagnostic import LoopyError @@ -55,7 +55,7 @@ def dtype_to_type_context(target, dtype): # {{{ vectorizability checker -class VectorizabilityChecker(RecursiveMapper): +class VectorizabilityChecker(Mapper): """The return value from this mapper is a :class:`bool` indicating whether the result of the expression is vectorized along :attr:`vec_iname`. If the expression is not vectorizable, the mapper raises diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 276407fc1..810ac4379 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -28,7 +28,7 @@ import islpy as isl import pymbolic.primitives as p from pymbolic import var -from pymbolic.mapper import IdentityMapper, RecursiveMapper +from pymbolic.mapper import IdentityMapper, Mapper from pymbolic.mapper.stringifier import ( PREC_BITWISE_AND, PREC_BITWISE_OR, @@ -124,9 +124,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s): return s - def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None): - result = RecursiveMapper.rec(self, expr, type_context) - + def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None): # type: ignore[override] + result = Mapper.rec(self, expr, type_context) if needed_type is None: return result else: @@ -604,7 +603,7 @@ def map_nan(self, expr, type_context): # {{{ C expression to code mapper -class CExpressionToCodeMapper(RecursiveMapper): +class CExpressionToCodeMapper(Mapper): # {{{ helpers From 94d64dd527277742732090a8d310256d53fbf3b7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 3 Oct 2024 09:01:03 -0500 Subject: [PATCH 60/68] Limit Github PR CI concurrency --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2c6c77e77..234988280 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,10 @@ on: schedule: - cron: '17 3 * * 0' +concurrency: + group: ${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + jobs: ruff: name: Ruff From 3009c048372b4c89ed8c1c3db8357abc4ed0c29a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 21 Oct 2024 21:24:20 -0500 Subject: [PATCH 61/68] Call flatten() on expressionss that are assumed to be simplified --- loopy/codegen/loop.py | 3 ++- loopy/symbolic.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 645a57e31..c64c2ea67 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -28,6 +28,7 @@ from loopy.codegen.control import build_loop_nest from loopy.codegen.result import merge_codegen_results from loopy.diagnostic import LoopyError, warn +from loopy.symbolic import flatten # {{{ conditional-reducing slab decomposition @@ -309,7 +310,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr - hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) + hw_axis_expr = flatten(hw_axis_expr + pw_aff_to_expr(lower_bound)) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 22dbd3bf5..6727423a8 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -54,6 +54,7 @@ ) from pymbolic.mapper.dependency import CachedDependencyMapper as DependencyMapperBase from pymbolic.mapper.evaluator import CachedEvaluationMapper as EvaluationMapperBase +from pymbolic.mapper.flattener import FlattenMapper as FlattenMapperBase from pymbolic.mapper.stringifier import StringifyMapper as StringifyMapperBase from pymbolic.mapper.substitutor import ( CachedSubstitutionMapper as SubstitutionMapperBase, @@ -195,6 +196,14 @@ def map_resolved_function(self, expr, *args, **kwargs): map_fortran_division = IdentityMapperBase.map_quotient +class FlattenMapper(FlattenMapperBase, IdentityMapperMixin): + pass + + +def flatten(expr: ExpressionT) -> ExpressionT: + return FlattenMapper()(expr) + + class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -1833,7 +1842,7 @@ def aff_to_expr(aff: isl.Aff) -> ExpressionT: if coeff: result += coeff*aff_to_expr(aff.get_div(i)) - return result // denom + return flatten(result // denom) def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> ExpressionT: @@ -2178,14 +2187,17 @@ def qpolynomial_to_expr(qpoly): assert all(isinstance(num, int) for num in numerators) assert isinstance(common_denominator, int) + # FIXME: Delete if in favor of the general case once we depend on pymbolic 2024.1. if common_denominator == 1: - return sum(num * monomial + res = sum(num * monomial for num, monomial in zip(numerators, monomials)) else: - return FloorDiv(sum(num * monomial + res = FloorDiv(sum(num * monomial for num, monomial in zip(numerators, monomials)), common_denominator) + return flatten(res) + # }}} From 123f534bb31b0957ecaf9d04d85b8c2c90772843 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 30 Oct 2024 15:18:49 -0500 Subject: [PATCH 62/68] Add a few more calls to flatten for compat with pymbolic 2024.1 --- doc/tutorial.rst | 2 +- loopy/kernel/array.py | 4 +++- loopy/statistics.py | 4 ++-- loopy/symbolic.py | 26 ++++++++++++++++++++++---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index b0d9cebd4..4aeb42428 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1318,7 +1318,7 @@ The kernel translates into two OpenCL kernels. int tmp; tmp = tmp_save_slot[16 * gid(0) + lid(0)]; - arr[(lid(0) + gid(0) * 16 + 1) % n] = tmp; + arr[(1 + lid(0) + gid(0) * 16) % n] = tmp; } Now we can execute the kernel. diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 884c26d2f..84477749f 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1,5 +1,7 @@ from __future__ import annotations +from loopy.symbolic import flatten + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -1318,7 +1320,7 @@ def eval_expr_assert_integer_constant(i, expr): "make_temporaries_for_offsets_and_strides " "during preprocessing.") - subscripts[dim_tag.target_axis] += (stride // vector_size)*idx + subscripts[dim_tag.target_axis] += flatten((stride // vector_size)*idx) elif isinstance(dim_tag, SeparateArrayArrayDimTag): raise AssertionError() diff --git a/loopy/statistics.py b/loopy/statistics.py index 29ea91259..2d0537fdb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -36,7 +36,7 @@ from loopy.diagnostic import LoopyError, warn_with_kernel from loopy.kernel.data import AddressSpace, MultiAssignmentBase, TemporaryVariable from loopy.kernel.function_interface import CallableKernel -from loopy.symbolic import CoefficientCollector +from loopy.symbolic import CoefficientCollector, flatten from loopy.translation_unit import TranslationUnit @@ -1167,7 +1167,7 @@ def get_iname_strides(tag_to_iname_dict): total_iname_stride += axis_tag_stride*coeff - tag_to_stride_dict[tag] = total_iname_stride + tag_to_stride_dict[tag] = flatten(total_iname_stride) return tag_to_stride_dict diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6727423a8..86e854bd2 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -29,7 +29,17 @@ import re from functools import cached_property, reduce from sys import intern -from typing import TYPE_CHECKING, AbstractSet, Any, ClassVar, Mapping, Sequence, Tuple +from typing import ( + TYPE_CHECKING, + AbstractSet, + Any, + ClassVar, + Mapping, + Sequence, + Tuple, + TypeVar, + cast, +) import immutables import numpy as np @@ -39,6 +49,7 @@ import pymbolic.primitives as p import pytools.lex from islpy import dim_type +from pymbolic import ArithmeticExpressionT from pymbolic.mapper import ( CachedCombineMapper as CombineMapperBase, CachedIdentityMapper as IdentityMapperBase, @@ -200,8 +211,14 @@ class FlattenMapper(FlattenMapperBase, IdentityMapperMixin): pass -def flatten(expr: ExpressionT) -> ExpressionT: - return FlattenMapper()(expr) +ArithmeticOrExpressionT = TypeVar( + "ArithmeticOrExpressionT", + ArithmeticExpressionT, + ExpressionT) + + +def flatten(expr: ArithmeticOrExpressionT) -> ArithmeticOrExpressionT: + return cast(ArithmeticOrExpressionT, FlattenMapper()(expr)) class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): @@ -2127,7 +2144,8 @@ def simplify_using_aff(kernel, expr): try: aff = guarded_aff_from_expr(domain.space, expr) except ExpressionToAffineConversionError: - return expr + # Accomplish at least *some* simplification + return flatten(expr) # FIXME: Deal with assumptions, too. aff = aff.gist(domain) From 442d3ef25b782bee6d2fcf4dc0381708f849b4d7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 30 Oct 2024 15:51:36 -0500 Subject: [PATCH 63/68] Change deprecated calls to Expression.index --- loopy/target/execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 21600c734..d1455530d 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -187,7 +187,7 @@ def generate_integer_arg_finding_from_array_data( if shape_i is not None: equations.append( _ArgFindingEquation( - lhs=var(arg.name).attr("shape").index(axis_nr), + lhs=var(arg.name).attr("shape")[axis_nr], rhs=shape_i, order=0, based_on_names=frozenset({arg.name}))) @@ -198,7 +198,7 @@ def generate_integer_arg_finding_from_array_data( equations.append( _ArgFindingEquation( lhs=var("_lpy_even_div")( - var(arg.name).attr("strides").index(axis_nr), + var(arg.name).attr("strides")[axis_nr], arg.dtype.itemsize), rhs=_str_to_expr(stride_i), order=0, From 510ad10385a135687d9c1e5717e11f2c4f314835 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 30 Oct 2024 15:51:49 -0500 Subject: [PATCH 64/68] Silence spurious pylint warning in kernel creation --- loopy/kernel/creation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index d023015fa..4f1803f24 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1070,7 +1070,8 @@ def parse_domains(domains, defines): if isinstance(dom, str): dom, = expand_defines(dom, defines) - if not dom.lstrip().startswith("["): + # pylint warning is spurious + if not dom.lstrip().startswith("["): # pylint: disable=no-member # i.e. if no parameters are already given parameters = (_gather_isl_identifiers(dom) - _find_inames_in_set(dom) From e54799bc2776a0577395fe95c788dcdac3b18082 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 1 Nov 2024 16:49:26 -0500 Subject: [PATCH 65/68] Add calls to flatten() in precompute and privatize --- loopy/transform/precompute.py | 3 ++- loopy/transform/privatize.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index d04fa5b2d..6208f90ff 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -47,6 +47,7 @@ RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, + flatten, get_dependencies, ) from loopy.transform.array_buffer_map import ( @@ -928,7 +929,7 @@ def add_assumptions(d): storage_axis_subst_dict[ prior_storage_axis_name_dict.get(arg_name, arg_name)] = \ - arg+base_index + flatten(arg+base_index) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index ef878c90f..ca31368d2 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -43,7 +43,7 @@ # {{{ privatize temporaries with iname -from loopy.symbolic import IdentityMapper +from loopy.symbolic import IdentityMapper, flatten class ExtraInameIndexInserter(IdentityMapper): @@ -66,7 +66,7 @@ def map_subscript(self, expr): self.seen_priv_axis_inames.update(v.name for v in extra_idx) - new_idx = index + tuple(v - self.iname_to_lbound[v.name] + new_idx = index + tuple(flatten(v - self.iname_to_lbound[v.name]) for v in extra_idx) if len(new_idx) == 1: @@ -81,7 +81,7 @@ def map_variable(self, expr): else: self.seen_priv_axis_inames.update(v.name for v in new_idx) - new_idx = tuple(v - self.iname_to_lbound[v.name] + new_idx = tuple(flatten(v - self.iname_to_lbound[v.name]) for v in new_idx) if len(new_idx) == 1: From da8537c3558718c08d70cff147224e27bce77a73 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 1 Nov 2024 16:49:38 -0500 Subject: [PATCH 66/68] Swap out a deprecated .index --- loopy/transform/precompute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 6208f90ff..2c91643ac 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -297,7 +297,7 @@ def map_substitution(self, name, tag, arguments, expn_state): new_outer_expr = var(self.temporary_name) if stor_subscript: - new_outer_expr = new_outer_expr.index(tuple(stor_subscript)) + new_outer_expr = new_outer_expr[tuple(stor_subscript)] # Can't possibly be nested, and no need to traverse # further as compute expression has already been seen From 7ac9fa6c31797da664688be6fb71a11d9aaae770 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 1 Nov 2024 16:55:01 -0500 Subject: [PATCH 67/68] Fix some typos --- loopy/schedule/tools.py | 2 +- loopy/statistics.py | 8 ++++---- loopy/transform/realize_reduction.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index a7860e343..3858462b1 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -886,7 +886,7 @@ def _raise_loopy_err(x): _update_nesting_constraints(relaxed_priorities, warn) # ordered_loop_nests: A mapping from the unordered loop nests to their - # ordered couterparts. For example. If we had only one loop nest + # ordered counterparts. For example. If we had only one loop nest # `frozenset({"i", "j", "k"})`, and the prioirities said added the # constraint that "i" must be nested within "k", then `ordered_loop_nests` # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop diff --git a/loopy/statistics.py b/loopy/statistics.py index 2d0537fdb..99b163f80 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -422,16 +422,16 @@ def to_bytes(self): bytes_map = get_mem_access_map(knl).to_bytes() params = {"n": 512, "m": 256, "l": 128} - s1_g_ld_byt = bytes_map.filter_by( + s1_g_ld_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 1}, direction=["load"]).eval_and_sum(params) - s2_g_ld_byt = bytes_map.filter_by( + s2_g_ld_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 2}, direction=["load"]).eval_and_sum(params) - s1_g_st_byt = bytes_map.filter_by( + s1_g_st_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 1}, direction=["store"]).eval_and_sum(params) - s2_g_st_byt = bytes_map.filter_by( + s2_g_st_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 2}, direction=["store"]).eval_and_sum(params) diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 8aea6541a..7d1f3c870 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -711,7 +711,7 @@ def _add_to_depends_on(insn_id, new_depends_on_params): needs_replacement = True - # {{{ generate a new assignent instruction + # {{{ generate a new assignment instruction new_assignee_name = var_name_gen( "{insn_id}_retval_{assignee_nr}" From da84302ab455de0eacb9a3fad449ef77ad9db604 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 6 Nov 2024 09:57:57 -0600 Subject: [PATCH 68/68] Un-type symbolic.flatten This restores compatibility with pymbolic 2022.2 x-ref: https://github.com/firedrakeproject/loopy/issues/27 --- loopy/symbolic.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 86e854bd2..f0e0333ec 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -37,8 +37,6 @@ Mapping, Sequence, Tuple, - TypeVar, - cast, ) import immutables @@ -49,7 +47,6 @@ import pymbolic.primitives as p import pytools.lex from islpy import dim_type -from pymbolic import ArithmeticExpressionT from pymbolic.mapper import ( CachedCombineMapper as CombineMapperBase, CachedIdentityMapper as IdentityMapperBase, @@ -211,14 +208,8 @@ class FlattenMapper(FlattenMapperBase, IdentityMapperMixin): pass -ArithmeticOrExpressionT = TypeVar( - "ArithmeticOrExpressionT", - ArithmeticExpressionT, - ExpressionT) - - -def flatten(expr: ArithmeticOrExpressionT) -> ArithmeticOrExpressionT: - return cast(ArithmeticOrExpressionT, FlattenMapper()(expr)) +def flatten(expr): + return FlattenMapper()(expr) class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):