diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 32707fe86..234988280 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,10 @@ on: schedule: - cron: '17 3 * * 0' +concurrency: + group: ${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + jobs: ruff: name: Ruff @@ -20,6 +24,13 @@ jobs: pipx install ruff ruff check + typos: + name: Typos + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: crate-ci/typos@master + pylint: name: Pylint runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 7cf3c4751..4378c7122 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,6 @@ lextab.py yacctab.py .pytest_cache/* -loopy/_git_rev.py - .cache .env virtualenv-[0-9]*[0-9] diff --git a/MEMO b/MEMO index f4e5c34e4..5a9438811 100644 --- a/MEMO +++ b/MEMO @@ -7,7 +7,7 @@ Documentation Notes Things to consider ^^^^^^^^^^^^^^^^^^ -- Depedencies are pointwise for shared loop dimensions +- Dependencies are pointwise for shared loop dimensions and global over non-shared ones (between dependent and ancestor) - multiple insns could fight over which iname gets local axis 0 diff --git a/README.rst b/README.rst index 1ef7773db..87c4887a3 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code .. image:: https://gitlab.tiker.net/inducer/loopy/badges/main/pipeline.svg :alt: Gitlab Build Status :target: https://gitlab.tiker.net/inducer/loopy/commits/main -.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main&event=push +.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main :alt: Github Build Status - :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI+event%3Apush + :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI .. image:: https://badge.fury.io/py/loopy.png :alt: Python Package Index Release Page :target: https://pypi.org/project/loopy/ diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py index 82a2b9602..bbde23174 100644 --- a/contrib/mem-pattern-explorer/pattern_vis.py +++ b/contrib/mem-pattern-explorer/pattern_vis.py @@ -76,7 +76,7 @@ def tick(self): class Array: def __init__(self, ctx, name, shape, strides, elements_per_row=None): # Each array element stores a tuple: - # (timestamp, subgroup, g0, g1, g2, ) of last acccess + # (timestamp, subgroup, g0, g1, g2, ) of last access assert len(shape) == len(strides) diff --git a/doc/misc.rst b/doc/misc.rst index 3fea6fdd4..be1c964cd 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -158,7 +158,7 @@ In the meantime, you can generate code simply by saying:: print(cg_result.host_code()) print(cg_result.device_code()) -Additionally, for C-based languages, header defintions are available via:: +Additionally, for C-based languages, header definitions are available via:: loopy.generate_header(knl) @@ -338,8 +338,8 @@ This list is always growing, but here are a few pointers: Use :func:`loopy.join_inames`. -In what sense does Loopy suport vectorization? ----------------------------------------------- +In what sense does Loopy support vectorization? +----------------------------------------------- There are really two ways in which the OpenCL/CUDA model of computation exposes vectorization: @@ -352,7 +352,7 @@ vectorization: e.g. ``float4``, which support arithmetic with implicit vector semantics as well as a number of 'intrinsic' functions. -Loopy suports both. The first one, SIMT, is accessible by tagging inames with, +Loopy supports both. The first one, SIMT, is accessible by tagging inames with, e.g., ``l.0```. Accessing the second one requires using both execution- and data-reshaping capabilities in loopy. To start with, you need an array that has an axis with the length of the desired vector. If that's not yet available, diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst index 3dc0a2bd7..86e2edad2 100644 --- a/doc/ref_internals.rst +++ b/doc/ref_internals.rst @@ -53,3 +53,7 @@ Schedule -------- .. automodule:: loopy.schedule +.. automodule:: loopy.schedule.tools +.. automodule:: loopy.schedule.tree + + diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 1fa237b25..2962c23b8 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -262,6 +262,7 @@ Instructions .. {{{ +.. autoclass:: HappensAfter .. autoclass:: InstructionBase .. _assignments: diff --git a/doc/ref_other.rst b/doc/ref_other.rst index e5059380d..538f0cdb9 100644 --- a/doc/ref_other.rst +++ b/doc/ref_other.rst @@ -1,6 +1,11 @@ Reference: Other Functionality ============================== +Auxiliary Data Types +-------------------- + +.. automodule:: loopy.typing + Obtaining Kernel Performance Statistics --------------------------------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index a697bed30..4aeb42428 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -438,7 +438,8 @@ with identical bounds, for the use of the transpose: ... out[ii,jj] = 2*out[ii,jj] {dep=transpose} ... """, ... [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...]) - >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj") + >>> knl = lp.prioritize_loops(knl, "i,j") + >>> knl = lp.prioritize_loops(knl, "ii,jj") :func:`loopy.duplicate_inames` can be used to achieve the same goal. Now the intended code is generated and our test passes. @@ -613,7 +614,7 @@ commonly called 'loop tiling': ... assumptions="n mod 16 = 0 and n >= 1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) - >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") + >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner") >>> knl = lp.set_options(knl, write_code=True) >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -822,7 +823,7 @@ enabling some cost savings: { int const i_outer = -1 + n + -1 * ((3 * n) / 4); - if (-1 + n >= 0) + if (i_outer >= 0) { a[4 * i_outer] = (float) (0.0f); if (-2 + -4 * i_outer + n >= 0) @@ -957,7 +958,7 @@ Consider the following example: ... "{ [i_outer,i_inner, k]: " ... "0<= 16*i_outer + i_inner a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10} + ... <> a_temp[i_inner] = a[16*i_outer + i_inner] ... out[16*i_outer + i_inner] = sum(k, a_temp[k]) ... """) >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) @@ -1032,8 +1033,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - acc_k = 0.0f; a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1056,12 +1057,11 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch[lid(0)]; out[16 * gid(0) + lid(0)] = acc_k; @@ -1209,6 +1209,12 @@ Let us start with an example. Consider the kernel from above with a ... assumptions="n mod 16 = 0") >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") +.. testsetup:: + + >>> prog = prog.with_kernel( + ... prog.default_entrypoint.copy( + ... silenced_warnings=["v1_scheduler_fallback"])) + Here is what happens when we try to generate code for the kernel: >>> cgr = lp.generate_code_v2(prog) @@ -1312,7 +1318,7 @@ The kernel translates into two OpenCL kernels. int tmp; tmp = tmp_save_slot[16 * gid(0) + lid(0)]; - arr[(lid(0) + gid(0) * 16 + 1) % n] = tmp; + arr[(1 + lid(0) + gid(0) * 16) % n] = tmp; } Now we can execute the kernel. @@ -1903,18 +1909,16 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; - { - int const k_outer = 0; - + for (int i = 0; i <= 49; ++i) for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } - } + { + int const k_outer = 0; + + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } } In this kernel, when a work-item performs the second instruction it uses data diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index d9ac1f1b2..64fcb0af4 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(prog)" + "print(prog) # noqa: F821" ] }, { @@ -105,17 +105,8 @@ "metadata": {}, "outputs": [], "source": [ - "print(prog)" + "print(prog) # noqa: F821" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 070e0d071..bf6e29e47 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -24,10 +24,7 @@ def transform(knl, vars, stream_dtype): knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") - knl = lp.add_and_infer_dtypes(knl, { - var: stream_dtype - for var in vars - }) + knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype)) knl = lp.set_argument_order(knl, vars + ["n"]) diff --git a/loopy/__init__.py b/loopy/__init__.py index 275d4f26e..149a1af26 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -56,6 +56,7 @@ BarrierInstruction, CallInstruction, CInstruction, + HappensAfter, InstructionBase, LegacyStringInstructionTag, MemoryOrdering, @@ -203,15 +204,9 @@ find_rules_matching, ) from loopy.translation_unit import TranslationUnit, for_each_kernel, make_program - -# }}} from loopy.type_inference import infer_unknown_types from loopy.types import to_loopy_type - -# {{{ imported user interface from loopy.typing import auto - -# {{{ import transforms from loopy.version import MOST_RECENT_LANGUAGE_VERSION, VERSION @@ -242,6 +237,7 @@ "ExecutorBase", "GeneratedProgram", "GlobalArg", + "HappensAfter", "ISPCTarget", "ImageArg", "InKernelCallable", @@ -563,18 +559,18 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None): indices = ["i%d" % i for i in range(rank)] shape = ["n%d" % i for i in range(rank)] - commad_indices = ", ".join(indices) + command_indices = ", ".join(indices) bounds = " and ".join( f"0<={ind}<{shape_i}" for ind, shape_i in zip(indices, shape)) set_str = "{{[{}]: {} }}".format( - commad_indices, + command_indices, bounds ) result = make_kernel(set_str, "output[%s] = input[%s]" - % (commad_indices, commad_indices), + % (command_indices, command_indices), lang_version=MOST_RECENT_LANGUAGE_VERSION, default_offset=auto) diff --git a/loopy/check.py b/loopy/check.py index 17887cff5..31bbd7b0f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -22,6 +22,7 @@ import logging from collections import defaultdict +from collections.abc import Mapping, Sequence from functools import reduce from typing import List, Optional, Tuple, Union @@ -43,7 +44,15 @@ FixedStrideArrayDimTag, SeparateArrayArrayDimTag, ) -from loopy.kernel.data import ArrayArg, ArrayDimImplementationTag, auto +from loopy.kernel.data import ( + AddressSpace, + ArrayArg, + ArrayDimImplementationTag, + InameImplementationTag, + TemporaryVariable, + auto, +) +from loopy.kernel.function_interface import CallableKernel from loopy.kernel.instruction import ( CallInstruction, CInstruction, @@ -51,10 +60,14 @@ NoOpInstruction, _DataObliviousInstruction, ) -from loopy.symbolic import CombineMapper, ResolvedFunction, WalkMapper -from loopy.translation_unit import for_each_kernel +from loopy.symbolic import CombineMapper, ResolvedFunction, SubArrayRef, WalkMapper +from loopy.translation_unit import ( + CallablesTable, + TranslationUnit, + check_each_kernel, +) from loopy.type_inference import TypeReader -from loopy.typing import ExpressionT +from loopy.typing import ExpressionT, not_none logger = logging.getLogger(__name__) @@ -144,8 +157,8 @@ def map_constant(self, expr): map_nan = map_constant -@for_each_kernel -def check_functions_are_resolved(kernel): +@check_each_kernel +def check_functions_are_resolved(kernel: LoopKernel) -> None: """ Checks if all call nodes in the *kernel* expression have been resolved. """ @@ -166,7 +179,7 @@ def check_functions_are_resolved(kernel): raise NotImplementedError(type(insn)) -@for_each_kernel +@check_each_kernel def check_separated_array_consistency(kernel: LoopKernel) -> None: # Boo. This is (part of) the price of redundant representation. for arg in kernel.args: @@ -197,7 +210,7 @@ def check_separated_array_consistency(kernel: LoopKernel) -> None: f"'{sub_arg.name}' is not consistent.") -@for_each_kernel +@check_each_kernel def check_offsets_and_dim_tags(kernel: LoopKernel) -> None: from pymbolic.primitives import Expression, Variable @@ -356,8 +369,8 @@ def check_for_integer_subscript_indices(t_unit): raise NotImplementedError(type(clbl).__name__) -@for_each_kernel -def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): +@check_each_kernel +def check_sub_array_ref_inames_not_within_or_redn_inames(kernel: LoopKernel) -> None: all_within_inames = frozenset().union(*(insn.within_inames for insn in kernel.instructions)) all_redn_inames = frozenset().union(*(insn.reduction_inames() @@ -378,8 +391,8 @@ def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): " illegal.") -@for_each_kernel -def check_insn_attributes(kernel): +@check_each_kernel +def check_insn_attributes(kernel: LoopKernel) -> None: """ Check for legality of attributes of every instruction in *kernel*. """ @@ -412,8 +425,8 @@ def check_insn_attributes(kernel): ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES))) -@for_each_kernel -def check_for_duplicate_insn_ids(knl): +@check_each_kernel +def check_for_duplicate_insn_ids(knl: LoopKernel) -> None: """ Check if multiple instructions of *knl* have the same :attr:`loopy.InstructionBase.id`. @@ -428,8 +441,8 @@ def check_for_duplicate_insn_ids(knl): insn_ids.add(insn.id) -@for_each_kernel -def check_loop_priority_inames_known(kernel): +@check_each_kernel +def check_loop_priority_inames_known(kernel: LoopKernel) -> None: """ Checks if the inames in :attr:`loopy.LoopKernel.loop_priority` are part of the *kernel*'s domain. @@ -440,8 +453,8 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) -@for_each_kernel -def check_multiple_tags_allowed(kernel): +@check_each_kernel +def check_multiple_tags_allowed(kernel: LoopKernel) -> None: """ Checks if a multiple tags of an iname are compatible. """ @@ -465,7 +478,10 @@ def check_multiple_tags_allowed(kernel): "tags: {}".format(iname.name, iname.tags)) -def _check_for_double_use_of_hw_axes_inner(kernel, callables_table): +def _check_for_double_use_of_hw_axes_inner( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: from loopy.kernel.data import GroupInameTag, LocalInameTag, UniqueInameTag from loopy.kernel.instruction import CallInstruction from loopy.symbolic import ResolvedFunction @@ -489,7 +505,7 @@ def _check_for_double_use_of_hw_axes_inner(kernel, callables_table): insn_tag_keys.add(key) -def check_for_double_use_of_hw_axes(t_unit): +def check_for_double_use_of_hw_axes(t_unit: TranslationUnit) -> None: """ Check if any instruction of *kernel* is within multiple inames tagged with the same hw axis tag. @@ -505,8 +521,8 @@ def check_for_double_use_of_hw_axes(t_unit): raise NotImplementedError(type(clbl).__name__) -@for_each_kernel -def check_for_inactive_iname_access(kernel): +@check_each_kernel +def check_for_inactive_iname_access(kernel: LoopKernel) -> None: """ Check if any instruction accesses an iname but is not within it. """ @@ -523,8 +539,8 @@ def check_for_inactive_iname_access(kernel): - insn.within_inames), kernel.name)) -@for_each_kernel -def check_for_unused_inames(kernel): +@check_each_kernel +def check_for_unused_inames(kernel: LoopKernel) -> None: """ Check if there are any unused inames in the kernel. """ @@ -540,7 +556,7 @@ def check_for_unused_inames(kernel): % unused_inames) -def _is_racing_iname_tag(tv, tag): +def _is_racing_iname_tag(tv: TemporaryVariable, tag: InameImplementationTag) -> bool: from loopy.kernel.data import ( AddressSpace, ConcurrentTag, @@ -571,8 +587,8 @@ def _is_racing_iname_tag(tv, tag): "temporary variable '%s'" % tv.name) -@for_each_kernel -def check_for_write_races(kernel): +@check_each_kernel +def check_for_write_races(kernel: LoopKernel) -> None: """ Check if any memory accesses lead to write races. """ @@ -620,8 +636,8 @@ def check_for_write_races(kernel): WriteRaceConditionWarning) -@for_each_kernel -def check_for_data_dependent_parallel_bounds(kernel): +@check_each_kernel +def check_for_data_dependent_parallel_bounds(kernel: LoopKernel) -> None: """ Check that inames tagged as hw axes have bounds that are known at kernel launch. @@ -848,7 +864,7 @@ def map_call(self, expr, domain, insn_id): _check_bounds_inner_rec(subkernel, self.callables_table) -def _check_bounds_inner(kernel, callables_table): +def _check_bounds_inner(kernel: LoopKernel, callables_table: CallablesTable) -> None: from loopy.kernel.instruction import get_insn_domain temp_var_names = set(kernel.temporary_variables) @@ -874,7 +890,10 @@ def run_acm(expr): insn.with_transformed_expressions(run_acm) -def _check_bounds_inner_rec(kernel, callables_table): +def _check_bounds_inner_rec( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: if kernel.options.enforce_array_accesses_within_bounds not in [ "no_check", True, @@ -899,7 +918,7 @@ def _check_bounds_inner_rec(kernel, callables_table): warn_with_kernel(kernel, "array_access_out_of_bounds", str(e)) -def check_bounds(t_unit): +def check_bounds(t_unit: TranslationUnit) -> None: """ Performs out-of-bound check for every array access. """ @@ -912,8 +931,8 @@ def check_bounds(t_unit): # {{{ check write destinations -@for_each_kernel -def check_write_destinations(kernel): +@check_each_kernel +def check_write_destinations(kernel: LoopKernel) -> None: for insn in kernel.instructions: for wvar in insn.assignee_var_names(): if wvar in kernel.all_inames(): @@ -940,8 +959,8 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting -@for_each_kernel -def check_has_schedulable_iname_nesting(kernel): +@check_each_kernel +def check_has_schedulable_iname_nesting(kernel: LoopKernel) -> None: from loopy.transform.iname import ( get_iname_duplication_options, has_schedulable_iname_nesting, @@ -988,8 +1007,8 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): return ab_nosync and ba_nosync -def _get_address_space(kernel, var): - from loopy.kernel.data import AddressSpace, ArrayArg, ValueArg +def _get_address_space(kernel: LoopKernel, var: str) -> AddressSpace | type[auto]: + from loopy.kernel.data import ArrayArg, ValueArg if var in kernel.temporary_variables: address_space = kernel.temporary_variables[var].address_space else: @@ -1005,7 +1024,7 @@ def _get_address_space(kernel, var): return address_space -def _get_topological_order(kernel): +def _get_topological_order(kernel: LoopKernel) -> Sequence[str]: """ Returns a :class:`list` of insn ids of *kernel* in a topological sort order. @@ -1033,7 +1052,7 @@ def _get_topological_order(kernel): return order -def _check_variable_access_ordered_inner(kernel): +def _check_variable_access_ordered_inner(kernel: LoopKernel) -> None: from loopy.kernel.tools import find_aliasing_equivalence_classes from loopy.symbolic import AccessRangeOverlapChecker overlap_checker = AccessRangeOverlapChecker(kernel) @@ -1050,7 +1069,7 @@ def _check_variable_access_ordered_inner(kernel): # the mapping in both directions. # # Note: This can be worst-case O(n^2) in the number of instructions. - dep_reqs_to_vars = {} + dep_reqs_to_vars: dict[tuple[str, str], set[str]] = {} wmap = kernel.writer_map() rmap = kernel.reader_map() @@ -1081,14 +1100,16 @@ def _check_variable_access_ordered_inner(kernel): # {{{ compute rev_depends, depends_on # depends_on: mapping from insn_ids to their dependencies - depends_on = {insn.id: set() for insn in kernel.instructions} + depends_on: dict[str, set[str]] = { + not_none(insn.id): set() for insn in kernel.instructions} # rev_depends: mapping from insn_ids to their reverse deps. - rev_depends = {insn.id: set() for insn in kernel.instructions} + rev_depends: dict[str, set[str]] = { + not_none(insn.id): set() for insn in kernel.instructions} for insn in kernel.instructions: - depends_on[insn.id].update(insn.depends_on) + depends_on[not_none(insn.id)].update(insn.depends_on) for dep in insn.depends_on: - rev_depends[dep].add(insn.id) + rev_depends[dep].add(not_none(insn.id)) # }}} @@ -1096,7 +1117,8 @@ def _check_variable_access_ordered_inner(kernel): topological_order = _get_topological_order(kernel) - def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): + # TODO: Type this + def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order) -> None: """ Considering a graph defined by *edges* (as ``key -> value``), remove pairs of nodes from *dep_reqs_to_vars* for which edges @@ -1123,7 +1145,7 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): # for each *pred*, we will calculate all the direct/indirect # instructions that can be reached. seen_successors = set() - # first let us start with direct sucessors + # first let us start with direct successors to_check = edges[pred].copy() while to_check: successor = to_check.pop() @@ -1214,12 +1236,12 @@ def satisfy_dep_reqs_in_order(dep_reqs_to_vars, edges, order): # }}} -@for_each_kernel -def check_variable_access_ordered(kernel): +@check_each_kernel +def check_variable_access_ordered(kernel: LoopKernel) -> None: """Checks that between each write to a variable and all other accesses to the variable there is either: - * a direct/indirect depdendency edge, or + * a direct/indirect dependency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.InstructionBase.no_sync_with`) """ @@ -1252,7 +1274,7 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(t_unit): +def pre_schedule_checks(t_unit: TranslationUnit) -> None: try: logger.debug("pre-schedule checks start for entrypoints: " f"{t_unit.entrypoints}.") @@ -1312,9 +1334,12 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None: storage_array = name_to_array.get(ary.base_storage, None) if storage_array is None: - raise ValueError("nothing known about storage array " + raise LoopyError("Nothing known about storage array " f"'{ary.base_storage}' serving as base_storage of " - f"'{ary.name}'") + f"'{ary.name}'. " + "(Note: base storage is no longer automatically allocated. " + "Call allocate_temporaries_for_base_storage to automatically " + "allocate.)") if storage_array.base_storage: raise ValueError("storage array " @@ -1327,8 +1352,11 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None: # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, - sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk( + kernel: LoopKernel, + callables_table: CallablesTable, + sched_index: int | None = None + ) -> int: from loopy.schedule import ( Barrier, CallKernel, @@ -1339,6 +1367,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, gather_schedule_block, get_insn_ids_for_block_at, ) + assert kernel.linearization is not None if sched_index is None: group_axes = set() @@ -1439,7 +1468,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, return past_end_i -def check_for_unused_hw_axes_in_insns(kernel, callables_table): +def check_for_unused_hw_axes_in_insns( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: if kernel.linearization: _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table) @@ -1449,7 +1481,9 @@ def check_for_unused_hw_axes_in_insns(kernel, callables_table): # {{{ check that atomic ops are used exactly on atomic arrays -def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): +def check_that_atomic_ops_are_used_exactly_on_atomic_arrays( + kernel: LoopKernel + ) -> None: from loopy.kernel.data import ArrayBase, Assignment from loopy.types import AtomicType atomicity_candidates = ( @@ -1484,7 +1518,9 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used -def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): +def check_that_temporaries_are_defined_in_subkernels_where_used( + kernel: LoopKernel + ) -> None: from loopy.kernel.data import AddressSpace from loopy.kernel.tools import get_subkernels @@ -1537,9 +1573,10 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): # {{{ check that all instructions are scheduled -def check_that_all_insns_are_scheduled(kernel): +def check_that_all_insns_are_scheduled(kernel: LoopKernel) -> None: + assert kernel.linearization is not None - all_schedulable_insns = {insn.id for insn in kernel.instructions} + all_schedulable_insns = {not_none(insn.id) for insn in kernel.instructions} from loopy.schedule import sched_item_to_insn_id scheduled_insns = { insn_id @@ -1559,7 +1596,7 @@ def check_that_all_insns_are_scheduled(kernel): # {{{ check that shapes and strides are arguments -def check_that_shapes_and_strides_are_arguments(kernel): +def check_that_shapes_and_strides_are_arguments(kernel: LoopKernel) -> None: import loopy as lp from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag from loopy.kernel.data import ValueArg @@ -1569,12 +1606,12 @@ def check_that_shapes_and_strides_are_arguments(kernel): arg.name for arg in kernel.args if isinstance(arg, ValueArg) - and arg.dtype.is_integral()} + and not_none(arg.dtype).is_integral()} for arg in kernel.args: if isinstance(arg, ArrayBase): if isinstance(arg.shape, tuple): - shape_deps = set() + shape_deps: set[str] = set() for shape_axis in arg.shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) @@ -1603,14 +1640,21 @@ def check_that_shapes_and_strides_are_arguments(kernel): # {{{ validate_kernel_call_sites -def _get_sub_array_ref_swept_range(kernel, sar): +def _get_sub_array_ref_swept_range( + kernel: LoopKernel, + sar: SubArrayRef + ) -> isl.Set: from loopy.symbolic import get_access_map domain = kernel.get_inames_domain(frozenset({iname_var.name for iname_var in sar.swept_inames})) return get_access_map(domain, sar.swept_inames, kernel.assumptions).range() -def _are_sub_array_refs_equivalent(sar1, sar2, caller): +def _are_sub_array_refs_equivalent( + sar1: SubArrayRef, + sar2: SubArrayRef, + caller: LoopKernel + ) -> bool: """ Returns *True* iff *sar1* and *sar2* are equivalent :class:`loopy.SubArrayRef`s. @@ -1653,7 +1697,11 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): return True -def _validate_kernel_call_insn(caller, call_insn, callee): +def _validate_kernel_call_insn( + caller: LoopKernel, + call_insn: CallInstruction, + callee: LoopKernel + ) -> None: assert call_insn.expression.function.name == callee.name from loopy.kernel.array import ArrayBase from loopy.symbolic import SubArrayRef @@ -1703,7 +1751,10 @@ def _validate_kernel_call_insn(caller, call_insn, callee): f" (got {in_val}, {out_val}).") -def _validate_kernel_call_sites_inner(kernel, callables): +def _validate_kernel_call_sites_inner( + kernel: LoopKernel, + callables: CallablesTable, + ) -> None: from pymbolic.primitives import Call from loopy.kernel.function_interface import CallableKernel @@ -1722,11 +1773,12 @@ def _validate_kernel_call_sites_inner(kernel, callables): raise NotImplementedError(type(insn)) -def validate_kernel_call_sites(translation_unit): +def validate_kernel_call_sites(translation_unit: TranslationUnit) -> None: for name in translation_unit.callables_table: - clbl = translation_unit[name] - if isinstance(clbl, LoopKernel): - _validate_kernel_call_sites_inner(clbl, translation_unit.callables_table) + clbl = translation_unit.callables_table[name] + if isinstance(clbl, CallableKernel): + _validate_kernel_call_sites_inner( + clbl.subkernel, translation_unit.callables_table) # }}} @@ -1734,8 +1786,10 @@ def validate_kernel_call_sites(translation_unit): # {{{ check_all_callees_have_same_index_dtype -def check_all_callees_have_same_index_dtype(epoint: LoopKernel, - callables_table): +def check_all_callees_have_same_index_dtype( + epoint: LoopKernel, + callables_table: CallablesTable + ) -> None: from loopy.kernel.function_interface import CallableKernel epoint_clbl = callables_table[epoint.name] @@ -1752,7 +1806,10 @@ def check_all_callees_have_same_index_dtype(epoint: LoopKernel, # }}} -def pre_codegen_entrypoint_checks(kernel, callables_table): +def pre_codegen_entrypoint_checks( + kernel: LoopKernel, + callables_table: CallablesTable + ) -> None: logger.debug("pre-codegen entrypoint check %s: start" % kernel.name) kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) @@ -1775,7 +1832,7 @@ def pre_codegen_callable_checks(kernel, callables_table): logger.debug("pre-codegen callable check %s: done" % kernel.name) -def pre_codegen_checks(t_unit): +def pre_codegen_checks(t_unit: TranslationUnit) -> None: from loopy.kernel.function_interface import CallableKernel try: @@ -1798,7 +1855,11 @@ def pre_codegen_checks(t_unit): # {{{ sanity-check for implemented domains of each instruction -def check_implemented_domains(kernel, implemented_domains, code=None): +def check_implemented_domains( + kernel: LoopKernel, + implemented_domains: Mapping[str, isl.Set], + code: str | None = None, + ) -> bool: from islpy import align_two, dim_type last_idomains = None diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 645a57e31..c64c2ea67 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -28,6 +28,7 @@ from loopy.codegen.control import build_loop_nest from loopy.codegen.result import merge_codegen_results from loopy.diagnostic import LoopyError, warn +from loopy.symbolic import flatten # {{{ conditional-reducing slab decomposition @@ -309,7 +310,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr - hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) + hw_axis_expr = flatten(hw_axis_expr + pw_aff_to_expr(lower_bound)) # }}} diff --git a/loopy/expression.py b/loopy/expression.py index 224521b03..2581ec022 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -23,7 +23,7 @@ import numpy as np -from pymbolic.mapper import RecursiveMapper +from pymbolic.mapper import Mapper from loopy.codegen import UnvectorizableError from loopy.diagnostic import LoopyError @@ -55,7 +55,7 @@ def dtype_to_type_context(target, dtype): # {{{ vectorizability checker -class VectorizabilityChecker(RecursiveMapper): +class VectorizabilityChecker(Mapper): """The return value from this mapper is a :class:`bool` indicating whether the result of the expression is vectorized along :attr:`vec_iname`. If the expression is not vectorizable, the mapper raises diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 530e92678..fc9eace87 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -200,7 +200,7 @@ def get_type(self, name, none_ok=False): return None raise TranslationError( - "no type for '%s' found in 'implict none' routine" + "no type for '%s' found in 'implicit none' routine" % name) from None return self.implicit_types.get(name[0], np.dtype(np.int32)) @@ -426,7 +426,7 @@ def map_Implicit(self, node): scope.implicit_types = None for stmt, specs in node.items: - if scope.implict_types is None: + if scope.implict_types is None: # spellchecker: disable-line raise TranslationError("implicit decl not allowed after " "'implicit none'") tp = self.dtype_from_stmt(stmt) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 160b6415b..28aa3be30 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -186,7 +186,7 @@ def simplify_pw_aff(pw_aff, context=None): continue if aff_i.gist(dom_j).is_equal(aff_j): - # aff_i is sufficient to conver aff_j, eliminate aff_j + # aff_i is sufficient to cover aff_j, eliminate aff_j new_pieces = pieces[:] if i < j: new_pieces.pop(j) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index a9b3bb07e..d85b1568a 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -74,7 +74,7 @@ from loopy.target import TargetBase from loopy.tools import update_persistent_hash from loopy.types import LoopyType, NumpyType -from loopy.typing import ExpressionT +from loopy.typing import ExpressionT, InameStr if TYPE_CHECKING: @@ -117,82 +117,25 @@ class LoopKernel(Taggable): even if it contains mutable data types. See :meth:`copy` for an easy way of producing a modified copy. - .. attribute:: domains - - a list of :class:`islpy.BasicSet` instances representing the - :ref:`domain-tree`. - - .. attribute:: instructions - - A list of :class:`InstructionBase` instances, e.g. - :class:`Assignment`. See :ref:`instructions`. - - .. attribute:: args - - A list of :class:`loopy.KernelArgument` - - .. attribute:: schedule - - *None* or a list of :class:`loopy.schedule.ScheduleItem` - - .. attribute:: name - .. attribute:: preambles - .. attribute:: preamble_generators - .. attribute:: assumptions - - A :class:`islpy.BasicSet` parameter domain. - - .. attribute:: temporary_variables - - A :class:`dict` of mapping variable names to - :class:`loopy.TemporaryVariable` - instances. - - .. attribute:: symbol_manglers - - .. attribute:: substitutions - - a mapping from substitution names to - :class:`SubstitutionRule` objects - - .. attribute:: iname_slab_increments - - a dictionary mapping inames to (lower_incr, - upper_incr) tuples that will be separated out in the execution to generate - 'bulk' slabs with fewer conditionals. - - .. attribute:: loop_priority - - A frozenset of priority constraints to the kernel. Each such constraint - is a tuple of inames. Inames occuring in such a tuple will be scheduled - earlier than any iname following in the tuple. This applies only to inames - with non-parallel implementation tags. - - .. attribute:: silenced_warnings - - .. attribute:: applied_iname_rewrites - - A list of past substitution dictionaries that - were applied to the kernel. These are stored so that they may be repeated - on expressions the user specifies later. - - .. attribute:: options - - An instance of :class:`loopy.Options` - - .. attribute:: state - - A value from :class:`KernelState`. - - .. attribute:: target - - A subclass of :class:`loopy.TargetBase`. - - .. attribute:: inames - - An instance of :class:`dict`, a mapping from the names of kernel's - inames to their corresponding instances of :class:`loopy.kernel.data.Iname`. - An entry is guaranteed to be present for each iname. + .. autoattribute:: domains + .. autoattribute:: instructions + .. autoattribute:: args + .. autoattribute:: schedule + .. autoattribute:: name + .. autoattribute:: preambles + .. autoattribute:: preamble_generators + .. autoattribute:: assumptions + .. autoattribute:: temporary_variables + .. autoattribute:: symbol_manglers + .. autoattribute:: substitutions + .. autoattribute:: iname_slab_increments + .. autoattribute:: loop_priority + .. autoattribute:: silenced_warnings + .. autoattribute:: applied_iname_rewrites + .. autoattribute:: options + .. autoattribute:: state + .. autoattribute:: target + .. autoattribute:: inames .. automethod:: __call__ .. automethod:: copy @@ -201,11 +144,25 @@ class LoopKernel(Taggable): .. automethod:: without_tags """ domains: Sequence[isl.BasicSet] + """Represents the :ref:`domain-tree`.""" + instructions: Sequence[InstructionBase] + """ + See :ref:`instructions`. + """ + args: Sequence[KernelArgument] assumptions: isl.BasicSet + """ + Must be a :class:`islpy.BasicSet` parameter domain. + """ + temporary_variables: Mapping[str, TemporaryVariable] - inames: Mapping[str, Iname] + inames: Mapping[InameStr, Iname] + """ + An entry is guaranteed to be present for each iname. + """ + substitutions: Mapping[str, SubstitutionRule] options: Options target: TargetBase @@ -218,11 +175,29 @@ class LoopKernel(Taggable): symbol_manglers: Sequence[ Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = () linearization: Optional[Sequence[ScheduleItem]] = None - iname_slab_increments: Mapping[str, Tuple[int, int]] = field( + iname_slab_increments: Mapping[InameStr, Tuple[int, int]] = field( default_factory=Map) - loop_priority: FrozenSet[Tuple[str]] = field( + """ + A mapping from inames to (lower_incr, + upper_incr) tuples that will be separated out in the execution to generate + 'bulk' slabs with fewer conditionals. + """ + + loop_priority: FrozenSet[Tuple[InameStr, ...]] = field( default_factory=frozenset) - applied_iname_rewrites: Tuple[Dict[str, ExpressionT], ...] = () + """ + A frozenset of priority constraints to the kernel. Each such constraint + is a tuple of inames. Inames occurring in such a tuple will be scheduled + earlier than any iname following in the tuple. This applies only to inames + with non-parallel implementation tags. + """ + + applied_iname_rewrites: Tuple[Dict[InameStr, ExpressionT], ...] = () + """ + A list of past substitution dictionaries that + were applied to the kernel. These are stored so that they may be repeated + on expressions the user specifies later. + """ index_dtype: NumpyType = NumpyType(np.dtype(np.int32)) silenced_warnings: FrozenSet[str] = frozenset() @@ -274,11 +249,11 @@ def all_variable_names(self): | {arg.name for arg in self.args} | set(self.all_inames())) - def get_var_name_generator(self): + def get_var_name_generator(self) -> UniqueNameGenerator: return UniqueNameGenerator(self.all_variable_names()) - def get_instruction_id_generator(self, based_on="insn"): - used_ids = {insn.id for insn in self.instructions} + def get_instruction_id_generator(self, based_on="insn") -> UniqueNameGenerator: + used_ids = {insn.id for insn in self.instructions if insn.id is not None} return UniqueNameGenerator(used_ids) @@ -515,7 +490,7 @@ def get_leaf_domain_indices(self, inames): for iname in inames: home_domain_index = hdm[iname] if home_domain_index in domain_indices: - # nothin' new + # nothing new continue domain_path_to_root = [home_domain_index] + ppd[home_domain_index] diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 43e1f86b5..84477749f 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1,7 +1,7 @@ -"""Implementation tagging of array axes.""" - from __future__ import annotations +from loopy.symbolic import flatten + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -45,6 +45,7 @@ from warnings import warn import numpy as np # noqa +from typing_extensions import TypeAlias from pytools import ImmutableRecord from pytools.tag import Tag, Taggable @@ -52,7 +53,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType -from loopy.typing import ExpressionT, ShapeType +from loopy.typing import ExpressionT, ShapeType, auto if TYPE_CHECKING: @@ -69,8 +70,6 @@ __doc__ = """ -.. currentmodule:: loopy.kernel.array - .. autoclass:: ArrayDimImplementationTag .. autoclass:: _StrideArrayDimTagBase @@ -84,6 +83,23 @@ .. autoclass:: VectorArrayDimTag .. autofunction:: parse_array_dim_tags + +Cross-references +---------------- + +(This section shouldn't exist: Sphinx should be able to resolve these on its own.) + +.. class:: ShapeType + + See :class:`loopy.typing.ShapeType` + +.. class:: ExpressionT + + See :class:`loopy.typing.ExpressionT` + +.. class:: Tag + + See :class:`pytools.tag.Tag` """ @@ -593,132 +609,137 @@ def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, # {{{ array base class (for arguments and temporary arrays) -def _pymbolic_parse_if_necessary(x): - if isinstance(x, str): - from pymbolic import parse - return parse(x) - else: - return x +ToShapeLikeConvertible: TypeAlias = (Tuple[ExpressionT | str, ...] + | ExpressionT | type[auto] | str | tuple[str, ...]) -def _parse_shape_or_strides(x): - import loopy as lp - if x == "auto": - warn("use of 'auto' as a shape or stride won't work " - "any more--use loopy.auto instead", - stacklevel=3) - x = _pymbolic_parse_if_necessary(x) - if isinstance(x, lp.auto): - return x - assert not isinstance(x, list) - if not isinstance(x, tuple): - assert x is not lp.auto - x = (x,) +def _parse_shape_or_strides( + x: ToShapeLikeConvertible, + ) -> ShapeType | type[auto]: + from pymbolic import parse - return tuple(_pymbolic_parse_if_necessary(xi) for xi in x) - - -class ArrayBase(ImmutableRecord, Taggable): - """ - .. attribute :: name - - .. attribute :: dtype - - The :class:`loopy.types.LoopyType` of the array. If this is *None*, - :mod:`loopy` will try to continue without knowing the type of this - array, where the idea is that precise knowledge of the type will become - available at invocation time. Calling the kernel - (via :meth:`loopy.LoopKernel.__call__`) - automatically adds this type information based on invocation arguments. + if x == "auto": + raise ValueError("use of 'auto' as a shape or stride won't work " + "any more--use loopy.auto instead") - Note that some transformations, such as :func:`loopy.add_padding` - cannot be performed without knowledge of the exact *dtype*. + if x is auto: + return auto - .. attribute :: shape + if isinstance(x, str): + x = parse(x) - May be one of the following: + if isinstance(x, list): + raise ValueError("shape can't be a list") - * *None*. In this case, no shape is intended to be specified, - only the strides will be used to access the array. Bounds checking - will not be performed. + if not isinstance(x, tuple): + assert x is not auto + x = (x,) - * :class:`loopy.auto`. The shape will be determined by finding the - access footprint. + return tuple(parse(xi) if isinstance(xi, str) else xi for xi in x) - * a tuple like like :attr:`numpy.ndarray.shape`. - Each entry of the tuple is also allowed to be a :mod:`pymbolic` - expression involving kernel parameters, or a (potentially-comma - separated) or a string that can be parsed to such an expression. +class ArrayBase(ImmutableRecord, Taggable): + """ + .. autoattribute:: name + .. autoattribute:: dtype + .. autoattribute:: shape + .. autoattribute:: dim_tags + .. autoattribute:: offset + .. autoattribute:: dim_names + .. autoattribute:: alignment + .. autoattribute:: tags - Any element of the shape tuple not used to compute strides - may be *None*. + .. automethod:: __init__ + .. automethod:: __eq__ + .. automethod:: num_user_axes + .. automethod:: num_target_axes + .. automethod:: vector_size - .. attribute:: dim_tags + (supports persistent hashing) + """ + name: str - See :ref:`data-dim-tags`. + dtype: Optional[LoopyType] + """The :class:`loopy.types.LoopyType` of the array. If this is *None*, + :mod:`loopy` will try to continue without knowing the type of this + array, where the idea is that precise knowledge of the type will become + available at invocation time. Calling the kernel + (via :meth:`loopy.LoopKernel.__call__`) + automatically adds this type information based on invocation arguments. + + Note that some transformations, such as :func:`loopy.add_padding` + cannot be performed without knowledge of the exact *dtype*. + """ - .. attribute:: offset + shape: Union[ShapeType, Type["auto"], None] + """ + May be one of the following: - Offset from the beginning of the buffer to the point from - which the strides are counted, in units of the :attr:`dtype`. - May be one of + * *None*. In this case, no shape is intended to be specified, + only the strides will be used to access the array. Bounds checking + will not be performed. - * 0 or None - * a string (that is interpreted as an argument name). - * a pymbolic expression - * :class:`loopy.auto`, in which case an offset argument - is added automatically, immediately following this argument. + * :class:`loopy.auto`. The shape will be determined by finding the + access footprint. - .. attribute:: dim_names + * a tuple like like :attr:`numpy.ndarray.shape`. - A tuple of strings providing names for the array axes, or *None*. - If given, must have the same number of entries as :attr:`dim_tags` - and :attr:`dim_tags`. These do not live in any particular namespace - (i.e. collide with no other names) and serve a purely - informational/documentational purpose. On occasion, they are used - to generate more informative names than could be achieved by - axis numbers. + Each entry of the tuple is also allowed to be a :mod:`pymbolic` + expression involving kernel parameters, or a (potentially-comma + separated) or a string that can be parsed to such an expression. - .. attribute:: alignment + Any element of the shape tuple not used to compute strides + may be *None*. + """ - Memory alignment of the array in bytes. For temporary arrays, - this ensures they are allocated with this alignment. For arguments, - this entails a promise that the incoming array obeys this alignment - restriction. + dim_tags: Optional[Sequence[ArrayDimImplementationTag]] + """See :ref:`data-dim-tags`. + """ - Defaults to *None*. + offset: Union[ExpressionT, str, None] + """Offset from the beginning of the buffer to the point from + which the strides are counted, in units of the :attr:`dtype`. + May be one of + + * 0 or None + * a string (that is interpreted as an argument name). + * a pymbolic expression + * :class:`loopy.auto`, in which case an offset argument + is added automatically, immediately following this argument. + """ - If an integer N is given, the array would be declared - with ``__attribute__((aligned(N)))`` in code generation for - :class:`loopy.CFamilyTarget`. + dim_names: Optional[Tuple[str, ...]] + """A tuple of strings providing names for the array axes, or *None*. + If given, must have the same number of entries as :attr:`dim_tags` + and :attr:`dim_tags`. These do not live in any particular namespace + (i.e. collide with no other names) and serve a purely + informational/documentational purpose. On occasion, they are used + to generate more informative names than could be achieved by + axis numbers. + """ - .. versionadded:: 2018.1 + alignment: Optional[int] + """Memory alignment of the array in bytes. For temporary arrays, + this ensures they are allocated with this alignment. For arguments, + this entails a promise that the incoming array obeys this alignment + restriction. - .. attribute:: tags + Defaults to *None*. - A (possibly empty) frozenset of instances of - :class:`pytools.tag.Tag` intended for - consumption by an application. + If an integer N is given, the array would be declared + with ``__attribute__((aligned(N)))`` in code generation for + :class:`loopy.CFamilyTarget`. - .. versionadded:: 2020.2.2 + .. versionadded:: 2018.1 + """ - .. automethod:: __init__ - .. automethod:: __eq__ - .. automethod:: num_user_axes - .. automethod:: num_target_axes - .. automethod:: vector_size + tags: FrozenSet[Tag] + """A (possibly empty) frozenset of instances of + :class:`pytools.tag.Tag` intended for + consumption by an application. - (supports persistent hashing) + .. versionadded:: 2020.2.2 """ - name: str - dtype: Optional[LoopyType] - shape: Union[ShapeType, Type["auto"], None] - dim_tags: Optional[Sequence[ArrayDimImplementationTag]] - offset: Union[ExpressionT, str, None] - dim_names: Optional[Tuple[str, ...]] - alignment: Optional[int] - tags: FrozenSet[Tag] # Note that order may also wind up in attributes, if the # number of dimensions has not yet been determined. @@ -1299,7 +1320,7 @@ def eval_expr_assert_integer_constant(i, expr): "make_temporaries_for_offsets_and_strides " "during preprocessing.") - subscripts[dim_tag.target_axis] += (stride // vector_size)*idx + subscripts[dim_tag.target_axis] += flatten((stride // vector_size)*idx) elif isinstance(dim_tag, SeparateArrayArrayDimTag): raise AssertionError() diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c4cc880a0..4f1803f24 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -26,6 +26,7 @@ import logging import re from sys import intern +from typing import Any import numpy as np @@ -48,7 +49,7 @@ ) from loopy.symbolic import IdentityMapper, SubArrayRef, WalkMapper from loopy.tools import Optional, intern_frozenset_of_ids -from loopy.translation_unit import for_each_kernel +from loopy.translation_unit import TranslationUnit, for_each_kernel logger = logging.getLogger(__name__) @@ -1069,7 +1070,8 @@ def parse_domains(domains, defines): if isinstance(dom, str): dom, = expand_defines(dom, defines) - if not dom.lstrip().startswith("["): + # pylint warning is spurious + if not dom.lstrip().startswith("["): # pylint: disable=no-member # i.e. if no parameters are already given parameters = (_gather_isl_identifiers(dom) - _find_inames_in_set(dom) @@ -1884,7 +1886,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic @for_each_kernel -def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True, +def apply_single_writer_dependency_heuristic(kernel, warn_if_used=True, error_if_used=False): logger.debug("%s: default deps" % kernel.name) @@ -2023,7 +2025,7 @@ class SliceToInameReplacer(IdentityMapper): .. attribute:: subarray_ref_bounds A :class:`list` (one entry for each :class:`SubArrayRef` to be created) - of :class:`dict` instances to store the slices enountered in the + of :class:`dict` instances to store the slices encountered in the expressions as a mapping from ``iname`` to a tuple of ``(start, stop, step)``, which describes the boxy (i.e. affine) constraints imposed on the ``iname`` by the corresponding slice notation its intended to @@ -2536,13 +2538,6 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): assert len(knl.instructions) == len(inames_to_dup) - from loopy import duplicate_inames - from loopy.match import Id - for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): - for old_iname, new_iname in insn_inames_to_dup: - knl = duplicate_inames(knl, old_iname, - within=Id(insn.id), new_inames=new_iname) - check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) @@ -2563,6 +2558,27 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): knl = add_inferred_inames(knl) from loopy.transform.parameter import fix_parameters knl = fix_parameters(knl, **fixed_parameters) + + # ------------------------------------------------------------------------- + # Ordering dependency: + # ------------------------------------------------------------------------- + # Must duplicate inames after adding all the inames to the instructions. + # To duplicate an iname "i" in statement "S", lp.duplicate requires that + # the statement "S" be nested within the iname "i". + # ------------------------------------------------------------------------- + from loopy import duplicate_inames + from loopy.match import Id + for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): + for old_iname, new_iname in insn_inames_to_dup: + knl = duplicate_inames(knl, old_iname, + within=Id(insn.id), new_inames=new_iname) + new_insn = knl.id_to_insn[insn.id] + assert old_iname not in ( + new_insn.within_inames + | new_insn.reduction_inames() + | new_insn.sub_array_ref_inames() + ) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- @@ -2574,7 +2590,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) knl = resolve_dependencies(knl) - knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=False) + knl = apply_single_writer_dependency_heuristic(knl, warn_if_used=False) # ------------------------------------------------------------------------- # Ordering dependency: @@ -2600,7 +2616,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): # {{{ make_kernel -def make_kernel(*args, **kwargs): +def make_kernel(*args: Any, **kwargs: Any) -> TranslationUnit: tunit = make_function(*args, **kwargs) name, = tunit.callables_table return tunit.with_entrypoints(name) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index bdac071da..d2d80bedf 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -1,5 +1,7 @@ """Data used by the kernel object.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -38,9 +40,9 @@ Union, cast, ) -from warnings import warn -import numpy as np # noqa +import numpy # FIXME: imported as numpy to allow sphinx to resolve things +import numpy as np from immutables import Map from pytools import ImmutableRecord @@ -61,8 +63,8 @@ VarAtomicity, make_assignment, ) -from loopy.types import LoopyType, auto -from loopy.typing import ExpressionT, ShapeType +from loopy.types import LoopyType, ToLoopyTypeConvertible +from loopy.typing import ExpressionT, ShapeType, auto __doc__ = """ @@ -390,12 +392,6 @@ class KernelArgument(ImmutableRecord): def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) - target = kwargs.pop("target", None) - if target is not None: - warn("Passing 'target' is deprecated and will stop working in 2023. " - "It is already being ignored.", - DeprecationWarning, stacklevel=2) - dtype = kwargs.pop("dtype", None) for_atomic = kwargs.pop("for_atomic", False) @@ -521,7 +517,7 @@ def supporting_names(self) -> FrozenSet[str]: # Making this a function prevents incorrect use in isinstance. # Note: This is *not* deprecated, as it is super-common and # incrementally more convenient to use than ArrayArg directly. -def GlobalArg(*args, **kwargs): # noqa: N802 +def GlobalArg(*args, **kwargs) -> ArrayArg: # noqa: N802 address_space = kwargs.pop("address_space", None) if address_space is not None: raise TypeError("may not pass 'address_space' to GlobalArg") @@ -579,18 +575,15 @@ def supporting_names(self) -> FrozenSet[str]: ) -""" - :attribute tags: A (possibly empty) frozenset of instances of - :class:`pytools.tag.Tag` intended for consumption by an - application. - - ..versionadded: 2020.2.2 -""" - - class ValueArg(KernelArgument, Taggable): - def __init__(self, name, dtype=None, approximately=1000, target=None, - is_output=False, is_input=True, tags=None): + def __init__(self, + name: str, + dtype: ToLoopyTypeConvertible | None = None, + approximately: int = 1000, + is_output: bool = False, + is_input: bool = True, + tags: frozenset[Tag] | None = None, + ) -> None: """ :arg tags: A an instance of or Iterable of instances of :class:`pytools.tag.Tag` intended for consumption by an @@ -603,7 +596,6 @@ def __init__(self, name, dtype=None, approximately=1000, target=None, KernelArgument.__init__(self, name=name, dtype=dtype, approximately=approximately, - target=target, is_output=is_output, is_input=is_input, tags=tags) @@ -641,48 +633,42 @@ def get_arg_decl(self, ast_builder): class TemporaryVariable(ArrayBase): __doc__ = cast(str, ArrayBase.__doc__) + """ - .. attribute:: storage_shape - .. attribute:: base_indices - .. attribute:: address_space - - What memory this temporary variable lives in. - One of the values in :class:`AddressSpace`, - or :class:`loopy.auto` if this is - to be automatically determined. - - .. attribute:: base_storage - - The name of a storage array that is to be used to actually - hold the data in this temporary, or *None*. If not *None* or the name - of an existing variable, a variable of this name and appropriate size - will be created. - - .. attribute:: initializer - - *None* or a :class:`numpy.ndarray` of data to be used to initialize the - array. - - .. attribute:: read_only - - A :class:`bool` indicating whether the variable may be written during - its lifetime. If *True*, *initializer* must be given. - - .. attribute:: _base_storage_access_may_be_aliasing - - Whether the temporary is used to alias the underlying base storage. - Defaults to *False*. If *False*, C-based code generators will declare - the temporary as a ``restrict`` const pointer to the base storage - memory location. If *True*, the restrict part is omitted on this - declaration. + .. autoattribute:: storage_shape + .. autoattribute:: base_indices + .. autoattribute:: address_space + .. autoattribute:: base_storage + .. autoattribute:: initializer + .. autoattribute:: read_only + .. autoattribute:: _base_storage_access_may_be_aliasing """ storage_shape: Optional[ShapeType] base_indices: Optional[Tuple[ExpressionT, ...]] address_space: Union[AddressSpace, Type[auto]] base_storage: Optional[str] - initializer: Optional[np.ndarray] + """The name of a storage array that is to be used to actually + hold the data in this temporary, or *None*. If not *None* or the name + of an existing variable, a variable of this name and appropriate size + will be created. + """ + + initializer: Optional[numpy.ndarray] + """*None* or a :class:`numpy.ndarray` of data to be used to initialize the + array. + """ + read_only: bool + """A :class:`bool` indicating whether the variable may be written during + its lifetime. If *True*, *initializer* must be given. + """ + _base_storage_access_may_be_aliasing: bool + """Whether the temporary is used to alias the underlying base storage. + Defaults to *False*. If *False*, C-based code generators will declare + the temporary as a ``restrict`` const pointer to the base storage + memory location. If *True*, the restrict part is omitted on this + declaration. + """ min_target_axes: ClassVar[int] = 0 max_target_axes: ClassVar[int] = 1 @@ -697,11 +683,28 @@ class TemporaryVariable(ArrayBase): "_base_storage_access_may_be_aliasing", ) - def __init__(self, name, dtype=None, shape=auto, address_space=None, - dim_tags=None, offset=0, dim_names=None, strides=None, order=None, - base_indices=None, storage_shape=None, - base_storage=None, initializer=None, read_only=False, - _base_storage_access_may_be_aliasing=False, **kwargs): + def __init__( + self, + name: str, + dtype: ToLoopyTypeConvertible = None, + shape: Union[ShapeType, Type["auto"], None] = auto, + address_space: Union[AddressSpace, Type[auto], None] = None, + dim_tags: Optional[Sequence[ArrayDimImplementationTag]] = None, + offset: Union[ExpressionT, str, None] = 0, + dim_names: Optional[Tuple[str, ...]] = None, + strides: Optional[Tuple[ExpressionT, ...]] = None, + order: str | None = None, + + base_indices: Optional[Tuple[ExpressionT, ...]] = None, + storage_shape: ShapeType | None = None, + + base_storage: Optional[str] = None, + initializer: Optional[np.ndarray] = None, + read_only: bool = False, + + _base_storage_access_may_be_aliasing: bool = False, + **kwargs: Any + ) -> None: """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple @@ -711,12 +714,6 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, if address_space is None: address_space = auto - if address_space is None: - raise LoopyError( - "temporary variable '%s': " - "address_space must not be None" - % name) - if initializer is None: pass elif isinstance(initializer, np.ndarray): @@ -751,7 +748,12 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, if order is None: order = "C" - if base_indices is None and shape is not auto: + if shape is not None: + from loopy.kernel.array import _parse_shape_or_strides + shape = _parse_shape_or_strides(shape) + + if base_indices is None and shape is not auto and shape is not None: + assert isinstance(shape, tuple) base_indices = (0,) * len(shape) if not read_only and initializer is not None: @@ -790,7 +792,7 @@ def __init__(self, name, dtype=None, shape=auto, address_space=None, _base_storage_access_may_be_aliasing), **kwargs) - def copy(self, **kwargs): + def copy(self, **kwargs: Any) -> TemporaryVariable: address_space = kwargs.pop("address_space", None) if address_space is not None: @@ -799,15 +801,23 @@ def copy(self, **kwargs): return super().copy(**kwargs) @property - def nbytes(self): - shape = self.shape + def nbytes(self) -> ExpressionT: if self.storage_shape is not None: shape = self.storage_shape + else: + if self.shape is None: + raise ValueError("shape is None") + if self.shape is auto: + raise ValueError("shape is auto") + shape = cast(Tuple[ExpressionT], self.shape) + + if self.dtype is None: + raise ValueError("data type is indeterminate") from pytools import product return product(si for si in shape)*self.dtype.itemsize - def __str__(self): + def __str__(self) -> str: if self.address_space is auto: aspace_str = "auto" else: @@ -871,35 +881,27 @@ def supporting_names(self) -> FrozenSet[str]: # {{{ substitution rule -class SubstitutionRule(ImmutableRecord): +@dataclass(frozen=True) +class SubstitutionRule: """ - .. attribute:: name - .. attribute:: arguments - - A tuple of strings - - .. attribute:: expression + .. autoattribute:: name + .. autoattribute:: arguments + .. autoattribute:: expression """ - def __init__(self, name, arguments, expression): - assert isinstance(arguments, tuple) - - ImmutableRecord.__init__(self, - name=name, arguments=arguments, expression=expression) + name: str + arguments: Sequence[str] + expression: ExpressionT - def __str__(self): - return "{}({}) := {}".format( - self.name, ", ".join(self.arguments), self.expression) + def copy(self, **kwargs: Any) -> SubstitutionRule: + return replace(self, **kwargs) def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - """ - key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.arguments) key_builder.update_for_pymbolic_expression(key_hash, self.expression) + # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e3fcf108a..c96cd0fbb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -23,7 +23,8 @@ THE SOFTWARE. """ -from typing import TYPE_CHECKING, ClassVar, FrozenSet, Tuple +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Callable, ClassVar, FrozenSet, Tuple, TypeVar from pytools import ImmutableRecord @@ -36,6 +37,8 @@ if TYPE_CHECKING: + from typing_extensions import Self + from loopy.translation_unit import CallablesTable, FunctionIdT __doc__ = """ @@ -57,7 +60,23 @@ # {{{ argument descriptors -class ValueArgDescriptor(ImmutableRecord): +ArgDescriptorT = TypeVar("ArgDescriptorT", bound="ArgDescriptor") + + +class ArgDescriptor(ABC, ImmutableRecord): + @abstractmethod + def map_expr( + self, + subst_mapper: Callable[[ArgDescriptorT], ArgDescriptorT] + ) -> Self: + ... + + @abstractmethod + def depends_on(self) -> frozenset[str]: + ... + + +class ValueArgDescriptor(ArgDescriptor): hash_fields = () def map_expr(self, subst_mapper): @@ -69,7 +88,7 @@ def depends_on(self): update_persistent_hash = update_persistent_hash -class ArrayArgDescriptor(ImmutableRecord): +class ArrayArgDescriptor(ArgDescriptor): """ Records information about an array argument to an in-kernel callable. To be passed to and returned from @@ -672,7 +691,7 @@ def is_type_specialized(self): class CallableKernel(InKernelCallable): """ - Records informations about a callee kernel. Also provides interface through + Records information about a callee kernel. Also provides interface through member methods to make the callee kernel compatible to be called from a caller kernel. diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 198b7c03f..a6420b8fc 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -20,10 +20,14 @@ THE SOFTWARE. """ -from collections.abc import Set as abc_Set +from collections.abc import ( + Mapping as MappingABC, + Set as abc_Set, +) +from dataclasses import dataclass from functools import cached_property from sys import intern -from typing import FrozenSet +from typing import Any, FrozenSet, Mapping, Optional, Sequence, Tuple, Type, Union from warnings import warn import islpy as isl @@ -31,7 +35,9 @@ from pytools.tag import Tag, Taggable, tag_dataclass from loopy.diagnostic import LoopyError -from loopy.tools import Optional +from loopy.tools import Optional as LoopyOptional +from loopy.types import LoopyType +from loopy.typing import ExpressionT, InameStr # {{{ instruction tags @@ -77,6 +83,44 @@ class UseStreamingStoreTag(Tag): # }}} +# {{{ HappensAfter + +@dataclass(frozen=True) +class HappensAfter: + """A class representing a "happens-after" relationship between two + statements found in a :class:`loopy.LoopKernel`. Used to validate that a + given kernel transformation respects the data dependencies in a given + program. + + .. attribute:: variable_name + + The name of the variable responsible for the dependency. For + backward compatibility purposes, this may be *None*. In this case, the + dependency semantics revert to the deprecated, statement-level + dependencies of prior versions of :mod:`loopy`. + + .. attribute:: instances_rel + + An :class:`islpy.Map` representing the precise happens-after + relationship. The domain and range are sets of statement instances. The + instances in the domain are required to execute before the instances in + the range. + + Map dimensions are named according to the order of appearance of the + inames in a :mod:`loopy` program. The dimension names in the range are + appended with a prime to signify that the mapped instances are distinct. + + As a (deprecated) matter of backward compatibility, this may be *None*, + in which case the semantics revert to the (underspecified) + statement-level dependencies of prior versions of :mod:`loopy`. + """ + + variable_name: Optional[str] + instances_rel: Optional[isl.Map] + +# }}} + + # {{{ instructions: base class class InstructionBase(ImmutableRecord, Taggable): @@ -200,10 +244,20 @@ class InstructionBase(ImmutableRecord, Taggable): Inherits from :class:`pytools.tag.Taggable`. """ + id: Optional[str] + happens_after: Mapping[str, HappensAfter] + depends_on_is_final: bool + groups: FrozenSet[str] + conflicts_with_groups: FrozenSet[str] + no_sync_with: FrozenSet[Tuple[str, str]] + predicates: FrozenSet[ExpressionT] + within_inames: FrozenSet[InameStr] + within_inames_is_final: bool + priority: int # within_inames_is_final is deprecated and will be removed in version 2017.x. - fields = set("id depends_on depends_on_is_final " + fields = set("id depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " @@ -216,12 +270,23 @@ class InstructionBase(ImmutableRecord, Taggable): # Names of fields that are sets of pymbolic expressions. Needed for key building pymbolic_set_fields = {"predicates"} - def __init__(self, id, depends_on, depends_on_is_final, - groups, conflicts_with_groups, - no_sync_with, - within_inames_is_final, within_inames, - priority, - predicates, tags): + def __init__(self, + id: Optional[str], + happens_after: Union[ + Mapping[str, HappensAfter], FrozenSet[str], str, None], + depends_on_is_final: Optional[bool], + groups: Optional[FrozenSet[str]], + conflicts_with_groups: Optional[FrozenSet[str]], + no_sync_with: Optional[FrozenSet[Tuple[str, str]]], + within_inames_is_final: Optional[bool], + within_inames: Optional[FrozenSet[str]], + priority: Optional[int], + predicates: Optional[FrozenSet[str]], + tags: Optional[FrozenSet[Tag]], + *, + depends_on: Union[FrozenSet[str], str, None] = None, + ) -> None: + from immutabledict import immutabledict if predicates is None: predicates = frozenset() @@ -237,8 +302,50 @@ def __init__(self, id, depends_on, depends_on_is_final, predicates = frozenset(new_predicates) del new_predicates - if depends_on is None: - depends_on = frozenset() + # {{{ process happens_after/depends_on + + if happens_after is not None and depends_on is not None: + raise TypeError("may not pass both happens_after and depends_on") + elif depends_on is not None: + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Pass happens_after instead.", DeprecationWarning, stacklevel=2) + happens_after = depends_on + + del depends_on + + if depends_on_is_final and happens_after is None: + raise LoopyError("Setting depends_on_is_final to True requires " + "actually specifying happens_after/depends_on") + + if isinstance(happens_after, immutabledict): + pass + elif happens_after is None: + happens_after = immutabledict() + elif isinstance(happens_after, str): + warn("Passing a string for happens_after/depends_on is deprecated and " + "will stop working in 2025. Instead, pass a full-fledged " + "happens_after data structure.", DeprecationWarning, stacklevel=2) + + happens_after = immutabledict({ + after_id.strip(): HappensAfter( + variable_name=None, + instances_rel=None) + for after_id in happens_after.split(",") + if after_id.strip()}) + elif isinstance(happens_after, frozenset): + happens_after = immutabledict({ + after_id: HappensAfter( + variable_name=None, + instances_rel=None) + for after_id in happens_after}) + elif isinstance(happens_after, dict): + happens_after = immutabledict(happens_after) + else: + raise TypeError("'happens_after' has unexpected type: " + f"{type(happens_after)}") + + # }}} if groups is None: groups = frozenset() @@ -255,16 +362,12 @@ def __init__(self, id, depends_on, depends_on_is_final, if within_inames_is_final is None: within_inames_is_final = False - if isinstance(depends_on, str): - depends_on = frozenset( - s.strip() for s in depends_on.split(",") if s.strip()) - if depends_on_is_final is None: depends_on_is_final = False - if depends_on_is_final and not isinstance(depends_on, abc_Set): + if depends_on_is_final and not isinstance(happens_after, MappingABC): raise LoopyError("Setting depends_on_is_final to True requires " - "actually specifying depends_on") + "actually specifying happens_after/depends_on") if tags is None: tags = frozenset() @@ -288,13 +391,16 @@ def __init__(self, id, depends_on, depends_on_is_final, # assert all(is_interned(pred) for pred in predicates) assert isinstance(within_inames, abc_Set) - assert isinstance(depends_on, abc_Set) or depends_on is None + assert isinstance(happens_after, MappingABC) or happens_after is None assert isinstance(groups, abc_Set) assert isinstance(conflicts_with_groups, abc_Set) + from loopy.tools import is_hashable + assert is_hashable(happens_after) + ImmutableRecord.__init__(self, id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -307,6 +413,22 @@ def __init__(self, id, depends_on, depends_on_is_final, # The Taggable constructor call does extra validation. tags=tags) + def get_copy_kwargs(self, **kwargs): + passed_depends_on = "depends_on" in kwargs + + if passed_depends_on: + assert "happens_after" not in kwargs + + kwargs = super().get_copy_kwargs(**kwargs) + + if passed_depends_on: + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Instead, use happens_after.", DeprecationWarning, stacklevel=2) + del kwargs["happens_after"] + + return kwargs + # {{{ abstract interface def read_dependency_names(self): @@ -348,6 +470,13 @@ def with_transformed_expressions(self, f, assignee_f=None): # }}} + @property + def depends_on(self): + # FIXME Enable once we realistically check detailed dependencies. + # warn("depends_on is deprecated and will stop working in 2026. " + # "Use happens_after instead.", DeprecationWarning, stacklevel=2) + return frozenset(self.happens_after) + @property def assignee_name(self): """A convenience wrapper around :meth:`assignee_var_names` @@ -452,17 +581,24 @@ def update_persistent_hash(self, key_hash, key_builder): def __setstate__(self, val): super().__setstate__(val) + from immutabledict import immutabledict + from loopy.tools import intern_frozenset_of_ids if self.id is not None: # pylint:disable=access-member-before-definition self.id = intern(self.id) - self.depends_on = intern_frozenset_of_ids(self.depends_on) + self.happens_after = immutabledict({ + intern(after_id): ha + for after_id, ha in self.happens_after.items()}) self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( intern_frozenset_of_ids(self.conflicts_with_groups)) self.within_inames = ( intern_frozenset_of_ids(self.within_inames)) + def _with_new_tags(self, tags: FrozenSet[Tag]): + return self.copy(tags=tags) + # }}} @@ -790,30 +926,44 @@ class Assignment(MultiAssignmentBase): .. automethod:: __init__ """ + assignee: ExpressionT + expression: ExpressionT + temp_var_type: LoopyOptional + atomicity: Tuple[VarAtomicity, ...] + fields = MultiAssignmentBase.fields | \ set("assignee temp_var_type atomicity".split()) pymbolic_fields = MultiAssignmentBase.pymbolic_fields | {"assignee"} def __init__(self, - assignee, expression, - id=None, - depends_on=None, - depends_on_is_final=None, - groups=None, - conflicts_with_groups=None, - no_sync_with=None, - within_inames_is_final=None, - within_inames=None, - tags=None, - temp_var_type=_not_provided, atomicity=(), - priority=0, predicates=frozenset()): + assignee: Union[str, ExpressionT], + expression: Union[str, ExpressionT], + id: Optional[str] = None, + happens_after: Union[ + Mapping[str, HappensAfter], FrozenSet[str], str, None] = None, + depends_on_is_final: Optional[bool] = None, + groups: Optional[FrozenSet[str]] = None, + conflicts_with_groups: Optional[FrozenSet[str]] = None, + no_sync_with: Optional[FrozenSet[Tuple[str, str]]] = None, + within_inames_is_final: Optional[bool] = None, + within_inames: Optional[FrozenSet[str]] = None, + priority: Optional[int] = None, + predicates: Optional[FrozenSet[str]] = None, + tags: Optional[FrozenSet[Tag]] = None, + temp_var_type: Union[ + Type[_not_provided], None, LoopyOptional, + LoopyType] = _not_provided, + atomicity: Tuple[VarAtomicity, ...] = (), + *, + depends_on: Union[FrozenSet[str], str, None] = None, + ) -> None: if temp_var_type is _not_provided: - temp_var_type = Optional() + temp_var_type = LoopyOptional() super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -822,7 +972,8 @@ def __init__(self, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) from loopy.symbolic import parse if isinstance(assignee, str): @@ -934,7 +1085,7 @@ class CallInstruction(MultiAssignmentBase): A tuple of `:class:loopy.Optional`. If an entry is not empty, it contains the type that will be assigned to the new temporary variable - created from the assigment. + created from the assignment. .. automethod:: __init__ """ @@ -946,7 +1097,7 @@ class CallInstruction(MultiAssignmentBase): def __init__(self, assignees, expression, id=None, - depends_on=None, + happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, @@ -955,11 +1106,12 @@ def __init__(self, within_inames=None, tags=None, temp_var_types=None, - priority=0, predicates=frozenset()): + priority=0, predicates=frozenset(), + depends_on=None): super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -968,7 +1120,8 @@ def __init__(self, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) from pymbolic.primitives import Call @@ -1001,7 +1154,7 @@ def __init__(self, self.expression = expression if temp_var_types is None: - self.temp_var_types = (Optional(),) * len(self.assignees) + self.temp_var_types = (LoopyOptional(),) * len(self.assignees) else: self.temp_var_types = tuple( _check_and_fix_temp_var_type(tvt, stacklevel=3) @@ -1099,7 +1252,7 @@ def is_array_call(assignees, expression): Returns *True* is the instruction is an array call. An array call is a function call applied to array type objects. If any of - the arguemnts or assignees to the function is an array, + the arguments or assignees to the function is an array, :meth:`is_array_call` will return *True*. """ from pymbolic.primitives import Call, Subscript @@ -1143,10 +1296,16 @@ def modify_assignee_for_array_call(assignee): "SubArrayRef as its inputs") -def make_assignment(assignees, expression, temp_var_types=None, **kwargs): +def make_assignment(assignees: tuple[ExpressionT, ...], + expression: ExpressionT, + temp_var_types: ( + Sequence[LoopyType | None] | None) = None, + **kwargs: Any) -> Assignment | CallInstruction: - if temp_var_types is None: - temp_var_types = (Optional(),) * len(assignees) + if temp_var_types is not None: + tv_types: Sequence[LoopyType | LoopyOptional | None] = temp_var_types + else: + tv_types = (LoopyOptional(),) * len(assignees) if len(assignees) != 1 or is_array_call(assignees, expression): atomicity = kwargs.pop("atomicity", ()) @@ -1176,7 +1335,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, - temp_var_types=temp_var_types, + temp_var_types=tuple(tv_types), **kwargs) else: def _is_array(expr): @@ -1196,10 +1355,13 @@ def _is_array(expr): raise LoopyError("Array calls only supported as instructions" " with function call as RHS for now.") + assignee, = assignees + tv_type, = tv_types + return Assignment( - assignee=assignees[0], + assignee=assignee, expression=expression, - temp_var_type=temp_var_types[0], + temp_var_type=tv_type, **kwargs) @@ -1248,12 +1410,13 @@ class CInstruction(InstructionBase): def __init__(self, iname_exprs, code, read_variables=frozenset(), assignees=(), - id=None, depends_on=None, depends_on_is_final=None, + id=None, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=0, - predicates=frozenset(), tags=None): + predicates=frozenset(), tags=None, + depends_on=None): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -1266,13 +1429,14 @@ def __init__(self, InstructionBase.__init__(self, id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, - priority=priority, predicates=predicates, tags=tags) + priority=priority, predicates=predicates, tags=tags, + depends_on=depends_on) # {{{ normalize iname_exprs @@ -1416,15 +1580,15 @@ class NoOpInstruction(_DataObliviousInstruction): ... nop """ - def __init__(self, id=None, depends_on=None, depends_on_is_final=None, + def __init__(self, id=None, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, - predicates=None, tags=None): + predicates=None, tags=None, depends_on=None): super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -1433,7 +1597,8 @@ def __init__(self, id=None, depends_on=None, depends_on_is_final=None, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags) + tags=tags, + depends_on=depends_on) def __str__(self): first_line = "%s: ... nop" % self.id @@ -1460,7 +1625,7 @@ class BarrierInstruction(_DataObliviousInstruction): .. attribute:: mem_kind A string, ``"global"`` or ``"local"``. Chooses which memory type to - sychronize, for targets that require this (e.g. OpenCL) + synchronize, for targets that require this (e.g. OpenCL) The textual syntax in a :mod:`loopy` kernel is:: @@ -1475,20 +1640,21 @@ class BarrierInstruction(_DataObliviousInstruction): fields = _DataObliviousInstruction.fields | {"synchronization_kind", "mem_kind"} - def __init__(self, id, depends_on=None, depends_on_is_final=None, + def __init__(self, id, happens_after=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, predicates=None, tags=None, synchronization_kind="global", - mem_kind="local"): + mem_kind="local", + depends_on=None): if predicates: raise LoopyError("conditional barriers are not supported") super().__init__( id=id, - depends_on=depends_on, + happens_after=happens_after, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, @@ -1497,8 +1663,8 @@ def __init__(self, id, depends_on=None, depends_on_is_final=None, within_inames=within_inames, priority=priority, predicates=predicates, - tags=tags - ) + tags=tags, + depends_on=depends_on) self.synchronization_kind = synchronization_kind self.mem_kind = mem_kind diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 5ed9b2ad3..9a14aedd5 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -34,6 +34,7 @@ import islpy as isl from islpy import dim_type from pytools import memoize_on_first_arg, natsorted +from pytools.tag import Tag from loopy.diagnostic import LoopyError, warn_with_kernel from loopy.kernel import LoopKernel @@ -44,7 +45,8 @@ _DataObliviousInstruction, ) from loopy.symbolic import CombineMapper -from loopy.translation_unit import TranslationUnit, for_each_kernel +from loopy.translation_unit import TranslationUnit, TUnitOrKernelT, for_each_kernel +from loopy.types import ToLoopyTypeConvertible logger = logging.getLogger(__name__) @@ -52,15 +54,20 @@ # {{{ add and infer argument dtypes -def add_dtypes(prog_or_kernel, dtype_dict): +def add_dtypes( + kernel: TUnitOrKernelT, + dtype_dict: Mapping[str, ToLoopyTypeConvertible], + ) -> TUnitOrKernelT: """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - if isinstance(prog_or_kernel, TranslationUnit): + if isinstance(kernel, TranslationUnit): + t_unit = kernel + del kernel kernel_names = [clbl.subkernel.name for clbl in - prog_or_kernel.callables_table.values() if isinstance(clbl, + t_unit.callables_table.values() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError("add_dtypes may not take a TranslationUnit with more" @@ -69,10 +76,10 @@ def add_dtypes(prog_or_kernel, dtype_dict): kernel_name, = kernel_names - return prog_or_kernel.with_kernel( - add_dtypes(prog_or_kernel[kernel_name], dtype_dict)) + return t_unit.with_kernel( + add_dtypes(t_unit[kernel_name], dtype_dict)) - assert isinstance(prog_or_kernel, LoopKernel) + assert isinstance(kernel, LoopKernel) processed_dtype_dict = {} @@ -83,13 +90,13 @@ def add_dtypes(prog_or_kernel, dtype_dict): processed_dtype_dict[subkey] = v dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( - prog_or_kernel, processed_dtype_dict) + kernel, processed_dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars) + return kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(kernel, dtype_dict): @@ -263,7 +270,7 @@ def find_all_insn_inames(kernel): if insn.within_inames_is_final: continue - # {{{ depdency-based propagation + # {{{ dependency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( @@ -513,8 +520,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, """ # make sure all automatically added stuff shows up - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=False) if iname_cluster and not kernel.linearization: try: @@ -1252,9 +1259,9 @@ def find_recursive_dependencies(kernel, insn_ids): for insn_id in queue: insn = kernel.id_to_insn[insn_id] - additionals = insn.depends_on - result - result.update(additionals) - new_queue.extend(additionals) + additional = insn.depends_on - result + result.update(additional) + new_queue.extend(additional) queue = new_queue @@ -1477,7 +1484,7 @@ def conform_to_uniform_length(s): # {{{ stringify_instruction_list -def stringify_instruction_tag(tag): +def stringify_instruction_tag(tag: Tag) -> str: from loopy.kernel.instruction import LegacyStringInstructionTag if isinstance(tag, LegacyStringInstructionTag): return f"S({tag.value})" @@ -1485,7 +1492,7 @@ def stringify_instruction_tag(tag): return str(tag) -def stringify_instruction_list(kernel): +def stringify_instruction_list(kernel: LoopKernel) -> list[str]: # {{{ topological sort printed_insn_ids = set() @@ -1519,7 +1526,7 @@ def insert_insn_into_order(insn): leader = " " * uniform_arrow_length lines = [] - current_inames = [set()] + current_inames: list[set[str]] = [set()] if uniform_arrow_length: indent_level = [1] @@ -1530,13 +1537,13 @@ def insert_insn_into_order(insn): iname_order = kernel._get_iname_order_for_printing() - def add_pre_line(s): + def add_pre_line(s: str) -> None: lines.append(leader + " " * indent_level[0] + s) - def add_main_line(s): + def add_main_line(s: str) -> None: lines.append(arrows + " " * indent_level[0] + s) - def add_post_line(s): + def add_post_line(s: str) -> None: lines.append(extender + " " * indent_level[0] + s) def adapt_to_new_inames_list(new_inames): @@ -1735,7 +1742,7 @@ def get_global_barrier_order(kernel): @memoize_on_first_arg def find_most_recent_global_barrier(kernel, insn_id): - """Return the id of the latest occuring global barrier which the + """Return the id of the latest occurring global barrier which the given instruction (indirectly or directly) depends on, or *None* if this instruction does not depend on a global barrier. @@ -1995,7 +2002,7 @@ def infer_args_are_input_output(kernel): elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)): pass else: - raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + raise NotImplementedError("Unknown argument type %s." % type(arg)) if not (arg.is_input or arg.is_output): raise LoopyError("Kernel argument must be either input or output." diff --git a/loopy/match.py b/loopy/match.py index 889f4e74f..5e409791b 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -1,4 +1,4 @@ -"""Matching functionality for instruction ids and subsitution +"""Matching functionality for instruction ids and substitution rule invocations stacks.""" diff --git a/loopy/options.py b/loopy/options.py index 9c4fa0fb4..293670774 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -23,6 +23,7 @@ import os import re +from typing import Any from warnings import warn from pytools import ImmutableRecord @@ -118,7 +119,7 @@ class Options(ImmutableRecord): .. attribute:: cl_exec_manage_array_events - Within the PyOpenCL executor, respect and udpate + Within the PyOpenCL executor, respect and update :attr:`pyopencl.array.Array.events`. Defaults to *True*. @@ -156,7 +157,7 @@ class Options(ImmutableRecord): Allow re-ordering of floating point arithmetic. Re-ordering may give different results as floating point arithmetic is not - associative in addition and mulitplication. Default is *True*. + associative in addition and multiplication. Default is *True*. Note that the implementation of this option is currently incomplete. .. attribute:: build_options @@ -214,7 +215,7 @@ def __init__( # All defaults are further required to be False when cast to bool # for the update() functionality to work. - self, **kwargs): + self, **kwargs: Any) -> None: kwargs = _apply_legacy_map(self._legacy_options_map, kwargs) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d24e14cc2..3293e9a1e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -49,7 +49,11 @@ auto, filter_iname_tags_by_type, ) -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.function_interface import ( + ArgDescriptor, + CallableKernel, + ScalarCallable, +) # from loopy.transform.iname import remove_any_newly_unused_inames from loopy.kernel.instruction import ( @@ -655,7 +659,7 @@ def traverse_to_infer_arg_descr(kernel, callables_table): return descr_inferred_kernel, arg_descr_inf_mapper.clbl_inf_ctx -def infer_arg_descr(program): +def infer_arg_descr(t_unit: TranslationUnit) -> TranslationUnit: """ Returns a copy of *program* with the :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the @@ -666,12 +670,12 @@ def infer_arg_descr(program): from loopy.kernel.function_interface import ArrayArgDescriptor, ValueArgDescriptor from loopy.translation_unit import make_clbl_inf_ctx, resolve_callables - program = resolve_callables(program) + t_unit = resolve_callables(t_unit) - clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, - program.entrypoints) + clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table, + t_unit.entrypoints) - for e in program.entrypoints: + for e in t_unit.entrypoints: def _tuple_or_none(s): if isinstance(s, tuple): return s @@ -680,8 +684,8 @@ def _tuple_or_none(s): else: return s, - arg_id_to_descr = {} - for arg in program[e].args: + arg_id_to_descr: dict[str, ArgDescriptor] = {} + for arg in t_unit[e].args: if isinstance(arg, ArrayBase): if arg.shape not in (None, auto): arg_id_to_descr[arg.name] = ArrayArgDescriptor( @@ -691,12 +695,12 @@ def _tuple_or_none(s): arg_id_to_descr[arg.name] = ValueArgDescriptor() else: raise NotImplementedError() - new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( + new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_descrs( arg_id_to_descr, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, is_entrypoint=True) - return clbl_inf_ctx.finish_program(program) + return clbl_inf_ctx.finish_program(t_unit) # }}} @@ -825,8 +829,8 @@ def preprocess_program(t_unit: TranslationUnit) -> TranslationUnit: from loopy.transform.subst import expand_subst t_unit = expand_subst(t_unit) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - t_unit = apply_single_writer_depencency_heuristic(t_unit) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + t_unit = apply_single_writer_dependency_heuristic(t_unit) # Ordering restrictions: # diff --git a/loopy/py.typed b/loopy/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ca45521e3..1364be850 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1,5 +1,7 @@ from __future__ import annotations +from loopy.typing import not_none + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -51,11 +53,16 @@ from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel from loopy.kernel.instruction import InstructionBase from loopy.tools import LoopyKeyBuilder, caches +from loopy.typing import InameStr from loopy.version import DATA_MODEL_VERSION if TYPE_CHECKING: from loopy.kernel import LoopKernel + from loopy.schedule.tools import ( + InameStrSet, + LoopTree, + ) from loopy.translation_unit import CallablesTable, TranslationUnit @@ -63,12 +70,11 @@ __doc__ = """ -.. currentmodule:: loopy.schedule - .. autoclass:: ScheduleItem .. autoclass:: BeginBlockItem .. autoclass:: EndBlockItem .. autoclass:: CallKernel +.. autoclass:: ReturnFromKernel .. autoclass:: Barrier .. autoclass:: RunInstruction @@ -713,12 +719,13 @@ def get_insns_in_topologically_sorted_order( from pytools.graph import compute_topological_order rev_dep_map: Dict[str, Set[str]] = { - insn.id: set() for insn in kernel.instructions} + not_none(insn.id): set() for insn in kernel.instructions} for insn in kernel.instructions: for dep in insn.depends_on: + assert insn.id is not None rev_dep_map[dep].add(insn.id) - # For breaking ties, we compare the features of an intruction + # For breaking ties, we compare the features of an instruction # so that instructions with the same set of features are lumped # together. This helps in :method:`schedule_as_many_run_insns_as_possible` # which bails after 5 insns that don't have the same feature. @@ -877,7 +884,170 @@ def is_similar_to_template(insn): # }}} -# {{{ scheduling algorithm +# {{{ scheduling algorithm v2 + +def _get_outermost_diverging_inames( + tree: LoopTree, + within1: InameStrSet, + within2: InameStrSet + ) -> Tuple[InameStr, InameStr]: + """ + For loop nestings *within1* and *within2*, returns the first inames at which + the loops nests diverge in the loop nesting tree *tree*. + """ + common_ancestors = (within1 & within2) | {""} + + innermost_parent = max(common_ancestors, + key=lambda k: tree.depth(k)) + iname1, = frozenset(tree.children(innermost_parent)) & within1 + iname2, = frozenset(tree.children(innermost_parent)) & within2 + + return iname1, iname2 + + +def _generate_loop_schedules_v2(kernel: LoopKernel) -> Sequence[ScheduleItem]: + from functools import reduce + + from pytools.graph import compute_topological_order + + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.schedule.tools import get_loop_tree + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + parallel_inames = (concurrent_inames - ilp_inames - vec_inames) + + # {{{ can v2 scheduler handle the kernel? + + from loopy.schedule.tools import V2SchedulerNotImplementedError + if any(insn.conflicts_with_groups for insn in kernel.instructions): + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " kernels with instruction having conflicts with groups.") + + if any(insn.priority != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") + + if kernel.schedule is not None: + # cannot handle preschedule yet + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " prescheduled kernels.") + + if ilp_inames or vec_inames: + raise V2SchedulerNotImplementedError("v2 scheduler cannot schedule" + " loops tagged with 'ilp'/'vec' as they are not guaranteed to" + " be single entry loops.") + + # }}} + + loop_tree = get_loop_tree(kernel) + + # loop_inames: inames that are realized as loops. Concurrent inames aren't + # realized as a loop in the generated code for a loopy.TargetBase. + + # FIXME: These three could be one statement if it weren't for + # - https://github.com/python/mypy/issues/17693 + # - https://github.com/python/mypy/issues/17694 + emptyset: frozenset[InameStr] = frozenset() + all_inames = reduce( + frozenset.union, + (insn.within_inames for insn in kernel.instructions), + emptyset) + loop_inames = all_inames - parallel_inames + + # The idea here is to build a DAG, where nodes are schedule items and if + # there exists an edge from schedule item A to schedule item B in the DAG => + # B *must* come after A in the linearized result. + + dag: dict[ScheduleItem, frozenset[ScheduleItem]] = {} + + # LeaveLoop(i) *must* follow EnterLoop(i) + dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)}) + for iname in loop_inames}) + dag.update({LeaveLoop(iname=iname): frozenset() + for iname in loop_inames}) + dag.update({RunInstruction(insn_id=not_none(insn.id)): frozenset() + for insn in kernel.instructions}) + + # {{{ add constraints imposed by the loop nesting + + for outer_loop in loop_tree.nodes(): + if outer_loop == "": + continue + + for child in loop_tree.children(outer_loop): + inner_loop = child + dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)} + dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)} + + # }}} + + # {{{ add deps. between schedule items coming from insn. depepdencies + + for insn in kernel.instructions: + assert insn.id is not None + + insn_loop_inames = insn.within_inames & loop_inames + for dep_id in insn.depends_on: + dep = kernel.id_to_insn[dep_id] + dep_loop_inames = dep.within_inames & loop_inames + # Enforce instruction dep: + dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)} + + # {{{ register deps on loop entry/leave because of insn. deps + + if dep_loop_inames < insn_loop_inames: + for iname in insn_loop_inames - dep_loop_inames: + dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)} + elif insn_loop_inames < dep_loop_inames: + for iname in dep_loop_inames - insn_loop_inames: + dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + elif dep_loop_inames != insn_loop_inames: + insn_iname, dep_iname = _get_outermost_diverging_inames( + loop_tree, insn_loop_inames, dep_loop_inames) + dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)} + else: + pass + + # }}} + + for iname in insn_loop_inames: + # For an insn within a loop nest 'i' + # for i + # insn + # end i + # 'insn' *must* come b/w 'for i' and 'end i' + dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)} + + # }}} + + def iname_key(iname: str) -> str: + all_ancestors = sorted(loop_tree.ancestors(iname), + key=lambda x: loop_tree.depth(x)) + return ",".join(all_ancestors+[iname]) + + def key(x: ScheduleItem) -> tuple[str, ...]: + if isinstance(x, RunInstruction): + iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames), + key=lambda k: loop_tree.depth(k), + default="") + return (iname_key(iname), x.insn_id) + elif isinstance(x, (EnterLoop, LeaveLoop)): + return (iname_key(x.iname),) + else: + raise NotImplementedError + + return compute_topological_order(dag, key=key) + +# }}} + + +# {{{ legacy scheduling algorithm def _generate_loop_schedules_internal( sched_state, debug=None): @@ -1196,7 +1366,7 @@ def insn_sort_key(insn_id): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " "which the schedule is " - "currently stuck inside of, seems mis-nested. " + "currently stuck inside of, seems misnested. " "'%(subdep)s' must occur " "before '%(dep)s', " "but '%(subdep)s must be outside " "'%(iname)s', whereas '%(dep)s' must be back " @@ -1404,7 +1574,7 @@ def insn_sort_key(insn_id): get_priority_tiers(wanted, sched_state.kernel.loop_priority)) # Update the loop priority set, because some constraints may have - # have been contradictary. + # have been contradictory. loop_priority_set = set().union(*[set(t) for t in priority_tiers]) priority_tiers.append( @@ -2029,6 +2199,40 @@ def generate_loop_schedules( callables_table, debug_args=debug_args) +def _postprocess_schedule(kernel, callables_table, gen_sched): + from loopy.kernel import KernelState + + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) + + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="global", + verify_only=(not + kernel.options.insert_gbarriers)) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) + + new_kernel = kernel.copy( + linearization=gen_sched, + state=KernelState.LINEARIZED) + + from loopy.schedule.device_mapping import map_schedule_onto_host_or_device + if kernel.state != KernelState.LINEARIZED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + return new_kernel + + def _generate_loop_schedules_inner( kernel: LoopKernel, callables_table: CallablesTable, @@ -2041,6 +2245,19 @@ def _generate_loop_schedules_inner( raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") + from loopy.schedule.tools import V2SchedulerNotImplementedError + try: + gen_sched = _generate_loop_schedules_v2(kernel) + yield _postprocess_schedule(kernel, callables_table, gen_sched) + return + + except V2SchedulerNotImplementedError as e: + warn_with_kernel( + kernel, + "v1_scheduler_fallback", + f"Falling back to a slow scheduler implementation due to: {e}", + stacklevel=1) + schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -2102,7 +2319,7 @@ def _generate_loop_schedules_inner( schedule=(), - unscheduled_insn_ids={insn.id for insn in kernel.instructions}, + unscheduled_insn_ids={not_none(insn.id) for insn in kernel.instructions}, scheduled_insn_ids=frozenset(), within_subkernel=kernel.state != KernelState.LINEARIZED, may_schedule_global_barriers=True, @@ -2155,33 +2372,7 @@ def print_longest_dead_end(): sched_state, debug=debug, **schedule_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) - - gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, - return_dict=True) - - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="global", - verify_only=(not - kernel.options.insert_gbarriers)) - - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) - - new_kernel = kernel.copy( - linearization=gen_sched, - state=KernelState.LINEARIZED) - - from loopy.schedule.device_mapping import map_schedule_onto_host_or_device - if kernel.state != KernelState.LINEARIZED: - # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) + new_kernel = _postprocess_schedule(kernel, callables_table, gen_sched) yield new_kernel diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index b2a44c499..3858462b1 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -1,4 +1,35 @@ -__copyright__ = "Copyright (C) 2016 Matt Wala" +__copyright__ = """ +Copyright (C) 2016 Matt Wala +Copyright (C) 2020 University of Illinois Board of Trustees +Copyright (C) 2022 Kaushik Kulkarni +""" + +__doc__ = """ +.. autofunction:: get_block_boundaries +.. autofunction:: temporaries_read_in_subkernel +.. autofunction:: args_read_in_subkernel +.. autofunction:: args_written_in_subkernel +.. autofunction:: supporting_temporary_names + +.. autoclass:: KernelArgInfo +.. autoclass:: SubKernelArgInfo + +.. autofunction:: get_kernel_arg_info +.. autofunction:: get_subkernel_arg_info + +.. autofunction:: get_return_from_kernel_mapping + +.. autoclass:: AccessMapDescriptor +.. autoclass:: WriteRaceChecker + +.. autoclass:: InameStrSet +.. autoclass:: LoopNestTree +.. autoclass:: LoopTree + +.. autofunction:: separate_loop_nest +.. autofunction:: get_partial_loop_nest_tree +.. autofunction:: get_loop_tree +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -21,24 +52,32 @@ """ import enum +from collections.abc import Callable, Collection, Mapping from dataclasses import dataclass -from functools import cached_property -from typing import Dict, FrozenSet, List, Sequence, Set, Tuple +from functools import cached_property, reduce +from typing import AbstractSet, Dict, FrozenSet, List, Sequence, Set, Tuple + +from immutables import Map +from typing_extensions import TypeAlias import islpy as isl -from pytools import memoize_method +from pytools import memoize_method, memoize_on_first_arg +from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.data import AddressSpace, ArrayArg, TemporaryVariable +from loopy.schedule import ScheduleItem +from loopy.schedule.tree import Tree +from loopy.typing import InameStr, not_none # {{{ block boundary finder -def get_block_boundaries(schedule): - """ +def get_block_boundaries(schedule: Sequence[ScheduleItem]) -> Mapping[int, int]: + r""" Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. + :class:`loopy.schedule.BeginBlockItem`\ s to + :class:`loopy.schedule.EndBlockItem`\ s and vice versa. """ from loopy.schedule import BeginBlockItem, EndBlockItem block_bounds = {} @@ -131,8 +170,12 @@ def supporting_temporary_names( @dataclass(frozen=True) class KernelArgInfo: - passed_arg_names: Sequence[str] + """ + .. autoattribute:: passed_arg_names + .. autoattribute:: written_names + """ + passed_arg_names: Sequence[str] written_names: FrozenSet[str] @property @@ -142,6 +185,12 @@ def passed_names(self) -> Sequence[str]: @dataclass(frozen=True) class SubKernelArgInfo(KernelArgInfo): + """Inherits from :class:`KernelArgInfo`. + + .. autoattribute:: passed_inames + .. autoattribute:: passed_temporaries + """ + passed_inames: Sequence[str] passed_temporaries: Sequence[str] @@ -301,7 +350,7 @@ def get_subkernel_arg_info( # {{{ get_return_from_kernel_mapping -def get_return_from_kernel_mapping(kernel): +def get_return_from_kernel_mapping(kernel: LoopKernel) -> Mapping[int, int | None]: """ Returns a mapping from schedule index of every schedule item (S) in *kernel* to the schedule index of :class:`loopy.schedule.ReturnFromKernel` @@ -318,8 +367,8 @@ def get_return_from_kernel_mapping(kernel): ) assert isinstance(kernel, LoopKernel) assert isinstance(kernel.linearization, list) - return_from_kernel_idxs = {} - current_return_from_kernel = None + return_from_kernel_idxs: dict[int, int | None] = {} + current_return_from_kernel: int | None = None for sched_idx, sched_item in list(enumerate(kernel.linearization))[::-1]: if isinstance(sched_item, CallKernel): return_from_kernel_idxs[sched_idx] = current_return_from_kernel @@ -621,4 +670,445 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir, # }}} -# vim: foldmethod=marker + +InameStrSet: TypeAlias = FrozenSet[InameStr] +LoopNestTree: TypeAlias = Tree[InameStrSet] +LoopTree: TypeAlias = Tree[InameStr] + + +class V2SchedulerNotImplementedError(LoopyError): + pass + + +def separate_loop_nest( + tree: LoopNestTree, + loop_nests: Collection[InameStrSet], + inames_to_separate: InameStrSet + ) -> tuple[LoopNestTree, InameStrSet, InameStrSet | None]: + """ + Returns a copy of *tree* that has *inames_to_separate* occur in + nodes that are not shared with other inames. + Returns a version of the loop nest tree *tree* so that every node in the tree is + either a subset of *outermost_inames* or has an empty intersection with + *outermost_inames*. + + This routine modifies at most one node of the tree. + All its ancestors must satisfy `ancestor <= outermost_inames`. + For the first node not satisfying this relationship, + if `node & outermost_inames` is empty, no modification is made. + Otherwise, if ``node & outermost_inames < node``, that node is split + so as to separate *outermost_inames* in their own node. + + :arg loop_nests: A collection of nodes in *tree* that cover + *inames_to_separate*. + + :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, + where outer_loop_nest is the identifier for the new outer and inner + loop nests so that *inames_to_separate* is a valid nesting. + + .. note:: + + We could compute *loop_nests* within this routine's implementation, but + computing would be expensive and hence we ask the caller for this info. + + Example:: + *tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k', 'l'}) + + *inames_to_separate*: frozenset({'k', 'i', 'j'}) + *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} + + Returns: + + *new_tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k'}) + └── frozenset({'l'}) + + *outer_loop_nest*: frozenset({'k'}) + *inner_loop_nest*: frozenset({'l'}) + """ + assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests) + + # annotation to avoid https://github.com/python/mypy/issues/17693 + emptyset: InameStrSet = frozenset() + + assert inames_to_separate <= reduce(frozenset.union, loop_nests, emptyset) + + # {{{ sanity check to ensure the loop nest *inames_to_separate* is possible + + loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) + + for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): + if outer != tree.parent(inner): + raise LoopyError(f"Cannot schedule loop nest {inames_to_separate} " + f" in the nesting tree:\n{tree}") + + assert tree.depth(loop_nests[0]) == 0 + + # }}} + + innermost_node = loop_nests[-1] + # separate variable to avoid https://github.com/python/mypy/issues/17694 + outerer_loops = reduce(frozenset.union, loop_nests[:-1], emptyset) + new_outer_node = inames_to_separate - outerer_loops + new_inner_node = innermost_node - inames_to_separate + + if new_outer_node == innermost_node: + # such a loop nesting already exists => do nothing + return tree, new_outer_node, None + + # add the outer loop to our loop nest tree + tree = tree.add_node(new_outer_node, + parent=not_none(tree.parent(innermost_node))) + + # rename the old loop to the inner loop + tree = tree.replace_node(innermost_node, + new_node=new_inner_node) + + # set the parent of inner loop to be the outer loop + tree = tree.move_node(new_inner_node, new_parent=new_outer_node) + + return tree, new_outer_node, new_inner_node + + +def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): + """ + Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*. + """ + # add the outer loop to our loop nest tree + return tree.add_node(inner_loop_nest, parent=outer_loop_nest) + + +def _order_loop_nests( + loop_nest_tree: LoopNestTree, + strict_priorities: FrozenSet[Tuple[InameStr, ...]], + relaxed_priorities: FrozenSet[Tuple[InameStr, ...]], + iname_to_tree_node_id: Mapping[InameStr, InameStrSet], + ) -> LoopTree: + """ + Returns a loop nest where all nodes in the tree are instances of + :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to + multiple loop nesting, this routine returns a unique loop nest that is + obtained after constraining *loop_nest_tree* with the constraints enforced + by *priorities*. + + :arg strict_priorities: Expresses strict nesting constraints using the same + data structure as :attr:`loopy.LoopKernel.loop_priority`. + These priorities are imposed strictly i.e. if these conditions cannot be met a + :class:`loopy.diagnostic.LoopyError` is raised. + + :arg relaxed_priorities: Expresses strict nesting constraints using the same + data structure as :attr:`loopy.LoopKernel.loop_priority`. + These nesting constraints are treated as optional. + + :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a + part of. + """ + from warnings import warn + + from pytools.graph import compute_topological_order as toposort + + loop_nests = set(iname_to_tree_node_id.values()) + + # nesting_constraints: A mapping from the loop nest level to the nesting + # constraints applicable to it. + # Each nesting constraint is represented as a DAG. In the DAG, if there + # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested + # inside 'i'. + iname_to_nesting_constraints: dict[InameStrSet, dict[InameStr, InameStrSet]] = { + loop_nest: {iname: frozenset() for iname in loop_nest} + for loop_nest in loop_nests} + + # The plan here is populate DAGs in *nesting_constraints* and then perform a + # toposort for each loop nest. + + def _update_nesting_constraints( + priorities: FrozenSet[Tuple[InameStr, ...]], + cannot_satisfy_callback: Callable[[str], None] + ) -> None: + """ + Records *priorities* in *nesting_constraints* and calls + *cannot_satisfy_callback* with an appropriate error message if the + priorities cannot be met. + """ + for priority in priorities: + for outer_iname, inner_iname in zip(priority[:-1], priority[1:]): + if inner_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {inner_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + if outer_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {outer_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + iname_to_nesting_constraints[ + inner_iname_nest][outer_iname] |= {inner_iname} + else: + ancestors_of_inner_iname = (loop_nest_tree + .ancestors(inner_iname_nest)) + ancestors_of_outer_iname = (loop_nest_tree + .ancestors(outer_iname_nest)) + if outer_iname in ancestors_of_inner_iname: + # nesting constraint already satisfied => do nothing + pass + elif inner_iname in ancestors_of_outer_iname: + cannot_satisfy_callback("Cannot satisfy constraint that" + f" iname '{inner_iname}' must be" + f" nested within '{outer_iname}''.") + else: + # inner iname and outer iname are indirect family members + # => must be realized via dependencies in the linearization + # phase, not implemented in v2-scheduler yet. + raise V2SchedulerNotImplementedError("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") + + def _raise_loopy_err(x): + raise LoopyError(x) + + # record strict priorities + _update_nesting_constraints(strict_priorities, _raise_loopy_err) + # record relaxed priorities + _update_nesting_constraints(relaxed_priorities, warn) + + # ordered_loop_nests: A mapping from the unordered loop nests to their + # ordered counterparts. For example. If we had only one loop nest + # `frozenset({"i", "j", "k"})`, and the prioirities said added the + # constraint that "i" must be nested within "k", then `ordered_loop_nests` + # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop + # nests would now have an order. + ordered_loop_nests = { + unordered_nest: toposort(flow, key=lambda x: x) + for unordered_nest, flow in iname_to_nesting_constraints.items()} + + # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' + + assert loop_nest_tree.root == frozenset() + + new_tree = Tree.from_root("") + + old_to_new_parent = {} + + old_to_new_parent[loop_nest_tree.root] = "" + + # traversing 'tree' in an BFS fashion to create 'new_tree' + queue = list(loop_nest_tree.children(loop_nest_tree.root)) + + while queue: + current_nest = queue.pop(0) + + ordered_nest = ordered_loop_nests[current_nest] + new_tree = new_tree.add_node(ordered_nest[0], + parent=old_to_new_parent[not_none(loop_nest_tree + .parent(current_nest))]) + for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]): + new_tree = new_tree.add_node(node=new_child, parent=new_parent) + + old_to_new_parent[current_nest] = ordered_nest[-1] + + queue.extend(loop_nest_tree.children(current_nest)) + + # }}} + + return new_tree + + +@memoize_on_first_arg +def _get_parallel_inames(kernel: LoopKernel) -> AbstractSet[str]: + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + return (concurrent_inames - ilp_inames - vec_inames) + + +def get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree: + """ + Returns a tree representing the *kernel*'s loop nests. + + Each node of the returned tree has a :class:`frozenset` of inames. + All the inames in the identifier of a parent node of a loop nest in the + tree must be nested outside all the iname in identifier of the loop nest. + + .. note:: + + This routine only takes into account the nesting dependency + constraints of :attr:`loopy.InstructionBase.within_inames` of all the + *kernel*'s instructions and the iname tags. This routine does *NOT* + include the nesting constraints imposed by the dependencies between the + instructions and the dependencies imposed by the kernel's domain tree. + """ + from loopy.kernel.data import IlpBaseTag + + # figuring the possible loop nestings minus the concurrent_inames as they + # are never realized as actual loops + insn_iname_sets = { + insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} + + root: InameStrSet = frozenset() + tree = Tree.from_root(root) + + # mapping from iname to the innermost loop nest they are part of in *tree*. + iname_to_tree_node_id: Dict[InameStr, InameStrSet] = {} + + # if there were any loop with no inames, those have been already account + # for as the root. + insn_iname_sets = insn_iname_sets - {root} + + for iname_set in insn_iname_sets: + not_seen_inames = frozenset(iname for iname in iname_set + if iname not in iname_to_tree_node_id) + seen_inames = iname_set - not_seen_inames + + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + + tree, outer_loop, inner_loop = separate_loop_nest(tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + tree = _add_inner_loops(tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + for iname in not_seen_inames: + iname_to_tree_node_id[iname] = not_seen_inames + + # }}} + + # {{{ make ILP tagged inames innermost + + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + + for iname_set in insn_iname_sets: + for ilp_iname in (ilp_inames & insn_iname_sets): + # pull out other loops so that ilp_iname is the innermost + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + tree, outer_loop, inner_loop = separate_loop_nest(tree, + (all_nests + | {frozenset()}), + (iname_set + - {ilp_iname})) + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + # }}} + + return tree + + +def _get_iname_to_tree_node_id_from_partial_loop_nest_tree( + tree: LoopNestTree, + ) -> Mapping[str, frozenset[str]]: + """ + Returns the mapping from the iname to the *tree*'s node that it was a part + of. + + :arg tree: A partial loop nest tree. + """ + iname_to_tree_node_id = {} + for node in tree.nodes(): + assert isinstance(node, frozenset) + for iname in node: + iname_to_tree_node_id[iname] = node + + return Map(iname_to_tree_node_id) + + +def get_loop_tree(kernel: LoopKernel) -> LoopTree: + """ + Returns a tree representing the loop nesting for *kernel*. A parent node in + the tree is always nested outside all its children. + + .. note:: + + Multiple loop nestings might exist for *kernel*, but this routine returns + one valid loop nesting. + """ + from islpy import dim_type + + tree = get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + strict_loop_priorities: FrozenSet[Tuple[InameStr, ...]] = frozenset() + + # {{{ impose constraints by the domain tree + + # FIXME: These three could be one statement if it weren't for + # - https://github.com/python/mypy/issues/17693 + # - https://github.com/python/mypy/issues/17694 + emptyset: InameStrSet = frozenset() + loop_inames = reduce(frozenset.union, + (insn.within_inames + for insn in kernel.instructions), + emptyset) + loop_inames = loop_inames - _get_parallel_inames(kernel) + + for dom in kernel.domains: + for outer_iname in set(dom.get_var_names(dim_type.param)): + if outer_iname not in loop_inames: + continue + + for inner_iname in dom.get_var_names(dim_type.set): + if inner_iname not in loop_inames: + continue + + # either outer_iname and inner_iname should belong to the same + # loop nest level or outer should be strictly outside inner + # iname + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + strict_loop_priorities |= {(outer_iname, inner_iname)} + else: + ancestors_of_inner_iname = tree.ancestors(inner_iname_nest) + if outer_iname_nest not in ancestors_of_inner_iname: + raise LoopyError(f"Loop '{outer_iname}' cannot be nested" + f" outside '{inner_iname}'.") + + # }}} + + return _order_loop_nests(tree, + strict_loop_priorities, + kernel.loop_priority, + iname_to_tree_node_id) + +# vim: fdm=marker diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py new file mode 100644 index 000000000..253ff5f84 --- /dev/null +++ b/loopy/schedule/tree.py @@ -0,0 +1,285 @@ +# mypy: disallow-untyped-defs + +from __future__ import annotations + + +__copyright__ = """ +Copyright (C) 2022 Kaushik Kulkarni +Copyright (C) 2022-24 University of Illinois Board of Trustees +""" + + +__doc__ = """ +.. autoclass:: NodeT +.. autoclass:: Tree +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from dataclasses import dataclass +from functools import cached_property +from typing import Generic, Hashable, Iterator, List, Optional, Sequence, Tuple, TypeVar + +from immutables import Map + +from pytools import memoize_method + + +# {{{ tree data structure + +NodeT = TypeVar("NodeT", bound=Hashable) + + +@dataclass(frozen=True) +class Tree(Generic[NodeT]): + """ + An immutable tree containing nodes of type :class:`NodeT`. + + .. automethod:: ancestors + .. automethod:: parent + .. automethod:: children + .. automethod:: add_node + .. automethod:: depth + .. automethod:: replace_node + .. automethod:: move_node + + .. automethod:: __contains__ + + .. note:: + + Almost all the operations are implemented recursively. NOT suitable for + deep trees. At the very least if the Python implementation is CPython + this allocates a new stack frame for each iteration of the operation. + """ + + _parent_to_children: Map[NodeT, Tuple[NodeT, ...]] + _child_to_parent: Map[NodeT, Optional[NodeT]] + + @staticmethod + def from_root(root: NodeT) -> "Tree[NodeT]": + return Tree(Map({root: ()}), + Map({root: None})) + + @cached_property + def root(self) -> NodeT: + guess = set(self._child_to_parent).pop() + parent_of_guess = self.parent(guess) + while parent_of_guess is not None: + guess = parent_of_guess + parent_of_guess = self.parent(guess) + + return guess + + @memoize_method + def ancestors(self, node: NodeT) -> Tuple[NodeT, ...]: + """ + Returns a :class:`tuple` of nodes that are ancestors of *node*. + """ + assert node in self + + if self.is_root(node): + # => root + return () + + parent = self._child_to_parent[node] + assert parent is not None + + return (parent,) + self.ancestors(parent) + + def parent(self, node: NodeT) -> Optional[NodeT]: + """ + Returns the parent of *node*. + """ + assert node in self + + return self._child_to_parent[node] + + def children(self, node: NodeT) -> Tuple[NodeT, ...]: + """ + Returns the children of *node*. + """ + assert node in self + + return self._parent_to_children[node] + + @memoize_method + def depth(self, node: NodeT) -> int: + """ + Returns the depth of *node*, with the root having depth 0. + """ + assert node in self + + if self.is_root(node): + # => None + return 0 + + parent_of_node = self.parent(node) + assert parent_of_node is not None + + return 1 + self.depth(parent_of_node) + + def is_root(self, node: NodeT) -> bool: + assert node in self + + return self.parent(node) is None + + def is_leaf(self, node: NodeT) -> bool: + assert node in self + + return len(self.children(node)) == 0 + + def __contains__(self, node: NodeT) -> bool: + """Return *True* if *node* is a node in the tree.""" + return node in self._child_to_parent + + def add_node(self, node: NodeT, parent: NodeT) -> "Tree[NodeT]": + """ + Returns a :class:`Tree` with added node *node* having a parent + *parent*. + """ + if node in self: + raise ValueError(f"'{node}' already present in tree.") + + siblings = self._parent_to_children[parent] + + return Tree((self._parent_to_children + .set(parent, siblings + (node,)) + .set(node, ())), + self._child_to_parent.set(node, parent)) + + def replace_node(self, node: NodeT, new_node: NodeT) -> "Tree[NodeT]": + """ + Returns a copy of *self* with *node* replaced with *new_node*. + """ + if node not in self: + raise ValueError(f"'{node}' not present in tree.") + + if new_node in self: + raise ValueError(f"cannot replace with '{new_node}', as its already a part" + " of the tree.") + + parent = self.parent(node) + children = self.children(node) + + # {{{ update child to parent + + child_to_parent_mut = self._child_to_parent.mutate() + del child_to_parent_mut[node] + child_to_parent_mut[new_node] = parent + + for child in children: + child_to_parent_mut[child] = new_node + + # }}} + + # {{{ update parent_to_children + + parent_to_children_mut = self._parent_to_children.mutate() + del parent_to_children_mut[node] + parent_to_children_mut[new_node] = children + + if parent is not None: + # update the child's name in the parent's children + parent_to_children_mut[parent] = ( + *(frozenset(self.children(parent)) - frozenset([node])), + new_node,) + + # }}} + + return Tree(parent_to_children_mut.finish(), + child_to_parent_mut.finish()) + + def move_node(self, node: NodeT, new_parent: Optional[NodeT]) -> "Tree[NodeT]": + """ + Returns a copy of *self* with node *node* as a child of *new_parent*. + """ + if node not in self: + raise ValueError(f"'{node}' not a part of the tree => cannot move.") + + if self.is_root(node): + if new_parent is None: + return self + else: + raise ValueError("Moving root not allowed.") + + if new_parent is None: + raise ValueError("Making multiple roots not allowed") + + if new_parent not in self: + raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.") + + parent = self.parent(node) + assert parent is not None # parent=root handled as a special case + siblings = self.children(parent) + parents_new_children = tuple(frozenset(siblings) - frozenset([node])) + new_parents_children = self.children(new_parent) + (node,) + + new_child_to_parent = self._child_to_parent.set(node, new_parent) + new_parent_to_children = (self._parent_to_children + .set(parent, parents_new_children) + .set(new_parent, new_parents_children)) + + return Tree(new_parent_to_children, + new_child_to_parent) + + def __str__(self) -> str: + """ + Stringifies the tree by using the box-drawing unicode characters. + + .. doctest:: + + >>> from loopy.schedule.tree import Tree + >>> tree = (Tree.from_root("Root") + ... .add_node("A", "Root") + ... .add_node("B", "Root") + ... .add_node("D", "B") + ... .add_node("E", "B") + ... .add_node("C", "A")) + + >>> print(tree) + Root + ├── A + │ └── C + └── B + ├── D + └── E + """ + def rec(node: NodeT) -> List[str]: + children_result = [rec(c) for c in self.children(node)] + + def post_process_non_last_child(children: Sequence[str]) -> list[str]: + return ["├── " + children[0]] + [f"│ {c}" for c in children[1:]] + + def post_process_last_child(children: Sequence[str]) -> list[str]: + return ["└── " + children[0]] + [f" {c}" for c in children[1:]] + + children_result = ([post_process_non_last_child(c) + for c in children_result[:-1]] + + [post_process_last_child(c) + for c in children_result[-1:]]) + return [str(node)] + sum(children_result, start=[]) + + return "\n".join(rec(self.root)) + + def nodes(self) -> Iterator[NodeT]: + return iter(self._child_to_parent.keys()) + +# }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index c9cf9d938..99b163f80 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -36,7 +36,7 @@ from loopy.diagnostic import LoopyError, warn_with_kernel from loopy.kernel.data import AddressSpace, MultiAssignmentBase, TemporaryVariable from loopy.kernel.function_interface import CallableKernel -from loopy.symbolic import CoefficientCollector +from loopy.symbolic import CoefficientCollector, flatten from loopy.translation_unit import TranslationUnit @@ -422,16 +422,16 @@ def to_bytes(self): bytes_map = get_mem_access_map(knl).to_bytes() params = {"n": 512, "m": 256, "l": 128} - s1_g_ld_byt = bytes_map.filter_by( + s1_g_ld_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 1}, direction=["load"]).eval_and_sum(params) - s2_g_ld_byt = bytes_map.filter_by( + s2_g_ld_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 2}, direction=["load"]).eval_and_sum(params) - s1_g_st_byt = bytes_map.filter_by( + s1_g_st_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 1}, direction=["store"]).eval_and_sum(params) - s2_g_st_byt = bytes_map.filter_by( + s2_g_st_bytes = bytes_map.filter_by( mtype=["global"], lid_strides={0: 2}, direction=["store"]).eval_and_sum(params) @@ -629,7 +629,7 @@ class Op(ImmutableRecord): work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. + analogous to an NVIDIA CUDA warp. .. attribute:: kernel_name @@ -709,7 +709,7 @@ class MemAccess(ImmutableRecord): .. attribute:: variable_tags A :class:`frozenset` of subclasses of :class:`~pytools.tag.Tag` - that reflects :attr:`~loopy.symbolic.TaggedVariable.tags` of + that reflects :attr:`~loopy.TaggedVariable.tags` of an accessed variable. .. attribute:: count_granularity @@ -723,7 +723,7 @@ class MemAccess(ImmutableRecord): work-group executes on a single compute unit with all work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. + analogous to an NVIDIA CUDA warp. .. attribute:: kernel_name @@ -1109,7 +1109,7 @@ def _get_lid_and_gid_strides(knl, array, index): # create lid_strides and gid_strides dicts - # strides are coefficents in flattened index, i.e., we want + # strides are coefficients in flattened index, i.e., we want # lid_strides = {0:l0, 1:l1, 2:l2, ...} and # gid_strides = {0:g0, 1:g1, 2:g2, ...}, # where l0, l1, l2, g0, g1, and g2 come from flattened index @@ -1167,7 +1167,7 @@ def get_iname_strides(tag_to_iname_dict): total_iname_stride += axis_tag_stride*coeff - tag_to_stride_dict[tag] = total_iname_stride + tag_to_stride_dict[tag] = flatten(total_iname_stride) return tag_to_stride_dict @@ -1723,7 +1723,7 @@ def get_op_map(program, count_redundant_work=False, :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within - a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be @@ -1921,7 +1921,7 @@ def get_mem_access_map(program, count_redundant_work=False, :arg subgroup_size: An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails @@ -2085,7 +2085,7 @@ def get_synchronization_map(program, subgroup_size=None, entrypoint=None): :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``"guess"``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within - a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + a work-group, analogous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d56b54e79..f0e0333ec 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1,5 +1,7 @@ """Pymbolic mappers for loopy.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,12 +29,21 @@ import re from functools import cached_property, reduce from sys import intern -from typing import AbstractSet, ClassVar, Mapping, Sequence, Tuple +from typing import ( + TYPE_CHECKING, + AbstractSet, + Any, + ClassVar, + Mapping, + Sequence, + Tuple, +) import immutables import numpy as np import islpy as isl +import pymbolic.primitives # FIXME: also import by full name to allow sphinx to resolve import pymbolic.primitives as p import pytools.lex from islpy import dim_type @@ -51,6 +62,7 @@ ) from pymbolic.mapper.dependency import CachedDependencyMapper as DependencyMapperBase from pymbolic.mapper.evaluator import CachedEvaluationMapper as EvaluationMapperBase +from pymbolic.mapper.flattener import FlattenMapper as FlattenMapperBase from pymbolic.mapper.stringifier import StringifyMapper as StringifyMapperBase from pymbolic.mapper.substitutor import ( CachedSubstitutionMapper as SubstitutionMapperBase, @@ -58,19 +70,22 @@ from pymbolic.mapper.unifier import UnidirectionalUnifier as UnidirectionalUnifierBase from pymbolic.parser import Parser as ParserBase from pytools import ImmutableRecord, memoize, memoize_method, memoize_on_first_arg -from pytools.tag import Taggable +from pytools.tag import Tag, Taggable from loopy.diagnostic import ( ExpressionToAffineConversionError, LoopyError, UnableToDetermineAccessRangeError, ) +from loopy.types import ToLoopyTypeConvertible from loopy.typing import ExpressionT -__doc__ = """ -.. currentmodule:: loopy.symbolic +if TYPE_CHECKING: + from loopy.library.reduction import ReductionOperation + +__doc__ = """ Loopy-specific expression types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -82,6 +97,8 @@ .. autoclass:: TypedCSE +.. currentmodule:: loopy + .. autoclass:: TypeCast .. autoclass:: TaggedVariable @@ -90,6 +107,8 @@ .. autoclass:: LinearSubscript +.. currentmodule:: loopy.symbolic + .. autoclass:: RuleArgument .. autoclass:: ExpansionState @@ -185,6 +204,14 @@ def map_resolved_function(self, expr, *args, **kwargs): map_fortran_division = IdentityMapperBase.map_quotient +class FlattenMapper(FlattenMapperBase, IdentityMapperMixin): + pass + + +def flatten(expr): + return FlattenMapper()(expr) + + class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -640,8 +667,9 @@ class TypeCast(LoopyExpressionBase): The expression to be cast. """ + child: ExpressionT - def __init__(self, type, child): + def __init__(self, type: ToLoopyTypeConvertible, child: ExpressionT): super().__init__() from loopy.types import NumpyType, to_loopy_type @@ -678,13 +706,7 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable): may then be used to address these uses--such as by prefetching only accesses tagged a certain way. - .. attribute:: tags - - A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to - provide metadata on this object. Legacy string tags are converted to - :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry - a functional meaning, the tag carrying that same fucntional meaning - (e.g. :class:`~loopy.UseStreamingStoreTag`). + .. autoattribute:: tags Inherits from :class:`pymbolic.primitives.Variable` and :class:`pytools.tag.Taggable`. @@ -692,6 +714,14 @@ class TaggedVariable(LoopyExpressionBase, p.Variable, Taggable): init_arg_names = ("name", "tags") + tags: frozenset[Tag] + """A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to + provide metadata on this object. Legacy string tags are converted to + :class:`~loopy.LegacyStringInstructionTag` or, if they used to carry + a functional meaning, the tag carrying that same functional meaning + (e.g. :class:`~loopy.UseStreamingStoreTag`). + """ + def __init__(self, name, tags): p.Variable.__init__(self, name) if isinstance(tags, str): @@ -718,31 +748,42 @@ class Reduction(LoopyExpressionBase): """ Represents a reduction operation on :attr:`expr` across :attr:`inames`. - .. attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` - - .. attribute:: inames + .. autoattribute:: operation + .. autoattribute:: inames + .. autoattribute:: expr + .. autoattribute:: allow_simultaneous + """ - a list of inames across which reduction on :attr:`expr` is being - carried out. + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") - .. attribute:: expr + operation: ReductionOperation - An expression which may have tuple type. If the expression has tuple - type, it must be one of the following: - * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or - * a :class:`loopy.symbolic.Reduction`, or - * a function call or substitution rule invocation. + inames: Sequence[str] + """The inames across which reduction on :attr:`expr` is being + carried out. + """ - .. attribute:: allow_simultaneous + expr: ExpressionT + """An expression which may have tuple type. If the expression has tuple + type, it must be one of the following: - A :class:`bool`. If not *True*, an iname is allowed to be used - in precisely one reduction, to avoid mis-nesting errors. + * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or + * a :class:`loopy.symbolic.Reduction`, or + * a function call or substitution rule invocation. """ - init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + allow_simultaneous: bool + """If not *True*, an iname is allowed to be used + in precisely one reduction, to avoid misnesting errors. + """ - def __init__(self, operation, inames, expr, allow_simultaneous=False): + def __init__(self, + operation: ReductionOperation | str, + inames: (tuple[str | pymbolic.primitives.Variable, ...] + | pymbolic.primitives.Variable | str), + expr: ExpressionT, + allow_simultaneous: bool = False + ) -> None: if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -751,7 +792,7 @@ def __init__(self, operation, inames, expr, allow_simultaneous=False): assert isinstance(inames, tuple) - def strip_var(iname): + def strip_var(iname: Any) -> str: if isinstance(iname, p.Variable): iname = iname.name @@ -968,10 +1009,14 @@ class SubArrayRef(LoopyExpressionBase): .. automethod:: is_equal """ + swept_inames: tuple[p.Variable, ...] + subscript: p.Subscript init_arg_names = ("swept_inames", "subscript") - def __init__(self, swept_inames, subscript): + def __init__(self, + swept_inames: tuple[p.Variable, ...] | p.Variable, + subscript: p.Subscript) -> None: # {{{ sanity checks @@ -1787,7 +1832,7 @@ def map_subscript(self, expr): # {{{ (pw)aff to expr conversion -def aff_to_expr(aff): +def aff_to_expr(aff: isl.Aff) -> ExpressionT: from pymbolic import var denom = aff.get_denominator_val().to_python() @@ -1805,10 +1850,10 @@ def aff_to_expr(aff): if coeff: result += coeff*aff_to_expr(aff.get_div(i)) - return result // denom + return flatten(result // denom) -def pw_aff_to_expr(pw_aff, int_ok=False): +def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> ExpressionT: if isinstance(pw_aff, int): if not int_ok: from warnings import warn @@ -1830,7 +1875,7 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return expr -def pw_aff_to_pw_aff_implemented_by_expr(pw_aff): +def pw_aff_to_pw_aff_implemented_by_expr(pw_aff: isl.PwAff) -> isl.PwAff: pieces = pw_aff.get_pieces() rest = isl.Set.universe(pw_aff.space.params()) @@ -1923,7 +1968,7 @@ def map_call(self, expr): "for as-pwaff evaluation") -def aff_from_expr(space, expr, vars_to_zero=None): +def aff_from_expr(space: isl.Space, expr: ExpressionT, vars_to_zero=None) -> isl.Aff: if vars_to_zero is None: vars_to_zero = frozenset() @@ -2090,7 +2135,8 @@ def simplify_using_aff(kernel, expr): try: aff = guarded_aff_from_expr(domain.space, expr) except ExpressionToAffineConversionError: - return expr + # Accomplish at least *some* simplification + return flatten(expr) # FIXME: Deal with assumptions, too. aff = aff.gist(domain) @@ -2150,14 +2196,17 @@ def qpolynomial_to_expr(qpoly): assert all(isinstance(num, int) for num in numerators) assert isinstance(common_denominator, int) + # FIXME: Delete if in favor of the general case once we depend on pymbolic 2024.1. if common_denominator == 1: - return sum(num * monomial + res = sum(num * monomial for num, monomial in zip(numerators, monomials)) else: - return FloorDiv(sum(num * monomial + res = FloorDiv(sum(num * monomial for num, monomial in zip(numerators, monomials)), common_denominator) + return flatten(res) + # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a99b7d065..56e5dd872 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -35,6 +35,7 @@ Const, Declarator, Generable, + Initializer, NestedDeclarator, Pointer, ) @@ -800,7 +801,7 @@ def get_function_definition( kernel = codegen_state.kernel assert kernel.linearization is not None - from cgen import FunctionBody, Initializer, Line + from cgen import FunctionBody, Line result = [] @@ -883,6 +884,42 @@ def get_kernel_call(self, codegen_state: CodeGenerationState, lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]: return None + def emit_temp_var_decl_for_tv_with_base_storage(self, + codegen_state: CodeGenerationState, + tv: TemporaryVariable) -> Generable: + """ + Returns the statement for initializing a :class:`loopy.TemporaryVariable` + with a user-provided :attr:`loopy.TemporaryVariable.base_storage`. + """ + assert tv.base_storage is not None + assert isinstance(tv.address_space, AddressSpace) + ecm = codegen_state.expression_to_code_mapper + + cast_decl = POD(self, tv.dtype, "") + temp_var_decl = POD(self, tv.dtype, tv.name) + + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer + + cast_decl = self.wrap_decl_for_address_space( + ptrtype(cast_decl), tv.address_space) + temp_var_decl = self.wrap_decl_for_address_space( + ptrtype(temp_var_decl), tv.address_space) + + cast_tp, cast_d = cast_decl.get_decl_pair() + return Initializer( + temp_var_decl, + "({} {}) ({} + {})".format( + " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset) + ), + ) + def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace @@ -930,33 +967,9 @@ def get_temporary_decls(self, codegen_state, schedule_index): else: assert tv.initializer is None - - cast_decl = POD(self, tv.dtype, "") - temp_var_decl = POD(self, tv.dtype, tv.name) - - if tv._base_storage_access_may_be_aliasing: - ptrtype = _ConstPointer - else: - # The 'restrict' part of this is a complete lie--of course - # all these temporaries are aliased. But we're promising to - # not use them to shovel data from one representation to the - # other. That counts, right? - ptrtype = _ConstRestrictPointer - - cast_decl = self.wrap_decl_for_address_space( - ptrtype(cast_decl), tv.address_space) - temp_var_decl = self.wrap_decl_for_address_space( - ptrtype(temp_var_decl), tv.address_space) - - cast_tp, cast_d = cast_decl.get_decl_pair() - temp_var_decl = Initializer( - temp_var_decl, - "({} {}) ({} + {})".format( - " ".join(cast_tp), cast_d, - tv.base_storage, - ecm(tv.offset) - )) - + temp_var_decl = self.emit_temp_var_decl_for_tv_with_base_storage( + codegen_state, tv + ) temp_decls_using_base_storage.append(temp_var_decl) # }}} diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index fc3238e92..9cde501a7 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -94,21 +94,21 @@ def python_dtype_str_inner(self, dtype): return f"_lpy_np.dtype(_lpy_np.{name})" raise Exception(f"dtype: {dtype} not recognized") - # {{{ handle non numpy arguements + # {{{ handle non numpy arguments def handle_non_numpy_arg(self, gen, arg): pass # }}} - # {{{ handle allocation of unspecified arguements + # {{{ handle allocation of unspecified arguments def handle_alloc( self, gen: CodeGenerator, arg: ArrayArg, strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], skip_arg_checks: bool) -> None: """ - Handle allocation of non-specified arguements for C-execution + Handle allocation of non-specified arguments for C-execution """ from pymbolic import var @@ -181,7 +181,7 @@ def target_specific_preamble(self, gen): def initialize_system_args(self, gen): """ - Initializes possibly empty system arguements + Initializes possibly empty system arguments """ pass @@ -238,7 +238,7 @@ class CCompiler: The general strategy here is as follows: 1. A :class:`codepy.Toolchain` is guessed from distutils. - The user may override any flags obtained therein by passing in arguements + The user may override any flags obtained therein by passing in arguments to cc, cflags, etc. 2. The kernel source is built into and object first, then made into a shared diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 276407fc1..810ac4379 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -28,7 +28,7 @@ import islpy as isl import pymbolic.primitives as p from pymbolic import var -from pymbolic.mapper import IdentityMapper, RecursiveMapper +from pymbolic.mapper import IdentityMapper, Mapper from pymbolic.mapper.stringifier import ( PREC_BITWISE_AND, PREC_BITWISE_OR, @@ -124,9 +124,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s): return s - def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None): - result = RecursiveMapper.rec(self, expr, type_context) - + def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None): # type: ignore[override] + result = Mapper.rec(self, expr, type_context) if needed_type is None: return result else: @@ -604,7 +603,7 @@ def map_nan(self, expr, type_context): # {{{ C expression to code mapper -class CExpressionToCodeMapper(RecursiveMapper): +class CExpressionToCodeMapper(Mapper): # {{{ helpers diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 3155b3bff..afeb5cee2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -35,7 +35,13 @@ from loopy.codegen.result import CodeGenerationResult from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag -from loopy.kernel.data import AddressSpace, ArrayArg, ConstantArg, ImageArg +from loopy.kernel.data import ( + AddressSpace, + ArrayArg, + ConstantArg, + ImageArg, + TemporaryVariable, +) from loopy.kernel.function_interface import ScalarCallable from loopy.target.c import CFamilyASTBuilder, CFamilyTarget from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -462,6 +468,39 @@ def get_image_arg_declarator( self, arg: ImageArg, is_written: bool) -> Declarator: raise NotImplementedError("not yet: texture arguments in CUDA") + def emit_temp_var_decl_for_tv_with_base_storage(self, + codegen_state: CodeGenerationState, + tv: TemporaryVariable) -> Generable: + from cgen import Initializer + + from loopy.target.c import POD, _ConstPointer, _ConstRestrictPointer + + assert tv.base_storage is not None + ecm = codegen_state.expression_to_code_mapper + + cast_decl = POD(self, tv.dtype, "") + temp_var_decl = POD(self, tv.dtype, tv.name) + + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer + + cast_decl = ptrtype(cast_decl) + temp_var_decl = ptrtype(temp_var_decl) + + cast_tp, cast_d = cast_decl.get_decl_pair() + return Initializer( + temp_var_decl, + "({} {}) ({} + {})".format( + " ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset) + ), + ) + # }}} # {{{ atomics diff --git a/loopy/target/execution.py b/loopy/target/execution.py index cb081a3e5..d1455530d 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -187,7 +187,7 @@ def generate_integer_arg_finding_from_array_data( if shape_i is not None: equations.append( _ArgFindingEquation( - lhs=var(arg.name).attr("shape").index(axis_nr), + lhs=var(arg.name).attr("shape")[axis_nr], rhs=shape_i, order=0, based_on_names=frozenset({arg.name}))) @@ -198,7 +198,7 @@ def generate_integer_arg_finding_from_array_data( equations.append( _ArgFindingEquation( lhs=var("_lpy_even_div")( - var(arg.name).attr("strides").index(axis_nr), + var(arg.name).attr("strides")[axis_nr], arg.dtype.itemsize), rhs=_str_to_expr(stride_i), order=0, @@ -377,21 +377,21 @@ def generate_value_arg_check( # }}} - # {{{ handle non numpy arguements + # {{{ handle non numpy arguments def handle_non_numpy_arg(self, gen: CodeGenerator, arg): raise NotImplementedError() # }}} - # {{{ handle allocation of unspecified arguements + # {{{ handle allocation of unspecified arguments def handle_alloc( self, gen: CodeGenerator, arg: ArrayArg, strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], skip_arg_checks: bool) -> None: """ - Handle allocation of non-specified arguements for C-execution + Handle allocation of non-specified arguments for C-execution """ raise NotImplementedError() @@ -647,7 +647,7 @@ def target_specific_preamble(self, gen): def initialize_system_args(self, gen): """ - Override to intialize any default system args + Override to initialize any default system args """ raise NotImplementedError() @@ -674,7 +674,7 @@ def __call__(self, program, entrypoint, codegen_result): """ Generates the wrapping python invoker for this execution target - :arg kernel: the loopy :class:`LoopKernel`(s) to be executued + :arg kernel: the loopy :class:`LoopKernel`(s) to be executed :codegen_result: the loopy :class:`CodeGenerationResult` created by code generation @@ -944,7 +944,7 @@ def __call__(self, queue, **kwargs): # }}} -# {{{ code highlighers +# {{{ code highlighters def get_highlighted_code(text, python=False): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index ce2a150b0..31d1cfd2d 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -112,7 +112,7 @@ def map_subscript(self, expr, type_context): if (isinstance(ary, TemporaryVariable) and ary.address_space == AddressSpace.PRIVATE): - # generate access code for acccess to private-index temporaries + # generate access code for access to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index e2f3ecda2..14383e54f 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -642,7 +642,7 @@ def get_function_declaration( from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) if not codegen_state.is_entrypoint: - # auxiliary kernels need not mention opencl speicific qualifiers + # auxiliary kernels need not mention opencl specific qualifiers # for a functions signature return preambles, fdecl @@ -908,7 +908,7 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, # }}} -# {{{ volatile mem acccess target +# {{{ volatile mem access target class VolatileMemExpressionToOpenCLCExpressionMapper( ExpressionToOpenCLCExpressionMapper): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ec702e39b..e4da6cd8b 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -324,7 +324,7 @@ def binary_tree_add(start, end): # -ffp-contract=fast which is the default for PTX codegen, but # for some unknown reason, clang fails to see the FMAs. # - # We need to do this only for complex as we haev temporaries + # We need to do this only for complex as we have temporaries # only in complex. For reals, the code generated looks like # # res = c + a * b @@ -879,7 +879,7 @@ def get_kernel_call( value_arg_code = generate_value_arg_setup( codegen_state.kernel, regular_arg_names) - arry_arg_code = generate_array_arg_setup( + array_arg_code = generate_array_arg_setup( codegen_state.kernel, regular_arg_names) if struct_overflow_arg_names: @@ -888,7 +888,7 @@ def get_kernel_call( struct_overflow_arg_names) py_passed_args = [] - struct_pack_types = [] + struct_pack_types: list[str] = [] struct_pack_args = [] for arg_name in skai.passed_names: @@ -952,7 +952,7 @@ def get_kernel_call( "argument count of the kernel ({_lpy_knl.num_args}).'"), Line(), value_arg_code, - arry_arg_code, + array_arg_code, overflow_args_code, Assign("_lpy_evt", f"{self.target.pyopencl_module_name}.enqueue_nd_range_kernel(" @@ -1207,7 +1207,7 @@ def get_expression_to_c_expression_mapper(self, codegen_state): # }}} -# {{{ volatile mem acccess target +# {{{ volatile mem access target class VolatileMemPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): diff --git a/loopy/tools.py b/loopy/tools.py index 50a523ee8..2e3b5db4f 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -972,4 +972,13 @@ def _get_persistent_hashable_arg(arg): # }}} + +def is_hashable(o: object) -> bool: + try: + hash(o) + except TypeError: + return False + return True + + # vim: fdm=marker diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index 73bd5cdde..7ab5e376e 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -91,7 +91,7 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add]) if insn_after is not None: - new_kernel = add_dependency(kernel=new_kernel, + new_kernel = add_dependency(new_kernel, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index ec3737233..7e7b6459c 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -413,17 +413,17 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs): except_inames=frozenset(self.primed_sweep_inames)) s2s_domain = stor2sweep.domain() - s2s_domain, aligned_g_s2s_parm_dom = isl.align_two( + s2s_domain, aligned_g_s2s_param_dom = isl.align_two( s2s_domain, global_s2s_par_dom) arg_restrictions = ( - aligned_g_s2s_parm_dom + aligned_g_s2s_param_dom .eliminate(dim_type.set, 0, - aligned_g_s2s_parm_dom.dim(dim_type.set)) + aligned_g_s2s_param_dom.dim(dim_type.set)) .remove_divs()) return (arg_restrictions & s2s_domain).is_subset( - aligned_g_s2s_parm_dom) + aligned_g_s2s_param_dom) class NoOpArrayToBufferMap(ArrayToBufferMapBase): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 1fe40a370..6866f23f3 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -20,6 +20,8 @@ THE SOFTWARE. """ +from collections.abc import Sequence + from immutables import Map import islpy as isl @@ -27,7 +29,11 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.function_interface import ( + CallableKernel, + InKernelCallable, + ScalarCallable, +) from loopy.kernel.instruction import ( Assignment, CallInstruction, @@ -40,7 +46,7 @@ RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, ) -from loopy.translation_unit import TranslationUnit, for_each_kernel +from loopy.translation_unit import FunctionIdT, TranslationUnit, for_each_kernel __doc__ = """ @@ -80,10 +86,8 @@ def register_callable(translation_unit, function_identifier, callable_, callables_table=new_callables) -def merge(translation_units): +def merge(translation_units: Sequence[TranslationUnit]) -> TranslationUnit: """ - :param translation_units: A sequence of :class:`loopy.TranslationUnit`. - :returns: An instance of :class:`loopy.TranslationUnit` which contains all the callables from each of the *translation_units. """ @@ -102,7 +106,7 @@ def merge(translation_units): if (prg_i.callables_table[clbl_name] != prg_j.callables_table[clbl_name]): # TODO: generate unique names + rename for the colliding - # callables (if entrypoints are colliding that shuold still + # callables (if entrypoints are colliding that should still # be an error) raise NotImplementedError("Translation units to be merged" " must have different callable names" @@ -110,7 +114,7 @@ def merge(translation_units): # }}} - callables_table = {} + callables_table: dict[FunctionIdT, InKernelCallable] = {} for trans_unit in translation_units: callables_table.update(trans_unit.callables_table) @@ -534,7 +538,12 @@ def inline_callable_kernel(translation_unit, function_name): # {{{ rename_callable -def rename_callable(program, old_name, new_name=None, existing_ok=False): +def rename_callable( + t_unit: TranslationUnit, + old_name: str, + new_name: str | None = None, + existing_ok=False + ) -> TranslationUnit: """ :arg program: An instance of :class:`loopy.TranslationUnit` :arg old_name: The callable to be renamed @@ -548,21 +557,21 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): SubstitutionRuleMappingContext, ) - assert isinstance(program, TranslationUnit) + assert isinstance(t_unit, TranslationUnit) assert isinstance(old_name, str) - if (new_name in program.callables_table) and not existing_ok: + if (new_name in t_unit.callables_table) and not existing_ok: raise LoopyError(f"callables named '{new_name}' already exists") if new_name is None: - namegen = UniqueNameGenerator(program.callables_table.keys()) + namegen = UniqueNameGenerator(t_unit.callables_table.keys()) new_name = namegen(old_name) assert isinstance(new_name, str) new_callables_table = {} - for name, clbl in program.callables_table.items(): + for name, clbl in t_unit.callables_table.items(): if name == old_name: name = new_name @@ -582,12 +591,12 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): new_callables_table[name] = clbl - new_entrypoints = program.entrypoints.copy() + new_entrypoints = t_unit.entrypoints.copy() if old_name in new_entrypoints: new_entrypoints = ((new_entrypoints | frozenset([new_name])) - frozenset([old_name])) - return program.copy(callables_table=Map(new_callables_table), + return t_unit.copy(callables_table=Map(new_callables_table), entrypoints=new_entrypoints) # }}} diff --git a/loopy/transform/concatenate.py b/loopy/transform/concatenate.py index 8a4bb28fb..fcf2e07ca 100644 --- a/loopy/transform/concatenate.py +++ b/loopy/transform/concatenate.py @@ -85,6 +85,9 @@ def concatenate_arrays( axis_length += ary.shape[axis_nr] new_ary = arrays[0] + if not isinstance(new_ary.shape, tuple): + raise ValueError("one of the arrays has indeterminate shape") + new_shape = list(new_ary.shape) new_shape[axis_nr] = axis_length new_ary = new_ary.copy(shape=tuple(new_shape)) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 088d89643..ea6cc0fc1 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -252,7 +252,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, footprint_subscripts, var_descr) # Our _not_provided is actually a different object from the one in the - # precompute module, but precompute acutally uses that to adjust its + # precompute module, but precompute actually uses that to adjust its # warning message. from loopy.transform.precompute import precompute_for_single_kernel @@ -292,7 +292,13 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, return new_kernel -def add_prefetch(program, *args, **kwargs): +def add_prefetch(t_unit, + var_name, sweep_inames=None, dim_arg_names=None, + default_tag=None, + rule_name=None, temporary_name=None, + temporary_address_space=None, temporary_scope=None, + footprint_subscripts=None, fetch_bounding_box=False, + fetch_outer_inames=None, prefetch_insn_id=None, within=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -379,7 +385,7 @@ def add_prefetch(program, *args, **kwargs): :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. - :arg fetch_insn_id: The ID of the instruction generated to perform the + :arg prefetch_insn_id: The ID of the instruction generated to perform the prefetch. :arg within: a stack match as understood by @@ -388,14 +394,26 @@ def add_prefetch(program, *args, **kwargs): This function internally uses :func:`extract_subst` and :func:`precompute`. """ - assert isinstance(program, TranslationUnit) + assert isinstance(t_unit, TranslationUnit) new_callables = {} - for func_id, in_knl_callable in program.callables_table.items(): + for func_id, in_knl_callable in t_unit.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) + in_knl_callable.subkernel, t_unit.callables_table, + var_name=var_name, + sweep_inames=sweep_inames, + dim_arg_names=dim_arg_names, + default_tag=default_tag, + rule_name=rule_name, + temporary_name=temporary_name, + temporary_address_space=temporary_address_space, + temporary_scope=temporary_scope, + footprint_subscripts=footprint_subscripts, + fetch_bounding_box=fetch_bounding_box, + fetch_outer_inames=fetch_outer_inames, + prefetch_insn_id=prefetch_insn_id, + within=within) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -407,7 +425,7 @@ def add_prefetch(program, *args, **kwargs): new_callables[func_id] = in_knl_callable - return program.copy(callables_table=Map(new_callables)) + return t_unit.copy(callables_table=Map(new_callables)) # }}} @@ -653,7 +671,7 @@ def set_argument_order(kernel, arg_names): :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ - # FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # FIXME: @inducer -- should this only affect the root kernel, or should it # take a within? if isinstance(arg_names, str): @@ -1007,6 +1025,11 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel, raise LoopyError( f"Temporary '{tv.name}' has an offset and no base_storage. " "That's not allowed.") + if not isinstance(tv.dtype, LoopyType): + raise LoopyError( + f"Dtype of temporary '{tv.name}' " + " is not inferred. Call lp.infer_unknown_types" + " first.") if (tv.base_storage and tv.base_storage not in kernel.temporary_variables diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index bb828221f..6c2688d90 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -154,7 +154,7 @@ def map_call(self, expr, *args): dc = self.diff_context if expr.function.name in dc.kernel.substitutions: - # FIXME: Deal with subsitution rules + # FIXME: Deal with substitution rules # Need to use chain rule here, too. raise NotImplementedError("substitution rules in differentiation") else: @@ -382,8 +382,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", assert isinstance(kernel, LoopKernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + kernel = apply_single_writer_dependency_heuristic(kernel, warn_if_used=True) if isinstance(diff_outputs, str): diff_outputs = [ diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 18df3dae4..97257745c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -21,7 +21,7 @@ """ -from typing import FrozenSet, Optional +from typing import Any, FrozenSet, Optional import islpy as isl from islpy import dim_type @@ -29,6 +29,7 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.instruction import InstructionBase from loopy.symbolic import ( RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, @@ -919,9 +920,13 @@ def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, old_to_new=dict(list(zip(inames, new_inames))), within=within) - def _does_access_old_inames(kernel, insn, *args): - return bool(frozenset(inames) & (insn.dependency_names() - | insn.reduction_inames())) + def _does_access_old_inames(kernel: LoopKernel, + insn: InstructionBase, + *args: Any) -> bool: + all_inames = (insn.within_inames + | insn.reduction_inames() + | insn.sub_array_ref_inames()) + return bool(frozenset(inames) & all_inames) kernel = rule_mapping_context.finish_kernel( indup.map_kernel(kernel, within=_does_access_old_inames, @@ -1540,7 +1545,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): break if not found: - raise LoopyError("invlaid tag kind: %s" % kind) + raise LoopyError("invalid tag kind: %s" % kind) from loopy.match import parse_match match = parse_match(insn_match) @@ -2265,7 +2270,7 @@ def add_inames_for_unused_hw_axes(kernel, within=None): Current limitations: * Only one iname in the kernel may be tagged with each of the unused hw axes. - * Occurence of an ``l.auto`` tag when an instruction is missing one of the + * Occurrence of an ``l.auto`` tag when an instruction is missing one of the local hw axes. :arg within: An instruction match as understood by diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 629916628..494bbf0bc 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -267,6 +267,7 @@ def replace_instruction_ids_in_insn( new_no_sync_with: List[Tuple[str, str]] = [] if insn.id in replacements: + assert isinstance(insn.id, str) insn = insn.copy(id=replacements[insn.id][0]) new_depends_on = list(insn.depends_on) @@ -425,7 +426,7 @@ def insns_in_conflicting_groups(insn1_id, insn2_id): if not nosync_to_add and not empty_ok: raise LoopyError("No nosync annotations were added as a result " "of this call. add_nosync will (by default) only add them to " - "accompany existing depencies or group exclusions. Maybe you want " + "accompany existing dependencies or group exclusions. Maybe you want " "to pass force=True?") new_instructions = list(kernel.instructions) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 1c1b48c59..2a82952c2 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -48,7 +48,7 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, :arg call_name: An instance of :class:`str` denoting the function call in the *kernel*. - :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + :arg args_to_pack: A list of the arguments as instances of :class:`str` which must be packed. If set *None*, it is interpreted that all the array arguments would be packed. :arg args_to_unpack: A list of the arguments as instances of :class:`str` diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index d04fa5b2d..2c91643ac 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -47,6 +47,7 @@ RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, + flatten, get_dependencies, ) from loopy.transform.array_buffer_map import ( @@ -296,7 +297,7 @@ def map_substitution(self, name, tag, arguments, expn_state): new_outer_expr = var(self.temporary_name) if stor_subscript: - new_outer_expr = new_outer_expr.index(tuple(stor_subscript)) + new_outer_expr = new_outer_expr[tuple(stor_subscript)] # Can't possibly be nested, and no need to traverse # further as compute expression has already been seen @@ -928,7 +929,7 @@ def add_assumptions(d): storage_axis_subst_dict[ prior_storage_axis_name_dict.get(arg_name, arg_name)] = \ - arg+base_index + flatten(arg+base_index) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index e9b2b8c53..ca31368d2 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -43,7 +43,7 @@ # {{{ privatize temporaries with iname -from loopy.symbolic import IdentityMapper +from loopy.symbolic import IdentityMapper, flatten class ExtraInameIndexInserter(IdentityMapper): @@ -66,7 +66,7 @@ def map_subscript(self, expr): self.seen_priv_axis_inames.update(v.name for v in extra_idx) - new_idx = index + tuple(v - self.iname_to_lbound[v.name] + new_idx = index + tuple(flatten(v - self.iname_to_lbound[v.name]) for v in extra_idx) if len(new_idx) == 1: @@ -81,7 +81,7 @@ def map_variable(self, expr): else: self.seen_priv_axis_inames.update(v.name for v in new_idx) - new_idx = tuple(v - self.iname_to_lbound[v.name] + new_idx = tuple(flatten(v - self.iname_to_lbound[v.name]) for v in new_idx) if len(new_idx) == 1: @@ -247,7 +247,7 @@ def privatize_temporaries_with_inames( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " - "previously within the iname(s). To remedy this, first promote" + "previously within the iname(s). To remedy this, first promote " "the instruction into the iname." % (kernel.name, insn.id, ", ".join( eiii.seen_priv_axis_inames - insn.within_inames))) diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 5161efba6..7d1f3c870 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -711,7 +711,7 @@ def _add_to_depends_on(insn_id, new_depends_on_params): needs_replacement = True - # {{{ generate a new assignent instruction + # {{{ generate a new assignment instruction new_assignee_name = var_name_gen( "{insn_id}_retval_{assignee_nr}" @@ -2019,7 +2019,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, | red_realize_ctx.surrounding_insn_add_within_inames)) kwargs.pop("id") - kwargs.pop("depends_on") + kwargs.pop("happens_after") kwargs.pop("expression") kwargs.pop("assignee", None) kwargs.pop("assignees", None) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b5c7aa7a1..422d22568 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -327,8 +327,8 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ establish the relevant definition of lhs_name for each usage site dep_kernel = expand_subst(kernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - dep_kernel = apply_single_writer_depencency_heuristic(dep_kernel) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + dep_kernel = apply_single_writer_dependency_heuristic(dep_kernel) assigning_insn_ids = {insn.id for insn in dep_kernel.instructions if lhs_name in insn.assignee_var_names()} @@ -354,7 +354,7 @@ def get_relevant_definition_insn_id(usage_insn_id): if len(rel_def_ids) > 1: raise LoopyError("more than one write to '%s' found in " - "depdendencies of '%s'--definition cannot be resolved " + "dependencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" % (lhs_name, usage_insn_id, ", ".join(rel_def_ids))) @@ -433,7 +433,7 @@ def _accesses_lhs(kernel, insn, *args): for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " - "asignee index '%s' is not a plain variable. " + "assignee index '%s' is not a plain variable. " "Perhaps use loopy.affine_map_inames() " "to perform substitution." % (def_id, i)) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index c0d1b0b05..4afdfcef7 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -27,10 +27,20 @@ from collections.abc import Set as abc_Set from dataclasses import dataclass, field, replace from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, FrozenSet, Mapping, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + FrozenSet, + Mapping, + Optional, + TypeVar, + Union, +) from warnings import warn from immutables import Map +from typing_extensions import Concatenate, ParamSpec, Self from pymbolic.primitives import Call, Variable @@ -76,8 +86,16 @@ .. autofunction:: make_program +.. autofunction:: check_each_kernel + .. autofunction:: for_each_kernel +.. autoclass:: TUnitOrKernelT + +.. class:: P + + A :class:`typing.ParamSpec` for use in annotating :func:`for_each_kernel` and + :func:`check_each_kernel`. """ @@ -182,6 +200,8 @@ class TranslationUnit: The :class:`~loopy.LoopKernel` representing the main entrypoint of the program, if defined. Currently, this attribute may only be accessed if there is exactly one entrypoint in the translation unit. + Will raise an error if the default entrypoint is not a + :class:`~loopy.LoopKernel`. .. attribute:: callables_table @@ -196,7 +216,7 @@ class TranslationUnit: .. attribute:: func_id_to_in_knl_callables_mappers A :class:`frozenset` of functions of the signature ``(target: - TargetBase, function_indentifier: str)`` that returns an instance + TargetBase, function_identifier: str)`` that returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. .. automethod:: executor @@ -226,9 +246,9 @@ def __post_init__(self): object.__setattr__(self, "_program_executor_cache", {}) - def copy(self, **kwargs): + def copy(self, **kwargs: Any) -> Self: target = kwargs.pop("target", None) - program = replace(self, **kwargs) + t_unit = replace(self, **kwargs) if target: from loopy.kernel import KernelState if max(callable_knl.subkernel.state @@ -240,7 +260,7 @@ def copy(self, **kwargs): "preprocessed, cannot modify target now.") new_callables = {} - for func_id, clbl in program.callables_table.items(): + for func_id, clbl in t_unit.callables_table.items(): if isinstance(clbl, CallableKernel): knl = clbl.subkernel knl = knl.copy(target=target) @@ -251,16 +271,12 @@ def copy(self, **kwargs): raise NotImplementedError() new_callables[func_id] = clbl - program = replace( + t_unit = replace( self, callables_table=Map(new_callables), target=target) - return program + return t_unit - def with_entrypoints(self, entrypoints): - """ - :param entrypoints: Either a comma-separated :class:`str` or - :class:`frozenset`. - """ + def with_entrypoints(self, entrypoints: str | frozenset[str]) -> Self: if isinstance(entrypoints, str): entrypoints = frozenset([e.strip() for e in entrypoints.split(",")]) @@ -278,7 +294,7 @@ def state(self): if isinstance(callable_knl, CallableKernel)), default=KernelState.INITIAL) - def with_kernel(self, kernel): + def with_kernel(self, kernel: LoopKernel) -> Self: """ If *self* contains a callable kernel with *kernel*'s name, replaces its subkernel and returns a copy of *self*. Else records a new callable @@ -300,9 +316,9 @@ def with_kernel(self, kernel): new_callables = self.callables_table.set(kernel.name, clbl) return self.copy(callables_table=new_callables) - def __getitem__(self, name): + def __getitem__(self, name) -> LoopKernel: """ - For the callable named *name*, return a :class:`loopy.LoopKernel` if + For the callable named *name*, return a :class:`loopy.LoopKernel`. if it's a :class:`~loopy.kernel.function_interface.CallableKernel` otherwise return the callable itself. """ @@ -310,13 +326,20 @@ def __getitem__(self, name): if isinstance(result, CallableKernel): return result.subkernel else: - return result + raise ValueError("TranslationUnit.__getitem__ " + "can only be used for instances of LoopKernel. " + "Access all other callables via callables_table.") @property - def default_entrypoint(self): + def default_entrypoint(self) -> LoopKernel: if len(self.entrypoints) == 1: - entrypoint, = self.entrypoints - return self[entrypoint] + ep_name, = self.entrypoints + entrypoint = self[ep_name] + + if not isinstance(entrypoint, LoopKernel): + raise ValueError("default entrypoint is not a kernel") + + return entrypoint else: raise ValueError("TranslationUnit has multiple possible entrypoints." " The default entrypoint kernel is not uniquely" @@ -726,6 +749,9 @@ def __getitem__(self, name): # }}} +TUnitOrKernelT = TypeVar("TUnitOrKernelT", LoopKernel, TranslationUnit) + + # {{{ helper functions def make_program(kernel: LoopKernel) -> TranslationUnit: @@ -741,21 +767,46 @@ def make_program(kernel: LoopKernel) -> TranslationUnit: entrypoints=frozenset()) -def for_each_kernel(transform): +P = ParamSpec("P") + + +def check_each_kernel( + check: Callable[Concatenate[LoopKernel, P], None] + ) -> Callable[Concatenate[TranslationUnit, P], None]: + def _collective_check( + t_unit_or_kernel: TranslationUnit | LoopKernel, /, + *args: P.args, + **kwargs: P.kwargs + ) -> None: + if isinstance(t_unit_or_kernel, TranslationUnit): + for clbl in t_unit_or_kernel.callables_table.values(): + if isinstance(clbl, CallableKernel): + check(clbl.subkernel, *args, **kwargs) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + elif isinstance(t_unit_or_kernel, LoopKernel): + check(t_unit_or_kernel, *args, **kwargs) + else: + raise TypeError("expected LoopKernel or TranslationUnit") + + return wraps(check)(_collective_check) + + +def for_each_kernel( + transform: Callable[Concatenate[LoopKernel, P], LoopKernel] + ) -> Callable[Concatenate[TUnitOrKernelT, P], TUnitOrKernelT]: """ Function wrapper for transformations of the type ``transform(kernel: LoopKernel, *args, **kwargs) -> LoopKernel``. Returns a function that would apply *transform* to all callable kernels in a :class:`loopy.TranslationUnit`. """ - def _collective_transform(*args, **kwargs): - if "translation_unit" in kwargs: - t_unit_or_kernel = kwargs.pop("translation_unit") - elif "kernel" in kwargs: - t_unit_or_kernel = kwargs.pop("kernel") - else: - t_unit_or_kernel = args[0] - args = args[1:] - + def _collective_transform( + t_unit_or_kernel: TUnitOrKernelT, /, + *args: P.args, + **kwargs: P.kwargs + ) -> TUnitOrKernelT: if isinstance(t_unit_or_kernel, TranslationUnit): t_unit = t_unit_or_kernel new_callables = {} @@ -771,10 +822,11 @@ def _collective_transform(*args, **kwargs): new_callables[func_id] = clbl return t_unit.copy(callables_table=Map(new_callables)) - else: - assert isinstance(t_unit_or_kernel, LoopKernel) + elif isinstance(t_unit_or_kernel, LoopKernel): kernel = t_unit_or_kernel return transform(kernel, *args, **kwargs) + else: + raise TypeError("expected LoopKernel or TranslationUnit") return wraps(transform)(_collective_transform) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b997cee25..b8e3db720 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -43,7 +43,11 @@ parse_tagged_name, ) from loopy.tools import is_integer -from loopy.translation_unit import CallablesInferenceContext, make_clbl_inf_ctx +from loopy.translation_unit import ( + CallablesInferenceContext, + TranslationUnit, + make_clbl_inf_ctx, +) from loopy.types import NumpyType @@ -396,7 +400,7 @@ def map_constant(self, expr): def map_type_cast(self, expr): subtype, = self.rec(expr.child) - if not issubclass(subtype.dtype.type, np.number): + if not issubclass(subtype.dtype.type, (np.number, np.bool_)): raise LoopyError(f"Can't cast a '{subtype}' to '{expr.type}'") return [expr.type] @@ -1026,31 +1030,34 @@ def _instruction_missed_during_inference(insn): return type_specialized_kernel, clbl_inf_ctx -def infer_unknown_types(program, expect_completion=False): +def infer_unknown_types( + t_unit: TranslationUnit, + expect_completion: bool = False + ) -> TranslationUnit: """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto from loopy.translation_unit import resolve_callables - program = resolve_callables(program) + t_unit = resolve_callables(t_unit) # {{{ early-exit criterion if all(clbl.is_type_specialized() - for clbl in program.callables_table.values()): + for clbl in t_unit.callables_table.values()): # all the callables including the kernels have inferred their types # => no need for type inference - return program + return t_unit # }}} - clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, - program.entrypoints) + clbl_inf_ctx = make_clbl_inf_ctx(t_unit.callables_table, + t_unit.entrypoints) - for e in program.entrypoints: + for e in t_unit.entrypoints: logger.debug(f"Entering entrypoint: {e}") arg_id_to_dtype = {arg.name: arg.dtype for arg in - program[e].args if arg.dtype not in (None, auto)} - new_callable, clbl_inf_ctx = program.callables_table[e].with_types( + t_unit[e].args if arg.dtype not in (None, auto)} + new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_types( arg_id_to_dtype, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, is_entrypoint=True) @@ -1073,7 +1080,7 @@ def infer_unknown_types(program, expect_completion=False): raise LoopyError("could not determine type of" f" '{vars_not_inferred.pop()}' of kernel '{e}'.") - return clbl_inf_ctx.finish_program(program) + return clbl_inf_ctx.finish_program(t_unit) # }}} diff --git a/loopy/types.py b/loopy/types.py index 143715a39..a837d1c46 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -165,7 +165,7 @@ def __repr__(self): class OpaqueType(LoopyType): """An opaque data type is truly opaque - it has no allocations, no temporaries of that type, etc. The only thing allowed is to be pass in - through one ValueArg and go out to another. It is introduced to accomodate + through one ValueArg and go out to another. It is introduced to accommodate functional calls to external libraries. """ def __init__(self, name: str) -> None: diff --git a/loopy/typing.py b/loopy/typing.py index 948616578..e6166fb7a 100644 --- a/loopy/typing.py +++ b/loopy/typing.py @@ -1,3 +1,15 @@ +""" +.. autoclass:: IntegralT +.. autoclass:: FloatT +.. autoclass:: ExpressionT +.. autoclass:: ShapeType +.. autoclass:: auto +""" + + +from __future__ import annotations + + __copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees" __license__ = """ @@ -24,25 +36,28 @@ from typing import Optional, Tuple, TypeVar, Union import numpy as np +from typing_extensions import TypeAlias from pymbolic.primitives import Expression -IntegralT = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8, +IntegralT: TypeAlias = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64] -FloatT = Union[float, complex, np.float32, np.float64, np.complex64, +FloatT: TypeAlias = Union[float, complex, np.float32, np.float64, np.complex64, np.complex128] -ExpressionT = Union[IntegralT, FloatT, Expression] -ShapeType = Tuple[ExpressionT, ...] -StridesType = ShapeType +ExpressionT: TypeAlias = Union[IntegralT, FloatT, Expression] +ShapeType: TypeAlias = Tuple[ExpressionT, ...] +StridesType: TypeAlias = ShapeType + +InameStr: TypeAlias = str class auto: # noqa """A generic placeholder object for something that should be automatically determined. See, for example, the *shape* or *strides* argument of - :class:`ArrayArg`. + :class:`~loopy.ArrayArg`. """ diff --git a/loopy/version.py b/loopy/version.py index 09d8442a2..609e6c179 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -21,32 +21,16 @@ """ -# {{{ find install- or run-time git revision +import re +from importlib import metadata -import os +VERSION_TEXT = metadata.version("loopy") +_match = re.match("^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT) +assert _match is not None +VERSION_STATUS = _match.group(2) +VERSION = tuple(int(nr) for nr in _match.group(1).split(".")) -if os.environ.get("AKPYTHON_EXEC_IMPORT_UNAVAILABLE") is not None: - # We're just being exec'd by setup.py. We can't import anything. - _git_rev = None - -else: - import loopy._git_rev as _git_rev_mod # pylint: disable=no-name-in-module,import-error # noqa: E501 - _git_rev = _git_rev_mod.GIT_REVISION - - # If we're running from a dev tree, the last install (and hence the most - # recent update of the above git rev) could have taken place very long ago. - from pytools import find_module_git_revision - _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1) - if _runtime_git_rev is not None: - _git_rev = _runtime_git_rev - -# }}} - - -VERSION = (2024, 1) -VERSION_STATUS = "" -VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS try: import islpy.version @@ -62,8 +46,7 @@ else: _cgen_version = cgen.version.VERSION_TEXT -DATA_MODEL_VERSION = "{}-islpy{}-cgen{}-{}-v1".format( - VERSION_TEXT, _islpy_version, _cgen_version, _git_rev) +DATA_MODEL_VERSION = f"{VERSION_TEXT}-islpy{_islpy_version}-cgen{_cgen_version}-v1" FALLBACK_LANGUAGE_VERSION = (2018, 2) diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py index 7ee30313c..eb8125cdb 100644 --- a/proto-tests/test_tim.py +++ b/proto-tests/test_tim.py @@ -190,7 +190,7 @@ def test_tim3d(ctx_factory): knl = lp.split_iname(knl, "k", n, inner_tag="l.2") # , slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.0") # , slabs=(0, 1)) -# knl = lp.tag_inames(knl, dict(k_nner="unr")) +# knl = lp.tag_inames(knl, dict(k_inner="unr")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) diff --git a/pyproject.toml b/pyproject.toml index 6f7b977b6..4b4334adb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,86 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = [ + "setuptools>=63", +] + +[project] +name = "loopy" +version = "2024.1" +description = "A code generator for array-based code on CPUs and GPUs" +readme = "README.rst" +license = { text = "MIT" } +authors = [ + { name = "Andreas Kloeckner", email = "inform@tiker.net" }, +] +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Other Audience", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", +] +dependencies = [ + "pytools>=2024.1.5", + "pymbolic>=2022.1", + "genpy>=2016.1.2", + + # https://github.com/inducer/loopy/pull/419 + "numpy>=1.19", + + "cgen>=2016.1", + "islpy>=2019.1", + "codepy>=2017.1", + "colorama", + "Mako", + "pyrsistent", + "immutables", + "immutabledict", + + "typing-extensions>=4", +] +[project.optional-dependencies] +pyopencl = [ + "pyopencl>=2022.3", +] +fortran = [ + # Note that this is *not* regular 'f2py2e', this is + # the Fortran parser from the (unfinished) third-edition + # f2py, as linked below. This package is not on the package index, AFAIK. + # -AK, 2024-08-02 + "f2py @ git+https://github.com/pearu/f2py.git", + "ply>=3.6", +] + +[project.scripts] + +[project.urls] +Documentation = "https://documen.tician.de/loopy" +Homepage = "https://github.com/inducer/loopy" + +[tool.setuptools.packages.find] +include = [ + "loopy*", +] + +[tool.setuptools.package-data] +loopy = [ + "py.typed", +] + +[tool.setuptools.package-dir] +# https://github.com/Infleqtion/client-superstaq/pull/715 +"" = "." [tool.ruff] preview = true @@ -63,7 +146,7 @@ known-local-folder = [ lines-after-imports = 2 [tool.mypy] -python_version = 3.8 +python_version = "3.10" warn_unused_ignores = true # TODO @@ -90,3 +173,29 @@ module = [ "IPython.*", ] ignore_missing_imports = true + +[tool.typos.default] +extend-ignore-re = [ + "(?Rm)^.*(#|//)\\s*spellchecker:\\s*disable-line$" +] + +[tool.typos.default.extend-words] +# like the numpy function, array range +arange = "arange" +# N-Dimensional +ND = "ND" +# used for 'diff_output' +dout = "dout" +# an element-wise slice of array u +ue = "ue" +# used in an ordering context, "ab" / "ba" +ba = "ba" + +"dependees" = "dependees" + +[tool.typos.files] +extend-exclude = [ + "loopy/target/c/compyte", + "notes/*/*.eps", +] + diff --git a/setup.py b/setup.py deleted file mode 100644 index 0cf58f83a..000000000 --- a/setup.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python - -import os - -from setuptools import find_packages, setup - - -ver_dic = {} -version_file = open("loopy/version.py") -try: - version_file_contents = version_file.read() -finally: - version_file.close() - -os.environ["AKPYTHON_EXEC_IMPORT_UNAVAILABLE"] = "1" -exec(compile(version_file_contents, "loopy/version.py", "exec"), ver_dic) - - -# {{{ capture git revision at install time - -# authoritative version in pytools/__init__.py -def find_git_revision(tree_root): - # Keep this routine self-contained so that it can be copy-pasted into - # setup.py. - - from os.path import abspath, exists, join - tree_root = abspath(tree_root) - - if not exists(join(tree_root, ".git")): - return None - - from subprocess import PIPE, STDOUT, Popen - p = Popen(["git", "rev-parse", "HEAD"], shell=False, - stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True, - cwd=tree_root) - (git_rev, _) = p.communicate() - - git_rev = git_rev.decode() - - git_rev = git_rev.rstrip() - - retcode = p.returncode - assert retcode is not None - if retcode != 0: - from warnings import warn - warn("unable to find git revision", stacklevel=1) - return None - - return git_rev - - -def write_git_revision(package_name): - from os.path import dirname, join - dn = dirname(__file__) - git_rev = find_git_revision(dn) - - with open(join(dn, package_name, "_git_rev.py"), "w") as outf: - outf.write('GIT_REVISION = "%s"\n' % git_rev) - - -write_git_revision("loopy") - -# }}} - - -setup(name="loopy", - version=ver_dic["VERSION_TEXT"], - description="A code generator for array-based code on CPUs and GPUs", - long_description=open("README.rst").read(), - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Other Audience", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Information Analysis", - "Topic :: Scientific/Engineering :: Mathematics", - "Topic :: Scientific/Engineering :: Visualization", - "Topic :: Software Development :: Libraries", - "Topic :: Utilities", - ], - - python_requires="~=3.8", - install_requires=[ - "pytools>=2024.1.5", - "pymbolic>=2022.1", - "genpy>=2016.1.2", - - # https://github.com/inducer/loopy/pull/419 - "numpy>=1.19", - - "cgen>=2016.1", - "islpy>=2019.1", - "codepy>=2017.1", - "colorama", - "Mako", - "pyrsistent", - "immutables", - "typing_extensions", - ], - - extras_require={ - "pyopencl": [ - "pyopencl>=2022.3", - ], - "fortran": [ - # Note that this is *not* regular 'f2py2e', this is - # the Fortran parser from the (unfinished) third-edition - # f2py, as linked below. - "f2py>=0.3.1", - "ply>=3.6", - ], - }, - - dependency_links=[ - "git+https://github.com/pearu/f2py.git" - ], - - scripts=["bin/loopy"], - - author="Andreas Kloeckner", - url="https://mathema.tician.de/software/loopy", - author_email="inform@tiker.net", - license="MIT", - packages=find_packages(), - ) diff --git a/test/test_apps.py b/test/test_apps.py index 207bc7ee2..c4cffaee1 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -324,7 +324,7 @@ def test_rob_stroud_bernstein_full(): def test_stencil(ctx_factory): ctx = ctx_factory() - # n=32 causes corner case behavior in size calculations for temprorary (a + # n=32 causes corner case behavior in size calculations for temporary (a # non-unifiable, two-constant-segments PwAff as the base index) n = 256 diff --git a/test/test_c_execution.py b/test/test_c_execution.py index e703d9415..6208b9aed 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -95,17 +95,17 @@ def test_c_target_strides_nonsquare(): from loopy.target.c import ExecutableCTarget def __get_kernel(order="C"): - indicies = ["i", "j", "k"] - sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + indices = ["i", "j", "k"] + sizes = tuple(np.random.randint(1, 11, size=len(indices))) # create domain strings domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}" domains = [] - for idx, size in zip(indicies, sizes): + for idx, size in zip(indices, sizes): domains.append(domain_template.format( iname=idx, size=size)) statement = "out[{indexed}] = 2 * a[{indexed}]".format( - indexed=", ".join(indicies)) + indexed=", ".join(indices)) return lp.make_kernel( domains, statement, @@ -142,17 +142,17 @@ def test_c_optimizations(): from loopy.target.c import ExecutableCTarget def __get_kernel(order="C"): - indicies = ["i", "j", "k"] - sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + indices = ["i", "j", "k"] + sizes = tuple(np.random.randint(1, 11, size=len(indices))) # create domain strings domain_template = "{{ [{iname}]: 0 <= {iname} < {size} }}" domains = [] - for idx, size in zip(indicies, sizes): + for idx, size in zip(indices, sizes): domains.append(domain_template.format( iname=idx, size=size)) statement = "out[{indexed}] = 2 * a[{indexed}]".format( - indexed=", ".join(indicies)) + indexed=", ".join(indices)) return lp.make_kernel( domains, statement, diff --git a/test/test_callables.py b/test/test_callables.py index d58247a75..44a94e43a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -1397,8 +1397,8 @@ def test_inline_deps(ctx_factory): prg = lp.merge([parent_knl, child_knl]) inlined = lp.inline_callable_kernel(prg, "func") - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - apply_single_writer_depencency_heuristic(inlined, error_if_used=True) + from loopy.kernel.creation import apply_single_writer_dependency_heuristic + apply_single_writer_dependency_heuristic(inlined, error_if_used=True) _evt, (a_dev,) = inlined(cq) diff --git a/test/test_loopy.py b/test/test_loopy.py index e9aa47ef4..bfa607328 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2306,7 +2306,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): from testlib import GridOverride - # artifically expand via overridden_get_grid_sizes_for_insn_ids + # artificially expand via overridden_get_grid_sizes_for_insn_ids knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) @@ -3689,6 +3689,21 @@ def test_no_unnecessary_lbarrier(ctx_factory): assert not barrier_between(knl, "write_s_a", "write_ao") +def test_long_kernel(): + n = 500 + insns = [ + f"a{i}[j{i}] = j{i}" + for i in range(n) + ] + domains = [ + f"{{ [j{i}]: 0<=j{i}<10 }}" + for i in range(n) + ] + t_unit = lp.make_kernel(domains, insns) + t_unit = lp.preprocess_kernel(t_unit) + lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_target.py b/test/test_target.py index 6c448debf..08bf286cd 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -314,10 +314,7 @@ def test_ispc_streaming_stores(): knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") - knl = lp.add_and_infer_dtypes(knl, { - var: stream_dtype - for var in vars - }) + knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype)) knl = lp.set_argument_order(knl, vars + ["n"]) diff --git a/test/test_transform.py b/test/test_transform.py index 8060d2038..98398fefd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1704,6 +1704,36 @@ def test_precompute_lets_inner_length1_inames_live(): == parse("(e_inner + e_outer*16) / i_0")) +def test_duplicate_iname_not_read_only_nested(ctx_factory): + # See + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i, j]: 0<=i,j<10}", + """ + for i + <> acc = 0 {id=init, tags=foo} + for j + acc = acc + A[i, j] * x[i, j] {id=update, tags=foo} + end + y[i] = acc {id=assign, tags=foo} + end + """, + [lp.GlobalArg("A,x,y", shape=lp.auto, dtype=np.float32), + ...], + seq_dependencies=True, + ) + ref_t_unit = t_unit + + t_unit = lp.duplicate_inames( + t_unit, + inames="i", within="tag:foo", new_inames="irow") + print(t_unit) + assert (t_unit.default_entrypoint.id_to_insn["init"].within_inames + == frozenset({"irow"})) + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_tree.py b/test/test_tree.py new file mode 100644 index 000000000..3dea8470e --- /dev/null +++ b/test/test_tree.py @@ -0,0 +1,50 @@ +__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl as pytest_generate_tests, +) + +from loopy.schedule.tree import Tree + + +def test_tree_simple(): + tree = Tree.from_root("") + + tree = tree.add_node("bar", parent="") + tree = tree.add_node("baz", parent="bar") + + assert tree.depth("") == 0 + assert tree.depth("bar") == 1 + assert tree.depth("baz") == 2 + + assert "" in tree + assert "bar" in tree + assert "baz" in tree + assert "foo" not in tree + + tree = tree.replace_node("bar", "foo") + assert "bar" not in tree + assert "foo" in tree + + tree = tree.move_node("baz", new_parent="") + assert tree.depth("baz") == 1